/* 16-bit signed integer dot product
* Altivec-assisted version
* Copyright 2004 Phil Karn
* May be used under the terms of the GNU Lesser General Public License (LGPL)
*/
#include <stdlib.h>
#include "fec.h"
struct dotprod {
int len; /* Number of coefficients */
/* On an Altivec machine, these hold 8 copies of the coefficients,
* preshifted by 0,1,..7 words to meet all possible input data
*/
signed short *coeffs[8];
};
/* Create and return a descriptor for use with the dot product function */
void *initdp_av(signed short coeffs[],int len){
struct dotprod *dp;
int i,j;
if(len == 0)
return NULL;
dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
dp->len = len;
/* Make 8 copies of coefficients, one for each data alignment,
* each aligned to 16-byte boundary
*/
for(i=0;i<8;i++){
dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short));
for(j=0;j<len;j++)
dp->coeffs[i][j+i] = coeffs[j];
}
return (void *)dp;
}
/* Free a dot product descriptor created earlier */
void freedp_av(void *p){
struct dotprod *dp = (struct dotprod *)p;
int i;
for(i=0;i<8;i++)
if(dp->coeffs[i] != NULL)
free(dp->coeffs[i]);
free(dp);
}
/* Compute a dot product given a descriptor and an input array
* The length is taken from the descriptor
*/
long dotprod_av(void *p,signed short a[]){
struct dotprod *dp = (struct dotprod *)p;
int al;
vector signed short *ar,*d;
vector signed int sums0,sums1,sums2,sums3;
union { vector signed int v; signed int w[4];} s;
int nblocks;
/* round ar down to beginning of 16-byte block containing 0th element of
* input buffer. Then set d to one of 8 sets of shifted coefficients
*/
ar = (vector signed short *)((int)a & ~15);
al = ((int)a & 15)/sizeof(signed short);
d = (vector signed short *)dp->coeffs[al];
nblocks = (dp->len+al-1)/8+1;
/* Sum into four vectors each holding four 32-bit partial sums */
sums3 = sums2 = sums1 = sums0 = (vector signed int)(0);
while(nblocks >= 4){
sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0);
sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1);
sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2);
sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3);
nblocks -= 4;
}
sums0 = vec_adds(sums0,sums1);
sums2 = vec_adds(sums2,sums3);
sums0 = vec_adds(sums0,sums2);
while(nblocks-- > 0){
sums0 = vec_msums(ar[nblocks],d[nblocks],sums0);
}
/* Sum 4 partial sums into final result */
s.v = vec_sums(sums0,(vector signed int)(0));
return s.w[3];
}