C++程序  |  1283行  |  34.25 KB

/*
 * Copyright (C) 2008 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* ---- includes ----------------------------------------------------------- */

#include "b_BasicEm/Math.h"
#include "b_BasicEm/Functions.h"

/* ---- related objects  --------------------------------------------------- */

/* ---- typedefs ----------------------------------------------------------- */

/* ---- constants ---------------------------------------------------------- */

/* ------------------------------------------------------------------------- */

/* ========================================================================= */
/*                                                                           */
/* ---- \ghd{ external functions } ----------------------------------------- */
/*                                                                           */
/* ========================================================================= */

#if defined( HW_SSE2 )
	extern int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA );
	extern int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA );
#endif

#if defined( HW_FR71 )
	int32 bbs_dotProduct_fr71( const int16* vec1A, const int16* vec2A, uint32 sizeA );
#endif

/* ------------------------------------------------------------------------- */

uint16 bbs_sqrt32( uint32 valA )
{
	uint32 rootL = 0;
	uint32 expL = 0;
	expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
	expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
	expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
	expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
	switch( expL >> 1 )
	{
		case 15: rootL += ( ( rootL + 0x8000 ) * ( rootL + 0x8000 ) <= valA ) << 15;
		case 14: rootL += ( ( rootL + 0x4000 ) * ( rootL + 0x4000 ) <= valA ) << 14;
		case 13: rootL += ( ( rootL + 0x2000 ) * ( rootL + 0x2000 ) <= valA ) << 13;
		case 12: rootL += ( ( rootL + 0x1000 ) * ( rootL + 0x1000 ) <= valA ) << 12;
		case 11: rootL += ( ( rootL + 0x0800 ) * ( rootL + 0x0800 ) <= valA ) << 11;
		case 10: rootL += ( ( rootL + 0x0400 ) * ( rootL + 0x0400 ) <= valA ) << 10;
		case 9:  rootL += ( ( rootL + 0x0200 ) * ( rootL + 0x0200 ) <= valA ) << 9;
		case 8:  rootL += ( ( rootL + 0x0100 ) * ( rootL + 0x0100 ) <= valA ) << 8;
		case 7:  rootL += ( ( rootL + 0x0080 ) * ( rootL + 0x0080 ) <= valA ) << 7;
		case 6:  rootL += ( ( rootL + 0x0040 ) * ( rootL + 0x0040 ) <= valA ) << 6;
		case 5:  rootL += ( ( rootL + 0x0020 ) * ( rootL + 0x0020 ) <= valA ) << 5;
		case 4:  rootL += ( ( rootL + 0x0010 ) * ( rootL + 0x0010 ) <= valA ) << 4;
		case 3:  rootL += ( ( rootL + 0x0008 ) * ( rootL + 0x0008 ) <= valA ) << 3;
		case 2:  rootL += ( ( rootL + 0x0004 ) * ( rootL + 0x0004 ) <= valA ) << 2;
		case 1:  rootL += ( ( rootL + 0x0002 ) * ( rootL + 0x0002 ) <= valA ) << 1;
		case 0:  rootL += ( ( rootL + 0x0001 ) * ( rootL + 0x0001 ) <= valA ) << 0;
	}

	return ( uint16 )rootL;
}

/* ------------------------------------------------------------------------- */

uint8 bbs_sqrt16( uint16 valA )
{
	uint16 rootL = 0;
	rootL += ( ( rootL + 0x80 ) * ( rootL + 0x80 ) <= valA ) << 7;
	rootL += ( ( rootL + 0x40 ) * ( rootL + 0x40 ) <= valA ) << 6;
	rootL += ( ( rootL + 0x20 ) * ( rootL + 0x20 ) <= valA ) << 5;
	rootL += ( ( rootL + 0x10 ) * ( rootL + 0x10 ) <= valA ) << 4;
	rootL += ( ( rootL + 0x08 ) * ( rootL + 0x08 ) <= valA ) << 3;
	rootL += ( ( rootL + 0x04 ) * ( rootL + 0x04 ) <= valA ) << 2;
	rootL += ( ( rootL + 0x02 ) * ( rootL + 0x02 ) <= valA ) << 1;
	rootL += ( ( rootL + 0x01 ) * ( rootL + 0x01 ) <= valA ) << 0;
	return ( uint8 )rootL;
}

/* ------------------------------------------------------------------------- */

/* table of sqrt and slope values */
const uint32 bbs_fastSqrt32_tableG[] = 
{
	268435456, 1016, 272596992, 1000, 276692992, 987, 280735744, 972, 
	284717056, 959, 288645120, 946, 292519936, 933, 296341504, 922, 
	300118016, 910, 303845376, 899, 307527680, 889, 311169024, 878, 
	314765312, 869, 318324736, 858, 321839104, 850, 325320704, 840, 
	328761344, 832, 332169216, 824, 335544320, 815, 338882560, 807, 
	342188032, 799, 345460736, 792, 348704768, 785, 351920128, 777, 
	355102720, 771, 358260736, 764, 361390080, 757, 364490752, 751, 
	367566848, 745, 370618368, 739, 373645312, 732, 376643584, 727, 
	379621376, 722, 382578688, 715, 385507328, 711, 388419584, 705, 
	391307264, 700, 394174464, 695, 397021184, 689, 399843328, 686, 
	402653184, 680, 405438464, 675, 408203264, 672, 410955776, 666, 
	413683712, 663, 416399360, 658, 419094528, 653, 421769216, 650, 
	424431616, 646, 427077632, 641, 429703168, 638, 432316416, 634, 
	434913280, 630, 437493760, 627, 440061952, 622, 442609664, 620, 
	445149184, 615, 447668224, 613, 450179072, 609, 452673536, 605, 
	455151616, 602, 457617408, 600, 460075008, 595, 462512128, 593, 
	464941056, 590, 467357696, 587, 469762048, 583, 472150016, 581, 
	474529792, 578, 476897280, 575, 479252480, 572, 481595392, 569, 
	483926016, 567, 486248448, 564, 488558592, 561, 490856448, 559, 
	493146112, 556, 495423488, 553, 497688576, 552, 499949568, 548, 
	502194176, 546, 504430592, 544, 506658816, 541, 508874752, 539, 
	511082496, 537, 513282048, 534, 515469312, 533, 517652480, 529, 
	519819264, 528, 521981952, 526, 524136448, 523, 526278656, 521, 
	528412672, 519, 530538496, 517, 532656128, 515, 534765568, 514 
};

uint16 bbs_fastSqrt32( uint32 valA )
{
	uint32 expL = 0;
	uint32 valL;
	uint32 offsL;
	uint32 indexL = 0;

	if( valA == 0 ) return 0;

	/* compute closest even size exponent of valA */
	expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
	expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
	expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
	expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;

	valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */
	offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 12 ) ) >> 13;
	indexL = ( valL >> 24 ) & 0xFE;

	return ( bbs_fastSqrt32_tableG[ indexL ] + offsL * bbs_fastSqrt32_tableG[ indexL + 1 ] ) >> ( 28 - ( expL >> 1 ) );
}

/* ------------------------------------------------------------------------- */

/* table of 1/sqrt (1.31) and negative slope (.15) values 
   referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */
const uint32 bbs_invSqrt32_tableG[] = 
{
	2147483648u, 1001, 2114682880, 956, 2083356672, 915, 2053373952, 877,
	2024636416, 840, 1997111296, 808, 1970634752, 776, 1945206784, 746,
	1920761856, 720, 1897168896, 693, 1874460672, 669, 1852538880, 646,
	1831370752, 625, 1810890752, 604, 1791098880, 584, 1771962368, 567,
	1753382912, 548, 1735426048, 533, 1717960704, 516, 1701052416, 502,
	1684602880, 487, 1668644864, 474, 1653112832, 461, 1638006784, 448,
	1623326720, 436, 1609039872, 426, 1595080704, 414, 1581514752, 404,
	1568276480, 394, 1555365888, 384, 1542782976, 375, 1530494976, 367,
	1518469120, 357, 1506770944, 350, 1495302144, 342, 1484095488, 334,
	1473150976, 327, 1462435840, 320, 1451950080, 313, 1441693696, 307,
	1431633920, 300, 1421803520, 294, 1412169728, 289, 1402699776, 282,
	1393459200, 277, 1384382464, 272, 1375469568, 266, 1366753280, 262,
	1358168064, 257, 1349746688, 251, 1341521920, 248, 1333395456, 243,
	1325432832, 238, 1317634048, 235, 1309933568, 230, 1302396928, 227,
	1294958592, 222, 1287684096, 219, 1280507904, 216, 1273430016, 211,
	1266515968, 209, 1259667456, 205, 1252950016, 202, 1246330880, 198,
	1239842816, 196, 1233420288, 192, 1227128832, 190, 1220902912, 187,
	1214775296, 184, 1208745984, 181, 1202814976, 179, 1196949504, 176,
	1191182336, 173, 1185513472, 171, 1179910144, 169, 1174372352, 166,
	1168932864, 164, 1163558912, 162, 1158250496, 160, 1153007616, 157,
	1147863040, 155, 1142784000, 154, 1137737728, 151, 1132789760, 149,
	1127907328, 148, 1123057664, 145, 1118306304, 144, 1113587712, 142,
	1108934656, 140, 1104347136, 138, 1099825152, 137, 1095335936, 135,
	1090912256, 134, 1086521344, 131, 1082228736, 131, 1077936128, 128		
};

uint32 bbs_invSqrt32( uint32 valA )
{

	uint32 expL = 0;
	uint32 valL;
	uint32 offsL;
	uint32 indexL = 0;

	if( valA == 0U ) return 0x80000000;

	/* compute closest even size exponent of valA */
	expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
	expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
	expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
	expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
	
	valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */
	offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 9 ) ) >> 10;
	indexL = ( valL >> 24 ) & 0xFE;
	
	return ( bbs_invSqrt32_tableG[ indexL ] - offsL * bbs_invSqrt32_tableG[ indexL + 1 ] ) >> ( expL >> 1 );
}

/* ------------------------------------------------------------------------- */

/* table of 1/( x + 1 ) (2.30) and negative slope (.14) values
   referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */
const int32 bbs_inv32_tableG[] = 
{
	1073741824, 1986, 1041203200, 1870, 1010565120, 1762, 981696512, 1664,
	954433536,  1575, 928628736,  1491, 904200192,  1415, 881016832, 1345,
	858980352,  1278, 838041600,  1218, 818085888,  1162, 799047680, 1108,
	780894208,  1059, 763543552,  1013, 746946560,  970,  731054080, 930,
	715816960,  891,  701218816,  856,  687194112,  823,  673710080, 791,
	660750336,  761,  648282112,  732,  636289024,  706,  624721920, 681,
	613564416,  657,  602800128,  635,  592396288,  613,  582352896, 592,
	572653568,  573,  563265536,  554,  554188800,  537,  545390592, 520,
};

int32 bbs_inv32( int32 valA )
{

	uint32 expL = 0;
	int32 signL = ( ( valA >> 30 ) & 0xFFFFFFFE ) + 1;
	int32 valL = signL * valA;
	int32 offsL;
	uint32 indexL = 0;

	if( valL <= ( int32 ) 1 ) return 0x40000000 * signL;

	/* compute size exponent of valL */
	expL += ( ( valL >> ( expL + 0x10 ) ) != 0 ) << 4;
	expL += ( ( valL >> ( expL + 0x08 ) ) != 0 ) << 3;
	expL += ( ( valL >> ( expL + 0x04 ) ) != 0 ) << 2;
	expL += ( ( valL >> ( expL + 0x02 ) ) != 0 ) << 1;
	expL += ( ( valL >> ( expL + 0x01 ) ) != 0 );
	
	valL = ( ( valL << ( 30 - expL ) ) - 1073741824 ); /*( 1U << 30 )*/
	offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 10 ) ) >> 11;
	indexL = ( valL >> 24 ) & 0xFE;
	
	return signL * ( ( ( ( bbs_inv32_tableG[ indexL ] - offsL * bbs_inv32_tableG[ indexL + 1 ] ) >> ( expL - 1 ) ) + 1 ) >> 1 );
}

/* ------------------------------------------------------------------------- */

uint32 bbs_intLog2( uint32 valA )
{
	uint32 expL = 0;
	expL += 0x10 * ( ( valA >> ( expL + 0x10 ) ) != 0 );
	expL += 0x08 * ( ( valA >> ( expL + 0x08 ) ) != 0 );
	expL += 0x04 * ( ( valA >> ( expL + 0x04 ) ) != 0 );
	expL += 0x02 * ( ( valA >> ( expL + 0x02 ) ) != 0 );
	expL += 0x01 * ( ( valA >> ( expL + 0x01 ) ) != 0 );
	return expL;
}

/* ------------------------------------------------------------------------- */

const uint32 bbs_pow2M1_tableG[] = 
{
	0,			713,	46769127,	721,	94047537,	729,	141840775,	737,
	190154447,	745,	238994221,	753,	288365825,	761,	338275050,	769,
	388727751,	778,	439729846,	786,	491287318,	795,	543406214,	803,
	596092647,	812,	649352798,	821,	703192913,	830,	757619310,	839,
	812638371,	848,	868256550,	857,	924480371,	867,	981316430,	876,
	1038771393, 886,	1096851999, 895,	1155565062, 905,	1214917468, 915,
	1274916179, 925,	1335568233, 935,	1396880745, 945,	1458860907, 956,
	1521515988, 966,	1584853338, 976,	1648880387, 987,	1713604645, 998,
	1779033703, 1009,	1845175238, 1020,	1912037006, 1031,	1979626852, 1042,
	2047952702, 1053,	2117022572, 1065,	2186844564u, 1077,	2257426868u, 1088,
	2328777762u, 1100,	2400905617u, 1112,	2473818892u, 1124,	2547526141u, 1136,
	2622036010u, 1149,	2697357237u, 1161,	2773498659u, 1174,	2850469207u, 1187,
	2928277909u, 1200,	3006933892u, 1213,	3086446383u, 1226,	3166824708u, 1239,
	3248078296u, 1253,	3330216677u, 1266,	3413249486u, 1280,	3497186464u, 1294,
	3582037455u, 1308,	3667812413u, 1323,	3754521399u, 1337,	3842174584u, 1352,
	3930782250u, 1366,	4020354790u, 1381,	4110902710u, 1396,	4202436633u, 1411
};

uint32 bbs_pow2M1( uint32 valA )
{
	uint32 offsL = ( valA & 0x03FFFFFF ) >> 10;
	uint16 indexL = ( ( valA & 0xFC000000 ) >> 26 ) << 1;
	return bbs_pow2M1_tableG[ indexL ] + offsL * bbs_pow2M1_tableG[ indexL + 1 ];	
}

/* ------------------------------------------------------------------------- */

uint32 bbs_pow2( int32 valA )
{
	int32 shiftL = 16 - ( valA >> 27 );
	uint32 offsL  = ( uint32 )( valA << 5 );
	if( shiftL == 32 ) return 1;
	return ( 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL );
}

/* ------------------------------------------------------------------------- */

uint32 bbs_exp( int32 valA )
{
	int32 adjustedL;
	int32 shiftL;
	int32 offsL;

	/* check boundaries to avoid overflow */
	if( valA < -1488522236 )
	{
		return 0;
	}
	else if( valA > 1488522236 )
	{
		return 0xFFFFFFFF;
	}

	/* multily valA with 1/ln(2) in order to use function 2^x instead of e^x */
	adjustedL = ( valA >> 16 ) * 94548 + ( ( ( ( ( uint32 )valA ) & 0x0FFFF ) * 47274 ) >> 15 );

	shiftL = 16 - ( adjustedL >> 27 );
	if( shiftL == 32 ) return 1;
	offsL  = ( uint32 )( adjustedL << 5 );
	return ( ( int32 ) 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL );
}

/* ------------------------------------------------------------------------- */

int16 bbs_satS16( int32 valA )
{
	if( valA > 32767 ) return 32767;
	if( valA < -32768 ) return -32768;
	return valA;
}

/* ------------------------------------------------------------------------- */

#if defined( HW_i586 ) || defined( HW_i686 )

/* Windows section */
#if defined( WIN32 ) && !defined( WIN64 )

/* disable warning "no return value"*/
#pragma warning( disable : 4035 )

/** 
 * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0 
 */
int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
{
	__asm
	{	
			push    esi
			push    edi

			mov     eax, vec1A
			mov     ebx, vec2A

			mov     ecx, sizeA

			pxor    mm4, mm4
			pxor    mm6, mm6

			pxor    mm7, mm7
			shr		ecx, 4

		inner_loop_start:
			movq    mm0, 0[eax]
			paddd   mm7, mm4

			movq    mm1, 0[ebx]
			paddd   mm7, mm6

			movq    mm2, 8[eax]
			pmaddwd mm0, mm1

			movq    mm3, 8[ebx]

			movq    mm4, 16[eax]
			pmaddwd mm2, mm3

			movq    mm5, 16[ebx]
			paddd   mm7, mm0

			movq    mm6, 24[eax]
			pmaddwd mm4, mm5

			pmaddwd mm6, 24[ebx]
			paddd   mm7, mm2

			add     eax, 32
			add     ebx, 32

			dec     ecx
			jnz     inner_loop_start

			paddd   mm7, mm4

			paddd   mm7, mm6
        
			movq    mm0, mm7

			psrlq   mm0, 32

			paddd   mm7, mm0

			movd    eax, mm7
			
			emms
			pop     edi
			pop     esi
	}
}

#pragma warning( default : 4035 )

/* gcc compiler section */
#elif defined( epl_LINUX ) || defined( CYGWIN )

/**
 * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0
 */
int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
{
	int32 resultL;

	__asm__ __volatile__(

			"movl %1,%%eax\n\t"
			"movl %2,%%ebx\n\t"

			"movl %3,%%ecx\n\t"

			"pxor %%mm4,%%mm4\n\t"
			"pxor %%mm6,%%mm6\n\t"

			"pxor %%mm7, %%mm7\n\t"
			"shrl $4, %%ecx\n\t"

			"\n1:\t"
			"movq 0(%%eax),%%mm0\n\t"
			"paddd %%mm4,%%mm7\n\t"

			"movq 0( %%ebx ),%%mm1\n\t"
			"paddd %%mm6,%%mm7\n\t"
			
			"movq 8( %%eax ),%%mm2\n\t"
			"pmaddwd %%mm1,%%mm0\n\t"

			"movq 8( %%ebx ),%%mm3\n\t"

			"movq 16( %%eax ),%%mm4\n\t"
			"pmaddwd %%mm3,%%mm2\n\t"

			"movq 16( %%ebx ),%%mm5\n\t"
			"paddd %%mm0,%%mm7\n\t"

			"movq 24( %%eax ),%%mm6\n\t"
			"pmaddwd %%mm5,%%mm4\n\t"

			"pmaddwd 24( %%ebx ),%%mm6\n\t"
			"paddd %%mm2,%%mm7\n\t"

			"addl $32,%%eax\n\t"
			"addl $32,%%ebx\n\t"

			"decl %%ecx\n\t"
			"jnz 1b\n\t"

			"paddd %%mm4,%%mm7\n\t"
			"paddd %%mm6,%%mm7\n\t"
        
			"movq  %%mm7,%%mm0\n\t"

			"psrlq $32,%%mm0\n\t"

			"paddd %%mm0,%%mm7\n\t"

			"movd %%mm7,%0\n\t"
			
			"emms\n\t"

		: "=&g" ( resultL )
		: "g" ( vec1A ), "g" ( vec2A ), "g" ( sizeA )
		: "si", "di", "ax", "bx", "cx", "st", "memory" );

	return resultL;
}

#endif /* epl_LINUX, CYGWIN */

#endif /* HW_i586 || HW_i686 */

/* ------------------------------------------------------------------------- */

#ifdef HW_TMS320C6x
/**
 * Calls fast assembler version of dotproduct for DSP. 
 * dotProduct_C62x is implemented in file dotprod.asm and expects input vectors
 * of even length.
 */
int32 bbs_dotProduct_dsp( const int16* vec1A, const int16* vec2A, uint32 sizeA )
{
	if( sizeA & 1 )
	{
		int32 resultL;		
		resultL = dotProduct_C62x( vec1A, vec2A, sizeA - 1 );
		return resultL + ( int32 ) *( vec1A + sizeA - 1 ) * *( vec2A + sizeA - 1 );
	}
	else
	{
		return dotProduct_C62x( vec1A, vec2A, sizeA );
	}
}
#endif /* HW_TMS320C6x */

/* ------------------------------------------------------------------------- */

/* 16 dot product for the PS2/EE processor */
/* input vectors MUST be 128 bit aligned ! */

#if defined( epl_LINUX ) && defined( HW_EE )

int32 bbs_dotProduct_EE( const int16* vec1A, const int16* vec2A, uint32 sizeA )
{
	int32 resultL = 0,
	      iL = sizeA >> 3,
	      jL = sizeA - ( iL << 3 );

	if( iL > 0 )
	{
		/* multiply-add elements of input vectors in sets of 8 */
		int32 accL[ 4 ], t1L, t2L, t3L;
		asm volatile (
			"pxor %4, %2, %2\n\t"			/* reset 8 accumulators (LO and HI register) to 0 */
			"pmtlo %4\n\t"		
			"pmthi %4\n\t"

			"\n__begin_loop:\t"

			"lq %2,0(%0)\n\t"				/* load 8 pairs of int16 */
			"lq %3,0(%1)\n\t"			

			"addi %0,%0,16\n\t"				/* vec1L += 16 */
			"addi %1,%1,16\n\t"				/* vec2L += 16 */
			"addi %7,%7,-1\n\t"				/* iL-- */

			"pmaddh %4,%2,%3\n\t"			/* parallel multiply-add of 8 pairs of int16 */

			"bgtzl %7,__begin_loop\n\t"		/* if iL > 0 goto _begin_loop */

			"pmflo %2\n\t"					/* parallel add 8 accumulators , store remaining 4 accumulators in tmpL */
			"pmfhi %3\n\t"
			"paddw %4,%2,%3\n\t"						
			"sq %4,0(%8)\n\t"
		: "=r" ( vec1A ), "=r" ( vec2A ), "=r" ( t1L ), "=r" ( t2L ), "=r" ( t3L )
		: "0" ( vec1A ), "1" ( vec2A ), "r" ( iL ), "r" ( accL )
		: "memory" );

		/* add 4 parallel accumulators */
		resultL += accL[ 0 ] + accL[ 1 ] + accL[ 2 ] + accL[ 3 ];
	}
	
	/* multiply-add remaining elements of input vectors */
	for( ; jL--; ) resultL += ( int32 ) *vec1A++ * *vec2A++;

	return resultL;
}

#endif

/* ------------------------------------------------------------------------- */

#if defined( HW_ARMv5TE )

/* fast 16 dot product for ARM9E cores (DSP extensions).
 * input vectors must be 32 bit aligned
 */
int32 bbs_dotProduct_arm9e( const int16* vec1A, const int16* vec2A, uint32 sizeA )
{
	int32 accuL = 0;

	int32* v1PtrL = ( int32* )vec1A;
	int32* v2PtrL = ( int32* )vec2A;

	for( ; sizeA >= 4; sizeA -= 4 )
	{
		__asm {
		    smlabb accuL, *v1PtrL, *v2PtrL, accuL;
		    smlatt accuL, *v1PtrL, *v2PtrL, accuL;
		}
		v1PtrL++; v2PtrL++;
	    __asm {
		    smlabb accuL, *v1PtrL, *v2PtrL, accuL;
		    smlatt accuL, *v1PtrL, *v2PtrL, accuL;
		}
		v1PtrL++; v2PtrL++;
	}

	vec1A = ( int16* )v1PtrL;
	vec2A = ( int16* )v2PtrL;

	/* multiply-add remaining elements of input vectors */
	for( ; sizeA > 0; sizeA-- ) accuL += ( int32 )*vec1A++ * *vec2A++;

	return accuL;
}

#endif

/* ------------------------------------------------------------------------- */

/**
 * Computes a fast dot product using standard C
 */
int32 bbs_dotProduct_stdc( const int16* vec1A, const int16* vec2A, uint32 sizeA )
{
	int32 accuL = 0;

	for( ; sizeA >= 8; sizeA -= 8 )
	{
		accuL += ( int32 ) *vec1A * *vec2A;
		accuL += ( int32 ) *( vec1A + 1 ) * *( vec2A + 1 );
		accuL += ( int32 ) *( vec1A + 2 ) * *( vec2A + 2 );
		accuL += ( int32 ) *( vec1A + 3 ) * *( vec2A + 3 );

		accuL += ( int32 ) *( vec1A + 4 ) * *( vec2A + 4 );
		accuL += ( int32 ) *( vec1A + 5 ) * *( vec2A + 5 );
		accuL += ( int32 ) *( vec1A + 6 ) * *( vec2A + 6 );
		accuL += ( int32 ) *( vec1A + 7 ) * *( vec2A + 7 );

		vec1A += 8;
		vec2A += 8;
	}

	for( ; sizeA; sizeA-- ) accuL += ( int32 ) *vec1A++ * *vec2A++;

	return accuL;
}

/* ------------------------------------------------------------------------- */

int32 bbs_dotProductInt16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
{
/* PC */
#if ( defined( HW_i586 ) || defined( HW_i686 ) )

	#if defined( HW_SSE2 )
		uint32 size16L = sizeA & 0xfffffff0;
		if( size16L )
		{	
			if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 ) 
			{
				return bbs_dotProduct_128SSE2( vec1A, vec2A, sizeA );
			}
			else
			{
				return bbs_dotProduct_u128SSE2( vec1A, vec2A, sizeA );
			}
		}
	#elif !defined( WIN64 )
		/* MMX version (not supported by 64-bit compiler) */
		uint32 size16L = sizeA & 0xfffffff0;
		if( size16L )
		{	
			if( sizeA == size16L )
			{
				return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L );
			}
			return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L )
					+ bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L );
		} /* if( size16L ) */
	#endif

	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );

/* Playstation 2 */
#elif defined( HW_EE ) && defined( epl_LINUX ) 

	if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 ) 
	{
		return bbs_dotProduct_EE( vec1A, vec2A, sizeA );
	}
	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );

/* ARM9E */
#elif defined( HW_ARMv5TE )

	return bbs_dotProduct_arm9e( vec1A, vec2A, sizeA );

/* TI C6000 */
#elif defined( HW_TMS320C6x )

	return bbs_dotProduct_dsp( vec1A, vec2A, sizeA );
	
#elif defined( HW_FR71 )

	uint32 size16L = sizeA & 0xfffffff0;
	if( size16L )
	{	
		if( sizeA == size16L )
		{
			return bbs_dotProduct_fr71( vec1A, vec2A, size16L );
		}
		return bbs_dotProduct_fr71( vec1A, vec2A, size16L )
				+ bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L );
	}

	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );

#endif

	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
}

/* ------------------------------------------------------------------------- */

/* table of fermi and slope values (result: 2.30; offset: .12) 
   referenced in b_NeuralNetEm/FastMlpNet.c, not not rename or remove */
const uint32 bbs_fermi_tableG[] = 
{
	45056,      8,     77824,      13,    131072,     21,    217088,     34,
	356352,     57,    589824,     94,    974848,     155,   1609728,    255,
	2654208,    418,   4366336,    688,   7184384,    1126,  11796480,   1834,
	19308544,   2970,  31473664,   4748,  50921472,   7453,  81448960,   11363,
	127991808,  16573, 195874816,  22680, 288772096,  28469, 405381120,  32102,
	536870912,  32101, 668356608,  28469, 784965632,  22680, 877862912,  16573,
	945745920,  11363, 992288768,  7453,  1022816256, 4748,  1042264064, 2970,
	1054429184, 1834,  1061941248, 1126,  1066553344, 688,   1069371392, 418,
	1071083520, 255,   1072128000, 155,   1072762880, 94,    1073147904, 57,
	1073381376, 34,    1073520640, 21,    1073606656, 13,    1073659904, 8,
};

int32 bbs_fermi( int32 valA )
{
	uint32 indexL = ( ( valA >> 15 ) + 20 ) << 1;
	uint32 offsL  = ( ( valA & 0x00007FFF ) + 4 ) >> 3;
	if( valA <  -655360 ) return 1;
	if( valA >=  655360 ) return 1073741824 - 1; /* ( 1 << 30 ) */
	return ( bbs_fermi_tableG[ indexL ] + offsL * bbs_fermi_tableG[ indexL + 1 ] );
}

/* ------------------------------------------------------------------------- */

void bbs_uint32ReduceToNBits( uint32* argPtrA, int32* bbpPtrA, uint32 nBitsA )
{
	int32 posHighestBitL = bbs_intLog2( *argPtrA ) + 1;
	int32 shiftL = posHighestBitL - nBitsA;
	if( shiftL > 0 )
	{
		( *argPtrA ) >>= shiftL;
		( *bbpPtrA ) -= shiftL;
	}
}

/* ------------------------------------------------------------------------- */

void bbs_int32ReduceToNBits( int32* argPtrA, int32* bbpPtrA, uint32 nBitsA )
{
	int32 posHighestBitL = bbs_intLog2( bbs_abs( *argPtrA ) ) + 1;
	int32 shiftL = posHighestBitL - nBitsA;
	if( shiftL > 0 )
	{
		( *argPtrA ) >>= shiftL;
		( *bbpPtrA ) -= shiftL;
	}
}

/* ------------------------------------------------------------------------- */

uint32 bbs_convertU32( uint32 srcA, int32 srcBbpA, int32 dstBbpA )
{
	if( dstBbpA >= srcBbpA )
	{
		uint32 shiftL = dstBbpA - srcBbpA;
		if( srcA > ( ( uint32 )0xFFFFFFFF >> shiftL ) )
		{
			/* overflow */
			return ( uint32 )0xFFFFFFFF;
		}
		else
		{
			return srcA << shiftL;
		}
	}
	else
	{
		uint32 shiftL = srcBbpA - dstBbpA;
		uint32 addL = 1L << ( shiftL - 1 );
		if( srcA + addL < addL )
		{
			/* rounding would cause overflow */
			return srcA >> shiftL;
		}
		else
		{
			return ( srcA + addL ) >> shiftL;
		}
	}
}

/* ------------------------------------------------------------------------- */

int32 bbs_convertS32( int32 srcA, int32 srcBbpA, int32 dstBbpA )
{
	if( dstBbpA >= srcBbpA )
	{
		uint32 shiftL = ( uint32 )( dstBbpA - srcBbpA );
		if( srcA > ( ( int32 )0x7FFFFFFF >> shiftL ) )
		{
			/* overflow */
			return ( uint32 )0x7FFFFFFF;
		}
		else if( srcA < ( ( int32 )0x80000000 >> shiftL ) )
		{
			/* underflow */
			return ( int32 )0x80000000;
		}
		else
		{
			return srcA << shiftL;
		}
	}
	else
	{
		uint32 shiftL = ( uint32 )( srcBbpA - dstBbpA );
		int32 addL = 1L << ( shiftL - 1 );
		if( srcA + addL < addL )
		{
			/* rounding would cause overflow */
			return srcA >> shiftL;
		}
		else
		{
			return ( srcA + addL ) >> shiftL;
		}
	}
}

/* ------------------------------------------------------------------------- */

int32 bbs_vecPowerFlt16( const int16 *xA, int16 nxA )
{
/*	#if defined( HW_TMS320C5x )
		uint32 rL;
		power( ( int16* ) xA, ( int32* ) &rL, nxA );  // does not work properly in DSPLib version 2.20.02
		return ( rL >> 1 );
	#else*/
		/* needs to be optimized */
		int32 rL = 0;
		for( ; nxA--; )
		{
			rL += ( int32 ) *xA * *xA;
			xA++;
		}
		return rL;
/*	#endif */
}

/* ------------------------------------------------------------------------- */

void bbs_mulU32( uint32 v1A, uint32 v2A, uint32* manPtrA, int32* expPtrA )
{
	uint32 log1L = bbs_intLog2( v1A );
	uint32 log2L = bbs_intLog2( v2A );

	if( log1L + log2L < 32 )
	{
		*manPtrA = v1A * v2A;
		*expPtrA = 0;
	}
	else
	{
		uint32 v1L = v1A;
		uint32 v2L = v2A;
		uint32 exp1L = 0;
		uint32 exp2L = 0;
		if( log1L > 15 && log2L > 15 ) 
		{
			exp1L = log1L - 15;
			exp2L = log2L - 15;
			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
		}
		else if( log1L > 15 ) 
		{
			exp1L = log1L + log2L - 31;
			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
		}
		else
		{
			exp2L = log1L + log2L - 31;
			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
		}

		*manPtrA = v1L * v2L;
		*expPtrA = exp1L + exp2L;
	}
}

/* ------------------------------------------------------------------------- */

void bbs_mulS32( int32 v1A, int32 v2A, int32* manPtrA, int32* expPtrA )
{
	uint32 log1L = bbs_intLog2( v1A > 0 ? v1A : -v1A );
	uint32 log2L = bbs_intLog2( v2A > 0 ? v2A : -v2A );

	if( log1L + log2L < 30 )
	{
		*manPtrA = v1A * v2A;
		*expPtrA = 0;
	}
	else
	{
		int32 v1L = v1A;
		int32 v2L = v2A;
		int32 exp1L = 0;
		int32 exp2L = 0;
		if( log1L > 14 && log2L > 14 ) 
		{
			exp1L = log1L - 14;
			exp2L = log2L - 14;
			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
		}
		else if( log1L > 14 ) 
		{
			exp1L = log1L + log2L - 29;
			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
		}
		else
		{
			exp2L = log1L + log2L - 29;
			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
		}

		*manPtrA = v1L * v2L;
		*expPtrA = exp1L + exp2L;
	}
}

/* ------------------------------------------------------------------------- */

void bbs_vecSqrNorm32( const int32* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA )
{
	uint32 sumL = 0;
	int32 sumExpL = 0;

	uint32 iL;
	for( iL = 0; iL < sizeA; iL++ )
	{
		int32 vL = vecA[ iL ];
		int32 logL = bbs_intLog2( vL > 0 ? vL : -vL );
		int32 expL = ( logL > 14 ) ? logL - 14 : 0;
		uint32 prdL;

		if( expL >= 1 )
		{
			vL = ( ( vL >> ( expL - 1 ) ) + 1 ) >> 1;
		}
		else
		{
			vL = vL >> expL;
		}

		prdL = vL * vL;
		expL <<= 1; /* now exponent of product */

		if( sumExpL > expL )
		{
			uint32 shrL = sumExpL - expL;
			prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1;
		}
		else if( expL > sumExpL )
		{
			uint32 shrL = expL - sumExpL;
			sumL = ( ( sumL >> ( shrL - 1 ) ) + 1 ) >> 1;
			sumExpL += shrL;
		}

		sumL += prdL;

		if( sumL > 0x80000000 )
		{
			sumL = ( sumL + 1 ) >> 1;
			sumExpL++;
		}
	}

	/* make exponent even */
	if( ( sumExpL & 1 ) != 0 )
	{
		sumL = ( sumL + 1 ) >> 1;
		sumExpL++;
	}

	if( manPtrA != NULL ) *manPtrA = sumL;
	if( expPtrA != NULL ) *expPtrA = sumExpL;
}

/* ------------------------------------------------------------------------- */

void bbs_vecSqrNorm16( const int16* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA )
{
	uint32 sumL = 0;
	int32 sumExpL = 0;

	uint32 iL;
	for( iL = 0; iL < sizeA; iL++ )
	{
		int32 vL = vecA[ iL ];
		uint32 prdL = vL * vL;

		if( sumExpL > 0 )
		{
			uint32 shrL = sumExpL;
			prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1;
		}

		sumL += prdL;

		if( sumL > 0x80000000 )
		{
			sumL = ( sumL + 1 ) >> 1;
			sumExpL++;
		}
	}

	/* make exponent even */
	if( ( sumExpL & 1 ) != 0 )
	{
		sumL = ( sumL + 1 ) >> 1;
		sumExpL++;
	}

	if( manPtrA != NULL ) *manPtrA = sumL;
	if( expPtrA != NULL ) *expPtrA = sumExpL;
}

/* ------------------------------------------------------------------------- */

uint32 bbs_vecNorm16( const int16* vecA, uint32 sizeA )
{
	uint32 manL;
	uint32 expL;
	bbs_vecSqrNorm16( vecA, sizeA, &manL, &expL );
	manL = bbs_sqrt32( manL );
	return manL << ( expL >> 1 );
}

/* ------------------------------------------------------------------------- */

void bbs_matMultiplyFlt16( const int16 *x1A, int16 row1A, int16 col1A, const int16 *x2A, int16 col2A, int16 *rA )
{
	#if defined( HW_TMS320C5x )
		/* operands need to be in internal memory for mmul*/
		if( x1A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE ||
			x2A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE )
		{
			int16 iL,jL,kL;
			int16 *ptr1L, *ptr2L;
			int32 sumL;
			
			for( iL = 0; iL < row1A; iL++ )
			{
				for( jL = 0; jL < col2A; jL++ )
				{
					ptr1L = ( int16* ) x1A + iL * col1A;
					ptr2L = ( int16* ) x2A + jL;
					sumL = 0;
					for( kL = 0; kL < col1A; kL++ )
					{
						sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
						ptr2L += col2A;
					}
					*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
				}
			}
		}
		else mmul( ( int16* ) x1A, row1A, col1A, ( int16* ) x2A, col1A, col2A, rA );
	
	#elif defined( HW_ARMv4 ) || defined( HW_ARMv5TE )

		int32 iL, jL, kL;
		int16 *ptr1L, *ptr2L;
		int32 sumL;
		for( iL = 0; iL < row1A; iL++ )
		{
			for( jL = 0; jL < col2A; jL++ )
			{
				ptr1L = ( int16* ) x1A + iL * col1A;
				ptr2L = ( int16* ) x2A + jL;
				sumL = 0;
				for( kL = col1A; kL >= 4; kL -= 4 )
				{
					sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
					sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
					sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
					sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
					ptr2L += col2A;
				}
				for( ; kL > 0; kL-- )
				{
					sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
					ptr2L += col2A;
				}
				*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
			}
		}
	#else
		/* needs to be optimized */
		int16 iL,jL,kL;
		int16 *ptr1L, *ptr2L;
		int32 sumL;
		
		for( iL = 0; iL < row1A; iL++ )
		{
			for( jL = 0; jL < col2A; jL++ )
			{
				ptr1L = ( int16* ) x1A + iL * col1A;
				ptr2L = ( int16* ) x2A + jL;
				sumL = 0;
				for( kL = 0; kL < col1A; kL++ )
				{
					sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
					ptr2L += col2A;
				}
				*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
			}
		}
	#endif
}

/* ------------------------------------------------------------------------- */

void bbs_matMultiplyTranspFlt16( const int16 *x1A, int16 row1A, int16 col1A, 
								 const int16 *x2A, int16 col2A, int16 *rA )
{
	const int16* ptr1L = x1A;

	int32 iL;
	for( iL = row1A; iL > 0 ; iL-- )
	{
		int32 jL;
		const int16* ptr2L = x2A;
		for( jL = col2A; jL > 0 ; jL-- )
		{
			int32 kL;
			int32 sumL = 0;
			for( kL = col1A >> 2; kL > 0; kL-- )
			{
				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
			}
			for( kL = col1A & 3; kL > 0; kL-- )
			{
				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
			}
			*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
			ptr1L -= col1A;
		}
		ptr1L += col1A;
	}
}

/* ------------------------------------------------------------------------- */


#ifndef mtrans
uint16 bbs_matTrans( int16 *xA, int16 rowA, int16 colA, int16 *rA )
{
	/* needs to be optimized */
	int16 iL;
	for( iL = colA; iL--; )
	{
		int16* sL = xA++;
		int16 jL;
		for( jL = rowA; jL--; )
		{
			*rA++ = *sL;
			sL += colA;
		}
	}
	return 0;
}
#endif

/* ------------------------------------------------------------------------- */
#ifndef atan2_16
int16 bbs_atan2( int16 nomA, int16 denomA )
{
	int16 phL, argL;	
	
	if( nomA == denomA ) return 8192;
	argL = ( ( int32 ) nomA << 15 ) / denomA;
	
	/* 0.318253*2 x      20857 .15
	  +0.003314*2 x^2      217 .15
	  -0.130908*2 x^3    -8580 .15
	  +0.068542*2 x^4     4491 .15
	  -0.009159*2 x^5     -600 .15 */	

	phL = -600;
	phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 4481;
	phL = ( ( ( int32 ) phL * argL ) >> 15 ) - 8580;
	phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 217;
	phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 20857;
	phL = ( ( int32 ) phL * argL ) >> 15;

	return phL >> 1; /* /2 */
}

/* needs to be optimized */
uint16 bbs_vecPhase( int16 *reA, int16 *imA, int16 *phaseA, uint16 sizeA )
{
	for( ; sizeA--; )
	{
		int16 reL = *reA++;
		int16 imL = *imA++;
		int16 phL = 0;
		
		if( reL < 0 )
		{
			reL = -reL;
			if( imL < 0 )
			{
				imL = -imL;
				if( reL > imL ) 
				{
					phL = -32768 + bbs_atan2( imL, reL );
				}
				else
				{
					phL = -16384 - bbs_atan2( reL, imL );
				}
			}
			else
			{
				if( reL > imL ) 
				{
					phL = -( -32768 + bbs_atan2( imL, reL ) );
				}
				else
				{
					if( imL == 0 ) phL = 0;
					else phL = 16384 + bbs_atan2( reL, imL );
				}
			}
		}
		else
		{
			if( imL < 0 )
			{
				imL = -imL;
				if( reL > imL ) 
				{
					phL = -bbs_atan2( imL, reL );
				}
				else
				{
					phL = -16384 + bbs_atan2( reL, imL );
				}
			}
			else
			{
				if( reL > imL ) 
				{
					phL = bbs_atan2( imL, reL );
				}
				else
				{
					if( imL == 0 ) phL = 0;
					else phL = 16384 - bbs_atan2( reL, imL );
				}
			}
		}
		
		*phaseA++ = phL;
	}
	return 0;
}

#endif

/* ------------------------------------------------------------------------- */