/* * Copyright (C) 2008 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* ---- includes ----------------------------------------------------------- */ #include "b_BasicEm/Math.h" #include "b_BasicEm/Functions.h" /* ---- related objects --------------------------------------------------- */ /* ---- typedefs ----------------------------------------------------------- */ /* ---- constants ---------------------------------------------------------- */ /* ------------------------------------------------------------------------- */ /* ========================================================================= */ /* */ /* ---- \ghd{ external functions } ----------------------------------------- */ /* */ /* ========================================================================= */ #if defined( HW_SSE2 ) extern int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA ); extern int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA ); #endif #if defined( HW_FR71 ) int32 bbs_dotProduct_fr71( const int16* vec1A, const int16* vec2A, uint32 sizeA ); #endif /* ------------------------------------------------------------------------- */ uint16 bbs_sqrt32( uint32 valA ) { uint32 rootL = 0; uint32 expL = 0; expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4; expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3; expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2; expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1; switch( expL >> 1 ) { case 15: rootL += ( ( rootL + 0x8000 ) * ( rootL + 0x8000 ) <= valA ) << 15; case 14: rootL += ( ( rootL + 0x4000 ) * ( rootL + 0x4000 ) <= valA ) << 14; case 13: rootL += ( ( rootL + 0x2000 ) * ( rootL + 0x2000 ) <= valA ) << 13; case 12: rootL += ( ( rootL + 0x1000 ) * ( rootL + 0x1000 ) <= valA ) << 12; case 11: rootL += ( ( rootL + 0x0800 ) * ( rootL + 0x0800 ) <= valA ) << 11; case 10: rootL += ( ( rootL + 0x0400 ) * ( rootL + 0x0400 ) <= valA ) << 10; case 9: rootL += ( ( rootL + 0x0200 ) * ( rootL + 0x0200 ) <= valA ) << 9; case 8: rootL += ( ( rootL + 0x0100 ) * ( rootL + 0x0100 ) <= valA ) << 8; case 7: rootL += ( ( rootL + 0x0080 ) * ( rootL + 0x0080 ) <= valA ) << 7; case 6: rootL += ( ( rootL + 0x0040 ) * ( rootL + 0x0040 ) <= valA ) << 6; case 5: rootL += ( ( rootL + 0x0020 ) * ( rootL + 0x0020 ) <= valA ) << 5; case 4: rootL += ( ( rootL + 0x0010 ) * ( rootL + 0x0010 ) <= valA ) << 4; case 3: rootL += ( ( rootL + 0x0008 ) * ( rootL + 0x0008 ) <= valA ) << 3; case 2: rootL += ( ( rootL + 0x0004 ) * ( rootL + 0x0004 ) <= valA ) << 2; case 1: rootL += ( ( rootL + 0x0002 ) * ( rootL + 0x0002 ) <= valA ) << 1; case 0: rootL += ( ( rootL + 0x0001 ) * ( rootL + 0x0001 ) <= valA ) << 0; } return ( uint16 )rootL; } /* ------------------------------------------------------------------------- */ uint8 bbs_sqrt16( uint16 valA ) { uint16 rootL = 0; rootL += ( ( rootL + 0x80 ) * ( rootL + 0x80 ) <= valA ) << 7; rootL += ( ( rootL + 0x40 ) * ( rootL + 0x40 ) <= valA ) << 6; rootL += ( ( rootL + 0x20 ) * ( rootL + 0x20 ) <= valA ) << 5; rootL += ( ( rootL + 0x10 ) * ( rootL + 0x10 ) <= valA ) << 4; rootL += ( ( rootL + 0x08 ) * ( rootL + 0x08 ) <= valA ) << 3; rootL += ( ( rootL + 0x04 ) * ( rootL + 0x04 ) <= valA ) << 2; rootL += ( ( rootL + 0x02 ) * ( rootL + 0x02 ) <= valA ) << 1; rootL += ( ( rootL + 0x01 ) * ( rootL + 0x01 ) <= valA ) << 0; return ( uint8 )rootL; } /* ------------------------------------------------------------------------- */ /* table of sqrt and slope values */ const uint32 bbs_fastSqrt32_tableG[] = { 268435456, 1016, 272596992, 1000, 276692992, 987, 280735744, 972, 284717056, 959, 288645120, 946, 292519936, 933, 296341504, 922, 300118016, 910, 303845376, 899, 307527680, 889, 311169024, 878, 314765312, 869, 318324736, 858, 321839104, 850, 325320704, 840, 328761344, 832, 332169216, 824, 335544320, 815, 338882560, 807, 342188032, 799, 345460736, 792, 348704768, 785, 351920128, 777, 355102720, 771, 358260736, 764, 361390080, 757, 364490752, 751, 367566848, 745, 370618368, 739, 373645312, 732, 376643584, 727, 379621376, 722, 382578688, 715, 385507328, 711, 388419584, 705, 391307264, 700, 394174464, 695, 397021184, 689, 399843328, 686, 402653184, 680, 405438464, 675, 408203264, 672, 410955776, 666, 413683712, 663, 416399360, 658, 419094528, 653, 421769216, 650, 424431616, 646, 427077632, 641, 429703168, 638, 432316416, 634, 434913280, 630, 437493760, 627, 440061952, 622, 442609664, 620, 445149184, 615, 447668224, 613, 450179072, 609, 452673536, 605, 455151616, 602, 457617408, 600, 460075008, 595, 462512128, 593, 464941056, 590, 467357696, 587, 469762048, 583, 472150016, 581, 474529792, 578, 476897280, 575, 479252480, 572, 481595392, 569, 483926016, 567, 486248448, 564, 488558592, 561, 490856448, 559, 493146112, 556, 495423488, 553, 497688576, 552, 499949568, 548, 502194176, 546, 504430592, 544, 506658816, 541, 508874752, 539, 511082496, 537, 513282048, 534, 515469312, 533, 517652480, 529, 519819264, 528, 521981952, 526, 524136448, 523, 526278656, 521, 528412672, 519, 530538496, 517, 532656128, 515, 534765568, 514 }; uint16 bbs_fastSqrt32( uint32 valA ) { uint32 expL = 0; uint32 valL; uint32 offsL; uint32 indexL = 0; if( valA == 0 ) return 0; /* compute closest even size exponent of valA */ expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4; expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3; expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2; expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1; valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */ offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 12 ) ) >> 13; indexL = ( valL >> 24 ) & 0xFE; return ( bbs_fastSqrt32_tableG[ indexL ] + offsL * bbs_fastSqrt32_tableG[ indexL + 1 ] ) >> ( 28 - ( expL >> 1 ) ); } /* ------------------------------------------------------------------------- */ /* table of 1/sqrt (1.31) and negative slope (.15) values referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */ const uint32 bbs_invSqrt32_tableG[] = { 2147483648u, 1001, 2114682880, 956, 2083356672, 915, 2053373952, 877, 2024636416, 840, 1997111296, 808, 1970634752, 776, 1945206784, 746, 1920761856, 720, 1897168896, 693, 1874460672, 669, 1852538880, 646, 1831370752, 625, 1810890752, 604, 1791098880, 584, 1771962368, 567, 1753382912, 548, 1735426048, 533, 1717960704, 516, 1701052416, 502, 1684602880, 487, 1668644864, 474, 1653112832, 461, 1638006784, 448, 1623326720, 436, 1609039872, 426, 1595080704, 414, 1581514752, 404, 1568276480, 394, 1555365888, 384, 1542782976, 375, 1530494976, 367, 1518469120, 357, 1506770944, 350, 1495302144, 342, 1484095488, 334, 1473150976, 327, 1462435840, 320, 1451950080, 313, 1441693696, 307, 1431633920, 300, 1421803520, 294, 1412169728, 289, 1402699776, 282, 1393459200, 277, 1384382464, 272, 1375469568, 266, 1366753280, 262, 1358168064, 257, 1349746688, 251, 1341521920, 248, 1333395456, 243, 1325432832, 238, 1317634048, 235, 1309933568, 230, 1302396928, 227, 1294958592, 222, 1287684096, 219, 1280507904, 216, 1273430016, 211, 1266515968, 209, 1259667456, 205, 1252950016, 202, 1246330880, 198, 1239842816, 196, 1233420288, 192, 1227128832, 190, 1220902912, 187, 1214775296, 184, 1208745984, 181, 1202814976, 179, 1196949504, 176, 1191182336, 173, 1185513472, 171, 1179910144, 169, 1174372352, 166, 1168932864, 164, 1163558912, 162, 1158250496, 160, 1153007616, 157, 1147863040, 155, 1142784000, 154, 1137737728, 151, 1132789760, 149, 1127907328, 148, 1123057664, 145, 1118306304, 144, 1113587712, 142, 1108934656, 140, 1104347136, 138, 1099825152, 137, 1095335936, 135, 1090912256, 134, 1086521344, 131, 1082228736, 131, 1077936128, 128 }; uint32 bbs_invSqrt32( uint32 valA ) { uint32 expL = 0; uint32 valL; uint32 offsL; uint32 indexL = 0; if( valA == 0U ) return 0x80000000; /* compute closest even size exponent of valA */ expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4; expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3; expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2; expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1; valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */ offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 9 ) ) >> 10; indexL = ( valL >> 24 ) & 0xFE; return ( bbs_invSqrt32_tableG[ indexL ] - offsL * bbs_invSqrt32_tableG[ indexL + 1 ] ) >> ( expL >> 1 ); } /* ------------------------------------------------------------------------- */ /* table of 1/( x + 1 ) (2.30) and negative slope (.14) values referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */ const int32 bbs_inv32_tableG[] = { 1073741824, 1986, 1041203200, 1870, 1010565120, 1762, 981696512, 1664, 954433536, 1575, 928628736, 1491, 904200192, 1415, 881016832, 1345, 858980352, 1278, 838041600, 1218, 818085888, 1162, 799047680, 1108, 780894208, 1059, 763543552, 1013, 746946560, 970, 731054080, 930, 715816960, 891, 701218816, 856, 687194112, 823, 673710080, 791, 660750336, 761, 648282112, 732, 636289024, 706, 624721920, 681, 613564416, 657, 602800128, 635, 592396288, 613, 582352896, 592, 572653568, 573, 563265536, 554, 554188800, 537, 545390592, 520, }; int32 bbs_inv32( int32 valA ) { uint32 expL = 0; int32 signL = ( ( valA >> 30 ) & 0xFFFFFFFE ) + 1; int32 valL = signL * valA; int32 offsL; uint32 indexL = 0; if( valL <= ( int32 ) 1 ) return 0x40000000 * signL; /* compute size exponent of valL */ expL += ( ( valL >> ( expL + 0x10 ) ) != 0 ) << 4; expL += ( ( valL >> ( expL + 0x08 ) ) != 0 ) << 3; expL += ( ( valL >> ( expL + 0x04 ) ) != 0 ) << 2; expL += ( ( valL >> ( expL + 0x02 ) ) != 0 ) << 1; expL += ( ( valL >> ( expL + 0x01 ) ) != 0 ); valL = ( ( valL << ( 30 - expL ) ) - 1073741824 ); /*( 1U << 30 )*/ offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 10 ) ) >> 11; indexL = ( valL >> 24 ) & 0xFE; return signL * ( ( ( ( bbs_inv32_tableG[ indexL ] - offsL * bbs_inv32_tableG[ indexL + 1 ] ) >> ( expL - 1 ) ) + 1 ) >> 1 ); } /* ------------------------------------------------------------------------- */ uint32 bbs_intLog2( uint32 valA ) { uint32 expL = 0; expL += 0x10 * ( ( valA >> ( expL + 0x10 ) ) != 0 ); expL += 0x08 * ( ( valA >> ( expL + 0x08 ) ) != 0 ); expL += 0x04 * ( ( valA >> ( expL + 0x04 ) ) != 0 ); expL += 0x02 * ( ( valA >> ( expL + 0x02 ) ) != 0 ); expL += 0x01 * ( ( valA >> ( expL + 0x01 ) ) != 0 ); return expL; } /* ------------------------------------------------------------------------- */ const uint32 bbs_pow2M1_tableG[] = { 0, 713, 46769127, 721, 94047537, 729, 141840775, 737, 190154447, 745, 238994221, 753, 288365825, 761, 338275050, 769, 388727751, 778, 439729846, 786, 491287318, 795, 543406214, 803, 596092647, 812, 649352798, 821, 703192913, 830, 757619310, 839, 812638371, 848, 868256550, 857, 924480371, 867, 981316430, 876, 1038771393, 886, 1096851999, 895, 1155565062, 905, 1214917468, 915, 1274916179, 925, 1335568233, 935, 1396880745, 945, 1458860907, 956, 1521515988, 966, 1584853338, 976, 1648880387, 987, 1713604645, 998, 1779033703, 1009, 1845175238, 1020, 1912037006, 1031, 1979626852, 1042, 2047952702, 1053, 2117022572, 1065, 2186844564u, 1077, 2257426868u, 1088, 2328777762u, 1100, 2400905617u, 1112, 2473818892u, 1124, 2547526141u, 1136, 2622036010u, 1149, 2697357237u, 1161, 2773498659u, 1174, 2850469207u, 1187, 2928277909u, 1200, 3006933892u, 1213, 3086446383u, 1226, 3166824708u, 1239, 3248078296u, 1253, 3330216677u, 1266, 3413249486u, 1280, 3497186464u, 1294, 3582037455u, 1308, 3667812413u, 1323, 3754521399u, 1337, 3842174584u, 1352, 3930782250u, 1366, 4020354790u, 1381, 4110902710u, 1396, 4202436633u, 1411 }; uint32 bbs_pow2M1( uint32 valA ) { uint32 offsL = ( valA & 0x03FFFFFF ) >> 10; uint16 indexL = ( ( valA & 0xFC000000 ) >> 26 ) << 1; return bbs_pow2M1_tableG[ indexL ] + offsL * bbs_pow2M1_tableG[ indexL + 1 ]; } /* ------------------------------------------------------------------------- */ uint32 bbs_pow2( int32 valA ) { int32 shiftL = 16 - ( valA >> 27 ); uint32 offsL = ( uint32 )( valA << 5 ); if( shiftL == 32 ) return 1; return ( 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL ); } /* ------------------------------------------------------------------------- */ uint32 bbs_exp( int32 valA ) { int32 adjustedL; int32 shiftL; int32 offsL; /* check boundaries to avoid overflow */ if( valA < -1488522236 ) { return 0; } else if( valA > 1488522236 ) { return 0xFFFFFFFF; } /* multily valA with 1/ln(2) in order to use function 2^x instead of e^x */ adjustedL = ( valA >> 16 ) * 94548 + ( ( ( ( ( uint32 )valA ) & 0x0FFFF ) * 47274 ) >> 15 ); shiftL = 16 - ( adjustedL >> 27 ); if( shiftL == 32 ) return 1; offsL = ( uint32 )( adjustedL << 5 ); return ( ( int32 ) 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL ); } /* ------------------------------------------------------------------------- */ int16 bbs_satS16( int32 valA ) { if( valA > 32767 ) return 32767; if( valA < -32768 ) return -32768; return valA; } /* ------------------------------------------------------------------------- */ #if defined( HW_i586 ) || defined( HW_i686 ) /* Windows section */ #if defined( WIN32 ) && !defined( WIN64 ) /* disable warning "no return value"*/ #pragma warning( disable : 4035 ) /** * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0 */ int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA ) { __asm { push esi push edi mov eax, vec1A mov ebx, vec2A mov ecx, sizeA pxor mm4, mm4 pxor mm6, mm6 pxor mm7, mm7 shr ecx, 4 inner_loop_start: movq mm0, 0[eax] paddd mm7, mm4 movq mm1, 0[ebx] paddd mm7, mm6 movq mm2, 8[eax] pmaddwd mm0, mm1 movq mm3, 8[ebx] movq mm4, 16[eax] pmaddwd mm2, mm3 movq mm5, 16[ebx] paddd mm7, mm0 movq mm6, 24[eax] pmaddwd mm4, mm5 pmaddwd mm6, 24[ebx] paddd mm7, mm2 add eax, 32 add ebx, 32 dec ecx jnz inner_loop_start paddd mm7, mm4 paddd mm7, mm6 movq mm0, mm7 psrlq mm0, 32 paddd mm7, mm0 movd eax, mm7 emms pop edi pop esi } } #pragma warning( default : 4035 ) /* gcc compiler section */ #elif defined( epl_LINUX ) || defined( CYGWIN ) /** * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0 */ int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA ) { int32 resultL; __asm__ __volatile__( "movl %1,%%eax\n\t" "movl %2,%%ebx\n\t" "movl %3,%%ecx\n\t" "pxor %%mm4,%%mm4\n\t" "pxor %%mm6,%%mm6\n\t" "pxor %%mm7, %%mm7\n\t" "shrl $4, %%ecx\n\t" "\n1:\t" "movq 0(%%eax),%%mm0\n\t" "paddd %%mm4,%%mm7\n\t" "movq 0( %%ebx ),%%mm1\n\t" "paddd %%mm6,%%mm7\n\t" "movq 8( %%eax ),%%mm2\n\t" "pmaddwd %%mm1,%%mm0\n\t" "movq 8( %%ebx ),%%mm3\n\t" "movq 16( %%eax ),%%mm4\n\t" "pmaddwd %%mm3,%%mm2\n\t" "movq 16( %%ebx ),%%mm5\n\t" "paddd %%mm0,%%mm7\n\t" "movq 24( %%eax ),%%mm6\n\t" "pmaddwd %%mm5,%%mm4\n\t" "pmaddwd 24( %%ebx ),%%mm6\n\t" "paddd %%mm2,%%mm7\n\t" "addl $32,%%eax\n\t" "addl $32,%%ebx\n\t" "decl %%ecx\n\t" "jnz 1b\n\t" "paddd %%mm4,%%mm7\n\t" "paddd %%mm6,%%mm7\n\t" "movq %%mm7,%%mm0\n\t" "psrlq $32,%%mm0\n\t" "paddd %%mm0,%%mm7\n\t" "movd %%mm7,%0\n\t" "emms\n\t" : "=&g" ( resultL ) : "g" ( vec1A ), "g" ( vec2A ), "g" ( sizeA ) : "si", "di", "ax", "bx", "cx", "st", "memory" ); return resultL; } #endif /* epl_LINUX, CYGWIN */ #endif /* HW_i586 || HW_i686 */ /* ------------------------------------------------------------------------- */ #ifdef HW_TMS320C6x /** * Calls fast assembler version of dotproduct for DSP. * dotProduct_C62x is implemented in file dotprod.asm and expects input vectors * of even length. */ int32 bbs_dotProduct_dsp( const int16* vec1A, const int16* vec2A, uint32 sizeA ) { if( sizeA & 1 ) { int32 resultL; resultL = dotProduct_C62x( vec1A, vec2A, sizeA - 1 ); return resultL + ( int32 ) *( vec1A + sizeA - 1 ) * *( vec2A + sizeA - 1 ); } else { return dotProduct_C62x( vec1A, vec2A, sizeA ); } } #endif /* HW_TMS320C6x */ /* ------------------------------------------------------------------------- */ /* 16 dot product for the PS2/EE processor */ /* input vectors MUST be 128 bit aligned ! */ #if defined( epl_LINUX ) && defined( HW_EE ) int32 bbs_dotProduct_EE( const int16* vec1A, const int16* vec2A, uint32 sizeA ) { int32 resultL = 0, iL = sizeA >> 3, jL = sizeA - ( iL << 3 ); if( iL > 0 ) { /* multiply-add elements of input vectors in sets of 8 */ int32 accL[ 4 ], t1L, t2L, t3L; asm volatile ( "pxor %4, %2, %2\n\t" /* reset 8 accumulators (LO and HI register) to 0 */ "pmtlo %4\n\t" "pmthi %4\n\t" "\n__begin_loop:\t" "lq %2,0(%0)\n\t" /* load 8 pairs of int16 */ "lq %3,0(%1)\n\t" "addi %0,%0,16\n\t" /* vec1L += 16 */ "addi %1,%1,16\n\t" /* vec2L += 16 */ "addi %7,%7,-1\n\t" /* iL-- */ "pmaddh %4,%2,%3\n\t" /* parallel multiply-add of 8 pairs of int16 */ "bgtzl %7,__begin_loop\n\t" /* if iL > 0 goto _begin_loop */ "pmflo %2\n\t" /* parallel add 8 accumulators , store remaining 4 accumulators in tmpL */ "pmfhi %3\n\t" "paddw %4,%2,%3\n\t" "sq %4,0(%8)\n\t" : "=r" ( vec1A ), "=r" ( vec2A ), "=r" ( t1L ), "=r" ( t2L ), "=r" ( t3L ) : "0" ( vec1A ), "1" ( vec2A ), "r" ( iL ), "r" ( accL ) : "memory" ); /* add 4 parallel accumulators */ resultL += accL[ 0 ] + accL[ 1 ] + accL[ 2 ] + accL[ 3 ]; } /* multiply-add remaining elements of input vectors */ for( ; jL--; ) resultL += ( int32 ) *vec1A++ * *vec2A++; return resultL; } #endif /* ------------------------------------------------------------------------- */ #if defined( HW_ARMv5TE ) /* fast 16 dot product for ARM9E cores (DSP extensions). * input vectors must be 32 bit aligned */ int32 bbs_dotProduct_arm9e( const int16* vec1A, const int16* vec2A, uint32 sizeA ) { int32 accuL = 0; int32* v1PtrL = ( int32* )vec1A; int32* v2PtrL = ( int32* )vec2A; for( ; sizeA >= 4; sizeA -= 4 ) { __asm { smlabb accuL, *v1PtrL, *v2PtrL, accuL; smlatt accuL, *v1PtrL, *v2PtrL, accuL; } v1PtrL++; v2PtrL++; __asm { smlabb accuL, *v1PtrL, *v2PtrL, accuL; smlatt accuL, *v1PtrL, *v2PtrL, accuL; } v1PtrL++; v2PtrL++; } vec1A = ( int16* )v1PtrL; vec2A = ( int16* )v2PtrL; /* multiply-add remaining elements of input vectors */ for( ; sizeA > 0; sizeA-- ) accuL += ( int32 )*vec1A++ * *vec2A++; return accuL; } #endif /* ------------------------------------------------------------------------- */ /** * Computes a fast dot product using standard C */ int32 bbs_dotProduct_stdc( const int16* vec1A, const int16* vec2A, uint32 sizeA ) { int32 accuL = 0; for( ; sizeA >= 8; sizeA -= 8 ) { accuL += ( int32 ) *vec1A * *vec2A; accuL += ( int32 ) *( vec1A + 1 ) * *( vec2A + 1 ); accuL += ( int32 ) *( vec1A + 2 ) * *( vec2A + 2 ); accuL += ( int32 ) *( vec1A + 3 ) * *( vec2A + 3 ); accuL += ( int32 ) *( vec1A + 4 ) * *( vec2A + 4 ); accuL += ( int32 ) *( vec1A + 5 ) * *( vec2A + 5 ); accuL += ( int32 ) *( vec1A + 6 ) * *( vec2A + 6 ); accuL += ( int32 ) *( vec1A + 7 ) * *( vec2A + 7 ); vec1A += 8; vec2A += 8; } for( ; sizeA; sizeA-- ) accuL += ( int32 ) *vec1A++ * *vec2A++; return accuL; } /* ------------------------------------------------------------------------- */ int32 bbs_dotProductInt16( const int16* vec1A, const int16* vec2A, uint32 sizeA ) { /* PC */ #if ( defined( HW_i586 ) || defined( HW_i686 ) ) #if defined( HW_SSE2 ) uint32 size16L = sizeA & 0xfffffff0; if( size16L ) { if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 ) { return bbs_dotProduct_128SSE2( vec1A, vec2A, sizeA ); } else { return bbs_dotProduct_u128SSE2( vec1A, vec2A, sizeA ); } } #elif !defined( WIN64 ) /* MMX version (not supported by 64-bit compiler) */ uint32 size16L = sizeA & 0xfffffff0; if( size16L ) { if( sizeA == size16L ) { return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L ); } return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L ) + bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L ); } /* if( size16L ) */ #endif return bbs_dotProduct_stdc( vec1A, vec2A, sizeA ); /* Playstation 2 */ #elif defined( HW_EE ) && defined( epl_LINUX ) if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 ) { return bbs_dotProduct_EE( vec1A, vec2A, sizeA ); } return bbs_dotProduct_stdc( vec1A, vec2A, sizeA ); /* ARM9E */ #elif defined( HW_ARMv5TE ) return bbs_dotProduct_arm9e( vec1A, vec2A, sizeA ); /* TI C6000 */ #elif defined( HW_TMS320C6x ) return bbs_dotProduct_dsp( vec1A, vec2A, sizeA ); #elif defined( HW_FR71 ) uint32 size16L = sizeA & 0xfffffff0; if( size16L ) { if( sizeA == size16L ) { return bbs_dotProduct_fr71( vec1A, vec2A, size16L ); } return bbs_dotProduct_fr71( vec1A, vec2A, size16L ) + bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L ); } return bbs_dotProduct_stdc( vec1A, vec2A, sizeA ); #endif return bbs_dotProduct_stdc( vec1A, vec2A, sizeA ); } /* ------------------------------------------------------------------------- */ /* table of fermi and slope values (result: 2.30; offset: .12) referenced in b_NeuralNetEm/FastMlpNet.c, not not rename or remove */ const uint32 bbs_fermi_tableG[] = { 45056, 8, 77824, 13, 131072, 21, 217088, 34, 356352, 57, 589824, 94, 974848, 155, 1609728, 255, 2654208, 418, 4366336, 688, 7184384, 1126, 11796480, 1834, 19308544, 2970, 31473664, 4748, 50921472, 7453, 81448960, 11363, 127991808, 16573, 195874816, 22680, 288772096, 28469, 405381120, 32102, 536870912, 32101, 668356608, 28469, 784965632, 22680, 877862912, 16573, 945745920, 11363, 992288768, 7453, 1022816256, 4748, 1042264064, 2970, 1054429184, 1834, 1061941248, 1126, 1066553344, 688, 1069371392, 418, 1071083520, 255, 1072128000, 155, 1072762880, 94, 1073147904, 57, 1073381376, 34, 1073520640, 21, 1073606656, 13, 1073659904, 8, }; int32 bbs_fermi( int32 valA ) { uint32 indexL = ( ( valA >> 15 ) + 20 ) << 1; uint32 offsL = ( ( valA & 0x00007FFF ) + 4 ) >> 3; if( valA < -655360 ) return 1; if( valA >= 655360 ) return 1073741824 - 1; /* ( 1 << 30 ) */ return ( bbs_fermi_tableG[ indexL ] + offsL * bbs_fermi_tableG[ indexL + 1 ] ); } /* ------------------------------------------------------------------------- */ void bbs_uint32ReduceToNBits( uint32* argPtrA, int32* bbpPtrA, uint32 nBitsA ) { int32 posHighestBitL = bbs_intLog2( *argPtrA ) + 1; int32 shiftL = posHighestBitL - nBitsA; if( shiftL > 0 ) { ( *argPtrA ) >>= shiftL; ( *bbpPtrA ) -= shiftL; } } /* ------------------------------------------------------------------------- */ void bbs_int32ReduceToNBits( int32* argPtrA, int32* bbpPtrA, uint32 nBitsA ) { int32 posHighestBitL = bbs_intLog2( bbs_abs( *argPtrA ) ) + 1; int32 shiftL = posHighestBitL - nBitsA; if( shiftL > 0 ) { ( *argPtrA ) >>= shiftL; ( *bbpPtrA ) -= shiftL; } } /* ------------------------------------------------------------------------- */ uint32 bbs_convertU32( uint32 srcA, int32 srcBbpA, int32 dstBbpA ) { if( dstBbpA >= srcBbpA ) { uint32 shiftL = dstBbpA - srcBbpA; if( srcA > ( ( uint32 )0xFFFFFFFF >> shiftL ) ) { /* overflow */ return ( uint32 )0xFFFFFFFF; } else { return srcA << shiftL; } } else { uint32 shiftL = srcBbpA - dstBbpA; uint32 addL = 1L << ( shiftL - 1 ); if( srcA + addL < addL ) { /* rounding would cause overflow */ return srcA >> shiftL; } else { return ( srcA + addL ) >> shiftL; } } } /* ------------------------------------------------------------------------- */ int32 bbs_convertS32( int32 srcA, int32 srcBbpA, int32 dstBbpA ) { if( dstBbpA >= srcBbpA ) { uint32 shiftL = ( uint32 )( dstBbpA - srcBbpA ); if( srcA > ( ( int32 )0x7FFFFFFF >> shiftL ) ) { /* overflow */ return ( uint32 )0x7FFFFFFF; } else if( srcA < ( ( int32 )0x80000000 >> shiftL ) ) { /* underflow */ return ( int32 )0x80000000; } else { return srcA << shiftL; } } else { uint32 shiftL = ( uint32 )( srcBbpA - dstBbpA ); int32 addL = 1L << ( shiftL - 1 ); if( srcA + addL < addL ) { /* rounding would cause overflow */ return srcA >> shiftL; } else { return ( srcA + addL ) >> shiftL; } } } /* ------------------------------------------------------------------------- */ int32 bbs_vecPowerFlt16( const int16 *xA, int16 nxA ) { /* #if defined( HW_TMS320C5x ) uint32 rL; power( ( int16* ) xA, ( int32* ) &rL, nxA ); // does not work properly in DSPLib version 2.20.02 return ( rL >> 1 ); #else*/ /* needs to be optimized */ int32 rL = 0; for( ; nxA--; ) { rL += ( int32 ) *xA * *xA; xA++; } return rL; /* #endif */ } /* ------------------------------------------------------------------------- */ void bbs_mulU32( uint32 v1A, uint32 v2A, uint32* manPtrA, int32* expPtrA ) { uint32 log1L = bbs_intLog2( v1A ); uint32 log2L = bbs_intLog2( v2A ); if( log1L + log2L < 32 ) { *manPtrA = v1A * v2A; *expPtrA = 0; } else { uint32 v1L = v1A; uint32 v2L = v2A; uint32 exp1L = 0; uint32 exp2L = 0; if( log1L > 15 && log2L > 15 ) { exp1L = log1L - 15; exp2L = log2L - 15; v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1; v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1; } else if( log1L > 15 ) { exp1L = log1L + log2L - 31; v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1; } else { exp2L = log1L + log2L - 31; v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1; } *manPtrA = v1L * v2L; *expPtrA = exp1L + exp2L; } } /* ------------------------------------------------------------------------- */ void bbs_mulS32( int32 v1A, int32 v2A, int32* manPtrA, int32* expPtrA ) { uint32 log1L = bbs_intLog2( v1A > 0 ? v1A : -v1A ); uint32 log2L = bbs_intLog2( v2A > 0 ? v2A : -v2A ); if( log1L + log2L < 30 ) { *manPtrA = v1A * v2A; *expPtrA = 0; } else { int32 v1L = v1A; int32 v2L = v2A; int32 exp1L = 0; int32 exp2L = 0; if( log1L > 14 && log2L > 14 ) { exp1L = log1L - 14; exp2L = log2L - 14; v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1; v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1; } else if( log1L > 14 ) { exp1L = log1L + log2L - 29; v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1; } else { exp2L = log1L + log2L - 29; v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1; } *manPtrA = v1L * v2L; *expPtrA = exp1L + exp2L; } } /* ------------------------------------------------------------------------- */ void bbs_vecSqrNorm32( const int32* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA ) { uint32 sumL = 0; int32 sumExpL = 0; uint32 iL; for( iL = 0; iL < sizeA; iL++ ) { int32 vL = vecA[ iL ]; int32 logL = bbs_intLog2( vL > 0 ? vL : -vL ); int32 expL = ( logL > 14 ) ? logL - 14 : 0; uint32 prdL; if( expL >= 1 ) { vL = ( ( vL >> ( expL - 1 ) ) + 1 ) >> 1; } else { vL = vL >> expL; } prdL = vL * vL; expL <<= 1; /* now exponent of product */ if( sumExpL > expL ) { uint32 shrL = sumExpL - expL; prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1; } else if( expL > sumExpL ) { uint32 shrL = expL - sumExpL; sumL = ( ( sumL >> ( shrL - 1 ) ) + 1 ) >> 1; sumExpL += shrL; } sumL += prdL; if( sumL > 0x80000000 ) { sumL = ( sumL + 1 ) >> 1; sumExpL++; } } /* make exponent even */ if( ( sumExpL & 1 ) != 0 ) { sumL = ( sumL + 1 ) >> 1; sumExpL++; } if( manPtrA != NULL ) *manPtrA = sumL; if( expPtrA != NULL ) *expPtrA = sumExpL; } /* ------------------------------------------------------------------------- */ void bbs_vecSqrNorm16( const int16* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA ) { uint32 sumL = 0; int32 sumExpL = 0; uint32 iL; for( iL = 0; iL < sizeA; iL++ ) { int32 vL = vecA[ iL ]; uint32 prdL = vL * vL; if( sumExpL > 0 ) { uint32 shrL = sumExpL; prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1; } sumL += prdL; if( sumL > 0x80000000 ) { sumL = ( sumL + 1 ) >> 1; sumExpL++; } } /* make exponent even */ if( ( sumExpL & 1 ) != 0 ) { sumL = ( sumL + 1 ) >> 1; sumExpL++; } if( manPtrA != NULL ) *manPtrA = sumL; if( expPtrA != NULL ) *expPtrA = sumExpL; } /* ------------------------------------------------------------------------- */ uint32 bbs_vecNorm16( const int16* vecA, uint32 sizeA ) { uint32 manL; uint32 expL; bbs_vecSqrNorm16( vecA, sizeA, &manL, &expL ); manL = bbs_sqrt32( manL ); return manL << ( expL >> 1 ); } /* ------------------------------------------------------------------------- */ void bbs_matMultiplyFlt16( const int16 *x1A, int16 row1A, int16 col1A, const int16 *x2A, int16 col2A, int16 *rA ) { #if defined( HW_TMS320C5x ) /* operands need to be in internal memory for mmul*/ if( x1A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE || x2A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE ) { int16 iL,jL,kL; int16 *ptr1L, *ptr2L; int32 sumL; for( iL = 0; iL < row1A; iL++ ) { for( jL = 0; jL < col2A; jL++ ) { ptr1L = ( int16* ) x1A + iL * col1A; ptr2L = ( int16* ) x2A + jL; sumL = 0; for( kL = 0; kL < col1A; kL++ ) { sumL += ( ( int32 ) *ptr1L++ * *ptr2L ); ptr2L += col2A; } *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */ } } } else mmul( ( int16* ) x1A, row1A, col1A, ( int16* ) x2A, col1A, col2A, rA ); #elif defined( HW_ARMv4 ) || defined( HW_ARMv5TE ) int32 iL, jL, kL; int16 *ptr1L, *ptr2L; int32 sumL; for( iL = 0; iL < row1A; iL++ ) { for( jL = 0; jL < col2A; jL++ ) { ptr1L = ( int16* ) x1A + iL * col1A; ptr2L = ( int16* ) x2A + jL; sumL = 0; for( kL = col1A; kL >= 4; kL -= 4 ) { sumL += ( ( int32 ) *ptr1L++ * *ptr2L ); sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) ); sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) ); sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) ); ptr2L += col2A; } for( ; kL > 0; kL-- ) { sumL += ( ( int32 ) *ptr1L++ * *ptr2L ); ptr2L += col2A; } *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */ } } #else /* needs to be optimized */ int16 iL,jL,kL; int16 *ptr1L, *ptr2L; int32 sumL; for( iL = 0; iL < row1A; iL++ ) { for( jL = 0; jL < col2A; jL++ ) { ptr1L = ( int16* ) x1A + iL * col1A; ptr2L = ( int16* ) x2A + jL; sumL = 0; for( kL = 0; kL < col1A; kL++ ) { sumL += ( ( int32 ) *ptr1L++ * *ptr2L ); ptr2L += col2A; } *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */ } } #endif } /* ------------------------------------------------------------------------- */ void bbs_matMultiplyTranspFlt16( const int16 *x1A, int16 row1A, int16 col1A, const int16 *x2A, int16 col2A, int16 *rA ) { const int16* ptr1L = x1A; int32 iL; for( iL = row1A; iL > 0 ; iL-- ) { int32 jL; const int16* ptr2L = x2A; for( jL = col2A; jL > 0 ; jL-- ) { int32 kL; int32 sumL = 0; for( kL = col1A >> 2; kL > 0; kL-- ) { sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); } for( kL = col1A & 3; kL > 0; kL-- ) { sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); } *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */ ptr1L -= col1A; } ptr1L += col1A; } } /* ------------------------------------------------------------------------- */ #ifndef mtrans uint16 bbs_matTrans( int16 *xA, int16 rowA, int16 colA, int16 *rA ) { /* needs to be optimized */ int16 iL; for( iL = colA; iL--; ) { int16* sL = xA++; int16 jL; for( jL = rowA; jL--; ) { *rA++ = *sL; sL += colA; } } return 0; } #endif /* ------------------------------------------------------------------------- */ #ifndef atan2_16 int16 bbs_atan2( int16 nomA, int16 denomA ) { int16 phL, argL; if( nomA == denomA ) return 8192; argL = ( ( int32 ) nomA << 15 ) / denomA; /* 0.318253*2 x 20857 .15 +0.003314*2 x^2 217 .15 -0.130908*2 x^3 -8580 .15 +0.068542*2 x^4 4491 .15 -0.009159*2 x^5 -600 .15 */ phL = -600; phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 4481; phL = ( ( ( int32 ) phL * argL ) >> 15 ) - 8580; phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 217; phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 20857; phL = ( ( int32 ) phL * argL ) >> 15; return phL >> 1; /* /2 */ } /* needs to be optimized */ uint16 bbs_vecPhase( int16 *reA, int16 *imA, int16 *phaseA, uint16 sizeA ) { for( ; sizeA--; ) { int16 reL = *reA++; int16 imL = *imA++; int16 phL = 0; if( reL < 0 ) { reL = -reL; if( imL < 0 ) { imL = -imL; if( reL > imL ) { phL = -32768 + bbs_atan2( imL, reL ); } else { phL = -16384 - bbs_atan2( reL, imL ); } } else { if( reL > imL ) { phL = -( -32768 + bbs_atan2( imL, reL ) ); } else { if( imL == 0 ) phL = 0; else phL = 16384 + bbs_atan2( reL, imL ); } } } else { if( imL < 0 ) { imL = -imL; if( reL > imL ) { phL = -bbs_atan2( imL, reL ); } else { phL = -16384 + bbs_atan2( reL, imL ); } } else { if( reL > imL ) { phL = bbs_atan2( imL, reL ); } else { if( imL == 0 ) phL = 0; else phL = 16384 - bbs_atan2( reL, imL ); } } } *phaseA++ = phL; } return 0; } #endif /* ------------------------------------------------------------------------- */