; libFLAC - Free Lossless Audio Codec library ; Copyright (C) 2004,2005,2006,2007 Josh Coalson ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; ; - Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; - Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the distribution. ; ; - Neither the name of the Xiph.org Foundation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .text .align 2 .globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16 .globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8 _FLAC__lpc_restore_signal_asm_ppc_altivec_16: ; r3: residual[] ; r4: data_len ; r5: qlp_coeff[] ; r6: order ; r7: lp_quantization ; r8: data[] ; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal() ; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual ; bps<=15 for mid-side coding, since that uses an extra bit) ; these should be fast; the inner loop is unrolled (it takes no more than ; 3*(order%4) instructions, all of which are arithmetic), and all of the ; coefficients and all relevant history stay in registers, so the outer loop ; has only one load from memory (the residual) ; I have not yet run this through simg4, so there may be some avoidable stalls, ; and there may be a somewhat more clever way to do the outer loop ; the branch mechanism may prevent dynamic loading; I still need to examine ; this issue, and there may be a more elegant method stmw r31,-4(r1) addi r9,r1,-28 li r31,0xf andc r9,r9,r31 ; for quadword-aligned stack data slwi r6,r6,2 ; adjust for word size slwi r4,r4,2 add r4,r4,r8 ; r4 = data+data_len mfspr r0,256 ; cache old vrsave addis r31,0,hi16(0xfffffc00) ori r31,r31,lo16(0xfffffc00) mtspr 256,r31 ; declare VRs in vrsave cmplw cr0,r8,r4 ; i<data_len bc 4,0,L1400 ; load coefficients into v0-v7 and initial history into v8-v15 li r31,0xf and r31,r8,r31 ; r31: data%4 li r11,16 subf r31,r31,r11 ; r31: 4-(data%4) slwi r31,r31,3 ; convert to bits for vsro li r10,-4 stw r31,-4(r9) lvewx v0,r10,r9 vspltisb v18,-1 vsro v18,v18,v0 ; v18: mask vector li r31,0x8 lvsl v0,0,r31 vsldoi v0,v0,v0,12 li r31,0xc lvsl v1,0,r31 vspltisb v2,0 vspltisb v3,-1 vmrglw v2,v2,v3 vsel v0,v1,v0,v2 ; v0: reversal permutation vector add r10,r5,r6 lvsl v17,0,r5 ; v17: coefficient alignment permutation vector vperm v17,v17,v17,v0 ; v17: reversal coefficient alignment permutation vector mr r11,r8 lvsl v16,0,r11 ; v16: history alignment permutation vector lvx v0,0,r5 addi r5,r5,16 lvx v1,0,r5 vperm v0,v0,v1,v17 lvx v8,0,r11 addi r11,r11,-16 lvx v9,0,r11 vperm v8,v9,v8,v16 cmplw cr0,r5,r10 bc 12,0,L1101 vand v0,v0,v18 addis r31,0,hi16(L1307) ori r31,r31,lo16(L1307) b L1199 L1101: addi r5,r5,16 lvx v2,0,r5 vperm v1,v1,v2,v17 addi r11,r11,-16 lvx v10,0,r11 vperm v9,v10,v9,v16 cmplw cr0,r5,r10 bc 12,0,L1102 vand v1,v1,v18 addis r31,0,hi16(L1306) ori r31,r31,lo16(L1306) b L1199 L1102: addi r5,r5,16 lvx v3,0,r5 vperm v2,v2,v3,v17 addi r11,r11,-16 lvx v11,0,r11 vperm v10,v11,v10,v16 cmplw cr0,r5,r10 bc 12,0,L1103 vand v2,v2,v18 addis r31,0,hi16(L1305) ori r31,r31,lo16(L1305) b L1199 L1103: addi r5,r5,16 lvx v4,0,r5 vperm v3,v3,v4,v17 addi r11,r11,-16 lvx v12,0,r11 vperm v11,v12,v11,v16 cmplw cr0,r5,r10 bc 12,0,L1104 vand v3,v3,v18 addis r31,0,hi16(L1304) ori r31,r31,lo16(L1304) b L1199 L1104: addi r5,r5,16 lvx v5,0,r5 vperm v4,v4,v5,v17 addi r11,r11,-16 lvx v13,0,r11 vperm v12,v13,v12,v16 cmplw cr0,r5,r10 bc 12,0,L1105 vand v4,v4,v18 addis r31,0,hi16(L1303) ori r31,r31,lo16(L1303) b L1199 L1105: addi r5,r5,16 lvx v6,0,r5 vperm v5,v5,v6,v17 addi r11,r11,-16 lvx v14,0,r11 vperm v13,v14,v13,v16 cmplw cr0,r5,r10 bc 12,0,L1106 vand v5,v5,v18 addis r31,0,hi16(L1302) ori r31,r31,lo16(L1302) b L1199 L1106: addi r5,r5,16 lvx v7,0,r5 vperm v6,v6,v7,v17 addi r11,r11,-16 lvx v15,0,r11 vperm v14,v15,v14,v16 cmplw cr0,r5,r10 bc 12,0,L1107 vand v6,v6,v18 addis r31,0,hi16(L1301) ori r31,r31,lo16(L1301) b L1199 L1107: addi r5,r5,16 lvx v19,0,r5 vperm v7,v7,v19,v17 addi r11,r11,-16 lvx v19,0,r11 vperm v15,v19,v15,v16 vand v7,v7,v18 addis r31,0,hi16(L1300) ori r31,r31,lo16(L1300) L1199: mtctr r31 ; set up invariant vectors vspltish v16,0 ; v16: zero vector li r10,-12 lvsr v17,r10,r8 ; v17: result shift vector lvsl v18,r10,r3 ; v18: residual shift back vector li r10,-4 stw r7,-4(r9) lvewx v19,r10,r9 ; v19: lp_quantization vector L1200: vmulosh v20,v0,v8 ; v20: sum vector bcctr 20,0 L1300: vmulosh v21,v7,v15 vsldoi v15,v15,v14,4 ; increment history vaddsws v20,v20,v21 L1301: vmulosh v21,v6,v14 vsldoi v14,v14,v13,4 vaddsws v20,v20,v21 L1302: vmulosh v21,v5,v13 vsldoi v13,v13,v12,4 vaddsws v20,v20,v21 L1303: vmulosh v21,v4,v12 vsldoi v12,v12,v11,4 vaddsws v20,v20,v21 L1304: vmulosh v21,v3,v11 vsldoi v11,v11,v10,4 vaddsws v20,v20,v21 L1305: vmulosh v21,v2,v10 vsldoi v10,v10,v9,4 vaddsws v20,v20,v21 L1306: vmulosh v21,v1,v9 vsldoi v9,v9,v8,4 vaddsws v20,v20,v21 L1307: vsumsws v20,v20,v16 ; v20[3]: sum vsraw v20,v20,v19 ; v20[3]: sum >> lp_quantization lvewx v21,0,r3 ; v21[n]: *residual vperm v21,v21,v21,v18 ; v21[3]: *residual vaddsws v20,v21,v20 ; v20[3]: *residual + (sum >> lp_quantization) vsldoi v18,v18,v18,4 ; increment shift vector vperm v21,v20,v20,v17 ; v21[n]: shift for storage vsldoi v17,v17,v17,12 ; increment shift vector stvewx v21,0,r8 vsldoi v20,v20,v20,12 vsldoi v8,v8,v20,4 ; insert value onto history addi r3,r3,4 addi r8,r8,4 cmplw cr0,r8,r4 ; i<data_len bc 12,0,L1200 L1400: mtspr 256,r0 ; restore old vrsave lmw r31,-4(r1) blr _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8: ; r3: residual[] ; r4: data_len ; r5: qlp_coeff[] ; r6: order ; r7: lp_quantization ; r8: data[] ; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above ; this version assumes order<=8; it uses fewer vector registers, which should ; save time in context switches, and has less code, which may improve ; instruction caching stmw r31,-4(r1) addi r9,r1,-28 li r31,0xf andc r9,r9,r31 ; for quadword-aligned stack data slwi r6,r6,2 ; adjust for word size slwi r4,r4,2 add r4,r4,r8 ; r4 = data+data_len mfspr r0,256 ; cache old vrsave addis r31,0,hi16(0xffc00000) ori r31,r31,lo16(0xffc00000) mtspr 256,r31 ; declare VRs in vrsave cmplw cr0,r8,r4 ; i<data_len bc 4,0,L2400 ; load coefficients into v0-v1 and initial history into v2-v3 li r31,0xf and r31,r8,r31 ; r31: data%4 li r11,16 subf r31,r31,r11 ; r31: 4-(data%4) slwi r31,r31,3 ; convert to bits for vsro li r10,-4 stw r31,-4(r9) lvewx v0,r10,r9 vspltisb v6,-1 vsro v6,v6,v0 ; v6: mask vector li r31,0x8 lvsl v0,0,r31 vsldoi v0,v0,v0,12 li r31,0xc lvsl v1,0,r31 vspltisb v2,0 vspltisb v3,-1 vmrglw v2,v2,v3 vsel v0,v1,v0,v2 ; v0: reversal permutation vector add r10,r5,r6 lvsl v5,0,r5 ; v5: coefficient alignment permutation vector vperm v5,v5,v5,v0 ; v5: reversal coefficient alignment permutation vector mr r11,r8 lvsl v4,0,r11 ; v4: history alignment permutation vector lvx v0,0,r5 addi r5,r5,16 lvx v1,0,r5 vperm v0,v0,v1,v5 lvx v2,0,r11 addi r11,r11,-16 lvx v3,0,r11 vperm v2,v3,v2,v4 cmplw cr0,r5,r10 bc 12,0,L2101 vand v0,v0,v6 addis r31,0,hi16(L2301) ori r31,r31,lo16(L2301) b L2199 L2101: addi r5,r5,16 lvx v7,0,r5 vperm v1,v1,v7,v5 addi r11,r11,-16 lvx v7,0,r11 vperm v3,v7,v3,v4 vand v1,v1,v6 addis r31,0,hi16(L2300) ori r31,r31,lo16(L2300) L2199: mtctr r31 ; set up invariant vectors vspltish v4,0 ; v4: zero vector li r10,-12 lvsr v5,r10,r8 ; v5: result shift vector lvsl v6,r10,r3 ; v6: residual shift back vector li r10,-4 stw r7,-4(r9) lvewx v7,r10,r9 ; v7: lp_quantization vector L2200: vmulosh v8,v0,v2 ; v8: sum vector bcctr 20,0 L2300: vmulosh v9,v1,v3 vsldoi v3,v3,v2,4 vaddsws v8,v8,v9 L2301: vsumsws v8,v8,v4 ; v8[3]: sum vsraw v8,v8,v7 ; v8[3]: sum >> lp_quantization lvewx v9,0,r3 ; v9[n]: *residual vperm v9,v9,v9,v6 ; v9[3]: *residual vaddsws v8,v9,v8 ; v8[3]: *residual + (sum >> lp_quantization) vsldoi v6,v6,v6,4 ; increment shift vector vperm v9,v8,v8,v5 ; v9[n]: shift for storage vsldoi v5,v5,v5,12 ; increment shift vector stvewx v9,0,r8 vsldoi v8,v8,v8,12 vsldoi v2,v2,v8,4 ; insert value onto history addi r3,r3,4 addi r8,r8,4 cmplw cr0,r8,r4 ; i<data_len bc 12,0,L2200 L2400: mtspr 256,r0 ; restore old vrsave lmw r31,-4(r1) blr