///****************************************************************************** // * // * Copyright (C) 2018 The Android Open Source Project // * // * Licensed under the Apache License, Version 2.0 (the "License"); // * you may not use this file except in compliance with the License. // * You may obtain a copy of the License at: // * // * http://www.apache.org/licenses/LICENSE-2.0 // * // * Unless required by applicable law or agreed to in writing, software // * distributed under the License is distributed on an "AS IS" BASIS, // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // * See the License for the specific language governing permissions and // * limitations under the License. // * // ***************************************************************************** // * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore //*/ .macro push_v_regs stp q8, q9, [sp, #-32]! stp q10, q11, [sp, #-32]! stp q12, q13, [sp, #-32]! stp q14, q15, [sp, #-32]! stp X8, X9, [sp, #-16]! stp X10, X11, [sp, #-16]! stp X12, X13, [sp, #-16]! stp X14, X15, [sp, #-16]! stp X16, X17, [sp, #-16]! stp X29, X30, [sp, #-16]! .endm .macro pop_v_regs ldp X29, X30, [sp], #16 ldp X16, X17, [sp], #16 ldp X14, X15, [sp], #16 ldp X12, X13, [sp], #16 ldp X10, X11, [sp], #16 ldp X8, X9, [sp], #16 ldp q14, q15, [sp], #32 ldp q12, q13, [sp], #32 ldp q10, q11, [sp], #32 ldp q8, q9, [sp], #32 .endm .macro swp reg1, reg2 MOv x16, \reg1 MOv \reg1, \reg2 MOv \reg2, x16 .endm .text .p2align 2 .global ixheaacd_imdct_using_fft_armv8 ixheaacd_imdct_using_fft_armv8: push_v_regs LDR X29, =11600 ADD X4, X0, X29 LDR X29, =11856 ADD X5, X0, X29 LDR X29, =11920 ADD X6, X0, X29 LDR X29, =11936 ADD X7, X0, X29 COND_1: CMP X1, #0x400 BNE COND_2 MOv X8, #4 B RADIX_4_FIRST_START COND_2: CMP X1, #0x200 BNE COND_3 MOv X8, #3 MOv X4, X5 B RADIX_8_FIRST_START COND_3: CMP X1, #0x100 BNE COND_4 MOv X8, #3 MOv X4, X5 B RADIX_4_FIRST_START COND_4: CMP X1, #0x80 BNE COND_5 MOv X8, #2 MOv X4, X6 B RADIX_8_FIRST_START COND_5: CMP X1, #0x40 BNE COND_6 MOv X8, #2 MOv X4, X6 B RADIX_4_FIRST_START COND_6: MOv X8, #1 MOv X4, X7 RADIX_8_FIRST_START: LSR W9 , W1, #5 LSL W1, W1, #1 RADIX_8_FIRST_LOOP: MOv X5 , X2 MOv X6 , X2 MOv X7 , X2 MOv X11 , X2 LDRB W12, [X4] ADD X5, X5, X12, LSL #3 LD2 {v0.S, v1.S}[0], [X5], X1 ADD X5, X5, X1 LD2 {v4.S, v5.S}[0], [X5], X1 SUB X5, X5, X1, LSL #1 LD2 {v2.S, v3.S}[0], [X5], X1 ADD X5, X5, X1 LD2 {v6.S, v7.S}[0], [X5], X1 SUB X5, X5, X1, LSL #2 LDRB W12, [X4, #1] ADD X6, X6, X12, LSL #3 LD2 {v0.S, v1.S}[1], [X6] , X1 ADD X6, X6, X1 LD2 {v4.S, v5.S}[1], [X6] , X1 SUB X6, X6, X1, LSL #1 LD2 {v2.S, v3.S}[1], [X6] , X1 ADD X6, X6, X1 LD2 {v6.S, v7.S}[1], [X6], X1 SUB X6, X6, X1, LSL #2 LDRB W12, [X4, #2] ADD X7, X7, X12, LSL #3 LD2 {v0.S, v1.S}[2], [X7] , X1 ADD X7, X7, X1 LD2 {v4.S, v5.S}[2], [X7] , X1 SUB X7, X7, X1, LSL #1 LDRB W12, [X4, #3] ADD X11, X11, X12, LSL #3 LD2 {v0.S, v1.S}[3], [X11] , X1 ADD X11, X11, X1 LD2 {v4.S, v5.S}[3], [X11] , X1 SUB X11, X11, X1, LSL #1 ADD v8.4S, v0.4S, v4.4S LD2 {v2.S, v3.S}[2], [X7] , X1 ADD X7, X7, X1 SUB v9.4S, v0.4S, v4.4S LD2 {v6.S, v7.S}[2], [X7], X1 SUB X7, X7, X1, LSL #2 ADD v0.4S, v1.4S, v5.4S LD2 {v2.S, v3.S}[3], [X11] , X1 ADD X11, X11, X1 SUB v4.4S, v1.4S, v5.4S LD2 {v6.S, v7.S}[3], [X11], X1 SUB X11, X11, X1, LSL #2 ADD X4, X4, #4 ADD X5, X5, X1, LSR #1 ADD X6, X6, X1, LSR #1 ADD X7, X7, X1, LSR #1 ADD X11, X11, X1, LSR #1 ADD v1.4S, v2.4S, v6.4S LD2 {v14.S, v15.S}[0], [X5] , X1 SUB v5.4S, v2.4S, v6.4S LD2 {v10.S, v11.S}[0], [X5] , X1 ADD v2.4S, v3.4S, v7.4S LD2 {v12.S, v13.S}[0], [X5] , X1 SUB v6.4S, v3.4S, v7.4S LD2 {v14.S, v15.S}[1], [X6] , X1 ADD v3.4S, v9.4S, v6.4S LD2 {v10.S, v11.S}[1], [X6] , X1 SUB v7.4S, v9.4S, v6.4S LD2 {v12.S, v13.S}[1], [X6] , X1 SUB v6.4S, v4.4S, v5.4S LD2 {v14.S, v15.S}[2], [X7] , X1 ADD v9.4S, v4.4S, v5.4S LD2 {v10.S, v11.S}[2], [X7] , X1 ADD v4.4S, v8.4S, v1.4S LD2 {v12.S, v13.S}[2], [X7] , X1 SUB v5.4S, v8.4S, v1.4S LD2 {v14.S, v15.S}[3], [X11] , X1 ADD v8.4S, v0.4S, v2.4S LD2 {v10.S, v11.S}[3], [X11] , X1 SUB v0.4S, v0.4S, v2.4S LD2 {v12.S, v13.S}[3], [X11] , X1 LD2 {v1.S, v2.S}[0], [X5], X1 ADD v17.4S, v14.4S, v12.4S LD2 {v1.S, v2.S}[1], [X6] , X1 SUB v16.4S, v14.4S, v12.4S LD2 {v1.S, v2.S}[2], [X7] , X1 ADD v14.4S, v15.4S, v13.4S LD2 {v1.S, v2.S}[3], [X11] , X1 SUB v12.4S, v15.4S, v13.4S ADD v15.4S, v10.4S, v1.4S SUB v13.4S, v10.4S, v1.4S ADD v10.4S, v11.4S, v2.4S SUB v1.4S, v11.4S, v2.4S ADD v11.4S, v17.4S, v15.4S SUB v2.4S, v17.4S, v15.4S ADD v17.4S, v14.4S, v10.4S SUB v15.4S, v14.4S, v10.4S ADD v14.4S, v16.4S, v12.4S SUB v10.4S, v16.4S, v12.4S ADD v16.4S, v13.4S, v1.4S SUB v12.4S, v13.4S, v1.4S ADD v1.4S , v14.4S, v12.4S SUB v13.4S, v14.4S, v12.4S SUB v12.4S, v16.4S, v10.4S UZP1 v22.8H, v1.8H, v1.8H UZP2 v23.8H, v1.8H, v1.8H ADD v14.4S, v16.4S, v10.4S UZP1 v26.8H, v13.8H, v13.8H UZP2 v27.8H, v13.8H, v13.8H ADD v16.4S, v4.4S, v11.4S UZP1 v24.8H, v12.8H, v12.8H UZP2 v25.8H, v12.8H, v12.8H SUB v10.4S, v4.4S, v11.4S UZP1 v28.8H, v14.8H, v14.8H UZP2 v29.8H, v14.8H, v14.8H ADD v4.4S, v8.4S, v17.4S MOv W14, #0x5a82 SUB v11.4S, v8.4S, v17.4S ADD v8.4S, v5.4S, v15.4S SUB v17.4S, v5.4S, v15.4S SUB v5.4S, v0.4S, v2.4S ADD v15.4S, v0.4S, v2.4S DUP v31.4H, W14 UMULL v19.4S, v26.4H, v31.4H UMULL v18.4S, v28.4H, v31.4H SSHR v19.4S, v19.4S, #15 SSHR v18.4S, v18.4S, #15 SQDMLAL v19.4S, v27.4H, v31.4H SQDMLAL v18.4S, v29.4H, v31.4H UMULL v13.4S, v24.4H, v31.4H UMULL v14.4S, v22.4H, v31.4H ADD v20.4S, v3.4S, v19.4S SUB v21.4S, v3.4S, v19.4S ADD v30.4S, v6.4S, v18.4S SUB v6.4S, v6.4S, v18.4S SSHR v13.4S, v13.4S, #15 SSHR v14.4S, v14.4S, #15 SQDMLAL v13.4S, v25.4H, v31.4H SQDMLAL v14.4S, v23.4H, v31.4H ADD v3.4S, v7.4S, v13.4S SUB v19.4S, v7.4S, v13.4S ADD v1.4S, v9.4S, v14.4S SUB v18.4S, v9.4S, v14.4S swp v17.D[0], v8.D[0] swp v17.D[1], v8.D[1] swp v4.D[0], v16.D[0] swp v4.D[1], v16.D[1] TRN1 v12.4S, v4.4S, v20.4S TRN2 v22.4S, v4.4S, v20.4S SHL v12.4S, v12.4S, #3 TRN1 v9.4S, v17.4S, v3.4S TRN2 v2.4S, v17.4S, v3.4S SHL v22.4S, v22.4S, #3 SHL v9.4S, v9.4S, #3 TRN1 v24.4S, v10.4S, v21.4S TRN2 v7.4S, v10.4S, v21.4S SHL v2.4S, v2.4S, #3 SHL v24.4S, v24.4S, #3 TRN1 v13.4S, v16.4S, v6.4S TRN2 v23.4S, v16.4S, v6.4S SHL v7.4S, v7.4S, #3 SHL v13.4S, v13.4S, #3 TRN1 v10.4S, v5.4S, v18.4S TRN2 v3.4S, v5.4S, v18.4S SHL v23.4S, v23.4S, #3 SHL v10.4S, v10.4S, #3 TRN1 v26.4S, v8.4S, v19.4S TRN2 v4.4S, v8.4S, v19.4S SHL v3.4S, v3.4S, #3 SHL v26.4S, v26.4S, #3 TRN1 v25.4S, v11.4S, v30.4S TRN2 v8.4S, v11.4S, v30.4S SHL v4.4S, v4.4S, #3 SHL v25.4S, v25.4S, #3 TRN1 v27.4S, v15.4S, v1.4S TRN2 v5.4S, v15.4S, v1.4S SHL v8.4S, v8.4S, #3 SHL v27.4S, v27.4S, #3 swp v9.D[0], v12.D[1] SHL v5.4S, v5.4S, #3 swp v2.D[0], v22.D[1] swp v24.D[1], v26.D[0] swp v7.D[1], v4.D[0] swp v10.D[0], v13.D[1] swp v3.D[0], v23.D[1] swp v27.D[0], v25.D[1] swp v5.D[0], v8.D[1] MOv X15, #32 ST2 {v12.4S, v13.4S}, [X3], X15 ST2 {v24.4S, v25.4S}, [X3], X15 ST2 {v22.4S, v23.4S}, [X3], X15 ST2 {v7.4S, v8.4S}, [X3], X15 ST2 {v9.4S, v10.4S}, [X3], X15 ST2 {v26.4S, v27.4S}, [X3], X15 ST2 {v2.4S, v3.4S}, [X3], X15 ST2 {v4.4S, v5.4S}, [X3], X15 SUBS X9, X9, #1 BNE RADIX_8_FIRST_LOOP LSR X1, X1, #1 LSL X15, X1, #3 SUB X3, X3, X15 MOv X5, #8 MOv X4, #32 LSR X15, X1, #5 MOv X6, X15 B RADIX_4_FIRST_ENDS RADIX_8_FIRST_ENDS: RADIX_4_FIRST_START: LSR W9, W1, #4 LSL W1, W1, #1 RADIX_4_LOOP: MOv X5 , X2 MOv X6 , X2 MOv X7 , X2 MOv X11 , X2 LDRB W12, [X4, #0] ADD X5, X5, X12, LSL #3 LD2 {v0.S, v1.S}[0], [X5] , X1 ADD X5, X5, X1 LD2 {v8.S, v9.S}[0], [X5] , X1 SUB X5, X5, X1, LSL #1 LD2 {v4.S, v5.S}[0], [X5] , X1 ADD X5, X5, X1 LD2 {v12.S, v13.S}[0], [X5] , X1 LDRB W12, [X4, #1] ADD X6, X6, X12, LSL #3 LD2 {v0.S, v1.S}[1], [X6] , X1 ADD X6, X6, X1 LD2 {v8.S, v9.S}[1], [X6] , X1 SUB X6, X6, X1, LSL #1 LD2 {v4.S, v5.S}[1], [X6] , X1 ADD X6, X6, X1 LD2 {v12.S, v13.S}[1], [X6] , X1 LDRB W12, [X4, #2] ADD X7, X7, X12, LSL #3 LD2 {v0.S, v1.S}[2], [X7] , X1 ADD X7, X7, X1 LD2 {v8.S, v9.S}[2], [X7] , X1 LDRB W12, [X4, #3] ADD X11, X11, X12 , LSL #3 LD2 {v0.S, v1.S}[3], [X11] , X1 ADD X11, X11, X1 LD2 {v8.S, v9.S}[3], [X11] , X1 SUB X7, X7, X1, LSL #1 ADD v16.4S, v0.4S, v8.4S LD2 {v4.S, v5.S}[2], [X7] , X1 ADD X7, X7, X1 ADD v18.4S, v1.4S, v9.4S LD2 {v12.S, v13.S}[2], [X7] , X1 SUB X11, X11, X1, LSL #1 SUB v20.4S, v0.4S, v8.4S LD2 {v4.S, v5.S}[3], [X11] , X1 ADD X11, X11, X1 SUB v22.4S, v1.4S, v9.4S LD2 {v12.S, v13.S}[3], [X11] , X1 ADD X4, X4, #4 ADD v24.4S, v4.4S, v12.4S ADD v26.4S, v5.4S, v13.4S SUB v28.4S, v4.4S, v12.4S SUB v30.4S, v5.4S, v13.4S ADD v17.4S, v16.4S, v24.4S ADD v11.4S, v18.4S, v26.4S SUB v19.4S, v16.4S, v24.4S SUB v15.4S, v18.4S, v26.4S ADD v8.4S, v20.4S, v30.4S SUB v9.4S, v22.4S, v28.4S ADD v13.4S, v22.4S, v28.4S SUB v12.4S, v20.4S, v30.4S TRN1 v0.4S, v17.4S, v8.4S TRN2 v8.4S, v17.4S, v8.4S SHL v0.4S, v0.4S, #2 TRN1 v4.4S, v19.4S, v12.4S TRN2 v12.4S, v19.4S, v12.4S SHL v8.4S, v8.4S, #2 SHL v4.4S, v4.4S, #2 TRN1 v1.4S, v11.4S, v9.4S TRN2 v9.4S, v11.4S, v9.4S SHL v12.4S, v12.4S, #2 SHL v1.4S, v1.4S, #2 TRN1 v5.4S, v15.4S, v13.4S TRN2 v13.4S, v15.4S, v13.4S SHL v9.4S, v9.4S, #2 SHL v5.4S, v5.4S, #2 swp v4.D[0], v0.D[1] SHL v13.4S, v13.4S, #2 swp v12.D[0], v8.D[1] swp v5.D[0], v1.D[1] swp v13.D[0], v9.D[1] MOv X15, #32 ST2 {v0.4S, v1.4S}, [X3], X15 ST2 {v8.4S, v9.4S}, [X3], X15 ST2 {v4.4S, v5.4S}, [X3], X15 ST2 {v12.4S, v13.4S}, [X3], X15 SUBS W9, W9, #1 BNE RADIX_4_LOOP LSR X1, X1, #1 SUB X3, X3, X1, LSL #3 MOv X5, #4 MOv X4, #64 LSR X6, X1, #4 RADIX_4_FIRST_ENDS: MOv x30, X3 LSR X5, X5, #2 LDR X14, =8528 ADD X0, X0, X14 OUTER_LOOP_R4: MOv X14, x30 MOv X7, X5 MOv X2, #0 MOv X9, X0 LSL X12, X5, #5 MIDDLE_LOOP_R4: LD2 {v20.H, v21.H}[0], [X9], X2 LD2 {v22.H, v23.H}[0], [X9], X2 ADD X11, X2, X4, LSL #2 LD2 {v24.H, v25.H}[0], [X9] ADD X10, X0, X11 LD2 {v20.H, v21.H}[1], [X10], X11 LD2 {v22.H, v23.H}[1], [X10], X11 ADD X2, X11, X4, LSL #2 LD2 {v24.H, v25.H}[1], [X10] ADD X9, X0, X2 LD2 {v20.H, v21.H}[2], [X9], X2 LD2 {v22.H, v23.H}[2], [X9], X2 ADD X11, X2, X4, LSL #2 LD2 {v24.H, v25.H}[2], [X9] ADD X10, X0, X11 LD2 {v20.H, v21.H}[3], [X10], X11 LD2 {v22.H, v23.H}[3], [X10], X11 ADD X2, X11, X4, LSL #2 LD2 {v24.H, v25.H}[3], [X10] ADD X9, X0, X2 MOv X10, X6 INNER_LOOP_R4: LD2 {v30.4S, v31.4S}, [X14], X12 SSHR v30.4S, v30.4S, #1 LD4 {v16.4H, v17.4H, v18.4H, v19.4H}, [X14], X12 SSHR v31.4S, v31.4S, #1 USHR v16.4H, v16.4H, #1 LD4 {v26.4H, v27.4H, v28.4H, v29.4H}, [X14], X12 USHR v18.4H, v18.4H, #1 SMULL v11.4S, v16.4H, v20.4H SMLSL v11.4S, v18.4H, v21.4H LD4 {v0.4H, v1.4H, v2.4H, v3.4H}, [X14], X12 SMULL v12.4S, v16.4H, v21.4H SMLAL v12.4S, v18.4H, v20.4H USHR v26.4H, v26.4H, #1 USHR v28.4H, v28.4H, #1 LSL x29, X12, #2 SUB X14, X14, X12, LSL #2 USHR v0.4H, v0.4H, #1 USHR v2.4H, v2.4H, #1 SMULL v13.4S, v26.4H, v22.4H SMLSL v13.4S, v28.4H, v23.4H SSHR v11.4S, v11.4S, #15 SMULL v14.4S, v26.4H, v23.4H SMLAL v14.4S, v28.4H, v22.4H SMULL v15.4S, v0.4H, v24.4H SMLSL v15.4S, v2.4H, v25.4H SMLAL v11.4S, v17.4H, v20.4H SMLSL v11.4S, v19.4H, v21.4H SSHR v12.4S, v12.4S, #15 SSHR v13.4S, v13.4S, #15 SSHR v14.4S, v14.4S, #15 SSHR v15.4S, v15.4S, #15 SMLAL v12.4S, v17.4H, v21.4H SMLAL v12.4S, v19.4H, v20.4H SMULL v5.4S, v0.4H, v25.4H SMLAL v5.4S, v2.4H, v24.4H SMLAL v13.4S, v27.4H, v22.4H SMLSL v13.4S, v29.4H, v23.4H SMLAL v14.4S, v27.4H, v23.4H SMLAL v14.4S, v29.4H, v22.4H SMLAL v15.4S, v1.4H, v24.4H SMLSL v15.4S, v3.4H, v25.4H SSHR v5.4S, v5.4S, #15 SMLAL v5.4S, v1.4H, v25.4H SMLAL v5.4S, v3.4H, v24.4H SUBS x17, X7, X5 BNE BYPASS_IF ADD X14, X14, X12 LDR W3, [X14] ADD X14, X14, X12 ASR W3, W3, #1 MOv v11.S[0], W3 LDR W3, [X14] ADD X14, X14, X12 ASR W3, W3, #1 MOv v13.S[0], W3 LDR W3, [X14] ASR W3, W3, #1 MOv v15.S[0], W3 SUB X14, X14, X12, LSL #1 ADD X14, X14, #4 LDR W3, [X14] ADD X14, X14, X12 ASR W3, W3, #1 MOv v12.S[0], W3 LDR W3, [X14] ADD X14, X14, X12 ASR W3, W3, #1 MOv v14.S[0], W3 LDR W3, [X14] ADD X14, X14, X12 ASR W3, W3, #1 MOv v5.S[0], W3 SUB X14, X14, #4 SUB X14, X14, x29 BYPASS_IF: ADD v6.4S, v30.4S, v13.4S ADD v7.4S, v31.4S, v14.4S SUB v30.4S, v30.4S, v13.4S SUB v31.4S, v31.4S, v14.4S ADD v8.4S, v11.4S, v15.4S ADD v9.4S, v12.4S, v5.4S SUB v15.4S, v11.4S, v15.4S SUB v14.4S, v12.4S, v5.4S ADD v10.4S, v6.4S, v8.4S ADD v11.4S, v7.4S, v9.4S ADD v12.4S, v30.4S, v14.4S SUB v13.4S, v31.4S, v15.4S SUB v6.4S, v6.4S, v8.4S ST2 {v10.4S, v11.4S}, [X14], X12 SUB v7.4S, v7.4S, v9.4S SUB v8.4S, v30.4S, v14.4S ST2 {v12.4S, v13.4S}, [X14], X12 ADD v9.4S, v31.4S, v15.4S ST2 {v6.4S, v7.4S}, [X14], X12 ST2 {v8.4S, v9.4S}, [X14], X12 SUBS X10, X10, #1 BNE INNER_LOOP_R4 SUB X14, X14, X1, LSL #3 ADD X14, X14, #32 SUBS X7, X7, #1 BNE MIDDLE_LOOP_R4 LSR X4, X4, #2 LSL X5, X5, #2 LSR X6, X6, #2 SUBS X8, X8, #1 BNE OUTER_LOOP_R4 END_LOOPS: pop_v_regs RET