///******************************************************************************
// *
// * Copyright (C) 2018 The Android Open Source Project
// *
// * Licensed under the Apache License, Version 2.0 (the "License");
// * you may not use this file except in compliance with the License.
// * You may obtain a copy of the License at:
// *
// * http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// *
// *****************************************************************************
// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/


.macro push_v_regs
    stp             q8, q9, [sp, #-32]!
    stp             q10, q11, [sp, #-32]!
    stp             q12, q13, [sp, #-32]!
    stp             q14, q15, [sp, #-32]!
    stp             X8, X9, [sp, #-16]!
    stp             X10, X11, [sp, #-16]!
    stp             X12, X13, [sp, #-16]!
    stp             X14, X15, [sp, #-16]!
    stp             X16, X17, [sp, #-16]!
    stp             X29, X30, [sp, #-16]!
.endm
.macro pop_v_regs
    ldp             X29, X30, [sp], #16
    ldp             X16, X17, [sp], #16
    ldp             X14, X15, [sp], #16
    ldp             X12, X13, [sp], #16
    ldp             X10, X11, [sp], #16
    ldp             X8, X9, [sp], #16
    ldp             q14, q15, [sp], #32
    ldp             q12, q13, [sp], #32
    ldp             q10, q11, [sp], #32
    ldp             q8, q9, [sp], #32
.endm

.macro swp reg1, reg2
    MOv             x16, \reg1
    MOv             \reg1, \reg2
    MOv             \reg2, x16
.endm
.text
.p2align 2
.global ixheaacd_imdct_using_fft_armv8
ixheaacd_imdct_using_fft_armv8:
    push_v_regs

    LDR             X29, =11600
    ADD             X4, X0, X29
    LDR             X29, =11856
    ADD             X5, X0, X29
    LDR             X29, =11920
    ADD             X6, X0, X29
    LDR             X29, =11936
    ADD             X7, X0, X29

COND_1: CMP         X1, #0x400
    BNE             COND_2
    MOv             X8, #4
    B               RADIX_4_FIRST_START


COND_2: CMP         X1, #0x200
    BNE             COND_3
    MOv             X8, #3
    MOv             X4, X5
    B               RADIX_8_FIRST_START

COND_3: CMP         X1, #0x100
    BNE             COND_4
    MOv             X8, #3
    MOv             X4, X5
    B               RADIX_4_FIRST_START

COND_4: CMP         X1, #0x80
    BNE             COND_5
    MOv             X8, #2
    MOv             X4, X6
    B               RADIX_8_FIRST_START

COND_5: CMP         X1, #0x40
    BNE             COND_6
    MOv             X8, #2
    MOv             X4, X6
    B               RADIX_4_FIRST_START
COND_6:
    MOv             X8, #1
    MOv             X4, X7



RADIX_8_FIRST_START:
    LSR             W9 , W1, #5
    LSL             W1, W1, #1

RADIX_8_FIRST_LOOP:

    MOv             X5 , X2
    MOv             X6 , X2
    MOv             X7 , X2
    MOv             X11 , X2






















    LDRB            W12, [X4]
    ADD             X5, X5, X12, LSL #3
    LD2             {v0.S, v1.S}[0], [X5], X1
    ADD             X5, X5, X1
    LD2             {v4.S, v5.S}[0], [X5], X1
    SUB             X5, X5, X1, LSL #1
    LD2             {v2.S, v3.S}[0], [X5], X1
    ADD             X5, X5, X1
    LD2             {v6.S, v7.S}[0], [X5], X1
    SUB             X5, X5, X1, LSL #2

    LDRB            W12, [X4, #1]
    ADD             X6, X6, X12, LSL #3
    LD2             {v0.S, v1.S}[1], [X6] , X1
    ADD             X6, X6, X1
    LD2             {v4.S, v5.S}[1], [X6] , X1
    SUB             X6, X6, X1, LSL #1
    LD2             {v2.S, v3.S}[1], [X6] , X1
    ADD             X6, X6, X1
    LD2             {v6.S, v7.S}[1], [X6], X1
    SUB             X6, X6, X1, LSL #2


    LDRB            W12, [X4, #2]
    ADD             X7, X7, X12, LSL #3
    LD2             {v0.S, v1.S}[2], [X7] , X1
    ADD             X7, X7, X1
    LD2             {v4.S, v5.S}[2], [X7] , X1
    SUB             X7, X7, X1, LSL #1

    LDRB            W12, [X4, #3]
    ADD             X11, X11, X12, LSL #3
    LD2             {v0.S, v1.S}[3], [X11] , X1
    ADD             X11, X11, X1
    LD2             {v4.S, v5.S}[3], [X11] , X1
    SUB             X11, X11, X1, LSL #1


    ADD             v8.4S, v0.4S, v4.4S
    LD2             {v2.S, v3.S}[2], [X7] , X1
    ADD             X7, X7, X1


    SUB             v9.4S, v0.4S, v4.4S
    LD2             {v6.S, v7.S}[2], [X7], X1
    SUB             X7, X7, X1, LSL #2


    ADD             v0.4S, v1.4S, v5.4S
    LD2             {v2.S, v3.S}[3], [X11] , X1
    ADD             X11, X11, X1

    SUB             v4.4S, v1.4S, v5.4S
    LD2             {v6.S, v7.S}[3], [X11], X1
    SUB             X11, X11, X1, LSL #2

    ADD             X4, X4, #4

    ADD             X5, X5, X1, LSR #1
    ADD             X6, X6, X1, LSR #1
    ADD             X7, X7, X1, LSR #1
    ADD             X11, X11, X1, LSR #1


    ADD             v1.4S, v2.4S, v6.4S
    LD2             {v14.S, v15.S}[0], [X5] , X1


    SUB             v5.4S, v2.4S, v6.4S
    LD2             {v10.S, v11.S}[0], [X5] , X1


    ADD             v2.4S, v3.4S, v7.4S
    LD2             {v12.S, v13.S}[0], [X5] , X1


    SUB             v6.4S, v3.4S, v7.4S
    LD2             {v14.S, v15.S}[1], [X6] , X1

    ADD             v3.4S, v9.4S, v6.4S
    LD2             {v10.S, v11.S}[1], [X6] , X1

    SUB             v7.4S, v9.4S, v6.4S
    LD2             {v12.S, v13.S}[1], [X6] , X1

    SUB             v6.4S, v4.4S, v5.4S
    LD2             {v14.S, v15.S}[2], [X7] , X1

    ADD             v9.4S, v4.4S, v5.4S
    LD2             {v10.S, v11.S}[2], [X7] , X1

    ADD             v4.4S, v8.4S, v1.4S
    LD2             {v12.S, v13.S}[2], [X7] , X1

    SUB             v5.4S, v8.4S, v1.4S
    LD2             {v14.S, v15.S}[3], [X11] , X1

    ADD             v8.4S, v0.4S, v2.4S
    LD2             {v10.S, v11.S}[3], [X11] , X1

    SUB             v0.4S, v0.4S, v2.4S
    LD2             {v12.S, v13.S}[3], [X11] , X1












    LD2             {v1.S, v2.S}[0], [X5], X1

    ADD             v17.4S, v14.4S, v12.4S

    LD2             {v1.S, v2.S}[1], [X6] , X1

    SUB             v16.4S, v14.4S, v12.4S

    LD2             {v1.S, v2.S}[2], [X7] , X1

    ADD             v14.4S, v15.4S, v13.4S

    LD2             {v1.S, v2.S}[3], [X11] , X1

    SUB             v12.4S, v15.4S, v13.4S

    ADD             v15.4S, v10.4S, v1.4S
    SUB             v13.4S, v10.4S, v1.4S
    ADD             v10.4S, v11.4S, v2.4S
    SUB             v1.4S, v11.4S, v2.4S

    ADD             v11.4S, v17.4S, v15.4S
    SUB             v2.4S, v17.4S, v15.4S
    ADD             v17.4S, v14.4S, v10.4S
    SUB             v15.4S, v14.4S, v10.4S

    ADD             v14.4S, v16.4S, v12.4S
    SUB             v10.4S, v16.4S, v12.4S
    ADD             v16.4S, v13.4S, v1.4S
    SUB             v12.4S, v13.4S, v1.4S

    ADD             v1.4S , v14.4S, v12.4S
    SUB             v13.4S, v14.4S, v12.4S
    SUB             v12.4S, v16.4S, v10.4S


    UZP1            v22.8H, v1.8H, v1.8H
    UZP2            v23.8H, v1.8H, v1.8H
    ADD             v14.4S, v16.4S, v10.4S

    UZP1            v26.8H, v13.8H, v13.8H
    UZP2            v27.8H, v13.8H, v13.8H
    ADD             v16.4S, v4.4S, v11.4S

    UZP1            v24.8H, v12.8H, v12.8H
    UZP2            v25.8H, v12.8H, v12.8H
    SUB             v10.4S, v4.4S, v11.4S

    UZP1            v28.8H, v14.8H, v14.8H
    UZP2            v29.8H, v14.8H, v14.8H
    ADD             v4.4S, v8.4S, v17.4S

    MOv             W14, #0x5a82

    SUB             v11.4S, v8.4S, v17.4S

    ADD             v8.4S, v5.4S, v15.4S
    SUB             v17.4S, v5.4S, v15.4S
    SUB             v5.4S, v0.4S, v2.4S
    ADD             v15.4S, v0.4S, v2.4S





















    DUP             v31.4H, W14

    UMULL           v19.4S, v26.4H, v31.4H
    UMULL           v18.4S, v28.4H, v31.4H
    SSHR            v19.4S, v19.4S, #15
    SSHR            v18.4S, v18.4S, #15


    SQDMLAL         v19.4S, v27.4H, v31.4H
    SQDMLAL         v18.4S, v29.4H, v31.4H


    UMULL           v13.4S, v24.4H, v31.4H
    UMULL           v14.4S, v22.4H, v31.4H

    ADD             v20.4S, v3.4S, v19.4S
    SUB             v21.4S, v3.4S, v19.4S
    ADD             v30.4S, v6.4S, v18.4S
    SUB             v6.4S, v6.4S, v18.4S

    SSHR            v13.4S, v13.4S, #15
    SSHR            v14.4S, v14.4S, #15

    SQDMLAL         v13.4S, v25.4H, v31.4H
    SQDMLAL         v14.4S, v23.4H, v31.4H




    ADD             v3.4S, v7.4S, v13.4S
    SUB             v19.4S, v7.4S, v13.4S
    ADD             v1.4S, v9.4S, v14.4S
    SUB             v18.4S, v9.4S, v14.4S























    swp             v17.D[0], v8.D[0]
    swp             v17.D[1], v8.D[1]
    swp             v4.D[0], v16.D[0]
    swp             v4.D[1], v16.D[1]

    TRN1            v12.4S, v4.4S, v20.4S
    TRN2            v22.4S, v4.4S, v20.4S

    SHL             v12.4S, v12.4S, #3
    TRN1            v9.4S, v17.4S, v3.4S
    TRN2            v2.4S, v17.4S, v3.4S
    SHL             v22.4S, v22.4S, #3

    SHL             v9.4S, v9.4S, #3
    TRN1            v24.4S, v10.4S, v21.4S
    TRN2            v7.4S, v10.4S, v21.4S
    SHL             v2.4S, v2.4S, #3

    SHL             v24.4S, v24.4S, #3
    TRN1            v13.4S, v16.4S, v6.4S
    TRN2            v23.4S, v16.4S, v6.4S
    SHL             v7.4S, v7.4S, #3

    SHL             v13.4S, v13.4S, #3
    TRN1            v10.4S, v5.4S, v18.4S
    TRN2            v3.4S, v5.4S, v18.4S
    SHL             v23.4S, v23.4S, #3

    SHL             v10.4S, v10.4S, #3
    TRN1            v26.4S, v8.4S, v19.4S
    TRN2            v4.4S, v8.4S, v19.4S
    SHL             v3.4S, v3.4S, #3

    SHL             v26.4S, v26.4S, #3
    TRN1            v25.4S, v11.4S, v30.4S
    TRN2            v8.4S, v11.4S, v30.4S
    SHL             v4.4S, v4.4S, #3

    SHL             v25.4S, v25.4S, #3
    TRN1            v27.4S, v15.4S, v1.4S
    TRN2            v5.4S, v15.4S, v1.4S
    SHL             v8.4S, v8.4S, #3

    SHL             v27.4S, v27.4S, #3
    swp             v9.D[0], v12.D[1]
    SHL             v5.4S, v5.4S, #3
    swp             v2.D[0], v22.D[1]

    swp             v24.D[1], v26.D[0]
    swp             v7.D[1], v4.D[0]
    swp             v10.D[0], v13.D[1]
    swp             v3.D[0], v23.D[1]
    swp             v27.D[0], v25.D[1]
    swp             v5.D[0], v8.D[1]

    MOv             X15, #32
    ST2             {v12.4S, v13.4S}, [X3], X15
    ST2             {v24.4S, v25.4S}, [X3], X15
    ST2             {v22.4S, v23.4S}, [X3], X15
    ST2             {v7.4S, v8.4S}, [X3], X15
    ST2             {v9.4S, v10.4S}, [X3], X15
    ST2             {v26.4S, v27.4S}, [X3], X15
    ST2             {v2.4S, v3.4S}, [X3], X15
    ST2             {v4.4S, v5.4S}, [X3], X15


    SUBS            X9, X9, #1
    BNE             RADIX_8_FIRST_LOOP

    LSR             X1, X1, #1
    LSL             X15, X1, #3
    SUB             X3, X3, X15

    MOv             X5, #8
    MOv             X4, #32
    LSR             X15, X1, #5
    MOv             X6, X15
    B               RADIX_4_FIRST_ENDS
RADIX_8_FIRST_ENDS:

RADIX_4_FIRST_START:

    LSR             W9, W1, #4
    LSL             W1, W1, #1
RADIX_4_LOOP:

    MOv             X5 , X2
    MOv             X6 , X2
    MOv             X7 , X2
    MOv             X11 , X2















    LDRB            W12, [X4, #0]
    ADD             X5, X5, X12, LSL #3

    LD2             {v0.S, v1.S}[0], [X5] , X1
    ADD             X5, X5, X1
    LD2             {v8.S, v9.S}[0], [X5] , X1
    SUB             X5, X5, X1, LSL #1
    LD2             {v4.S, v5.S}[0], [X5] , X1
    ADD             X5, X5, X1
    LD2             {v12.S, v13.S}[0], [X5] , X1

    LDRB            W12, [X4, #1]
    ADD             X6, X6, X12, LSL #3
    LD2             {v0.S, v1.S}[1], [X6] , X1
    ADD             X6, X6, X1
    LD2             {v8.S, v9.S}[1], [X6] , X1
    SUB             X6, X6, X1, LSL #1
    LD2             {v4.S, v5.S}[1], [X6] , X1
    ADD             X6, X6, X1
    LD2             {v12.S, v13.S}[1], [X6] , X1

    LDRB            W12, [X4, #2]
    ADD             X7, X7, X12, LSL #3

    LD2             {v0.S, v1.S}[2], [X7] , X1
    ADD             X7, X7, X1
    LD2             {v8.S, v9.S}[2], [X7] , X1


    LDRB            W12, [X4, #3]
    ADD             X11, X11, X12 , LSL #3


    LD2             {v0.S, v1.S}[3], [X11] , X1
    ADD             X11, X11, X1
    LD2             {v8.S, v9.S}[3], [X11] , X1

    SUB             X7, X7, X1, LSL #1
    ADD             v16.4S, v0.4S, v8.4S
    LD2             {v4.S, v5.S}[2], [X7] , X1
    ADD             X7, X7, X1
    ADD             v18.4S, v1.4S, v9.4S
    LD2             {v12.S, v13.S}[2], [X7] , X1

    SUB             X11, X11, X1, LSL #1
    SUB             v20.4S, v0.4S, v8.4S
    LD2             {v4.S, v5.S}[3], [X11] , X1
    ADD             X11, X11, X1
    SUB             v22.4S, v1.4S, v9.4S
    LD2             {v12.S, v13.S}[3], [X11] , X1






    ADD             X4, X4, #4

    ADD             v24.4S, v4.4S, v12.4S
    ADD             v26.4S, v5.4S, v13.4S
    SUB             v28.4S, v4.4S, v12.4S
    SUB             v30.4S, v5.4S, v13.4S

    ADD             v17.4S, v16.4S, v24.4S
    ADD             v11.4S, v18.4S, v26.4S
    SUB             v19.4S, v16.4S, v24.4S
    SUB             v15.4S, v18.4S, v26.4S

    ADD             v8.4S, v20.4S, v30.4S
    SUB             v9.4S, v22.4S, v28.4S
    ADD             v13.4S, v22.4S, v28.4S
    SUB             v12.4S, v20.4S, v30.4S




    TRN1            v0.4S, v17.4S, v8.4S
    TRN2            v8.4S, v17.4S, v8.4S

    SHL             v0.4S, v0.4S, #2
    TRN1            v4.4S, v19.4S, v12.4S
    TRN2            v12.4S, v19.4S, v12.4S
    SHL             v8.4S, v8.4S, #2

    SHL             v4.4S, v4.4S, #2
    TRN1            v1.4S, v11.4S, v9.4S
    TRN2            v9.4S, v11.4S, v9.4S
    SHL             v12.4S, v12.4S, #2

    SHL             v1.4S, v1.4S, #2
    TRN1            v5.4S, v15.4S, v13.4S
    TRN2            v13.4S, v15.4S, v13.4S
    SHL             v9.4S, v9.4S, #2

    SHL             v5.4S, v5.4S, #2
    swp             v4.D[0], v0.D[1]
    SHL             v13.4S, v13.4S, #2

    swp             v12.D[0], v8.D[1]
    swp             v5.D[0], v1.D[1]
    swp             v13.D[0], v9.D[1]

    MOv             X15, #32
    ST2             {v0.4S, v1.4S}, [X3], X15
    ST2             {v8.4S, v9.4S}, [X3], X15
    ST2             {v4.4S, v5.4S}, [X3], X15
    ST2             {v12.4S, v13.4S}, [X3], X15


    SUBS            W9, W9, #1
    BNE             RADIX_4_LOOP

    LSR             X1, X1, #1
    SUB             X3, X3, X1, LSL #3
    MOv             X5, #4
    MOv             X4, #64
    LSR             X6, X1, #4


RADIX_4_FIRST_ENDS:

    MOv             x30, X3
    LSR             X5, X5, #2

    LDR             X14, =8528
    ADD             X0, X0, X14

OUTER_LOOP_R4:

    MOv             X14, x30

    MOv             X7, X5
    MOv             X2, #0
    MOv             X9, X0
    LSL             X12, X5, #5
MIDDLE_LOOP_R4:

    LD2             {v20.H, v21.H}[0], [X9], X2
    LD2             {v22.H, v23.H}[0], [X9], X2
    ADD             X11, X2, X4, LSL #2
    LD2             {v24.H, v25.H}[0], [X9]
    ADD             X10, X0, X11

    LD2             {v20.H, v21.H}[1], [X10], X11
    LD2             {v22.H, v23.H}[1], [X10], X11
    ADD             X2, X11, X4, LSL #2
    LD2             {v24.H, v25.H}[1], [X10]
    ADD             X9, X0, X2

    LD2             {v20.H, v21.H}[2], [X9], X2
    LD2             {v22.H, v23.H}[2], [X9], X2
    ADD             X11, X2, X4, LSL #2
    LD2             {v24.H, v25.H}[2], [X9]
    ADD             X10, X0, X11

    LD2             {v20.H, v21.H}[3], [X10], X11
    LD2             {v22.H, v23.H}[3], [X10], X11
    ADD             X2, X11, X4, LSL #2
    LD2             {v24.H, v25.H}[3], [X10]
    ADD             X9, X0, X2

    MOv             X10, X6
INNER_LOOP_R4:

    LD2             {v30.4S, v31.4S}, [X14], X12
    SSHR            v30.4S, v30.4S, #1
    LD4             {v16.4H, v17.4H, v18.4H, v19.4H}, [X14], X12
    SSHR            v31.4S, v31.4S, #1

    USHR            v16.4H, v16.4H, #1
    LD4             {v26.4H, v27.4H, v28.4H, v29.4H}, [X14], X12
    USHR            v18.4H, v18.4H, #1

    SMULL           v11.4S, v16.4H, v20.4H
    SMLSL           v11.4S, v18.4H, v21.4H

    LD4             {v0.4H, v1.4H, v2.4H, v3.4H}, [X14], X12
    SMULL           v12.4S, v16.4H, v21.4H
    SMLAL           v12.4S, v18.4H, v20.4H

    USHR            v26.4H, v26.4H, #1
    USHR            v28.4H, v28.4H, #1

    LSL             x29, X12, #2
    SUB             X14, X14, X12, LSL #2

    USHR            v0.4H, v0.4H, #1
    USHR            v2.4H, v2.4H, #1

    SMULL           v13.4S, v26.4H, v22.4H
    SMLSL           v13.4S, v28.4H, v23.4H

    SSHR            v11.4S, v11.4S, #15

    SMULL           v14.4S, v26.4H, v23.4H
    SMLAL           v14.4S, v28.4H, v22.4H

    SMULL           v15.4S, v0.4H, v24.4H
    SMLSL           v15.4S, v2.4H, v25.4H

    SMLAL           v11.4S, v17.4H, v20.4H
    SMLSL           v11.4S, v19.4H, v21.4H

    SSHR            v12.4S, v12.4S, #15
    SSHR            v13.4S, v13.4S, #15
    SSHR            v14.4S, v14.4S, #15
    SSHR            v15.4S, v15.4S, #15

    SMLAL           v12.4S, v17.4H, v21.4H
    SMLAL           v12.4S, v19.4H, v20.4H

    SMULL           v5.4S, v0.4H, v25.4H
    SMLAL           v5.4S, v2.4H, v24.4H

    SMLAL           v13.4S, v27.4H, v22.4H
    SMLSL           v13.4S, v29.4H, v23.4H

    SMLAL           v14.4S, v27.4H, v23.4H
    SMLAL           v14.4S, v29.4H, v22.4H

    SMLAL           v15.4S, v1.4H, v24.4H
    SMLSL           v15.4S, v3.4H, v25.4H

    SSHR            v5.4S, v5.4S, #15

    SMLAL           v5.4S, v1.4H, v25.4H
    SMLAL           v5.4S, v3.4H, v24.4H



    SUBS            x17, X7, X5
    BNE             BYPASS_IF

    ADD             X14, X14, X12

    LDR             W3, [X14]
    ADD             X14, X14, X12
    ASR             W3, W3, #1

    MOv             v11.S[0], W3

    LDR             W3, [X14]
    ADD             X14, X14, X12
    ASR             W3, W3, #1
    MOv             v13.S[0], W3

    LDR             W3, [X14]
    ASR             W3, W3, #1
    MOv             v15.S[0], W3

    SUB             X14, X14, X12, LSL #1
    ADD             X14, X14, #4

    LDR             W3, [X14]
    ADD             X14, X14, X12
    ASR             W3, W3, #1
    MOv             v12.S[0], W3

    LDR             W3, [X14]
    ADD             X14, X14, X12
    ASR             W3, W3, #1
    MOv             v14.S[0], W3

    LDR             W3, [X14]
    ADD             X14, X14, X12
    ASR             W3, W3, #1
    MOv             v5.S[0], W3

    SUB             X14, X14, #4

    SUB             X14, X14, x29








BYPASS_IF:

    ADD             v6.4S, v30.4S, v13.4S
    ADD             v7.4S, v31.4S, v14.4S
    SUB             v30.4S, v30.4S, v13.4S
    SUB             v31.4S, v31.4S, v14.4S
    ADD             v8.4S, v11.4S, v15.4S
    ADD             v9.4S, v12.4S, v5.4S

    SUB             v15.4S, v11.4S, v15.4S
    SUB             v14.4S, v12.4S, v5.4S


    ADD             v10.4S, v6.4S, v8.4S
    ADD             v11.4S, v7.4S, v9.4S
    ADD             v12.4S, v30.4S, v14.4S
    SUB             v13.4S, v31.4S, v15.4S

    SUB             v6.4S, v6.4S, v8.4S
    ST2             {v10.4S, v11.4S}, [X14], X12
    SUB             v7.4S, v7.4S, v9.4S

    SUB             v8.4S, v30.4S, v14.4S
    ST2             {v12.4S, v13.4S}, [X14], X12
    ADD             v9.4S, v31.4S, v15.4S

    ST2             {v6.4S, v7.4S}, [X14], X12
    ST2             {v8.4S, v9.4S}, [X14], X12
    SUBS            X10, X10, #1
    BNE             INNER_LOOP_R4

    SUB             X14, X14, X1, LSL #3
    ADD             X14, X14, #32

    SUBS            X7, X7, #1
    BNE             MIDDLE_LOOP_R4




    LSR             X4, X4, #2
    LSL             X5, X5, #2
    LSR             X6, X6, #2
    SUBS            X8, X8, #1
    BNE             OUTER_LOOP_R4
END_LOOPS:
    pop_v_regs
    RET