.macro push_v_regs
    stp             d8, d9, [sp, #-16]!
    stp             d10, d11, [sp, #-16]!
    stp             d12, d13, [sp, #-16]!
    stp             d14, d15, [sp, #-16]!
    stp             X8, X9, [sp, #-16]!
    stp             X10, X11, [sp, #-16]!
    stp             X12, X13, [sp, #-16]!
    stp             X14, X15, [sp, #-16]!
    stp             X16, X17, [sp, #-16]!
    stp             X29, X30, [sp, #-16]!
.endm
.macro pop_v_regs
    ldp             X29, X30, [sp], #16
    ldp             X16, X17, [sp], #16
    ldp             X14, X15, [sp], #16
    ldp             X12, X13, [sp], #16
    ldp             X10, X11, [sp], #16
    ldp             X8, X9, [sp], #16
    ldp             d14, d15, [sp], #16
    ldp             d12, d13, [sp], #16
    ldp             d10, d11, [sp], #16
    ldp             d8, d9, [sp], #16
.endm

.text
.p2align 2
    .global ixheaacd_sbr_qmfanal32_winadds

ixheaacd_sbr_qmfanal32_winadds:         // PROC

    // STMFD sp!, {x4-x12, x14}
    push_v_regs
    stp             x19, x20, [sp, #-16]!
    //VPUSH       {D8 - D15}
    //LDR w5,  [SP, #108]              //filterStates
    //sxtw x5,w5
    //LDR w6,  [SP, #112]              //timeIn
    //sxtw x6,w6
    //LDR w7,  [SP, #116]              //stride
    //sxtw x7,w7

    LSL             x9, x7, #1


    MOV             x20, x4
    ADD             x5, x5, #64
    MOV             w10, #3

    //ADD         x5, x5, #56
    //MOV         x10, #1
    ////SUB         x6, x6, x9
    //CMP           x7, #1
    //MOV         x11, #-8
    //BGT         LOOP_SKIP_ODD

LOOP:
    LDRSH           w4  , [x6]
    ADD             x6, x6, x9
    LDRSH           w8  , [x6]
    ADD             x6, x6, x9
    LDRSH           w11  , [x6]
    ADD             x6, x6, x9
    LDRSH           w12 , [x6]
    ADD             x6, x6, x9

    STRH            w4  , [x5 , #-2]!
    STRH            w8  , [x5 , #-2]!
    STRH            w11  , [x5 , #-2]!
    STRH            w12 , [x5 , #-2]!

    LDRSH           w4  , [x6]
    ADD             x6, x6, x9
    LDRSH           w8  , [x6]
    ADD             x6, x6, x9
    LDRSH           w11  , [x6]
    ADD             x6, x6, x9
    LDRSH           w12 , [x6]
    ADD             x6, x6, x9

    STRH            w4  , [x5 , #-2]!
    STRH            w8  , [x5 , #-2]!
    STRH            w11  , [x5 , #-2]!
    STRH            w12 , [x5 , #-2]!
    SUBS            w10, w10, #1

    BPL             LOOP


//LOOP:
//  LD1 {v0.4h} , [x6], #8
//  LD1 {v1.4h} , [x6], #8
//
//  REV64  v4.4h , v0.4h
//  REV64  v5.4h , v1.4h
//
//  ST1 {v4.4h} , [x5] , x11
//  ST1 {v5.4h} , [x5] , x11
//
//  LD1 {v2.4h} , [x6], #8
//  LD1 {v3.4h} , [x6], #8
//
//  REV64  v6.4h , v2.4h
//  REV64  v7.4h , v3.4h
//
//  ST1 {v6.4h} , [x5] , x11
//  ST1 {v7.4h} , [x5] , x11
//
//    SUBS        x10, x10, #1
//    BPL         LOOP
//  B       SKIP_LOOP
//
//LOOP_SKIP_ODD:
//  LD2 {v0.4h , v1.4h} , [x6], #16
//  LD2 {v2.4h , v3.4h} , [x6], #16
//
//  REV64  v1.4h , v0.4h
//  REV64  v3.4h , v2.4h
//
//  ST1 {v1.4h} , [x5], x11
//  ST1 {v3.4h} , [x5], x11
//
//  LD2 {v4.4h , v5.4h} , [x6], #16
//  LD2 {v6.4h , v7.4h} , [x6], #16
//
//
//  REV64  v5.4h , v4.4h
//  REV64  v7.4h , v6.4h
//
//  ST1 {v5.4h} , [x5], x11
//  ST1 {v7.4h} , [x5], x11
//
//    SUBS        x10, x10, #1
//    BPL         LOOP_SKIP_ODD

SKIP_LOOP:

    //LDR w4,  [SP, #104]              //winAdd
    // sxtw x4,w4

    MOV             x4, x20
    MOV             x5, #8
    LD1             {v0.4h}, [x0], #8
    MOV             x6, #64

    LSL             x6, x6, #1
    LD2             {v1.4h, v2.4h}, [x2], #16
    MOV             x7, #244

    MOV             x9, x0
    ADD             x0, x0, #120

    MOV             x11, x4
    LD1             {v2.4h}, [x0], x6
    ADD             x11, x11, #128




    MOV             x10, x2
    ADD             x2, x2, #240

    sMULL           v30.4s, v0.4h, v1.4h
    LD2             {v3.4h, v4.4h}, [x2], #16
    ADD             x2, x2, #240


    LD1             {v4.4h}, [x0], x6
    sMLAL           v30.4s, v2.4h, v3.4h

    LD2             {v5.4h, v6.4h}, [x2], #16


    ADD             x2, x2, #240
    LD1             {v6.4h}, [x0], x6
    sMLAL           v30.4s, v4.4h, v5.4h

    LD2             {v7.4h, v8.4h}, [x2], #16


    ADD             x2, x2, #240
    LD1             {v8.4h}, [x0], x6
    sMLAL           v30.4s, v6.4h, v7.4h

    MOV             x0, x9
    LD2             {v9.4h, v10.4h}, [x2], #16


    ADD             x2, x2, #240
    LD1             {v10.4h}, [x1], #8
    sMLAL           v30.4s, v8.4h, v9.4h



    MOV             x9, x1
    LD2             {v11.4h, v12.4h}, [x3], #16
    ADD             x1, x1, #120


    MOV             x2, x10
    LD1             {v12.4h}, [x1], x6
    MOV             x10, x3

    ADD             x3, x3, #240
    LD2             {v13.4h, v14.4h}, [x3], #16
    ADD             x3, x3, #240


    LD2             {v15.4h, v16.4h}, [x3], #16

    LD1             {v14.4h}, [x1], x6
    ADD             x3, x3, #240



    LD1             {v16.4h}, [x1], x6
    SUB             x5, x5, #1

    LD2             {v17.4h, v18.4h}, [x3], #16


    ADD             x3, x3, #240
    LD1             {v18.4h}, [x1], x6

    MOV             x1, x9
    LD2             {v19.4h, v20.4h}, [x3], #16

    ADD             x3, x3, #240

    MOV             x3, x10


LOOP_1:


    LD1             {v0.4h}, [x0], #8

    MOV             x9, x0
    LD2             {v1.4h, v2.4h}, [x2], #16
    ADD             x0, x0, #120

    MOV             x10, x2
    ST1             { v30.4s}, [x4], #16
    ADD             x2, x2, #240


    sMULL           v30.4s, v10.4h, v11.4h
    LD1             {v2.4h}, [x0], x6
    sMLAL           v30.4s, v12.4h, v13.4h

    sMLAL           v30.4s, v14.4h, v15.4h
    LD2             {v3.4h, v4.4h}, [x2], #16
    sMLAL           v30.4s, v16.4h, v17.4h

    sMLAL           v30.4s, v18.4h, v19.4h
    LD1             {v4.4h}, [x0], x6
    ADD             x2, x2, #240

    ST1             { v30.4s}, [x11], #16


    sMULL           v30.4s, v0.4h, v1.4h
    LD2             {v5.4h, v6.4h}, [x2], #16
    sMLAL           v30.4s, v2.4h, v3.4h



    ADD             x2, x2, #240
    LD1             {v6.4h}, [x0], x6
    sMLAL           v30.4s, v4.4h, v5.4h

    LD2             {v7.4h, v8.4h}, [x2], #16


    ADD             x2, x2, #240
    LD1             {v8.4h}, [x0], x6
    sMLAL           v30.4s, v6.4h, v7.4h

    MOV             x0, x9
    LD2             {v9.4h, v10.4h}, [x2], #16



    ADD             x2, x2, #240
    LD1             {v10.4h}, [x1], #8
    MOV             x2, x10

    MOV             x9, x1
    LD2             {v11.4h, v12.4h}, [x3], #16
    ADD             x1, x1, #120


    sMLAL           v30.4s, v8.4h, v9.4h
    LD1             {v12.4h}, [x1], x6
    MOV             x10, x3


    ADD             x3, x3, #240
    LD2             {v13.4h, v14.4h}, [x3], #16
    ADD             x3, x3, #240



    LD1             {v14.4h}, [x1], x6
    LD2             {v15.4h, v16.4h}, [x3], #16
    ADD             x3, x3, #240


    LD1             {v16.4h}, [x1], x6
    LD2             {v17.4h, v18.4h}, [x3], #16
    ADD             x3, x3, #240


    LD1             {v18.4h}, [x1], x6
    SUBS            x5, x5, #1

    MOV             x1, x9
    LD2             {v19.4h, v20.4h}, [x3], #16

    ADD             x3, x3, #240

    MOV             x3, x10

    BGT             LOOP_1

    ST1             { v30.4s}, [x4], #16
    sMULL           v30.4s, v10.4h, v11.4h
    sMLAL           v30.4s, v12.4h, v13.4h

    sMLAL           v30.4s, v14.4h, v15.4h
    sMLAL           v30.4s, v16.4h, v17.4h
    sMLAL           v30.4s, v18.4h, v19.4h

    ST1             { v30.4s}, [x11], #16

    //VPOP        {D8 - D15}
    // LDMFD sp!, {x4-x12, x15}
    ldp             x19, x20, [sp], #16
    pop_v_regs
    ret
    // ENDP