//.include "ihevc_neon_macros.s"
.macro push_v_regs
    stp             x8, x9, [sp, #-16]!
    stp             x10, X11, [sp, #-16]!
    stp             X12, X13, [sp, #-16]!
    stp             X14, X15, [sp, #-16]!
    stp             X29, X30, [sp, #-16]!
.endm
.macro pop_v_regs
    ldp             X29, X30, [sp], #16
    ldp             X14, X15, [sp], #16
    ldp             X12, X13, [sp], #16
    ldp             X10, X11, [sp], #16
    ldp             X8, X9, [sp], #16
.endm

.text
.p2align 2
        .global ixheaacd_postradixcompute4


ixheaacd_postradixcompute4:

    // STMFD sp!, {x4-x12, x14}
    push_v_regs
    //SUB         sp, sp, #16

    //HARD CODED for FFT Length of 16
    // x3 is always 16


    //SUB         x4, x3, #2              ; y to y offset calculated
    //MOV         x4, #14
    //STR         x4, [sp, #8]            ; (npoints / 2)*4bytes - 4bytes

    //STR         x0, [sp, #12]           ; (3*(npoints/2))*4bytes - 4bytes
                                        // x0 to x2 offset (npoints / 2)*4bytes
    ADD             x4, x1, x3, lsl #1  // x1 -> x0, x4 -> x2
    MOV             x3, #2


POSTRADIX4_START:

//    LDMIA       x1!, {x5-x12}               // x_0 :x_7

    LDP             w5, w6, [x1], #8    // x_0 :x_1
    LDP             w7, w8, [x1], #8    // x_2 :x_3
    LDP             w9, w10, [x1], #8   // x_4 :x_5
    LDP             w11, w12, [x1], #8  // x_6 :x_7

    ADD             w14, w5, w9         // xh0_0 = x_0 + x_4
    SUB             w5, w5, w9          // xl0_0 = x_0 - x_4

    ADD             w9, w6, w10         // xh1_0 = x_1 + x_5
    SUB             w6, w6, w10         // xl1_0 = x_1 - x_5

    ADD             w10, w7, w11        // xh0_1 = x_2 + x_6
    SUB             w7, w7, w11         // xl0_1 = x_2 - x_6

    ADD             w11, w8, w12        // xh1_1 = x_3 + x_7
    SUB             w8, w8, w12         // xl1_1 = x_3 - x_7

    ADD             w12, w14, w10       // n00 = xh0_0 + xh0_1
    SUB             w14, w14, w10       // n20 = xh0_0 - xh0_1

    ADD             w10, w9, w11        // n01 = xh1_0 + xh1_1
    SUB             w9, w9, w11         // n21 = xh1_0 - xh1_1

    ADD             w11, w5, w8         // n10 = xl0_0 + xl1_1
    SUB             w5, w5, w8          // n30 = xl0_0 - xl1_1

    ADD             w8, w6, w7          // n31 = xl1_0 + xl0_1
    SUB             w6, w6, w7          // n11 = xl1_0 - xl0_1


    STR             w12, [x0], #4       // y0[h2] = n00, x7 -> y0[h2 + 1]

    STR             w10, [x0], #14<<1   // y0[h2 + 1] = n01, x7 -> y1[h2]

    STR             w11, [x0], #4       // y1[h2] = n10, x7 -> y1[h2 + 1]
    STR             w6 , [x0], #14<<1   // y1[h2 + 1] = n11, x7 -> y2[h2]

    STR             w14, [x0], #4       // y2[h2] = n20, x7 -> y2[h2 + 1]
    STR             w9 , [x0], #14<<1   // y2[h2 + 1] = n21, x7 -> y3[h2]

    STR             w5, [x0], #4        // y3[h2] = n30, x7 -> y3[h2 + 1]
    STR             w8, [x0], #0        // y3[h2 + 1] = n31, x7 -> y0[h2+2]

//    LDMIA       x4!, {x5-x12}               // x_0 :x_7

    LDP             w5, w6, [x4], #8    // x_8 :x_8
    LDP             w7, w8, [x4], #8    // x_a :x_b
    LDP             w9, w10, [x4], #8   // x_c :x_d
    LDP             w11, w12, [x4], #8  // x_e :x_f

    SUB             x0, x0, #92         // #4*3 + #14<<1 * 3 - 8


    ADD             w14, w5, w9
    SUB             w5, w5, w9

    ADD             w9, w6, w10
    SUB             w6, w6, w10

    ADD             w10, w7, w11
    SUB             w7, w7, w11

    ADD             w11, w8, w12
    SUB             w8, w8, w12

    ADD             w12, w14, w10
    SUB             w14, w14, w10

    ADD             w10, w9, w11
    SUB             w9, w9, w11

    ADD             w11, w5, w8
    SUB             w5, w5, w8

    ADD             w8, w6, w7
    SUB             w6, w6, w7

    STR             w12, [x0], #4
    STR             w10, [x0], #14<<1

    STR             w11, [x0], #4
    STR             w6, [x0], #14<<1

    STR             w14, [x0], #4
    STR             w9, [x0], #14<<1


    STR             w5, [x0], #4
    STR             w8, [x0], #0

    ADD             x1, x1, #1 << 5     // x0 += (Word32) npoints >> 1
    ADD             x4, x4, #1 << 5     // x2 += (Word32) npoints >> 1
    SUB             x0, x0, #100-8

    SUBS            w3, w3, #1

    BGT             POSTRADIX4_START

    // LDMFD sp!, {x4-x12, x15}
    pop_v_regs
    ret