.text
.p2align 2
.global ixheaacd_complex_fft_p2_asm

ixheaacd_complex_fft_p2_asm:
    STMFD           sp!, {r0-r12, lr}
    SUB             sp, sp, #0x28
    LDR             r0, [sp, #0x2c]
    @LDR      r12,[sp,#0x5c+4]
    EOR             r0, r0, r0, ASR #31
    CLZ             r0, r0
    SUB             r12, r0, #16        @dig_rev_shift = norm32(npoints) + 1 -16@
    SUB             r0, r0, #1
    RSB             r0, r0, #0x1e
    AND             r1, r0, #1
    STR             r1, [sp, #0x14]
    MOV             r1, r0, ASR #1
    LDR             r0, [sp, #0x2c]     @npoints
    STR             r1, [sp, #-4]!
    MOV             lr, r0, LSL #1      @(npoints >>1) * 4
    MOV             r0, #0

FIRST_STAGE_R4:
    LDR             r4, =0x33333333
    LDR             r5, =0x0F0F0F0F
    AND             r6, r4, r0
    AND             r7, r4, r0, LSR #2
    ORR             r4, r7, r6, LSL #2
    AND             r6, r5, r4
    AND             r7, r5, r4, LSR #4
    ORR             r4, r7, r6, LSL #4
    BIC             r6, r4, #0x0000FF00
    BIC             r7, r4, #0x00FF0000
    MOV             r7, r7, LSR #8
    ORR             r4, r7, r6, LSL #8
    LDR             r5, [sp, #0x18]
    MOV             r10, r4, LSR r12
    CMP             r5, #0
    ADDNE           r10, r10, #1
    BICNE           r10, r10, #1

    ADD             r1, r2, r10, LSL #2
    LDRD            r4, [r1]            @r4=x0r,  r5=x0i
    ADD             r1, r1, lr
    LDRD            r8, [r1]            @r8=x1r,  r9=x1i
    ADD             r1, r1, lr
    LDRD            r6, [r1]            @r6=x2r,  r7=x2i
    ADD             r1, r1, lr
    LDRD            r10, [r1]           @r10=x3r, r11=x3i
    ADD             r0, r0, #4
    CMP             r0, lr, ASR #1

    ADD             r4, r4, r6          @x0r = x0r + x2r@
    ADD             r5, r5, r7          @x0i = x0i + x2i@
    SUB             r6, r4, r6, lsl#1   @x2r = x0r - (x2r << 1)@
    SUB             r7, r5, r7, lsl#1   @x2i = x0i - (x2i << 1)@
    ADD             r8, r8, r10         @x1r = x1r + x3r@
    ADD             r9, r9, r11         @x1i = x1i + x3i@
    SUB             r1, r8, r10, lsl#1  @x3r = x1r - (x3r << 1)@
    SUB             r11, r9, r11, lsl#1 @x3i = x1i - (x3i << 1)@

    ADD             r4, r4, r8          @x0r = x0r + x1r@
    ADD             r5, r5, r9          @x0i = x0i + x1i@
    SUB             r8, r4, r8, lsl#1   @x1r = x0r - (x1r << 1)@
    SUB             r9, r5, r9, lsl#1   @x1i = x0i - (x1i << 1)
    ADD             r6, r6, r11         @x2r = x2r + x3i@
    SUB             r7, r7, r1          @x2i = x2i - x3r@
    SUB             r10, r6, r11, lsl#1 @x3i = x2r - (x3i << 1)@
    ADD             r11, r7, r1, lsl#1  @x3r = x2i + (x3r << 1)@

    STMIA           r3!, {r4-r11}
    BLT             FIRST_STAGE_R4
    LDR             r1, [sp], #4
    LDR             r0, [sp, #0x2c]
    MOV             r12, #0x40          @nodespacing = 64@
    STR             r12, [sp, #0x1c]
    LDR             r12, [sp, #0x2c]
    SUB             r3, r3, r0, LSL #3
    SUBS            r1, r1, #1
    STR             r3, [sp, #0x34]
    MOV             r4, r12, ASR #4
    MOV             r0, #4
    STR             r4, [sp, #0x18]
    STR             r1, [sp, #0x20]
    BLE             RADIX2
OUTER_LOOP:
    LDR             r1, [sp, #0x28]
    LDR             r12, [sp, #0x34]    @WORD32 *data = ptr_y@
    STR             r1, [sp, #0x10]
    LDR             r1, [sp, #0x18]

    MOV             r0, r0, LSL #3      @(del<<1) * 4
LOOP_TRIVIAL_TWIDDLE:
    LDRD            r4, [r12]           @r4=x0r,  r5=x0i
    ADD             r12, r12, r0
    LDRD            r6, [r12]           @r6=x1r,  r7=x1i
    ADD             r12, r12, r0
    LDRD            r8, [r12]           @r8=x2r,  r9=x2i
    ADD             r12, r12, r0
    LDRD            r10, [r12]          @r10=x3r, r11=x3i

@MOV    r4,r4,ASR #1
@MOV    r5,r5,ASR #1
@MOV    r6,r6,ASR #1
@MOV    r7,r7,ASR #1
@MOV    r8,r8,ASR #1
@MOV    r9,r9,ASR #1
@MOV    r10,r10,ASR #1
@MOV    r11,r11,ASR #1

    ADD             r4, r4, r8          @x0r = x0r + x2r@
    ADD             r5, r5, r9          @x0i = x0i + x2i@
    SUB             r8, r4, r8, lsl #1  @x2r = x0r - (x2r << 1)@
    SUB             r9, r5, r9, lsl #1  @x2i = x0i - (x2i << 1)@
    ADD             r6, r6, r10         @x1r = x1r + x3r@
    ADD             r7, r7, r11         @x1i = x1i + x3i@
    SUB             r2, r6, r10, lsl #1 @x3r = x1r - (x3r << 1)@
    SUB             r11, r7, r11, lsl #1 @x3i = x1i - (x3i << 1)@

    ADD             r4, r4, r6          @x0r = x0r + x1r@
    ADD             r5, r5, r7          @x0i = x0i + x1i@
@MOV    r4,r4,ASR #1
@MOV    r5,r5,ASR #1
    SUB             r6, r4, r6, lsl #1  @x1r = x0r - (x1r << 1)@
    SUB             r7, r5, r7, lsl #1  @x1i = x0i - (x1i << 1)
    ADD             r8, r8, r11         @x2r = x2r + x3i@
    SUB             r9, r9, r2          @x2i = x2i - x3r@
    SUB             r10, r8, r11, lsl#1 @x3i = x2r - (x3i << 1)@
    ADD             r11, r9, r2, lsl#1  @x3r = x2i + (x3r << 1)

    STRD            r10, [r12]          @r10=x3r, r11=x3i
    SUB             r12, r12, r0
    STRD            r6, [r12]           @r6=x1r,  r7=x1i
    SUB             r12, r12, r0
    STRD            r8, [r12]           @r8=x2r,  r9=x2i
    SUB             r12, r12, r0
    STRD            r4, [r12]           @r4=x0r,  r5=x0i
    ADD             r12, r12, r0, lsl #2

    SUBS            r1, r1, #1
    BNE             LOOP_TRIVIAL_TWIDDLE

    MOV             r0, r0, ASR #3
    LDR             r4, [sp, #0x1c]
    LDR             r3, [sp, #0x34]
    MUL             r1, r0, r4
    ADD             r12, r3, #8
    STR             r1, [sp, #0x24]
    MOV             r3, r1, ASR #2
    ADD             r3, r3, r1, ASR #3
    SUB             r3, r3, r1, ASR #4
    ADD             r3, r3, r1, ASR #5
    SUB             r3, r3, r1, ASR #6
    ADD             r3, r3, r1, ASR #7
    SUB             r3, r3, r1, ASR #8
    STR             r3, [sp, #-4]!
SECOND_LOOP:
    LDR             r3, [sp, #0x10+4]
    LDR             r14, [sp, #0x18+4]
    MOV             r0, r0, LSL #3      @(del<<1) * 4
    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
    LDR             r2, [r3, #4]        @w1l = *(twiddles + 2*j + 1)@
    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
    LDR             r6, [r3, #4]        @w2l = *(twiddles + 2*(j<<1) + 1)@
    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
    LDR             r8, [r3, #4]        @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@

    STR             r4, [sp, #8+4]
    STR             r1, [sp, #-4]
    STR             r2, [sp, #-8]
    STR             r5, [sp, #-12]
    STR             r6, [sp, #-16]
    STR             r7, [sp, #-20]
    STR             r8, [sp, #-24]

RADIX4_BFLY:

    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
    SUBS            r14, r14, #1

    LDR             r1, [sp, #-4]
    LDR             r2, [sp, #-8]

    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
    LSR             r3, r3, #31
    ORR             r4, r3, r4, LSL#1
    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
    LSR             r3, r3, #31
    ORR             r6, r3, r6, LSL#1
    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
    LSR             r3, r3, #31
    ORR             r5, r3, r5, LSL#1
    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
    LSR             r3, r3, #31
    ORR             r7, r3, r7, LSL#1
    ADD             r7, r7, r6
    SUB             r6, r4, r5          @

    LDR             r1, [sp, #-12]
    LDR             r2, [sp, #-16]

    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
    LSR             r3, r3, #31
    ORR             r4, r3, r4, LSL#1
    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
    LSR             r3, r3, #31
    ORR             r8, r3, r8, LSL#1
    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
    LSR             r3, r3, #31
    ORR             r5, r3, r5, LSL#1
    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
    LSR             r3, r3, #31
    ORR             r9, r3, r9, LSL#1
    ADD             r9, r9, r8
    SUB             r8, r4, r5          @

    LDR             r1, [sp, #-20]
    LDR             r2, [sp, #-24]

    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
    LSR             r3, r3, #31
    ORR             r4, r3, r4, LSL#1
    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
    LSR             r3, r3, #31
    ORR             r10, r3, r10, LSL#1
    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
    LSR             r3, r3, #31
    ORR             r5, r3, r5, LSL#1
    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
    LSR             r3, r3, #31
    ORR             r11, r3, r11, LSL#1
    ADD             r11, r11, r10
    SUB             r10, r4, r5         @

    @SUB   r12,r12,r0,lsl #1
    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
    LDR             r4, [r12, -r0, lsl #1]! @
    LDR             r5, [r12, #4]


    ADD             r4, r8, r4          @x0r = x0r + x2r@
    ADD             r5, r9, r5          @x0i = x0i + x2i@
    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
    ADD             r6, r6, r10         @x1r = x1r + x3r@
    ADD             r7, r7, r11         @x1i = x1i + x3i@
    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@

    ADD             r4, r4, r6          @x0r = x0r + x1r@
    ADD             r5, r5, r7          @x0i = x0i + x1i@
    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
    STRD            r4, [r12]           @r4=x0r,  r5=x0i
    ADD             r12, r12, r0

    ADD             r8, r8, r11         @x2r = x2r + x3i@
    SUB             r9, r9, r10         @x2i = x2i - x3r@
    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)

    STRD            r8, [r12]           @r8=x2r,  r9=x2i
    ADD             r12, r12, r0
    STRD            r6, [r12]           @r6=x1r,  r7=x1i
    ADD             r12, r12, r0
    STRD            r4, [r12]           @r10=x3r, r11=x3i
    ADD             r12, r12, r0

    BNE             RADIX4_BFLY
    MOV             r0, r0, ASR #3

    LDR             r1, [sp, #0x2c+4]
    LDR             r4, [sp, #8+4]
    SUB             r1, r12, r1, LSL #3
    LDR             r6, [sp, #0x1c+4]
    ADD             r12, r1, #8
    LDR             r7, [sp, #0]
    ADD             r4, r4, r6
    CMP             r4, r7
    BLE             SECOND_LOOP

SECOND_LOOP_2:
    LDR             r3, [sp, #0x10+4]
    LDR             r14, [sp, #0x18+4]
    MOV             r0, r0, LSL #3      @(del<<1) * 4

    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
    LDR             r2, [r3, #4]        @w1l = *(twiddles + 2*j + 1)@
    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
    LDR             r6, [r3, #4]        @w2l = *(twiddles + 2*(j<<1) + 1)@
    SUB             r3, r3, #2048       @ 512 *4
    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
    LDR             r8, [r3, #4]        @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@

    STR             r4, [sp, #8+4]

    STR             r1, [sp, #-4]
    STR             r2, [sp, #-8]
    STR             r5, [sp, #-12]
    STR             r6, [sp, #-16]
    STR             r7, [sp, #-20]
    STR             r8, [sp, #-24]

RADIX4_BFLY_2:
    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
    SUBS            r14, r14, #1
    LDR             r1, [sp, #-4]
    LDR             r2, [sp, #-8]

    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
    LSR             r3, r3, #31
    ORR             r4, r3, r4, LSL#1
    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
    LSR             r3, r3, #31
    ORR             r6, r3, r6, LSL#1
    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
    LSR             r3, r3, #31
    ORR             r5, r3, r5, LSL#1
    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
    LSR             r3, r3, #31
    ORR             r7, r3, r7, LSL#1
    ADD             r7, r7, r6
    SUB             r6, r4, r5          @

    LDR             r1, [sp, #-12]
    LDR             r2, [sp, #-16]

    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
    LSR             r3, r3, #31
    ORR             r4, r3, r4, LSL#1
    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
    LSR             r3, r3, #31
    ORR             r8, r3, r8, LSL#1
    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
    LSR             r3, r3, #31
    ORR             r5, r3, r5, LSL#1
    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
    LSR             r3, r3, #31
    ORR             r9, r3, r9, LSL#1
    ADD             r9, r9, r8
    SUB             r8, r4, r5          @

    LDR             r1, [sp, #-20]
    LDR             r2, [sp, #-24]

    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
    LSR             r3, r3, #31
    ORR             r4, r3, r4, LSL#1
    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
    LSR             r3, r3, #31
    ORR             r10, r3, r10, LSL#1
    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
    LSR             r3, r3, #31
    ORR             r5, r3, r5, LSL#1
    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
    LSR             r3, r3, #31
    ORR             r11, r3, r11, LSL#1
    ADD             r10, r11, r10
    SUB             r11, r5, r4         @

    @SUB    r12,r12,r0,lsl #1
    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
    LDR             r4, [r12, -r0, lsl #1]! @
    LDR             r5, [r12, #4]


    ADD             r4, r8, r4          @x0r = x0r + x2r@
    ADD             r5, r9, r5          @x0i = x0i + x2i@
    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
    ADD             r6, r6, r10         @x1r = x1r + x3r@
    ADD             r7, r7, r11         @x1i = x1i + x3i@
    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@

    ADD             r4, r4, r6          @x0r = x0r + x1r@
    ADD             r5, r5, r7          @x0i = x0i + x1i@
    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
    STRD            r4, [r12]           @r4=x0r,  r5=x0i
    ADD             r12, r12, r0

    ADD             r8, r8, r11         @x2r = x2r + x3i@
    SUB             r9, r9, r10         @x2i = x2i - x3r@
    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)

    STRD            r8, [r12]           @r8=x2r,  r9=x2i
    ADD             r12, r12, r0
    STRD            r6, [r12]           @r6=x1r,  r7=x1i
    ADD             r12, r12, r0
    STRD            r4, [r12]           @r10=x3r, r11=x3i
    ADD             r12, r12, r0

    BNE             RADIX4_BFLY_2
    MOV             r0, r0, ASR #3

    LDR             r1, [sp, #0x2c+4]
    LDR             r4, [sp, #8+4]
    SUB             r1, r12, r1, LSL #3
    LDR             r6, [sp, #0x1c+4]
    ADD             r12, r1, #8
    LDR             r7, [sp, #0x24+4]
    ADD             r4, r4, r6
    CMP             r4, r7, ASR #1
    BLE             SECOND_LOOP_2
    LDR             r7, [sp, #0]
    CMP             r4, r7, LSL #1
    BGT             SECOND_LOOP_4

SECOND_LOOP_3:
    LDR             r3, [sp, #0x10+4]
    LDR             r14, [sp, #0x18+4]
    MOV             r0, r0, LSL #3      @(del<<1) * 4

    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
    LDR             r2, [r3, #4]        @w1l = *(twiddles + 2*j + 1)@
    SUB             r3, r3, #2048       @ 512 *4
    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
    LDR             r6, [r3, #4]        @w2l = *(twiddles + 2*(j<<1) + 1)@
    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
    LDR             r8, [r3, #4]        @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@

    STR             r4, [sp, #8+4]
    STR             r1, [sp, #-4]
    STR             r2, [sp, #-8]
    STR             r5, [sp, #-12]
    STR             r6, [sp, #-16]
    STR             r7, [sp, #-20]
    STR             r8, [sp, #-24]


RADIX4_BFLY_3:
    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
    SUBS            r14, r14, #1

    LDR             r1, [sp, #-4]
    LDR             r2, [sp, #-8]

    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
    LSR             r3, r3, #31
    ORR             r4, r3, r4, LSL#1
    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
    LSR             r3, r3, #31
    ORR             r6, r3, r6, LSL#1
    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
    LSR             r3, r3, #31
    ORR             r5, r3, r5, LSL#1
    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
    LSR             r3, r3, #31
    ORR             r7, r3, r7, LSL#1
    ADD             r7, r7, r6
    SUB             r6, r4, r5          @

    LDR             r1, [sp, #-12]
    LDR             r2, [sp, #-16]

    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
    LSR             r3, r3, #31
    ORR             r4, r3, r4, LSL#1
    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
    LSR             r3, r3, #31
    ORR             r8, r3, r8, LSL#1
    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
    LSR             r3, r3, #31
    ORR             r5, r3, r5, LSL#1
    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
    LSR             r3, r3, #31
    ORR             r9, r3, r9, LSL#1
    ADD             r8, r9, r8
    SUB             r9, r5, r4          @

    LDR             r1, [sp, #-20]
    LDR             r2, [sp, #-24]

    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
    LSR             r3, r3, #31
    ORR             r4, r3, r4, LSL#1
    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
    LSR             r3, r3, #31
    ORR             r10, r3, r10, LSL#1
    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
    LSR             r3, r3, #31
    ORR             r5, r3, r5, LSL#1
    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
    LSR             r3, r3, #31
    ORR             r11, r3, r11, LSL#1
    ADD             r10, r11, r10
    SUB             r11, r5, r4         @

    @SUB    r12,r12,r0,lsl #1
    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
    LDR             r4, [r12, -r0, lsl #1]! @
    LDR             r5, [r12, #4]


    ADD             r4, r8, r4          @x0r = x0r + x2r@
    ADD             r5, r9, r5          @x0i = x0i + x2i@
    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
    ADD             r6, r6, r10         @x1r = x1r + x3r@
    ADD             r7, r7, r11         @x1i = x1i + x3i@
    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@

    ADD             r4, r4, r6          @x0r = x0r + x1r@
    ADD             r5, r5, r7          @x0i = x0i + x1i@
    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
    STRD            r4, [r12]           @r4=x0r,  r5=x0i
    ADD             r12, r12, r0

    ADD             r8, r8, r11         @x2r = x2r + x3i@
    SUB             r9, r9, r10         @x2i = x2i - x3r@
    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)

    STRD            r8, [r12]           @r8=x2r,  r9=x2i
    ADD             r12, r12, r0
    STRD            r6, [r12]           @r6=x1r,  r7=x1i
    ADD             r12, r12, r0
    STRD            r4, [r12]           @r10=x3r, r11=x3i
    ADD             r12, r12, r0

    BNE             RADIX4_BFLY_3
    MOV             r0, r0, ASR #3

    LDR             r1, [sp, #0x2c+4]
    LDR             r4, [sp, #8+4]
    SUB             r1, r12, r1, LSL #3
    LDR             r6, [sp, #0x1c+4]
    ADD             r12, r1, #8
    LDR             r7, [sp, #0]
    ADD             r4, r4, r6
    CMP             r4, r7, LSL #1
    BLE             SECOND_LOOP_3

SECOND_LOOP_4:
    LDR             r3, [sp, #0x10+4]
    LDR             r14, [sp, #0x18+4]
    MOV             r0, r0, LSL #3      @(del<<1) * 4

    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
    LDR             r2, [r3, #4]        @w1l = *(twiddles + 2*j + 1)@
    SUB             r3, r3, #2048       @ 512 *4
    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
    LDR             r6, [r3, #4]        @w2l = *(twiddles + 2*(j<<1) + 1)@
    SUB             r3, r3, #2048       @ 512 *4
    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
    LDR             r8, [r3, #4]        @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@


    STR             r4, [sp, #8+4]
    STR             r1, [sp, #-4]
    STR             r2, [sp, #-8]
    STR             r5, [sp, #-12]
    STR             r6, [sp, #-16]
    STR             r7, [sp, #-20]
    STR             r8, [sp, #-24]

RADIX4_BFLY_4:
    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
    SUBS            r14, r14, #1

    LDR             r1, [sp, #-4]
    LDR             r2, [sp, #-8]

    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
    LSR             r3, r3, #31
    ORR             r4, r3, r4, LSL#1
    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
    LSR             r3, r3, #31
    ORR             r6, r3, r6, LSL#1
    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
    LSR             r3, r3, #31
    ORR             r5, r3, r5, LSL#1
    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
    LSR             r3, r3, #31
    ORR             r7, r3, r7, LSL#1
    ADD             r7, r7, r6
    SUB             r6, r4, r5          @

    LDR             r1, [sp, #-12]
    LDR             r2, [sp, #-16]

    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
    LSR             r3, r3, #31
    ORR             r4, r3, r4, LSL#1
    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
    LSR             r3, r3, #31
    ORR             r8, r3, r8, LSL#1
    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
    LSR             r3, r3, #31
    ORR             r5, r3, r5, LSL#1
    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
    LSR             r3, r3, #31
    ORR             r9, r3, r9, LSL#1
    ADD             r8, r9, r8
    SUB             r9, r5, r4          @

    LDR             r1, [sp, #-20]
    LDR             r2, [sp, #-24]

    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
    LSR             r3, r3, #31
    ORR             r4, r3, r4, LSL#1
    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
    LSR             r3, r3, #31
    ORR             r10, r3, r10, LSL#1
    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
    LSR             r3, r3, #31
    ORR             r5, r3, r5, LSL#1
    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
    LSR             r3, r3, #31
    ORR             r11, r3, r11, LSL#1
    ADD             r11, r11, r10
    SUB             r10, r5, r4         @

    @SUB    r12,r12,r0,lsl #1
    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
    LDR             r4, [r12, -r0, lsl #1]! @
    LDR             r5, [r12, #4]


    ADD             r4, r8, r4          @x0r = x0r + x2r@
    ADD             r5, r9, r5          @x0i = x0i + x2i@
    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
    ADD             r6, r6, r10         @x1r = x1r + x3r@
    SUB             r7, r7, r11         @x1i = x1i - x3i@
    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
    ADD             r11, r7, r11, lsl#1 @x3i = x1i + (x3i << 1)@

    ADD             r4, r4, r6          @x0r = x0r + x1r@
    ADD             r5, r5, r7          @x0i = x0i + x1i@
    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
    STRD            r4, [r12]           @r4=x0r,  r5=x0i
    ADD             r12, r12, r0

    ADD             r8, r8, r11         @x2r = x2r + x3i@
    SUB             r9, r9, r10         @x2i = x2i - x3r@
    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)

    STRD            r8, [r12]           @r8=x2r,  r9=x2i
    ADD             r12, r12, r0
    STRD            r6, [r12]           @r6=x1r,  r7=x1i
    ADD             r12, r12, r0
    STRD            r4, [r12]           @r10=x3r, r11=x3i
    ADD             r12, r12, r0

    BNE             RADIX4_BFLY_4
    MOV             r0, r0, ASR #3

    LDR             r1, [sp, #0x2c+4]
    LDR             r4, [sp, #8+4]
    SUB             r1, r12, r1, LSL #3
    LDR             r6, [sp, #0x1c+4]
    ADD             r12, r1, #8
    LDR             r7, [sp, #0x24+4]
    ADD             r4, r4, r6
    CMP             r4, r7
    BLT             SECOND_LOOP_4
    ADD             sp, sp, #4

    LDR             r1, [sp, #0x1c]
    MOV             r0, r0, LSL #2
    MOV             r1, r1, ASR #2
    STR             r1, [sp, #0x1c]
    LDR             r1, [sp, #0x18]
    MOV             r1, r1, ASR #2
    STR             r1, [sp, #0x18]
    LDR             r1, [sp, #0x20]
    SUBS            r1, r1, #1
    STR             r1, [sp, #0x20]
    BGT             OUTER_LOOP

RADIX2:
    LDR             r1, [sp, #0x14]
    CMP             r1, #0
    BEQ             EXIT
    LDR             r12, [sp, #0x1c]
    LDR             r1, [sp, #0x28]
    CMP             r12, #0
    LDRNE           r12, [sp, #0x1c]
    MOVEQ           r4, #1
    MOVNE           r4, r12, LSL #1
    MOVS            r3, r0
    BEQ             EXIT

    MOV             r3, r3, ASR #1
    LDR             r5, [sp, #0x34]
    MOV             r0, r0, LSL #3      @(del<<1) * 4
    STR             r1, [sp, #-4]
RADIX2_BFLY:
    LDR             r1, [sp, #-4]
    LDRD            r6, [r5]            @r6 = x0r
    ADD             r5, r5, r0
    LDRD            r8, [r5]            @r8 = x1r

    LDR             r2, [r1]
    SUBS            r3, r3, #1


    SMULL           r1, r11, r8, r2     @mult32x16hin32(x1r,W1h)
    LSR             r1, r1, #31
    ORR             r11, r1, r11, LSL#1
    SMULL           r1, r10, r9, r2     @mult32x16hin32(x1i,W1h)
    LSR             r1, r1, #31
    ORR             r10, r1, r10, LSL#1


    LDR             r1, [sp, #-4]
    LDR             r2, [r1, #4]
    ADD             r1, r1, r4, LSL #3
    STR             r1, [sp, #-4]

    SMULL           r1, r8, r8, r2      @ixheaacd_mult32(x1r,w1l)
    LSR             r1, r1, #31
    ORR             r8, r1, r8, LSL#1
    SMULL           r1, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
    LSR             r1, r1, #31
    ORR             r9, r1, r9, LSL#1

    SUB             r8, r8, r10
    ADD             r9, r9, r11


    ADD             r10, r8, r6         @(x0r/2) + (x1r/2)
    ASR             r10, r10, #1
    ADD             r11, r9, r7         @(x0i/2) + (x1i/2)@
    ASR             r11, r11, #1
    SUB             r8, r6, r8          @(x0r/2) - (x1r/2)
    ASR             r8, r8, #1
    SUB             r9, r7, r9          @(x0i/2) - (x1i/2)@
    ASR             r9, r9, #1

    STRD            r8, [r5]
    SUB             r5, r5, r0
    STRD            r10, [r5], #8

    BNE             RADIX2_BFLY

    LDR             r1, [sp, #0x28]
    MOV             r3, r0, ASR #4
    STR             r1, [sp, #-4]
RADIX2_BFLY_2:
    LDR             r1, [sp, #-4]
    LDRD            r6, [r5]            @r6 = x0r
    ADD             r5, r5, r0
    LDRD            r8, [r5]            @r8 = x1r

    LDR             r2, [r1]
    SUBS            r3, r3, #1



    SMULL           r1, r11, r8, r2     @mult32x16hin32(x1r,W1h)
    LSR             r1, r1, #31
    ORR             r11, r1, r11, LSL#1
    SMULL           r1, r10, r9, r2     @mult32x16hin32(x1i,W1h)
    LSR             r1, r1, #31
    ORR             r10, r1, r10, LSL#1


    LDR             r1, [sp, #-4]
    LDR             r2, [r1, #4]
    ADD             r1, r1, r4, LSL #3
    STR             r1, [sp, #-4]

    SMULL           r1, r8, r8, r2      @ixheaacd_mult32(x1r,w1l)
    LSR             r1, r1, #31
    ORR             r8, r1, r8, LSL#1
    SMULL           r1, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
    LSR             r1, r1, #31
    ORR             r9, r1, r9, LSL#1

    ADD             r11, r11, r9
    SUB             r9, r10, r8         @
    MOV             r8, r11

    ADD             r10, r8, r6         @(x0r>>1) + (x1r)
    ASR             r10, r10, #1
    ADD             r11, r9, r7         @(x0i>>1) + (x1i)@
    ASR             r11, r11, #1
    SUB             r8, r6, r8          @(x0r>>1) - (x1r)
    ASR             r8, r8, #1
    SUB             r9, r7, r9          @(x0i>>1) - (x1i)@
    ASR             r9, r9, #1

    STRD            r8, [r5]
    SUB             r5, r5, r0
    STRD            r10, [r5], #8

    BNE             RADIX2_BFLY_2

EXIT:
    ADD             sp, sp, #0x38
    LDMFD           sp!, {r4-r12, pc}