@/******************************************************************************
@ *
@ * Copyright (C) 2018 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http:@www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/


.text
.p2align 2

    .global ixheaacd_esbr_cos_sin_mod_loop1
ixheaacd_esbr_cos_sin_mod_loop1:

    STMFD           sp!, {r4-r12, r14}
    VPUSH           {D8-D11}
@generating load addresses
    ADD             r4, r0, r1, lsl #3  @psubband1
    SUB             r4, r4, #4
    ADD             r5, r3, r1, lsl #3  @psubband1_t
    SUB             r5, r5, #8
    MOV             r6, r1, ASR #2

LOOP1:
@first part
    vld1.32         {d0} , [r2]!
    vrev64.32       d1, d0
    vld1.32         {d2[0]}, [r0]!
    ADD             r7, r0, #252
    vld1.32         {d2[1]}, [r7]
    vld1.32         {d3[0]}, [r4]
    ADD             r7, r4, #256
    vld1.32         {d3[1]}, [r7]
    SUB             r4, r4, #4

    VMULL.S32       q2, d0, d2          @qsub 2nd
    VMULL.S32       q3, d0, d3          @add 2nd
    VMULL.S32       q4, d1, d2          @add 1st
    VMULL.S32       q5, d1, d3          @qsub 1st

    vadd.I64        q0, q4, q3
    VQSUB.S64       Q1, Q5, Q2

    VSHRN.I64       D0, Q0, #32
    VSHRN.I64       D2, Q1, #32
    VMOV.32         D3, D0
    VST2.32         {D0[0], D2[0]}, [R3]!
    ADD             r7, r3, #248
    VST2.32         {D2[1], D3[1]}, [R7]

@second part
    vld1.32         {d0} , [r2]!
    vrev64.32       d1, d0
    vld1.32         {d2[0]}, [r0]!
    ADD             R7, R0, #252
    vld1.32         {d2[1]}, [r7]
    vld1.32         {d3[0]}, [r4]
    ADD             R7, R4, #256
    vld1.32         {d3[1]}, [r7]
    SUB             r4, r4, #4

    VMULL.S32       q2, d0, d2          @add 2nd
    VMULL.S32       q3, d0, d3          @sub 2nd
    VMULL.S32       q4, d1, d2          @sub 1st
    VMULL.S32       q5, d1, d3          @add 1st

    VADD.I64        Q0, Q5, Q2
    VQSUB.S64       Q1, Q4, Q3

    VSHRN.I64       D0, Q0, #32
    VSHRN.I64       D2, Q1, #32
    VMOV.32         D3, D0
    VST2.32         {D0[0], D2[0]}, [R5]
    ADD             R7, R5, #256
    VST2.32         {D2[1], D3[1]}, [R7]
    SUB             r5, r5, #8
@Third part
    vld1.32         {d0} , [r2]!
    vrev64.32       d1, d0
    vld1.32         {d2[0]}, [r0]!
    ADD             r7, r0, #252
    vld1.32         {d2[1]}, [r7]
    vld1.32         {d3[0]}, [r4]
    ADD             r7, r4, #256
    vld1.32         {d3[1]}, [r7]
    SUB             r4, r4, #4

    VMULL.S32       q2, d0, d2          @qsub 2nd
    VMULL.S32       q3, d0, d3          @add 2nd
    VMULL.S32       q4, d1, d2          @add 1st
    VMULL.S32       q5, d1, d3          @qsub 1st

    vadd.I64        q0, q4, q3
    VQSUB.S64       Q1, Q5, Q2

    VSHRN.I64       D0, Q0, #32
    VSHRN.I64       D2, Q1, #32
    VMOV.32         D3, D0
    VST2.32         {D0[0], D2[0]}, [R3]!
    ADD             r7, r3, #248
    VST2.32         {D2[1], D3[1]}, [R7]

@Fourth part
    vld1.32         {d0} , [r2]!
    vrev64.32       d1, d0
    vld1.32         {d2[0]}, [r0]!
    ADD             R7, R0, #252
    vld1.32         {d2[1]}, [r7]
    vld1.32         {d3[0]}, [r4]
    ADD             R7, R4, #256
    vld1.32         {d3[1]}, [r7]
    SUB             r4, r4, #4

    VMULL.S32       q2, d0, d2          @add 2nd
    VMULL.S32       q3, d0, d3          @sub 2nd
    VMULL.S32       q4, d1, d2          @sub 1st
    VMULL.S32       q5, d1, d3          @add 1st

    VADD.I64        Q0, Q5, Q2
    VQSUB.S64       Q1, Q4, Q3

    VSHRN.I64       D0, Q0, #32
    VSHRN.I64       D2, Q1, #32
    VMOV.32         D3, D0
    VST2.32         {D0[0], D2[0]}, [R5]
    ADD             R7, R5, #256
    SUBS            R6, R6, #1
    VST2.32         {D2[1], D3[1]}, [R7]
    SUB             r5, r5, #8

    BGT             LOOP1
    VPOP            {D8-D11}
    LDMFD           sp!, {r4-r12, r15}