///******************************************************************************
// *
// * Copyright (C) 2018 The Android Open Source Project
// *
// * Licensed under the Apache License, Version 2.0 (the "License");
// * you may not use this file except in compliance with the License.
// * You may obtain a copy of the License at:
// *
// * http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// *
// *****************************************************************************
// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/


.macro push_v_regs
    stp             q8, q9, [sp, #-32]!
    stp             q10, q11, [sp, #-32]!
    stp             q12, q13, [sp, #-32]!
    stp             q14, q15, [sp, #-32]!
    stp             X8, X9, [sp, #-16]!
    stp             X10, X11, [sp, #-16]!
    stp             X12, X13, [sp, #-16]!
    stp             X14, X15, [sp, #-16]!
    stp             X16, X17, [sp, #-16]!
    stp             X29, X30, [sp, #-16]!
.endm
.macro pop_v_regs
    ldp             X29, X30, [sp], #16
    ldp             X16, X17, [sp], #16
    ldp             X14, X15, [sp], #16
    ldp             X12, X13, [sp], #16
    ldp             X10, X11, [sp], #16
    ldp             X8, X9, [sp], #16
    ldp             q14, q15, [sp], #32
    ldp             q12, q13, [sp], #32
    ldp             q10, q11, [sp], #32
    ldp             q8, q9, [sp], #32
.endm
.text
.global ixheaacd_over_lap_add2_armv8


ixheaacd_over_lap_add2_armv8:
    push_v_regs
    MOV             X8, X5
    SUB             X12, X5, #1
    LSL             X9, X5, #2
    LSL             X12, X12, #2
    ADD             X10, X0, X9
    ADD             X7, X1, X12
    ADD             X4, X4, #1
    LD2             {V0.4H, V1.4H}, [X10], #16
    LSL             X11, X6, #2
    SUB             X7, X7, #12
    SUB             X4, X4, #16
    MOV             X12, #-16
    MOV             X13, #1
    ADD             X14, X4, #1
    NEG             X14, X14
    DUP             V21.4S, W4
    LD2             {V6.4H, V7.4H}, [X7], X12
    LSL             X4, X13, X14
    REV64           V4.4H, V6.4H
    DUP             V20.4S, W4
    REV64           V5.4H, V7.4H
    MOV             X4, X3

    MOV             X9, X2
    LD2             {V2.4H, V3.4H}, [X3], #16

    UMULL           V23.4S, V0.4H, V2.4H
    UMLSL           V23.4S, V4.4H, V3.4H
    LD2             {V8.4H, V9.4H}, [X10], #16
    SSHR            V23.4S, V23.4S, #16
    LD2             {V10.4H, V11.4H}, [X3], #16
    SMLAL           V23.4S, V1.4H, V2.4H
    SMLSL           V23.4S, V5.4H, V3.4H
    LD2             {V14.4H, V15.4H}, [X7], X12
    REV64           V12.4H, V14.4H
    REV64           V13.4H, V15.4H
    SQADD           V22.4S, V23.4S, V20.4S
    SSHL            V22.4S, V22.4S, V21.4S
    MOV             V24.16B, V22.16B
    SUB             X8, X8, #8

LOOP_1:

    LD2             {V0.4H, V1.4H}, [X10], #16
    UMULL           V19.4S, V8.4H, V10.4H
    LD2             {V2.4H, V3.4H}, [X3], #16
    UMLSL           V19.4S, V12.4H, V11.4H
    LD2             {V6.4H, V7.4H}, [X7], X12
    UMULL           V23.4S, V0.4H, V2.4H
    REV64           V4.4H, V6.4H
    UMLSL           V23.4S, V4.4H, V3.4H
    REV64           V5.4H, V7.4H
    SSHR            V19.4S, V19.4S, #16
    ST1             {V24.S}[0], [X2], X11
    SMLAL           V19.4S, V9.4H, V10.4H
    ST1             {V24.S}[1], [X2], X11
    SSHR            V23.4S, V23.4S, #16
    ST1             {V24.S}[2], [X2], X11
    SMLAL           V23.4S, V1.4H, V2.4H

    ST1             {V24.S}[3], [X2], X11
    SMLSL           V19.4S, V13.4H, V11.4H
    SMLSL           V23.4S, V5.4H, V3.4H

    LD2             {V8.4H, V9.4H}, [X10], #16
    LD2             {V10.4H, V11.4H}, [X3], #16


    LD2             {V14.4H, V15.4H}, [X7], X12
    SQADD           V18.4S, V19.4S, V20.4S
    REV64           V12.4H, V14.4H
    REV64           V13.4H, V15.4H
    SQADD           V22.4S, V23.4S, V20.4S
    SSHL            V18.4S, V18.4S, V21.4S
    MOV             V16.16B, V18.16B
    ST1             {V16.S}[0], [X2], X11
    SSHL            V22.4S, V22.4S, V21.4S


    MOV             V24.16B, V22.16B
    SUBS            X8, X8, #8

    ST1             {V16.S}[1], [X2], X11
    ST1             {V16.S}[2], [X2], X11
    ST1             {V16.S}[3], [X2], X11


    BGT             LOOP_1


    ST1             {V24.S}[0], [X2], X11
    UMULL           V19.4S, V8.4H, V10.4H
    UMLSL           V19.4S, V12.4H, V11.4H
    ST1             {V24.S}[1], [X2], X11
    ST1             {V24.S}[2], [X2], X11
    SSHR            V19.4S, V19.4S, #16
    ST1             {V24.S}[3], [X2], X11
    SMLAL           V19.4S, V9.4H, V10.4H
    SMLSL           V19.4S, V13.4H, V11.4H
    MOV             X12, #12
    MOV             V30.S[0], W5
    MOV             V31.S[0], W6
    SMULL           V29.4S, V30.4H, V31.4H
    MOV             W7, V29.S[0]

    LSL             W10, W5, #1
    SQADD           V18.4S, V19.4S, V20.4S
    SSHL            V18.4S, V18.4S, V21.4S
    MOV             V16.16B, V18.16B

    ST1             {V16.S}[0], [X2], X11
    LSL             X7, X7, #2

    ST1             {V16.S}[1], [X2], X11
    ADD             X7, X7, X9

    ST1             {V16.S}[2], [X2], X11
    ST1             {V16.S}[3], [X2], X11

    SUB             X11, X10, #1
    LSL             X10, X11, #2
    ADD             X10, X0, X10
    LSL             X11, X11, #1
    SUB             X10, X10, X12
    LSL             X8, X6, #2
    MOV             X12, #-16
    ADD             X11, X11, X4

    LD1             {V6.4S}, [X10], X12
    SUB             X11, X11, #14


    REV64           V0.4S, V6.4S
    SQNEG           V0.4S, V0.4S


    UZP1            V1.8H, V0.8H, V0.8H
    UZP2            V0.8H, V0.8H, V0.8H
    REV64           V1.4S, V1.4S
    REV64           V0.4S, V0.4S
    LD2             {V2.4H, V3.4H}, [X11], X12
    REV64           V2.4H, V2.4H
    REV64           V3.4H, V3.4H

    LD2             {V4.4H, V5.4H}, [X1], #16

    UMULL           V23.4S, V1.4H, V3.4H
    UMLSL           V23.4S, V4.4H, V2.4H
    SSHR            V23.4S, V23.4S, #16
    SMLAL           V23.4S, V0.4H, V3.4H
    SMLSL           V23.4S, V5.4H, V2.4H
    SQADD           V22.4S, V23.4S, V20.4S
    SSHL            V22.4S, V22.4S, V21.4S
    MOV             V24.16B, V22.16B


    LD1             {V14.4S}, [X10], X12
    UMULL           V23.4S, V1.4H, V3.4H
    UMLSL           V23.4S, V4.4H, V2.4H
    REV64           V8.4S, V14.4S
    SQNEG           V8.4S, V8.4S
    LD2             {V10.4H, V11.4H}, [X11], X12
    SSHR            V23.4S, V23.4S, #16
    LD2             {V12.4H, V13.4H}, [X1], #16
    SMLAL           V23.4S, V0.4H, V3.4H
    SMLSL           V23.4S, V5.4H, V2.4H
    UZP1            V9.8H, V8.8H, V8.8H
    UZP2            V8.8H, V8.8H, V8.8H
    rev64           v9.4s, v9.4s
    rev64           v8.4s, v8.4s
    REV64           V10.4H, V10.4H
    REV64           V11.4H, V11.4H
    SQADD           V22.4S, V23.4S, V20.4S
    SUB             X5, X5, #8
    SSHL            V22.4S, V22.4S, V21.4S
    MOV             V24.16B, V22.16B


LOOP_2:


    LD1             {V6.4S}, [X10], X12
    UMULL           V19.4S, V9.4H, V11.4H
    REV64           V0.4S, V6.4S
    SQNEG           V0.4S, V0.4S
    UZP1            V1.8H, V0.8H, V0.8H
    UZP2            V0.8H, V0.8H, V0.8H
    REV64           V1.4S, V1.4S
    REV64           V0.4S, V0.4S
    LD2             {V2.4H, V3.4H}, [X11], X12
    REV64           V2.8H, V2.8H
    REV64           V3.8H, V3.8H

    LD2             {V4.4H, V5.4H}, [X1], #16
    UMLSL           V19.4S, V12.4H, V10.4H
    ST1             {V24.S}[0], [X7], X8
    UMULL           V23.4S, V1.4H, V3.4H
    ST1             {V24.S}[1], [X7], X8
    SSHR            V19.4S, V19.4S, #16
    ST1             {V24.S}[2], [X7], X8
    UMLSL           V23.4S, V4.4H, V2.4H
    ST1             {V24.S}[3], [X7], X8
    SMLAL           V19.4S, V8.4H, V11.4H
    LD1             {V14.4S}, [X10], X12
    SSHR            V23.4S, V23.4S, #16
    SMLSL           V19.4S, V13.4H, V10.4H
    LD2             {V10.4H, V11.4H}, [X11], X12
    SMLAL           V23.4S, V0.4H, V3.4H
    SMLSL           V23.4S, V5.4H, V2.4H
    REV64           V8.4S, V14.4S
    LD2             {V12.4H, V13.4H}, [X1], #16
    SQNEG           V8.4S, V8.4S
    REV64           V11.4H, V11.4h
    REV64           V10.4H, V10.4H
    SQADD           V18.4S, V19.4S, V20.4S
    UZP1            V9.8H, V8.8H, V8.8H
    UZP2            V8.8H, V8.8H, V8.8H
    rev64           v9.4s, v9.4s
    rev64           v8.4s, v8.4s
    SQADD           V22.4S, V23.4S, V20.4S
    SSHL            V18.4S, V18.4S, V21.4S
    SUBS            X5, X5, #8
    MOV             V16.16B, V18.16B
    ST1             {V16.S}[0], [X7], X8
    SSHL            V22.4S, V22.4S, V21.4S
    ST1             {V16.S}[1], [X7], X8
    MOV             V24.16B, V22.16B

    ST1             {V16.S}[2], [X7], X8
    ST1             {V16.S}[3], [X7], X8

    BGT             LOOP_2

    ST1             {V24.S}[0], [X7], X8
    UMULL           V19.4S, V9.4H, V11.4H
    UMLSL           V19.4S, V12.4H, V10.4H
    ST1             {V24.S}[1], [X7], X8
    ST1             {V24.S}[2], [X7], X8
    SSHR            V19.4S, V19.4S, #16
    ST1             {V24.S}[3], [X7], X8

    SMLAL           V19.4S, V8.4H, V11.4H
    SMLSL           V19.4S, V13.4H, V10.4H
    SQADD           V18.4S, V19.4S, V20.4S
    SSHL            V18.4S, V18.4S, V21.4S
    MOV             V16.16B, V18.16B

    ST1             {V16.S}[0], [X7], X8
    ST1             {V16.S}[1], [X7], X8
    ST1             {V16.S}[2], [X7], X8
    ST1             {V16.S}[3], [X7], X8

    pop_v_regs
    RET