///******************************************************************************
// *
// * Copyright (C) 2018 The Android Open Source Project
// *
// * Licensed under the Apache License, Version 2.0 (the "License");
// * you may not use this file except in compliance with the License.
// * You may obtain a copy of the License at:
// *
// * http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// *
// *****************************************************************************
// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/


.macro push_v_regs
    stp             q8, q9, [sp, #-32]!
    stp             q10, q11, [sp, #-32]!
    stp             q12, q13, [sp, #-32]!
    stp             q14, q15, [sp, #-32]!
    stp             X8, X9, [sp, #-16]!
    stp             X10, X11, [sp, #-16]!
    stp             X12, X13, [sp, #-16]!
    stp             X22, X23, [sp, #-16]!
    stp             X16, X17, [sp, #-16]!
    stp             X20, X21, [sp, #-16]!
.endm
.macro pop_v_regs
    ldp             X20, X21, [sp], #16
    ldp             X16, X17, [sp], #16
    ldp             X22, X23, [sp], #16
    ldp             X12, X13, [sp], #16
    ldp             X10, X11, [sp], #16
    ldp             X8, X9, [sp], #16
    ldp             q14, q15, [sp], #32
    ldp             q12, q13, [sp], #32
    ldp             q10, q11, [sp], #32
    ldp             q8, q9, [sp], #32
.endm

.macro swp reg1, reg2
    MOV             X16, \reg1
    MOV             \reg1, \reg2
    MOV             \reg2, x16
.endm
.text
.global ixheaacd_pretwiddle_compute_armv8

ixheaacd_pretwiddle_compute_armv8:

    push_v_regs

    LSL             x7, x4, #4
    ADD             x7, x2, x7
    SUB             x7, x7, #4
    MOV             x22, #7500
    ADD             x3, x3, x22
    MVN             w5, w5
    ADD             w5, w5, #1





ARM_PROLOGUE:
    LDRH            w21, [x3]
    LDRH            w22, [x3, #2]
    LSL             w22, w22, #16
    LSL             w21, w21, #16

    LDR             w8, [x3], #4
    LDR             w9, [x0], #4












    SMULL           X12, w9, w21
    ASR             X12, x12, #32
    LDR             w10, [x1], #-4
    SMULL           X11, w9, w22
    ASR             X11, x11, #32
    SMULL           X23, w10, w22
    ASR             X23, x23, #32
    ADD             w9, w12, w23
    SMULL           X6, w10, w21
    ASR             X6, x6, #32


    MVN             w9, w9
    ADD             w9, w9, #1
    SUB             w11, w11, w6
    CMP             w5, #0
    BGT             NEXT
    MVN             w8, w5
    ADD             w8, w8, #1
    ASR             w11, w11, w8
    ASR             w9, w9, w8
    B               NEXT1

NEXT:
    LSL             w11, w11, w5
    LSL             w9, w9, w5



NEXT1:
    STR             w9, [x2], #4
    STR             w11, [x2], #4

    CMP             X4, #0x100
    BNE             NXT
    MOV             X6, #4
    B               NXT1
NXT:
    MOV             X6, #32
    ADD             X3, X3, #28

NXT1:
    SUB             X4, X4, #1
    ASR             X4, X4, #2
    SUB             x7, x7, #28














NEON_PROLOGUE:

    MOV             x8, #-32

    dup             v14.4s, w5

    SUB             X1, X1, #28

    LD2             {v8.h, v9.h}[0], [x3], x6
    LD2             {v8.h, v9.h}[1], [x3], x6
    LD2             {v8.h, v9.h}[2], [x3], x6
    LD2             {v8.h, v9.h}[3], [x3], x6

    rev64           v10.4h, v8.4h
    rev64           v11.4h, v9.4h

    LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32

    LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8

    rev64           v0.4h, v0.4h
    rev64           v1.4h, v1.4h
    rev64           v4.4h, v4.4h
    rev64           v5.4h, v5.4h







    uMULL           v30.4s, v2.4h, v9.4h
    uMULL           v28.4s, v4.4h, v9.4h
    uMULL           v26.4s, v2.4h, v8.4h
    uMULL           v24.4s, v4.4h, v8.4h

    ushR            v30.4s, v30.4s, #16
    ushR            v28.4s, v28.4s, #16
    ushR            v26.4s, v26.4s, #16
    ushR            v24.4s, v24.4s, #16

    sMLAL           v30.4s, v3.4h, v9.4h
    sMLAL           v28.4s, v5.4h, v9.4h
    sMLAL           v26.4s, v3.4h, v8.4h
    sMLAL           v24.4s, v5.4h, v8.4h

    ADD             v28.4s, v26.4s , v28.4s
    NEG             v28.4s, v28.4s
    SUB             v30.4s, v30.4s , v24.4s

    uMULL           v22.4s, v0.4h, v11.4h
    uMULL           v20.4s, v6.4h, v11.4h
    uMULL           v18.4s, v0.4h, v10.4h
    uMULL           v16.4s, v6.4h, v10.4h

    ushR            v22.4s, v22.4s, #16
    ushR            v20.4s, v20.4s, #16
    ushR            v18.4s, v18.4s, #16
    ushR            v16.4s, v16.4s, #16

    sMLAL           v22.4s, v1.4h, v11.4h
    LD2             {v8.h, v9.h}[0], [x3], x6

    sMLAL           v20.4s, v7.4h, v11.4h
    LD2             {v8.h, v9.h}[1], [x3], x6

    sMLAL           v18.4s, v1.4h, v10.4h
    LD2             {v8.h, v9.h}[2], [x3], x6

    sMLAL           v16.4s, v7.4h, v10.4h
    LD2             {v8.h, v9.h}[3], [x3], x6

    ADD             v20.4s, v20.4s , v18.4s

    NEG             v20.4s, v20.4s
    rev64           v10.4h, v8.4h
    rev64           v11.4h, v9.4h
    SUB             v22.4s, v16.4s , v22.4s
    LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32



    sshL            v20.4s, v20.4s, v14.4s
    LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8

    rev64           v0.4h, v0.4h
    rev64           v1.4h, v1.4h
    sshL            v22.4s, v22.4s, v14.4s

    rev64           v4.4h, v4.4h
    rev64           v5.4h, v5.4h
    sshL            v18.4s, v30.4s, v14.4s


    sshL            v16.4s, v28.4s, v14.4s








    SUB             X4, X4, #2

CORE_LOOP:
    uMULL           v30.4s, v2.4h, v9.4h
    MOV             v17.16B, v18.16B
    ST2             { v16.4s, v17.4s}, [x2]
    ADD             x2, x2, #32
    uMULL           v28.4s, v4.4h, v9.4h

    uMULL           v26.4s, v2.4h, v8.4h
    MOV             v21.16B, v22.16B
    ST2             { v20.4s, v21.4s}, [x7], x8
    uMULL           v24.4s, v4.4h, v8.4h

    ushR            v30.4s, v30.4s, #16
    ushR            v28.4s, v28.4s, #16
    ushR            v26.4s, v26.4s, #16
    ushR            v24.4s, v24.4s, #16

    sMLAL           v30.4s, v3.4h, v9.4h
    sMLAL           v28.4s, v5.4h, v9.4h
    sMLAL           v26.4s, v3.4h, v8.4h
    sMLAL           v24.4s, v5.4h, v8.4h

    ADD             v28.4s, v26.4s , v28.4s
    NEG             v28.4s, v28.4s
    SUB             v30.4s, v30.4s , v24.4s

    uMULL           v22.4s, v0.4h, v11.4h
    LD2             {v8.h, v9.h}[0], [x3], x6
    uMULL           v20.4s, v6.4h, v11.4h

    uMULL           v18.4s, v0.4h, v10.4h
    LD2             {v8.h, v9.h}[1], [x3], x6
    uMULL           v16.4s, v6.4h, v10.4h

    ushR            v22.4s, v22.4s, #16
    LD2             {v8.h, v9.h}[2], [x3], x6
    ushR            v20.4s, v20.4s, #16


    ushR            v18.4s, v18.4s, #16
    LD2             {v8.h, v9.h}[3], [x3], x6
    ushR            v16.4s, v16.4s, #16

    sMLAL           v22.4s, v1.4h, v11.4h

    sMLAL           v20.4s, v7.4h, v11.4h


    sMLAL           v18.4s, v1.4h, v10.4h


    sMLAL           v16.4s, v7.4h, v10.4h
    LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
    ADD             v20.4s, v20.4s , v18.4s

    NEG             v20.4s, v20.4s
    rev64           v10.4h, v8.4h
    rev64           v11.4h, v9.4h

    SUB             v22.4s, v16.4s , v22.4s
    LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8
    sshL            v20.4s, v20.4s, v14.4s


    sshL            v22.4s, v22.4s, v14.4s

    rev64           v0.4h, v0.4h
    rev64           v1.4h, v1.4h
    sshL            v18.4s, v30.4s, v14.4s

    rev64           v4.4h, v4.4h
    rev64           v5.4h, v5.4h
    sshL            v16.4s, v28.4s, v14.4s


    SUBS            x4, x4, #1
    BNE             CORE_LOOP






NEON_EPILOGUE:
    uMULL           v30.4s, v2.4h, v9.4h
    MOV             v17.16B, v18.16B
    ST2             { v16.4s, v17.4s}, [x2]
    ADD             x2, x2, #32
    uMULL           v28.4s, v4.4h, v9.4h

    uMULL           v26.4s, v2.4h, v8.4h
    MOV             v21.16B, v22.16B

    ST2             { v20.4s, v21.4s}, [x7], x8
    uMULL           v24.4s, v4.4h, v8.4h

    ushR            v30.4s, v30.4s, #16
    ushR            v28.4s, v28.4s, #16
    ushR            v26.4s, v26.4s, #16
    ushR            v24.4s, v24.4s, #16

    sMLAL           v30.4s, v3.4h, v9.4h
    sMLAL           v28.4s, v5.4h, v9.4h
    sMLAL           v26.4s, v3.4h, v8.4h
    sMLAL           v24.4s, v5.4h, v8.4h

    ADD             v28.4s, v26.4s , v28.4s
    NEG             v28.4s, v28.4s
    SUB             v30.4s, v30.4s , v24.4s

    uMULL           v22.4s, v0.4h, v11.4h
    uMULL           v20.4s, v6.4h, v11.4h
    uMULL           v18.4s, v0.4h, v10.4h
    uMULL           v16.4s, v6.4h, v10.4h

    ushR            v22.4s, v22.4s, #16
    ushR            v20.4s, v20.4s, #16
    ushR            v18.4s, v18.4s, #16
    ushR            v16.4s, v16.4s, #16

    sMLAL           v22.4s, v1.4h, v11.4h
    sMLAL           v20.4s, v7.4h, v11.4h
    sMLAL           v18.4s, v1.4h, v10.4h
    sMLAL           v16.4s, v7.4h, v10.4h

    ADD             v20.4s, v20.4s , v18.4s
    NEG             v20.4s, v20.4s
    SUB             v22.4s, v16.4s , v22.4s


    sshL            v20.4s, v20.4s, v14.4s
    sshL            v22.4s, v22.4s, v14.4s
    sshL            v18.4s, v30.4s, v14.4s
    sshL            v16.4s, v28.4s, v14.4s
    MOV             v17.16B, v18.16B
    ST2             { v16.4s, v17.4s}, [x2]
    ADD             x2, x2, #32
    MOV             v21.16B, v22.16B
    ST2             { v20.4s, v21.4s}, [x7], x8


RESIDUE_NEON:
    MOV             x10, #-16
    movi            v3.2s, #0x00000000
    movi            v4.2s, #0x00000000

    LD2             {v21.2s, v22.2s}, [x0], #16
    MOV             v0.8B, v21.8B
    MOV             v2.8B, v22.8B

    LD1             {v1.s}[0], [x0], #4;
    LD1             {v3.s}[0], [x0], #4;
    LD1             {v1.s}[1], [x0]
    MOV             v21.8B, v0.8B

    UZP1            v0.4h, v21.4h, v1.4h
    UZP2            v1.4h, v21.4h, v1.4h
    MOV             v21.8B, v2.8B
    UZP1            v2.4h, v21.4h, v3.4h
    UZP2            v3.4h, v21.4h, v3.4h

    ADD             x1, x1, #4

    LD1             {v6.s}[0], [x1], #4
    LD1             {v4.s}[1], [x1], #4
    LD1             {v6.s}[1], [x1], #4


    LD2             {v21.2s, v22.2s}, [x1], #16
    MOV             v5.8B, v21.8B
    MOV             v7.8B, v22.8B


    MOV             v21.8B, v4.8B
    UZP1            v4.4h, v21.4h, v5.4h
    UZP2            v5.4h, v21.4h, v5.4h
    MOV             v21.8B, v6.8B
    UZP1            v6.4h, v21.4h, v7.4h
    UZP2            v7.4h, v21.4h, v7.4h
    rev64           v0.4h, v0.4h
    rev64           v1.4h, v1.4h
    rev64           v4.4h, v4.4h
    rev64           v5.4h, v5.4h

    LD2             {v8.h, v9.h}[0], [x3], x6
    LD2             {v8.h, v9.h}[1], [x3], x6
    LD2             {v8.h, v9.h}[2], [x3], x6
    LD2             {v8.h, v9.h}[3], [x3], x6

    rev64           v10.4h, v8.4h
    rev64           v11.4h, v9.4h



    uMULL           v30.4s, v2.4h, v9.4h
    uMULL           v28.4s, v4.4h, v9.4h
    uMULL           v26.4s, v2.4h, v8.4h
    uMULL           v24.4s, v4.4h, v8.4h

    ushR            v30.4s, v30.4s, #16
    ushR            v28.4s, v28.4s, #16
    ushR            v26.4s, v26.4s, #16
    ushR            v24.4s, v24.4s, #16

    sMLAL           v30.4s, v3.4h, v9.4h
    sMLAL           v28.4s, v5.4h, v9.4h
    sMLAL           v26.4s, v3.4h, v8.4h
    sMLAL           v24.4s, v5.4h, v8.4h

    ADD             v28.4s, v26.4s , v28.4s
    NEG             v28.4s, v28.4s
    SUB             v30.4s, v30.4s , v24.4s

    uMULL           v22.4s, v0.4h, v11.4h
    uMULL           v20.4s, v6.4h, v11.4h
    uMULL           v18.4s, v0.4h, v10.4h
    uMULL           v16.4s, v6.4h, v10.4h

    ushR            v22.4s, v22.4s, #16
    ushR            v20.4s, v20.4s, #16
    ushR            v18.4s, v18.4s, #16
    ushR            v16.4s, v16.4s, #16

    sMLAL           v22.4s, v1.4h, v11.4h
    sMLAL           v20.4s, v7.4h, v11.4h
    sMLAL           v18.4s, v1.4h, v10.4h
    sMLAL           v16.4s, v7.4h, v10.4h

    ADD             v20.4s, v20.4s , v18.4s
    NEG             v20.4s, v20.4s
    SUB             v22.4s, v16.4s , v22.4s



    sshL            v20.4s, v20.4s, v14.4s
    sshL            v22.4s, v22.4s, v14.4s
    sshL            v18.4s, v30.4s, v14.4s
    sshL            v16.4s, v28.4s, v14.4s
    MOV             v21.16B, v22.16B
    ST2             { v20.4s, v21.4s}, [x7]
    mov             v17.16B, v18.16B
    ST2             {v16.2s, v17.2s}, [x2]
    ADD             x2, x2, #16

    ST2             {v16.s, v17.s}[2], [x2]
    ADD             x2, x2, #8






END1:
    pop_v_regs
    ret