//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
// *******************************************************************************
// * @file
// *  ih264_ihadamard_scaling_av8.s
// *
// * @brief
// *  Contains function definitions for inverse hadamard transform on 4x4 DC outputs
// *  of 16x16 intra-prediction
// *
// * @author
// *  Mohit
// *
// * @par List of Functions:
// *  - ih264_ihadamard_scaling_4x4_av8()
// *
// * @remarks
// *  None
// *
.include "ih264_neon_macros.s"

// *******************************************************************************
// */
// * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
// * of a 16x16 intra prediction macroblock, and then performs scaling.
// * prediction buffer
// *
// * @par Description:
// *  The DC coefficients pass through a 2-stage inverse hadamard transform.
// *  This inverse transformed content is scaled to based on Qp value.
// *
// * @param[in] pi2_src
// *  input 4x4 block of DC coefficients
// *
// * @param[out] pi2_out
// *  output 4x4 block
// *
// * @param[in] pu2_iscal_mat
// *  pointer to scaling list
// *
// * @param[in] pu2_weigh_mat
// *  pointer to weight matrix
// *
// * @param[in] u4_qp_div_6
// *  Floor (qp/6)
// *
// * @param[in] pi4_tmp
// * temporary buffer of size 1*16
// *
// * @returns none
// *
// * @remarks none
// *
// *******************************************************************************
// */
// *
// *******************************************************************************
// */
// void ih264_ihadamard_scaling_4x4(word16* pi2_src,
//        word16* pi2_out,
//        const uword16 *pu2_iscal_mat,
//        const uword16 *pu2_weigh_mat,
//        uword32 u4_qp_div_6,
//        word32* pi4_tmp)
//**************variables vs registers*****************************************
//x0 => *pi2_src
//x1 => *pi2_out
//x2 => *pu2_iscal_mat
//x3 => *pu2_weigh_mat
//x4=>   u4_qp_div_6

.text
.p2align 2

    .global ih264_ihadamard_scaling_4x4_av8
ih264_ihadamard_scaling_4x4_av8:

//only one shift is done in horizontal inverse because,
//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
    push_v_regs

//=======================inverse hadamard transform================================

    ld4       {v0.4h-v3.4h}, [x0]       //load x4,x5,x6,x7

    dup       v14.4s, w4                // populate the u4_qp_div_6
    ld1       {v15.h}[0], [x3]          // pu2_weigh_mat
    ld1       {v16.h}[0], [x2]          //pu2_iscal_mat

    saddl     v4.4s, v0.4h, v3.4h       //x0 = x4 + x7
    saddl     v5.4s, v1.4h, v2.4h       //x1 = x5 + x6
    ssubl     v6.4s, v1.4h, v2.4h       //x2 = x5 - x6
    ssubl     v7.4s, v0.4h, v3.4h       //x3 = x4 - x7

    add       v0.4s, v4.4s, v5.4s       //pi4_tmp_ptr[0] = x0 + x1
    add       v1.4s, v7.4s, v6.4s       //pi4_tmp_ptr[1] = x3 + x2
    sub       v2.4s, v4.4s, v5.4s       //pi4_tmp_ptr[2] = x0 - x1
    sub       v3.4s, v7.4s, v6.4s       //pi4_tmp_ptr[3] = x3 - x2

    umull     v15.4s, v15.4h, v16.4h
    dup       v15.4s, v15.s[0]          //pu2_weigh_mat[0]*pu2_iscal_mat[0]

    //transpose
    trn1      v4.4s, v0.4s, v1.4s
    trn2      v5.4s, v0.4s, v1.4s
    trn1      v6.4s, v2.4s, v3.4s
    trn2      v7.4s, v2.4s, v3.4s

    trn1      v0.2d, v4.2d, v6.2d
    trn2      v2.2d, v4.2d, v6.2d
    trn1      v1.2d, v5.2d, v7.2d
    trn2      v3.2d, v5.2d, v7.2d
    //end transpose

    add       v4.4s, v0.4s, v3.4s       //x0 = x4+x7
    add       v5.4s, v1.4s, v2.4s       //x1 = x5+x6
    sub       v6.4s, v1.4s, v2.4s       //x2 = x5-x6
    sub       v7.4s, v0.4s, v3.4s       //x3 = x4-x7

    add       v0.4s, v4.4s, v5.4s       //pi4_tmp_ptr[0] = x0 + x1
    add       v1.4s, v7.4s, v6.4s       //pi4_tmp_ptr[1] = x3 + x2
    sub       v2.4s, v4.4s, v5.4s       //pi4_tmp_ptr[2] = x0 - x1
    sub       v3.4s, v7.4s, v6.4s       //pi4_tmp_ptr[3] = x3 - x2

    mul       v0.4s, v0.4s, v15.4s      // q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
    mul       v1.4s, v1.4s, v15.4s      // q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
    mul       v2.4s, v2.4s, v15.4s      // q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
    mul       v3.4s, v3.4s, v15.4s      // q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15

    sshl      v0.4s, v0.4s, v14.4s      // q0  = q[i] = (p[i] << (qp/6)) where i = 0..3
    sshl      v1.4s, v1.4s, v14.4s      // q1  = q[i] = (p[i] << (qp/6)) where i = 4..7
    sshl      v2.4s, v2.4s, v14.4s      // q2  = q[i] = (p[i] << (qp/6)) where i = 8..11
    sshl      v3.4s, v3.4s, v14.4s      // q3  = q[i] = (p[i] << (qp/6)) where i = 12..15

    sqrshrn   v0.4h, v0.4s, #6          // d0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
    sqrshrn   v1.4h, v1.4s, #6          // d1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
    sqrshrn   v2.4h, v2.4s, #6          // d2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
    sqrshrn   v3.4h, v3.4s, #6          // d3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15

    st1       {v0.4h-v3.4h}, [x1]       //store the result

    pop_v_regs
    ret


// *******************************************************************************
// */
// * @brief This function performs a 2x2 inverse hadamard transform for chroma block
// *
// * @par Description:
// *  The DC coefficients pass through a 2-stage inverse hadamard transform.
// *  This inverse transformed content is scaled to based on Qp value.
// *  Both DC blocks of U and v blocks are processesd
// *
// * @param[in] pi2_src
// *  input 1x8 block of ceffs. First 4 are from U and next from V
// *
// * @param[out] pi2_out
// *  output 1x8 block
// *
// * @param[in] pu2_iscal_mat
// *  pointer to scaling list
// *
// * @param[in] pu2_weigh_mat
// *  pointer to weight matrix
// *
// * @param[in] u4_qp_div_6
// *  Floor (qp/6)
// *
// * @returns none
// *
// * @remarks none
// *
// *******************************************************************************
// */
// *
// *******************************************************************************
// */
// void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
//                                  WORD16* pi2_out,
//                                  const UWORD16 *pu2_iscal_mat,
//                                  const UWORD16 *pu2_weigh_mat,
//                                  UWORD32 u4_qp_div_6,

    .global ih264_ihadamard_scaling_2x2_uv_av8
ih264_ihadamard_scaling_2x2_uv_av8:

//Registers used
//   x0 : *pi2_src
//   x1 : *pi2_out
//   x2 : *pu2_iscal_mat
//   x3 : *pu2_weigh_mat
//   x4 : u4_qp_div_6
    push_v_regs
    ld1       {v26.h}[0], [x2]
    ld1       {v27.h}[0], [x3]

    sub       w4, w4, #5                //qp/6 - 4
    dup       v28.4s, w4                //load qp/6

    ld2       {v0.4h, v1.4h}, [x0]      //load 8 dc coeffs
                                        //i2_x4,i2_x6,i2_y4,i1_y6 -> d0
                                        //i2_x5,i2_x7,i2_y5,i1_y6 -> d1

    saddl     v2.4s, v0.4h, v1.4h       //i4_x0 = i4_x4 + i4_x5;...x2
    ssubl     v4.4s, v0.4h, v1.4h       //i4_x1 = i4_x4 - i4_x5;...x3

    umull     v30.4s, v26.4h, v27.4h    //pu2_iscal_mat[0]*pu2_weigh_mat[0]
    dup       v30.4s, v30.s[0]

    trn1      v0.4s, v2.4s, v4.4s
    trn2      v1.4s, v2.4s, v4.4s       //i4_x0 i4_x1 -> q1

    add       v2.4s, v0.4s, v1.4s       //i4_x4 = i4_x0+i4_x2;.. i4_x5
    sub       v3.4s, v0.4s, v1.4s       //i4_x6 = i4_x0-i4_x2;.. i4_x7

    mul       v2.4s, v2.4s, v30.4s
    mul       v3.4s, v3.4s, v30.4s

    sshl      v2.4s, v2.4s, v28.4s
    sshl      v3.4s, v3.4s, v28.4s

    xtn       v0.4h, v2.4s              //i4_x4 i4_x5 i4_y4 i4_y5
    xtn       v1.4h, v3.4s              //i4_x6 i4_x7 i4_y6 i4_y7

    st2       {v0.4s-v1.4s}, [x1]
    pop_v_regs
    ret