/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
*******************************************************************************
* @file
*  ihevc_weighted_pred_neon_intr.c
*
* @brief
*  Contains function definitions for weighted prediction used in inter
* prediction
*
* @author
*  Parthiban V
*
* @par List of Functions:
*  - ihevc_weighted_pred_uni()
*  - ihevc_weighted_pred_bi()
*  - ihevc_weighted_pred_bi_default()
*
* @remarks
*  None
*
*******************************************************************************
*/
/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/
#include "ihevc_typedefs.h"
#include "ihevc_defs.h"
#include "ihevc_macros.h"
#include "ihevc_func_selector.h"
#include "ihevc_inter_pred.h"
#include "arm_neon.h"


/**
*******************************************************************************
*
* @brief
*  Does uni-weighted prediction on the array pointed by  pi2_src and stores
* it at the location pointed by pi2_dst Assumptions : The function is
* optimized considering the fact Width and  height are multiple of 2.
*
* @par Description:
*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
* offset
*
* @param[in] pi2_src
*  Pointer to the source
*
* @param[out] pu1_dst
*  Pointer to the destination
*
* @param[in] src_strd
*  Source stride
*
* @param[in] dst_strd
*  Destination stride
*
* @param[in] wgt0
*  weight to be multiplied to the source
*
* @param[in] off0
*  offset to be added after rounding and
*
* @param[in] shifting
*
*
* @param[in] shift
*  (14 Bit depth) + log2_weight_denominator
*
* @param[in] lvl_shift
*  added before shift and offset
*
* @param[in] ht
*  height of the source
*
* @param[in] wd
*  width of the source
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src,
                                      UWORD8 *pu1_dst,
                                      WORD32 src_strd,
                                      WORD32 dst_strd,
                                      WORD32 wgt0,
                                      WORD32 off0,
                                      WORD32 shift,
                                      WORD32 lvl_shift,
                                      WORD32 ht,
                                      WORD32 wd)
{
    WORD32 row, col;
    int16x4_t pi2_src_val1;
    int16x4_t pi2_src_val2;
    int32x4_t i4_tmp1_t;
    int32x4_t i4_tmp2_t;
    int32x4_t sto_res_tmp1;
    uint16x4_t sto_res_tmp2;
    uint16x8_t sto_res_tmp3;
    uint8x8_t sto_res;
    int32x4_t tmp_lvl_shift_t;
    WORD32 tmp_shift = 0 - shift;
    int32x4_t tmp_shift_t;
    WORD16 *pi2_src_tmp;
    UWORD8 *pu1_dst_tmp;

    WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift);
    tmp_lvl_shift += (1 << (shift - 1));
    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
    tmp_shift_t = vmovq_n_s32(tmp_shift);

    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
    /* height has also been unrolled, hence 2 rows will processed at a time                     */
    /* store also has been taken care for two row process                                       */
    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
    /* saturated and narrowed                                                                   */

    for(row = ht; row > 0; row -= 2)
    {
        for(col = wd; col > 0; col -= 4)
        {
            pi2_src_tmp = pi2_src + src_strd;

            pu1_dst_tmp = pu1_dst + dst_strd;

            pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
            pi2_src += 4;

            pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
            i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0);

            i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t);
            i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0);

            sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
            i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t);

            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

            sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
            sto_res = vqmovn_u16(sto_res_tmp3);

            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
            pu1_dst += 4;

            sto_res = vqmovn_u16(sto_res_tmp3);
            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
        }
        pi2_src += 2 * src_strd - wd;
        pu1_dst += 2 * dst_strd - wd;
    }
}
//WEIGHTED_PRED_UNI

/**
*******************************************************************************
*
* @brief
* Chroma uni-weighted prediction on the array pointed by  pi2_src and stores
* it at the location pointed by pi2_dst Assumptions : The function is
* optimized considering the fact Width and  height are multiple of 2.
*
* @par Description:
*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
* offset
*
* @param[in] pi2_src
*  Pointer to the source
*
* @param[out] pu1_dst
*  Pointer to the destination
*
* @param[in] src_strd
*  Source stride
*
* @param[in] dst_strd
*  Destination stride
*
* @param[in] wgt0
*  weight to be multiplied to the source
*
* @param[in] off0
*  offset to be added after rounding and
*
* @param[in] shifting
*
*
* @param[in] shift
*  (14 Bit depth) + log2_weight_denominator
*
* @param[in] lvl_shift
*  added before shift and offset
*
* @param[in] ht
*  height of the source
*
* @param[in] wd
*  width of the source
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src,
                                             UWORD8 *pu1_dst,
                                             WORD32 src_strd,
                                             WORD32 dst_strd,
                                             WORD32 wgt0_cb,
                                             WORD32 wgt0_cr,
                                             WORD32 off0_cb,
                                             WORD32 off0_cr,
                                             WORD32 shift,
                                             WORD32 lvl_shift,
                                             WORD32 ht,
                                             WORD32 wd)
{
    WORD32 row, col;
    int16x4_t pi2_src_val1;
    int16x4_t pi2_src_val2;
    int32x4_t i4_tmp1_t;
    int32x4_t i4_tmp2_t;
    int32x4_t sto_res_tmp1;
    uint16x4_t sto_res_tmp2;
    uint16x8_t sto_res_tmp3;
    uint8x8_t sto_res;
    int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
    int32x4x2_t tmp_lvl_shift_t;
    WORD32 tmp_shift = 0 - shift;
    int32x4_t tmp_shift_t;
    int16x4_t tmp_wgt0_u, tmp_wgt0_v;
    int16x4x2_t wgt0;
    WORD16 *pi2_src_tmp;
    UWORD8 *pu1_dst_tmp;

    WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift);
    tmp_lvl_shift += (1 << (shift - 1));
    tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);

    tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift);
    tmp_lvl_shift += (1 << (shift - 1));
    tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);

    tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);

    tmp_shift_t = vmovq_n_s32(tmp_shift);

    tmp_wgt0_u = vdup_n_s16(wgt0_cb);
    tmp_wgt0_v = vdup_n_s16(wgt0_cr);
    wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);

    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
    /* height has also been unrolled, hence 2 rows will processed at a time                     */
    /* store also has been taken care for two row process                                       */
    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
    /* saturated and narrowed                                                                   */

    for(row = ht; row > 0; row -= 2)
    {
        for(col = 2 * wd; col > 0; col -= 4)
        {
            pi2_src_tmp = pi2_src + src_strd;

            pu1_dst_tmp = pu1_dst + dst_strd;

            pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
            pi2_src += 4;

            pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
            i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]);

            i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]);
            i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]);

            sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
            i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]);

            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

            sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
            sto_res = vqmovn_u16(sto_res_tmp3);

            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
            pu1_dst += 4;

            sto_res = vqmovn_u16(sto_res_tmp3);
            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
        }
        pi2_src += 2 * src_strd - 2 * wd;
        pu1_dst += 2 * dst_strd - 2 * wd;
    }
}
//WEIGHTED_PRED_CHROMA_UNI

/**
*******************************************************************************
*
* @brief
*  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
* pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
* function is optimized considering the fact Width and  height are multiple
* of 2.
*
* @par Description:
*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
* off1 + 1) << (shift - 1) ) >> shift
*
* @param[in] pi2_src1
*  Pointer to source 1
*
* @param[in] pi2_src2
*  Pointer to source 2
*
* @param[out] pu1_dst
*  Pointer to destination
*
* @param[in] src_strd1
*  Source stride 1
*
* @param[in] src_strd2
*  Source stride 2
*
* @param[in] dst_strd
*  Destination stride
*
* @param[in] wgt0
*  weight to be multiplied to source 1
*
* @param[in] off0
*  offset 0
*
* @param[in] wgt1
*  weight to be multiplied to source 2
*
* @param[in] off1
*  offset 1
*
* @param[in] shift
*  (14 Bit depth) + log2_weight_denominator
*
* @param[in] lvl_shift1
*  added before shift and offset
*
* @param[in] lvl_shift2
*  added before shift and offset
*
* @param[in] ht
*  height of the source
*
* @param[in] wd
*  width of the source
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1,
                                     WORD16 *pi2_src2,
                                     UWORD8 *pu1_dst,
                                     WORD32 src_strd1,
                                     WORD32 src_strd2,
                                     WORD32 dst_strd,
                                     WORD32 wgt0,
                                     WORD32 off0,
                                     WORD32 wgt1,
                                     WORD32 off1,
                                     WORD32 shift,
                                     WORD32 lvl_shift1,
                                     WORD32 lvl_shift2,
                                     WORD32 ht,
                                     WORD32 wd)
{
    WORD32 row, col;
    int16x4_t pi2_src1_val1;
    int16x4_t pi2_src1_val2;
    int16x4_t pi2_src2_val1;
    int16x4_t pi2_src2_val2;
    int32x4_t i4_tmp1_t1;
    int32x4_t i4_tmp1_t2;
    int32x4_t i4_tmp2_t1;
    int32x4_t i4_tmp2_t2;
    int32x4_t sto_res_tmp1;
    uint16x4_t sto_res_tmp2;
    uint16x8_t sto_res_tmp3;
    uint8x8_t sto_res;
    int32x4_t tmp_lvl_shift_t;
    WORD32 tmp_shift = 0 - shift;
    int32x4_t tmp_shift_t;
    WORD16 *pi2_src_tmp1;
    WORD16 *pi2_src_tmp2;
    UWORD8 *pu1_dst_tmp;

    WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1);
    tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1));
    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
    tmp_shift_t = vmovq_n_s32(tmp_shift);

    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
    /* height has also been unrolled, hence 2 rows will processed at a time                     */
    /* store also has been taken care for two row process                                       */
    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
    /* saturated and narrowed                                                                   */

    for(row = ht; row > 0; row -= 2)
    {
        for(col = wd; col > 0; col -= 4)
        {
            pi2_src_tmp1 = pi2_src1 + src_strd1;
            pi2_src_tmp2 = pi2_src2 + src_strd2;

            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
            pi2_src1 += 4;
            pu1_dst_tmp = pu1_dst + dst_strd;

            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
            pi2_src2 += 4;
            i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0);

            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
            i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1);

            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);

            i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0);
            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);

            i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1);
            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);

            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);

            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
            sto_res = vqmovn_u16(sto_res_tmp3);

            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
            pu1_dst += 4;

            sto_res = vqmovn_u16(sto_res_tmp3);
            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
        }
        pi2_src1 += 2 * src_strd1 - wd;
        pi2_src2 += 2 * src_strd2 - wd;
        pu1_dst += 2 * dst_strd - wd;
    }
}
//WEIGHTED_PRED_BI

/**
*******************************************************************************
*
* @brief
*  Chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
* pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
* function is optimized considering the fact Width and  height are multiple
* of 2.
*
* @par Description:
*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
* off1 + 1) << (shift - 1) ) >> shift
*
* @param[in] pi2_src1
*  Pointer to source 1
*
* @param[in] pi2_src2
*  Pointer to source 2
*
* @param[out] pu1_dst
*  Pointer to destination
*
* @param[in] src_strd1
*  Source stride 1
*
* @param[in] src_strd2
*  Source stride 2
*
* @param[in] dst_strd
*  Destination stride
*
* @param[in] wgt0
*  weight to be multiplied to source 1
*
* @param[in] off0
*  offset 0
*
* @param[in] wgt1
*  weight to be multiplied to source 2
*
* @param[in] off1
*  offset 1
*
* @param[in] shift
*  (14 Bit depth) + log2_weight_denominator
*
* @param[in] lvl_shift1
*  added before shift and offset
*
* @param[in] lvl_shift2
*  added before shift and offset
*
* @param[in] ht
*  height of the source
*
* @param[in] wd
*  width of the source
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1,
                                            WORD16 *pi2_src2,
                                            UWORD8 *pu1_dst,
                                            WORD32 src_strd1,
                                            WORD32 src_strd2,
                                            WORD32 dst_strd,
                                            WORD32 wgt0_cb,
                                            WORD32 wgt0_cr,
                                            WORD32 off0_cb,
                                            WORD32 off0_cr,
                                            WORD32 wgt1_cb,
                                            WORD32 wgt1_cr,
                                            WORD32 off1_cb,
                                            WORD32 off1_cr,
                                            WORD32 shift,
                                            WORD32 lvl_shift1,
                                            WORD32 lvl_shift2,
                                            WORD32 ht,
                                            WORD32 wd)
{
    WORD32 row, col;
    int16x4_t pi2_src1_val1;
    int16x4_t pi2_src1_val2;
    int16x4_t pi2_src2_val1;
    int16x4_t pi2_src2_val2;
    int32x4_t i4_tmp1_t1;
    int32x4_t i4_tmp1_t2;
    int32x4_t i4_tmp2_t1;
    int32x4_t i4_tmp2_t2;
    int32x4_t sto_res_tmp1;
    uint16x4_t sto_res_tmp2;
    uint16x8_t sto_res_tmp3;
    uint8x8_t sto_res;
    int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
    int32x4x2_t tmp_lvl_shift_t;
    WORD32 tmp_shift = 0 - shift;
    int32x4_t tmp_shift_t;
    int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v;
    int16x4x2_t wgt0, wgt1;
    WORD16 *pi2_src_tmp1;
    WORD16 *pi2_src_tmp2;
    UWORD8 *pu1_dst_tmp;

    WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb);
    tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1));
    tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);

    tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr);
    tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1));
    tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);

    tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);

    tmp_shift_t = vmovq_n_s32(tmp_shift);

    tmp_wgt0_u = vdup_n_s16(wgt0_cb);
    tmp_wgt0_v = vdup_n_s16(wgt0_cr);
    wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
    tmp_wgt1_u = vdup_n_s16(wgt1_cb);
    tmp_wgt1_v = vdup_n_s16(wgt1_cr);
    wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v);

    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
    /* height has also been unrolled, hence 2 rows will processed at a time                     */
    /* store also has been taken care for two row process                                       */
    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
    /* saturated and narrowed                                                                   */

    for(row = ht; row > 0; row -= 2)
    {
        for(col = 2 * wd; col > 0; col -= 4)
        {
            pi2_src_tmp1 = pi2_src1 + src_strd1;
            pi2_src_tmp2 = pi2_src2 + src_strd2;

            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
            pi2_src1 += 4;
            pu1_dst_tmp = pu1_dst + dst_strd;

            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
            pi2_src2 += 4;
            i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]);

            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
            i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]);

            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);

            i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]);
            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]);

            i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]);
            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);

            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);

            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]);
            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
            sto_res = vqmovn_u16(sto_res_tmp3);

            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
            pu1_dst += 4;

            sto_res = vqmovn_u16(sto_res_tmp3);
            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
        }
        pi2_src1 += 2 * src_strd1 - 2 * wd;
        pi2_src2 += 2 * src_strd2 - 2 * wd;
        pu1_dst += 2 * dst_strd - 2 * wd;
    }
}
//WEIGHTED_PRED_CHROMA_BI

/**
*******************************************************************************
*
* @brief
*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
* pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
* function is optimized considering the fact Width and  height are multiple
* of 2.
*
* @par Description:
*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
* >> shift  where shift = 15 - BitDepth
*
* @param[in] pi2_src1
*  Pointer to source 1
*
* @param[in] pi2_src2
*  Pointer to source 2
*
* @param[out] pu1_dst
*  Pointer to destination
*
* @param[in] src_strd1
*  Source stride 1
*
* @param[in] src_strd2
*  Source stride 2
*
* @param[in] dst_strd
*  Destination stride
*
* @param[in] lvl_shift1
*  added before shift and offset
*
* @param[in] lvl_shift2
*  added before shift and offset
*
* @param[in] ht
*  height of the source
*
* @param[in] wd
*  width of the source
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1,
                                             WORD16 *pi2_src2,
                                             UWORD8 *pu1_dst,
                                             WORD32 src_strd1,
                                             WORD32 src_strd2,
                                             WORD32 dst_strd,
                                             WORD32 lvl_shift1,
                                             WORD32 lvl_shift2,
                                             WORD32 ht,
                                             WORD32 wd)
{
    WORD32 row, col;
    int16x4_t pi2_src1_val1;
    int16x4_t pi2_src1_val2;
    int16x4_t pi2_src2_val1;
    int16x4_t pi2_src2_val2;
    int32x4_t i4_tmp1_t1;
    int32x4_t i4_tmp1_t2;
    int32x4_t i4_tmp2_t1;
    int32x4_t i4_tmp2_t2;
    int32x4_t sto_res_tmp1;
    uint16x4_t sto_res_tmp2;
    uint16x8_t sto_res_tmp3;
    uint8x8_t sto_res;
    int32x4_t tmp_lvl_shift_t;
    int32x4_t tmp_shift_t;
    WORD16 *pi2_src_tmp1;
    WORD16 *pi2_src_tmp2;
    UWORD8 *pu1_dst_tmp;
    WORD32 shift;

    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
    WORD32 tmp_shift = 0 - shift;
    WORD32 tmp_lvl_shift = 1 << (shift - 1);
    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
    tmp_shift_t = vmovq_n_s32(tmp_shift);

    int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
    int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);

    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
    /* height has also been unrolled, hence 2 rows will processed at a time                     */
    /* store also has been taken care for two row process                                       */
    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
    /* saturated and narrowed                                                                   */

    for(row = ht; row > 0; row -= 2)
    {
        for(col = wd; col > 0; col -= 4)
        {
            pi2_src_tmp1 = pi2_src1 + src_strd1;
            pi2_src_tmp2 = pi2_src2 + src_strd2;

            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
            pi2_src1 += 4;
            pu1_dst_tmp = pu1_dst + dst_strd;

            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
            pi2_src2 += 4;
            i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);

            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
            i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);

            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);

            i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);

            i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);

            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);

            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
            sto_res = vqmovn_u16(sto_res_tmp3);

            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
            pu1_dst += 4;

            sto_res = vqmovn_u16(sto_res_tmp3);
            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
        }
        pi2_src1 += 2 * src_strd1 - wd;
        pi2_src2 += 2 * src_strd2 - wd;
        pu1_dst += 2 * dst_strd - wd;
    }
}
//WEIGHTED_PRED_BI_DEFAULT

/**
*******************************************************************************
*
* @brief
*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
* pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
* function is optimized considering the fact Width and  height are multiple
* of 2.
*
* @par Description:
*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
* >> shift  where shift = 15 - BitDepth
*
* @param[in] pi2_src1
*  Pointer to source 1
*
* @param[in] pi2_src2
*  Pointer to source 2
*
* @param[out] pu1_dst
*  Pointer to destination
*
* @param[in] src_strd1
*  Source stride 1
*
* @param[in] src_strd2
*  Source stride 2
*
* @param[in] dst_strd
*  Destination stride
*
* @param[in] lvl_shift1
*  added before shift and offset
*
* @param[in] lvl_shift2
*  added before shift and offset
*
* @param[in] ht
*  height of the source
*
* @param[in] wd
*  width of the source
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1,
                                                    WORD16 *pi2_src2,
                                                    UWORD8 *pu1_dst,
                                                    WORD32 src_strd1,
                                                    WORD32 src_strd2,
                                                    WORD32 dst_strd,
                                                    WORD32 lvl_shift1,
                                                    WORD32 lvl_shift2,
                                                    WORD32 ht,
                                                    WORD32 wd)
{
    WORD32 row, col;
    int16x4_t pi2_src1_val1;
    int16x4_t pi2_src1_val2;
    int16x4_t pi2_src2_val1;
    int16x4_t pi2_src2_val2;
    int32x4_t i4_tmp1_t1;
    int32x4_t i4_tmp1_t2;
    int32x4_t i4_tmp2_t1;
    int32x4_t i4_tmp2_t2;
    int32x4_t sto_res_tmp1;
    uint16x4_t sto_res_tmp2;
    uint16x8_t sto_res_tmp3;
    uint8x8_t sto_res;
    int32x4_t tmp_lvl_shift_t;
    int32x4_t tmp_shift_t;
    WORD16 *pi2_src_tmp1;
    WORD16 *pi2_src_tmp2;
    UWORD8 *pu1_dst_tmp;
    WORD32 shift;
    WORD32 tmp_shift;
    WORD32 tmp_lvl_shift;
    int16x4_t lvl_shift1_t;
    int16x4_t lvl_shift2_t;
    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
    tmp_shift = 0 - shift;
    tmp_lvl_shift = 1 << (shift - 1);
    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
    tmp_shift_t = vmovq_n_s32(tmp_shift);

    lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
    lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);

    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
    /* height has also been unrolled, hence 2 rows will processed at a time                     */
    /* store also has been taken care for two row process                                       */
    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
    /* saturated and narrowed                                                                   */

    for(row = ht; row > 0; row -= 2)
    {
        for(col = 2 * wd; col > 0; col -= 4)
        {
            pi2_src_tmp1 = pi2_src1 + src_strd1;
            pi2_src_tmp2 = pi2_src2 + src_strd2;

            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
            pi2_src1 += 4;
            pu1_dst_tmp = pu1_dst + dst_strd;

            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
            pi2_src2 += 4;
            i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);

            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
            i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);

            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);

            i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);

            i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);

            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);

            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
            sto_res = vqmovn_u16(sto_res_tmp3);

            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
            pu1_dst += 4;

            sto_res = vqmovn_u16(sto_res_tmp3);
            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
        }
        pi2_src1 += 2 * src_strd1 - 2 * wd;
        pi2_src2 += 2 * src_strd2 - 2 * wd;
        pu1_dst += 2 * dst_strd - 2 * wd;
    }
}
//WEIGHTED_PRED_CHROMA_BI_DEFAULT