/****************************************************************************** * * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ /** ******************************************************************************* * @file * ihevc_weighted_pred_neon_intr.c * * @brief * Contains function definitions for weighted prediction used in inter * prediction * * @author * Parthiban V * * @par List of Functions: * - ihevc_weighted_pred_uni() * - ihevc_weighted_pred_bi() * - ihevc_weighted_pred_bi_default() * * @remarks * None * ******************************************************************************* */ /*****************************************************************************/ /* File Includes */ /*****************************************************************************/ #include "ihevc_typedefs.h" #include "ihevc_defs.h" #include "ihevc_macros.h" #include "ihevc_func_selector.h" #include "ihevc_inter_pred.h" #include "arm_neon.h" /** ******************************************************************************* * * @brief * Does uni-weighted prediction on the array pointed by pi2_src and stores * it at the location pointed by pi2_dst Assumptions : The function is * optimized considering the fact Width and height are multiple of 2. * * @par Description: * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + * offset * * @param[in] pi2_src * Pointer to the source * * @param[out] pu1_dst * Pointer to the destination * * @param[in] src_strd * Source stride * * @param[in] dst_strd * Destination stride * * @param[in] wgt0 * weight to be multiplied to the source * * @param[in] off0 * offset to be added after rounding and * * @param[in] shifting * * * @param[in] shift * (14 Bit depth) + log2_weight_denominator * * @param[in] lvl_shift * added before shift and offset * * @param[in] ht * height of the source * * @param[in] wd * width of the source * * @returns * * @remarks * None * ******************************************************************************* */ void ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src, UWORD8 *pu1_dst, WORD32 src_strd, WORD32 dst_strd, WORD32 wgt0, WORD32 off0, WORD32 shift, WORD32 lvl_shift, WORD32 ht, WORD32 wd) { WORD32 row, col; int16x4_t pi2_src_val1; int16x4_t pi2_src_val2; int32x4_t i4_tmp1_t; int32x4_t i4_tmp2_t; int32x4_t sto_res_tmp1; uint16x4_t sto_res_tmp2; uint16x8_t sto_res_tmp3; uint8x8_t sto_res; int32x4_t tmp_lvl_shift_t; WORD32 tmp_shift = 0 - shift; int32x4_t tmp_shift_t; WORD16 *pi2_src_tmp; UWORD8 *pu1_dst_tmp; WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift); tmp_lvl_shift += (1 << (shift - 1)); tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift); tmp_shift_t = vmovq_n_s32(tmp_shift); /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ /* height has also been unrolled, hence 2 rows will processed at a time */ /* store also has been taken care for two row process */ /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ /* saturated and narrowed */ for(row = ht; row > 0; row -= 2) { for(col = wd; col > 0; col -= 4) { pi2_src_tmp = pi2_src + src_strd; pu1_dst_tmp = pu1_dst + dst_strd; pi2_src_val1 = vld1_s16((int16_t *)pi2_src); pi2_src += 4; pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp); i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0); i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t); i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0); sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t); i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t); sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t); sto_res = vqmovn_u16(sto_res_tmp3); sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); pu1_dst += 4; sto_res = vqmovn_u16(sto_res_tmp3); vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); } pi2_src += 2 * src_strd - wd; pu1_dst += 2 * dst_strd - wd; } } //WEIGHTED_PRED_UNI /** ******************************************************************************* * * @brief * Chroma uni-weighted prediction on the array pointed by pi2_src and stores * it at the location pointed by pi2_dst Assumptions : The function is * optimized considering the fact Width and height are multiple of 2. * * @par Description: * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + * offset * * @param[in] pi2_src * Pointer to the source * * @param[out] pu1_dst * Pointer to the destination * * @param[in] src_strd * Source stride * * @param[in] dst_strd * Destination stride * * @param[in] wgt0 * weight to be multiplied to the source * * @param[in] off0 * offset to be added after rounding and * * @param[in] shifting * * * @param[in] shift * (14 Bit depth) + log2_weight_denominator * * @param[in] lvl_shift * added before shift and offset * * @param[in] ht * height of the source * * @param[in] wd * width of the source * * @returns * * @remarks * None * ******************************************************************************* */ void ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src, UWORD8 *pu1_dst, WORD32 src_strd, WORD32 dst_strd, WORD32 wgt0_cb, WORD32 wgt0_cr, WORD32 off0_cb, WORD32 off0_cr, WORD32 shift, WORD32 lvl_shift, WORD32 ht, WORD32 wd) { WORD32 row, col; int16x4_t pi2_src_val1; int16x4_t pi2_src_val2; int32x4_t i4_tmp1_t; int32x4_t i4_tmp2_t; int32x4_t sto_res_tmp1; uint16x4_t sto_res_tmp2; uint16x8_t sto_res_tmp3; uint8x8_t sto_res; int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v; int32x4x2_t tmp_lvl_shift_t; WORD32 tmp_shift = 0 - shift; int32x4_t tmp_shift_t; int16x4_t tmp_wgt0_u, tmp_wgt0_v; int16x4x2_t wgt0; WORD16 *pi2_src_tmp; UWORD8 *pu1_dst_tmp; WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift); tmp_lvl_shift += (1 << (shift - 1)); tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift); tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift); tmp_lvl_shift += (1 << (shift - 1)); tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift); tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v); tmp_shift_t = vmovq_n_s32(tmp_shift); tmp_wgt0_u = vdup_n_s16(wgt0_cb); tmp_wgt0_v = vdup_n_s16(wgt0_cr); wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v); /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ /* height has also been unrolled, hence 2 rows will processed at a time */ /* store also has been taken care for two row process */ /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ /* saturated and narrowed */ for(row = ht; row > 0; row -= 2) { for(col = 2 * wd; col > 0; col -= 4) { pi2_src_tmp = pi2_src + src_strd; pu1_dst_tmp = pu1_dst + dst_strd; pi2_src_val1 = vld1_s16((int16_t *)pi2_src); pi2_src += 4; pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp); i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]); i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]); i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]); sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t); i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]); sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t); sto_res = vqmovn_u16(sto_res_tmp3); sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); pu1_dst += 4; sto_res = vqmovn_u16(sto_res_tmp3); vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); } pi2_src += 2 * src_strd - 2 * wd; pu1_dst += 2 * dst_strd - 2 * wd; } } //WEIGHTED_PRED_CHROMA_UNI /** ******************************************************************************* * * @brief * Does bi-weighted prediction on the arrays pointed by pi2_src1 and * pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The * function is optimized considering the fact Width and height are multiple * of 2. * * @par Description: * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + * off1 + 1) << (shift - 1) ) >> shift * * @param[in] pi2_src1 * Pointer to source 1 * * @param[in] pi2_src2 * Pointer to source 2 * * @param[out] pu1_dst * Pointer to destination * * @param[in] src_strd1 * Source stride 1 * * @param[in] src_strd2 * Source stride 2 * * @param[in] dst_strd * Destination stride * * @param[in] wgt0 * weight to be multiplied to source 1 * * @param[in] off0 * offset 0 * * @param[in] wgt1 * weight to be multiplied to source 2 * * @param[in] off1 * offset 1 * * @param[in] shift * (14 Bit depth) + log2_weight_denominator * * @param[in] lvl_shift1 * added before shift and offset * * @param[in] lvl_shift2 * added before shift and offset * * @param[in] ht * height of the source * * @param[in] wd * width of the source * * @returns * * @remarks * None * ******************************************************************************* */ void ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1, WORD16 *pi2_src2, UWORD8 *pu1_dst, WORD32 src_strd1, WORD32 src_strd2, WORD32 dst_strd, WORD32 wgt0, WORD32 off0, WORD32 wgt1, WORD32 off1, WORD32 shift, WORD32 lvl_shift1, WORD32 lvl_shift2, WORD32 ht, WORD32 wd) { WORD32 row, col; int16x4_t pi2_src1_val1; int16x4_t pi2_src1_val2; int16x4_t pi2_src2_val1; int16x4_t pi2_src2_val2; int32x4_t i4_tmp1_t1; int32x4_t i4_tmp1_t2; int32x4_t i4_tmp2_t1; int32x4_t i4_tmp2_t2; int32x4_t sto_res_tmp1; uint16x4_t sto_res_tmp2; uint16x8_t sto_res_tmp3; uint8x8_t sto_res; int32x4_t tmp_lvl_shift_t; WORD32 tmp_shift = 0 - shift; int32x4_t tmp_shift_t; WORD16 *pi2_src_tmp1; WORD16 *pi2_src_tmp2; UWORD8 *pu1_dst_tmp; WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1); tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1)); tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift); tmp_shift_t = vmovq_n_s32(tmp_shift); /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ /* height has also been unrolled, hence 2 rows will processed at a time */ /* store also has been taken care for two row process */ /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ /* saturated and narrowed */ for(row = ht; row > 0; row -= 2) { for(col = wd; col > 0; col -= 4) { pi2_src_tmp1 = pi2_src1 + src_strd1; pi2_src_tmp2 = pi2_src2 + src_strd2; pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1); pi2_src1 += 4; pu1_dst_tmp = pu1_dst + dst_strd; pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2); pi2_src2 += 4; i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0); pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1); i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1); pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2); i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2); i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0); i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t); i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1); sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t); i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2); sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t); sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t); sto_res = vqmovn_u16(sto_res_tmp3); sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); pu1_dst += 4; sto_res = vqmovn_u16(sto_res_tmp3); vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); } pi2_src1 += 2 * src_strd1 - wd; pi2_src2 += 2 * src_strd2 - wd; pu1_dst += 2 * dst_strd - wd; } } //WEIGHTED_PRED_BI /** ******************************************************************************* * * @brief * Chroma bi-weighted prediction on the arrays pointed by pi2_src1 and * pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The * function is optimized considering the fact Width and height are multiple * of 2. * * @par Description: * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + * off1 + 1) << (shift - 1) ) >> shift * * @param[in] pi2_src1 * Pointer to source 1 * * @param[in] pi2_src2 * Pointer to source 2 * * @param[out] pu1_dst * Pointer to destination * * @param[in] src_strd1 * Source stride 1 * * @param[in] src_strd2 * Source stride 2 * * @param[in] dst_strd * Destination stride * * @param[in] wgt0 * weight to be multiplied to source 1 * * @param[in] off0 * offset 0 * * @param[in] wgt1 * weight to be multiplied to source 2 * * @param[in] off1 * offset 1 * * @param[in] shift * (14 Bit depth) + log2_weight_denominator * * @param[in] lvl_shift1 * added before shift and offset * * @param[in] lvl_shift2 * added before shift and offset * * @param[in] ht * height of the source * * @param[in] wd * width of the source * * @returns * * @remarks * None * ******************************************************************************* */ void ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1, WORD16 *pi2_src2, UWORD8 *pu1_dst, WORD32 src_strd1, WORD32 src_strd2, WORD32 dst_strd, WORD32 wgt0_cb, WORD32 wgt0_cr, WORD32 off0_cb, WORD32 off0_cr, WORD32 wgt1_cb, WORD32 wgt1_cr, WORD32 off1_cb, WORD32 off1_cr, WORD32 shift, WORD32 lvl_shift1, WORD32 lvl_shift2, WORD32 ht, WORD32 wd) { WORD32 row, col; int16x4_t pi2_src1_val1; int16x4_t pi2_src1_val2; int16x4_t pi2_src2_val1; int16x4_t pi2_src2_val2; int32x4_t i4_tmp1_t1; int32x4_t i4_tmp1_t2; int32x4_t i4_tmp2_t1; int32x4_t i4_tmp2_t2; int32x4_t sto_res_tmp1; uint16x4_t sto_res_tmp2; uint16x8_t sto_res_tmp3; uint8x8_t sto_res; int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v; int32x4x2_t tmp_lvl_shift_t; WORD32 tmp_shift = 0 - shift; int32x4_t tmp_shift_t; int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v; int16x4x2_t wgt0, wgt1; WORD16 *pi2_src_tmp1; WORD16 *pi2_src_tmp2; UWORD8 *pu1_dst_tmp; WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb); tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1)); tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift); tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr); tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1)); tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift); tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v); tmp_shift_t = vmovq_n_s32(tmp_shift); tmp_wgt0_u = vdup_n_s16(wgt0_cb); tmp_wgt0_v = vdup_n_s16(wgt0_cr); wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v); tmp_wgt1_u = vdup_n_s16(wgt1_cb); tmp_wgt1_v = vdup_n_s16(wgt1_cr); wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v); /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ /* height has also been unrolled, hence 2 rows will processed at a time */ /* store also has been taken care for two row process */ /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ /* saturated and narrowed */ for(row = ht; row > 0; row -= 2) { for(col = 2 * wd; col > 0; col -= 4) { pi2_src_tmp1 = pi2_src1 + src_strd1; pi2_src_tmp2 = pi2_src2 + src_strd2; pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1); pi2_src1 += 4; pu1_dst_tmp = pu1_dst + dst_strd; pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2); pi2_src2 += 4; i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]); pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1); i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]); pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2); i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2); i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]); i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]); i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]); sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t); i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2); sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]); sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t); sto_res = vqmovn_u16(sto_res_tmp3); sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); pu1_dst += 4; sto_res = vqmovn_u16(sto_res_tmp3); vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); } pi2_src1 += 2 * src_strd1 - 2 * wd; pi2_src2 += 2 * src_strd2 - 2 * wd; pu1_dst += 2 * dst_strd - 2 * wd; } } //WEIGHTED_PRED_CHROMA_BI /** ******************************************************************************* * * @brief * Does default bi-weighted prediction on the arrays pointed by pi2_src1 and * pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The * function is optimized considering the fact Width and height are multiple * of 2. * * @par Description: * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) * >> shift where shift = 15 - BitDepth * * @param[in] pi2_src1 * Pointer to source 1 * * @param[in] pi2_src2 * Pointer to source 2 * * @param[out] pu1_dst * Pointer to destination * * @param[in] src_strd1 * Source stride 1 * * @param[in] src_strd2 * Source stride 2 * * @param[in] dst_strd * Destination stride * * @param[in] lvl_shift1 * added before shift and offset * * @param[in] lvl_shift2 * added before shift and offset * * @param[in] ht * height of the source * * @param[in] wd * width of the source * * @returns * * @remarks * None * ******************************************************************************* */ void ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1, WORD16 *pi2_src2, UWORD8 *pu1_dst, WORD32 src_strd1, WORD32 src_strd2, WORD32 dst_strd, WORD32 lvl_shift1, WORD32 lvl_shift2, WORD32 ht, WORD32 wd) { WORD32 row, col; int16x4_t pi2_src1_val1; int16x4_t pi2_src1_val2; int16x4_t pi2_src2_val1; int16x4_t pi2_src2_val2; int32x4_t i4_tmp1_t1; int32x4_t i4_tmp1_t2; int32x4_t i4_tmp2_t1; int32x4_t i4_tmp2_t2; int32x4_t sto_res_tmp1; uint16x4_t sto_res_tmp2; uint16x8_t sto_res_tmp3; uint8x8_t sto_res; int32x4_t tmp_lvl_shift_t; int32x4_t tmp_shift_t; WORD16 *pi2_src_tmp1; WORD16 *pi2_src_tmp2; UWORD8 *pu1_dst_tmp; WORD32 shift; shift = SHIFT_14_MINUS_BIT_DEPTH + 1; WORD32 tmp_shift = 0 - shift; WORD32 tmp_lvl_shift = 1 << (shift - 1); tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift); tmp_shift_t = vmovq_n_s32(tmp_shift); int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1); int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2); /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ /* height has also been unrolled, hence 2 rows will processed at a time */ /* store also has been taken care for two row process */ /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ /* saturated and narrowed */ for(row = ht; row > 0; row -= 2) { for(col = wd; col > 0; col -= 4) { pi2_src_tmp1 = pi2_src1 + src_strd1; pi2_src_tmp2 = pi2_src2 + src_strd2; pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1); pi2_src1 += 4; pu1_dst_tmp = pu1_dst + dst_strd; pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2); pi2_src2 += 4; i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t); pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1); i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t); pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2); i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2); i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t); i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t); i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t); sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t); i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2); sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t); sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t); sto_res = vqmovn_u16(sto_res_tmp3); sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); pu1_dst += 4; sto_res = vqmovn_u16(sto_res_tmp3); vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); } pi2_src1 += 2 * src_strd1 - wd; pi2_src2 += 2 * src_strd2 - wd; pu1_dst += 2 * dst_strd - wd; } } //WEIGHTED_PRED_BI_DEFAULT /** ******************************************************************************* * * @brief * Does default bi-weighted prediction on the arrays pointed by pi2_src1 and * pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The * function is optimized considering the fact Width and height are multiple * of 2. * * @par Description: * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) * >> shift where shift = 15 - BitDepth * * @param[in] pi2_src1 * Pointer to source 1 * * @param[in] pi2_src2 * Pointer to source 2 * * @param[out] pu1_dst * Pointer to destination * * @param[in] src_strd1 * Source stride 1 * * @param[in] src_strd2 * Source stride 2 * * @param[in] dst_strd * Destination stride * * @param[in] lvl_shift1 * added before shift and offset * * @param[in] lvl_shift2 * added before shift and offset * * @param[in] ht * height of the source * * @param[in] wd * width of the source * * @returns * * @remarks * None * ******************************************************************************* */ void ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1, WORD16 *pi2_src2, UWORD8 *pu1_dst, WORD32 src_strd1, WORD32 src_strd2, WORD32 dst_strd, WORD32 lvl_shift1, WORD32 lvl_shift2, WORD32 ht, WORD32 wd) { WORD32 row, col; int16x4_t pi2_src1_val1; int16x4_t pi2_src1_val2; int16x4_t pi2_src2_val1; int16x4_t pi2_src2_val2; int32x4_t i4_tmp1_t1; int32x4_t i4_tmp1_t2; int32x4_t i4_tmp2_t1; int32x4_t i4_tmp2_t2; int32x4_t sto_res_tmp1; uint16x4_t sto_res_tmp2; uint16x8_t sto_res_tmp3; uint8x8_t sto_res; int32x4_t tmp_lvl_shift_t; int32x4_t tmp_shift_t; WORD16 *pi2_src_tmp1; WORD16 *pi2_src_tmp2; UWORD8 *pu1_dst_tmp; WORD32 shift; WORD32 tmp_shift; WORD32 tmp_lvl_shift; int16x4_t lvl_shift1_t; int16x4_t lvl_shift2_t; shift = SHIFT_14_MINUS_BIT_DEPTH + 1; tmp_shift = 0 - shift; tmp_lvl_shift = 1 << (shift - 1); tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift); tmp_shift_t = vmovq_n_s32(tmp_shift); lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1); lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2); /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ /* height has also been unrolled, hence 2 rows will processed at a time */ /* store also has been taken care for two row process */ /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ /* saturated and narrowed */ for(row = ht; row > 0; row -= 2) { for(col = 2 * wd; col > 0; col -= 4) { pi2_src_tmp1 = pi2_src1 + src_strd1; pi2_src_tmp2 = pi2_src2 + src_strd2; pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1); pi2_src1 += 4; pu1_dst_tmp = pu1_dst + dst_strd; pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2); pi2_src2 += 4; i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t); pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1); i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t); pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2); i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2); i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t); i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t); i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t); sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t); i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2); sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t); sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t); sto_res = vqmovn_u16(sto_res_tmp3); sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); pu1_dst += 4; sto_res = vqmovn_u16(sto_res_tmp3); vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); } pi2_src1 += 2 * src_strd1 - 2 * wd; pi2_src2 += 2 * src_strd2 - 2 * wd; pu1_dst += 2 * dst_strd - 2 * wd; } } //WEIGHTED_PRED_CHROMA_BI_DEFAULT