///***************************************************************************** //* //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //*****************************************************************************/ ///** //******************************************************************************* //* @file //* ihevc_inter_pred_chroma_copy.s //* //* @brief //* Contains function definitions for inter prediction interpolation. //* Functions are coded using NEON intrinsics and can be compiled using ARM //* RVCT //* //* @author //* Yogeswaran RS //* //* @par List of Functions: //* //* //* @remarks //* None //* //******************************************************************************* //*/ ///** //******************************************************************************* //* //* @brief //* Chroma interprediction filter for copy //* //* @par Description: //* Copies the array of width 'wd' and height 'ht' from the location pointed //* by 'src' to the location pointed by 'dst' //* //* @param[in] pu1_src //* UWORD8 pointer to the source //* //* @param[out] pu1_dst //* UWORD8 pointer to the destination //* //* @param[in] src_strd //* integer source stride //* //* @param[in] dst_strd //* integer destination stride //* //* @param[in] pi1_coeff //* WORD8 pointer to the filter coefficients //* //* @param[in] ht //* integer height of the array //* //* @param[in] wd //* integer width of the array //* //* @returns //* //* @remarks //* None //* //******************************************************************************* //*/ //void ihevc_inter_pred_chroma_copy( UWORD8 *pu1_src, // UWORD8 *pu1_dst, // WORD32 src_strd, // WORD32 dst_strd, // WORD8 *pi1_coeff, // WORD32 ht, // WORD32 wd) //**************Variables Vs Registers***************************************** //x0 => *pu1_src //x1 => *pu1_dst //x2 => src_strd //x3 => dst_strd //x4 => *pi1_coeff //x5 => ht //x6 => wd .text .align 4 .globl ihevc_inter_pred_chroma_copy_av8 .type ihevc_inter_pred_chroma_copy_av8, %function ihevc_inter_pred_chroma_copy_av8: LSL x12,x6,#1 //wd << 1 CMP x5,#0 //checks ht == 0 BLE END_LOOPS AND x8,x5,#3 //check ht for mul of 2 SUB x5,x5,x8 //check the rounded height value TST x12,#15 //checks wd for multiples for 16 BEQ CORE_LOOP_WD_16 TST x12,#7 //checks wd for multiples for 4 & 8 BEQ CORE_LOOP_WD_8 SUB x11,x12,#4 CMP x5,#0 BEQ OUTER_LOOP_WD_4_HT_2 OUTER_LOOP_WD_4: SUBS x4,x12,#0 //checks wd == 0 BLE END_INNER_LOOP_WD_4 INNER_LOOP_WD_4: LD1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) ADD x7,x0,x2 //pu1_src_tmp += src_strd ADD x6,x1,x3 //pu1_dst_tmp += dst_strd ST1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) ADD x0,x0,#4 //pu1_src += 4 ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) SUBS x4,x4,#4 //(wd -4) ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) ADD x1,x1,#4 //pu1_dst += 4 ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) BGT INNER_LOOP_WD_4 END_INNER_LOOP_WD_4: SUBS x5,x5,#4 //ht - 4 SUB x0,x7,x11 //pu1_src = pu1_src_tmp SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp BGT OUTER_LOOP_WD_4 CMP x8,#0 BGT OUTER_LOOP_WD_4_HT_2 END_LOOPS: RET OUTER_LOOP_WD_4_HT_2: SUBS x4,x12,#0 //checks wd == 0 BLE END_LOOPS INNER_LOOP_WD_4_HT_2: LD1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) ADD x7,x0,x2 //pu1_src_tmp += src_strd ADD x6,x1,x3 //pu1_dst_tmp += dst_strd ST1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) ADD x0,x0,#4 //pu1_src += 4 ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) SUBS x4,x4,#4 //(wd -4) ADD x1,x1,#4 //pu1_dst += 4 BGT INNER_LOOP_WD_4_HT_2 B END_LOOPS CORE_LOOP_WD_8: SUB x11,x12,#8 CMP x5,#0 BEQ OUTER_LOOP_WD_8_HT_2 OUTER_LOOP_WD_8: SUBS x4,x12,#0 //checks wd BLE END_INNER_LOOP_WD_8 INNER_LOOP_WD_8: ADD x7,x0,x2 //pu1_src_tmp += src_strd LD1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp) ADD x6,x1,x3 //pu1_dst_tmp += dst_strd ST1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src) LD1 {v1.8b},[x7],x2 //vld1_u8(pu1_src_tmp) ST1 {v1.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) SUBS x4,x4,#8 //wd - 8(Loop condition) LD1 {v2.8b},[x7],x2 //vld1_u8(pu1_src_tmp) ST1 {v2.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) LD1 {v3.8b},[x7],x2 //vld1_u8(pu1_src_tmp) ST1 {v3.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) BGT INNER_LOOP_WD_8 END_INNER_LOOP_WD_8: SUBS x5,x5,#4 //ht -= 4 SUB x0,x7,x11 //pu1_src = pu1_src_tmp SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp BGT OUTER_LOOP_WD_8 CMP x8,#0 BGT OUTER_LOOP_WD_8_HT_2 B END_LOOPS OUTER_LOOP_WD_8_HT_2: SUBS x4,x12,#0 //checks wd BLE END_LOOPS INNER_LOOP_WD_8_HT_2: ADD x7,x0,x2 //pu1_src_tmp += src_strd LD1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp) ADD x6,x1,x3 //pu1_dst_tmp += dst_strd ST1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src) LD1 {v1.8b},[x7],x2 //vld1_u8(pu1_src_tmp) ST1 {v1.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) B END_LOOPS CORE_LOOP_WD_16: SUB x11,x12,#16 CMP x5,#0 BEQ OUTER_LOOP_WD_16_HT_2 OUTER_LOOP_WD_16: SUBS x4,x12,#0 //checks wd BLE END_INNER_LOOP_WD_16 INNER_LOOP_WD_16: ADD x7,x0,x2 //pu1_src_tmp += src_strd LD1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp) ADD x6,x1,x3 //pu1_dst_tmp += dst_strd ST1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src) LD1 {v1.16b},[x7],x2 //vld1_u8(pu1_src_tmp) ST1 {v1.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) SUBS x4,x4,#16 //wd - 16(Loop condition) LD1 {v2.16b},[x7],x2 //vld1_u8(pu1_src_tmp) ST1 {v2.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) LD1 {v3.16b},[x7],x2 //vld1_u8(pu1_src_tmp) ST1 {v3.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) BGT INNER_LOOP_WD_16 END_INNER_LOOP_WD_16: SUBS x5,x5,#4 //ht -= 4 SUB x0,x7,x11 //pu1_src = pu1_src_tmp SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp BGT OUTER_LOOP_WD_16 CMP x8,#0 BGT OUTER_LOOP_WD_16_HT_2 B END_LOOPS OUTER_LOOP_WD_16_HT_2: SUBS x4,x12,#0 //checks wd BLE END_LOOPS INNER_LOOP_WD_16_HT_2: ADD x7,x0,x2 //pu1_src_tmp += src_strd LD1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp) ADD x6,x1,x3 //pu1_dst_tmp += dst_strd ST1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src) LD1 {v1.16b},[x7],x2 //vld1_u8(pu1_src_tmp) ST1 {v1.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) RET