///***************************************************************************** //* //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //*****************************************************************************/ ///** //******************************************************************************* //* //file //* ihevc_inter_pred_chroma_copy_w16out_neon.s //* //* //brief //* contains function definitions for inter prediction interpolation. //* functions are coded using neon intrinsics and can be compiled using //* rvct //* //* //author //* yogeswaran rs //* //* //par list of functions: //* //* //* //remarks //* none //* //******************************************************************************* //*/ ///** //******************************************************************************* //* //* //brief //* chroma interprediction filter for copy //* //* //par description: //* copies the array of width 'wd' and height 'ht' from the location pointed //* by 'src' to the location pointed by 'dst' //* //* //param[in] pu1_src //* uword8 pointer to the source //* //* //param[out] pu1_dst //* uword8 pointer to the destination //* //* //param[in] src_strd //* integer source stride //* //* //param[in] dst_strd //* integer destination stride //* //* //param[in] pi1_coeff //* word8 pointer to the filter coefficients //* //* //param[in] ht //* integer height of the array //* //* //param[in] wd //* integer width of the array //* //* //returns //* //* //remarks //* none //* //******************************************************************************* //*/ //void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src, // word16 *pi2_dst, // word32 src_strd, // word32 dst_strd, // word8 *pi1_coeff, // word32 ht, // word32 wd) //**************variables vs registers***************************************** //x0 => *pu1_src //x1 => *pi2_dst //x2 => src_strd //x3 => dst_strd //x4 => *pi1_coeff //x5 => ht //x6 => wd .text .align 4 .include "ihevc_neon_macros.s" .globl ihevc_inter_pred_chroma_copy_w16out_av8 .type ihevc_inter_pred_chroma_copy_w16out_av8, %function ihevc_inter_pred_chroma_copy_w16out_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff mov x16,x5 // ht mov x17,x6 // wd mov x12,x17 //loads wd lsl x12,x12,#1 //2*wd mov x7,x16 //loads ht cmp x7,#0 //ht condition(ht == 0) ble end_loops //loop and x8,x7,#3 //check ht for mul of 2 sub x9,x7,x8 //check the rounded height value and x11,x7,#6 cmp x11,#6 beq loop_ht_6 tst x12,#7 //conditional check for wd (multiples) beq core_loop_wd_8 loop_ht_6: sub x11,x12,#4 lsl x6, x3,#1 adds x6, x6,#0 cmp x9,#0 beq outer_loop_wd_4_ht_2 outer_loop_wd_4: subs x4,x12,#0 //wd conditional subtract ble end_inner_loop_wd_4 inner_loop_wd_4: ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp) add x5,x0,x2 //pu1_src +src_strd uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp) add x10,x1,x6 subs x4,x4,#4 //wd - 4 shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6) ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp) add x0,x0,#4 //pu1_src += 4 st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0) add x1,x1,#8 uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp) ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp) shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6) uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp) st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) shl v24.2d, v24.2d,#6 //vshlq_n_s64(temp, 6) ld1 {v26.8b},[x5],x2 //vld1_u8(pu1_src_tmp) st1 {v24.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) uxtl v26.8h, v26.8b //vmovl_u8(vld1_u8(pu1_src_tmp) shl v26.2d, v26.2d,#6 //vshlq_n_s64(temp, 6) st1 {v26.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) bgt inner_loop_wd_4 end_inner_loop_wd_4: subs x9,x9,#4 //ht - 4 sub x0,x5,x11 sub x1,x10,x11,lsl #1 bgt outer_loop_wd_4 cmp x8,#0 bgt outer_loop_wd_4_ht_2 end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 ret outer_loop_wd_4_ht_2: subs x4,x12,#0 //wd conditional subtract ble end_inner_loop_wd_4 inner_loop_wd_4_ht_2: ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp) add x5,x0,x2 //pu1_src +src_strd uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp) add x10,x1,x6 subs x4,x4,#4 //wd - 4 shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6) ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp) add x0,x0,#4 //pu1_src += 4 st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0) add x1,x1,#8 uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp) ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp) shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6) uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp) st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) bgt inner_loop_wd_4_ht_2 b end_loops core_loop_wd_8: //sub x11,x12,#8 lsl x5, x3,#1 adds x5, x5,#0 sub x20,x12,x3, lsl #2 // x11 = (dst_strd * 4) - width neg x11, x20 sub x20,x12,x2,lsl #2 //x2->src_strd neg x8, x20 lsr x4, x12, #3 // divide by 8 mov x7,x9 mul x7, x7, x4 sub x4,x12,#0 //wd conditional check sub x7,x7,#4 //subtract one for epilog cmp x9,#0 beq core_loop_wd_8_ht_2 prolog: add x6,x0,x2 //pu1_src_tmp += src_strd add x10,x1,x5 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) subs x4,x4,#8 //wd decrements by 8 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) add x20,x0,x8 csel x0, x20, x0,le add x6,x0,x2 //pu1_src_tmp += src_strd ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) add x20,x1,x11,lsl #1 csel x1, x20, x1,le sub x20,x12,#0 //wd conditional check csel x4, x20, x4,le subs x7,x7,#4 //ht - 4 blt epilog_end //jumps to epilog_end beq epilog //jumps to epilog outer_loop_wd_8: st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) subs x4,x4,#8 //wd decrements by 8 add x20,x0,x8 csel x0, x20, x0,le add x6,x0,x2 //pu1_src_tmp += src_strd ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) add x10,x1,x5 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) add x20,x1,x11,lsl #1 csel x1, x20, x1,le sub x20,x12,#0 //wd conditional check csel x4, x20, x4,le subs x7,x7,#4 //ht - 4 bgt outer_loop_wd_8 epilog: st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) //add x6,x0,x2 //pu1_src_tmp += src_strd shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) add x10,x1,x5 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) epilog_end: st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) b end_loops core_loop_wd_8_ht_2: add x6,x0,x2 //pu1_src_tmp += src_strd add x10,x1,x5 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) subs x12,x12,#8 //wd decrements by 8 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) bgt core_loop_wd_8_ht_2 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 ret