@/***************************************************************************** @* @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore @* @* Licensed under the Apache License, Version 2.0 (the "License"); @* you may not use this file except in compliance with the License. @* You may obtain a copy of the License at: @* @* http://www.apache.org/licenses/LICENSE-2.0 @* @* Unless required by applicable law or agreed to in writing, software @* distributed under the License is distributed on an "AS IS" BASIS, @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @* See the License for the specific language governing permissions and @* limitations under the License. @* @*****************************************************************************/ @/** @******************************************************************************* @* @file @* ihevc_intra_pred_filters_planar.s @* @* @brief @* contains function definitions for inter prediction interpolation. @* functions are coded using neon intrinsics and can be compiled using @* rvct @* @* @author @* akshaya mukund @* @* @par list of functions: @* @* @* @remarks @* none @* @******************************************************************************* @*/ @/** @******************************************************************************* @* @* @brief @* luma intraprediction filter for planar input @* @* @par description: @* @* @param[in] pu1_ref @* uword8 pointer to the source @* @* @param[out] pu1_dst @* uword8 pointer to the destination @* @* @param[in] src_strd @* integer source stride @* @* @param[in] dst_strd @* integer destination stride @* @* @param[in] pi1_coeff @* word8 pointer to the planar coefficients @* @* @param[in] nt @* size of tranform block @* @* @param[in] mode @* type of filtering @* @* @returns @* @* @remarks @* none @* @******************************************************************************* @*/ @void ihevc_intra_pred_luma_planar(uword8* pu1_ref, @ word32 src_strd, @ uword8* pu1_dst, @ word32 dst_strd, @ word32 nt, @ word32 mode, @ word32 pi1_coeff) @**************variables vs registers***************************************** @r0 => *pu1_ref @r1 => src_strd @r2 => *pu1_dst @r3 => dst_strd @stack contents from #40 @ nt @ mode @ pi1_coeff .text .align 4 .globl ihevc_intra_pred_chroma_planar_a9q .extern gau1_ihevc_planar_factor gau1_ihevc_planar_factor_addr: .long gau1_ihevc_planar_factor - ulbl1 - 8 .type ihevc_intra_pred_chroma_planar_a9q, %function ihevc_intra_pred_chroma_planar_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments ldr r4,[sp,#40] @loads nt ldr r11, gau1_ihevc_planar_factor_addr @loads table of coeffs ulbl1: add r11,r11,pc clz r5, r4 rsb r5, r5, #32 vdup.16 q7, r5 vneg.s16 q7, q7 @shr value (so vneg) vdup.8 d2, r4 @nt vdup.s16 q8, r4 @nt sub r6, r4, #1 @nt-1 add r6, r0,r6,lsl #1 @2*(nt-1) ldr r7, [r6] vdup.s16 d0, r7 @src[nt-1] add r6, r4, r4,lsl #1 @3nt add r6, r6, #1 @3nt + 1 lsl r6,r6,#1 @2*(3nt + 1) add r6, r6, r0 ldr r7, [r6] vdup.s16 d1, r7 @src[3nt+1] add r6, r4, r4 @2nt add r14, r6, #1 @2nt+1 lsl r14,#1 @2*(2nt+1) sub r6, r6, #1 @2nt-1 lsl r6,#1 @2*(2nt-1) add r6, r6, r0 @&src[2nt-1] add r14, r14, r0 @&src[2nt+1] mov r8, #1 @row+1 (row is first 0) sub r9, r4, r8 @nt-1-row (row is first 0) vdup.s8 d5, r8 @row + 1 vdup.s8 d6, r9 @nt - 1 - row vmov d7, d5 @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row add r12, r11, #1 @coeffs (to be reloaded after every row) mov r1, r4 @nt (row counter) (dec after every row) mov r5, r2 @dst (to be reloaded after every row and inc by dst_strd) mov r10, #8 @increment for the coeffs mov r0, r14 @&src[2nt+1] (to be reloaded after every row) cmp r4, #4 beq tf_sz_4 mov r10,r6 tf_sz_8_16: vld1.s8 {d10,d11}, [r14]! @load src[2nt+1+col] vld1.s8 d8, [r12]! vmov d9,d8 vzip.8 d8,d9 vsub.s8 d30, d2, d8 @[nt-1-col] vsub.s8 d31, d2, d9 loop_sz_8_16: ldr r7, [r6], #-2 @src[2nt-1-row] (dec to take into account row) vmull.u8 q6, d5, d0 @(row+1) * src[nt-1] ldr r11, [r6], #-2 @src[2nt-1-row] (dec to take into account row) vmlal.u8 q6, d6, d10 @(nt-1-row) * src[2nt+1+col] vdup.s16 d4, r7 @src[2nt-1-row] vmlal.u8 q6, d8, d1 @(col+1) * src[3nt+1] vdup.s16 d3, r11 @src[2nt-1-row] vmlal.u8 q6, d30, d4 @(nt-1-col) * src[2nt-1-row] vmull.u8 q14,d5,d0 ldr r7, [r6], #-2 @src[2nt-1-row] (dec to take into account row) vmlal.u8 q14,d6,d11 vadd.s8 d18, d5, d7 @row++ [(row+1)++]c vmlal.u8 q14,d31,d4 vsub.s8 d19, d6, d7 @[nt-1-row]-- vmlal.u8 q14,d9,d1 vdup.s16 d4, r7 @src[2nt-1-row] vmull.u8 q13, d18, d0 @(row+1) * src[nt-1] vadd.i16 q6, q6, q8 @add (nt) vmlal.u8 q13, d19, d10 @(nt-1-row) * src[2nt+1+col] vshl.s16 q6, q6, q7 @shr vmlal.u8 q13, d8, d1 @(col+1) * src[3nt+1] vadd.i16 q14,q14,q8 vmlal.u8 q13, d30, d3 @(nt-1-col) * src[2nt-1-row] vshl.s16 q14,q14,q7 vmull.u8 q12,d18,d0 vadd.s8 d5, d18, d7 @row++ [(row+1)++] vmlal.u8 q12,d19,d11 vsub.s8 d6, d19, d7 @[nt-1-row]-- vmlal.u8 q12,d9,d1 vmovn.i16 d12, q6 vmlal.u8 q12,d31,d3 vmovn.i16 d13,q14 vadd.i16 q13, q13, q8 @add (nt) vmull.u8 q11, d5, d0 @(row+1) * src[nt-1] vshl.s16 q13, q13, q7 @shr vmlal.u8 q11, d6, d10 @(nt-1-row) * src[2nt+1+col] vst1.s32 {d12,d13}, [r2], r3 vmlal.u8 q11, d8, d1 @(col+1) * src[3nt+1] vadd.i16 q12,q12,q8 vmlal.u8 q11, d30, d4 @(nt-1-col) * src[2nt-1-row] vshl.s16 q12,q12,q7 vmull.u8 q10,d5,d0 vadd.s8 d18, d5, d7 @row++ [(row+1)++]c vmlal.u8 q10,d6,d11 vsub.s8 d19, d6, d7 @[nt-1-row]-- vmlal.u8 q10,d31,d4 ldr r11, [r6], #-2 @src[2nt-1-row] (dec to take into account row) vmlal.u8 q10,d9,d1 vdup.s16 d3, r11 @src[2nt-1-row] vadd.i16 q11, q11, q8 @add (nt) vmull.u8 q6, d18, d0 @(row+1) * src[nt-1] vmovn.i16 d26, q13 vmlal.u8 q6, d19, d10 @(nt-1-row) * src[2nt+1+col] vmovn.i16 d27,q12 vmlal.u8 q6, d8, d1 @(col+1) * src[3nt+1] vshl.s16 q11, q11, q7 @shr vmlal.u8 q6, d30, d3 @(nt-1-col) * src[2nt-1-row] vadd.i16 q10,q10,q8 vmull.u8 q14,d18,d0 vst1.s32 {d26,d27}, [r2], r3 vmlal.u8 q14,d19,d11 vadd.s8 d5, d18, d7 @row++ [(row+1)++] vsub.s8 d6, d19, d7 @[nt-1-row]-- vmlal.u8 q14,d9,d1 vmlal.u8 q14,d31,d3 vshl.s16 q10,q10,q7 vadd.i16 q6, q6 ,q8 @add (nt) vmovn.i16 d22, q11 vadd.i16 q14,q14,q8 vmovn.i16 d23,q10 vshl.s16 q6, q6, q7 @shr vst1.s32 {d22,d23}, [r2], r3 vshl.s16 q14,q14,q7 vmovn.i16 d20, q6 vmovn.i16 d21,q14 vst1.s32 {d20,d21}, [r2], r3 subs r1, r1, #4 bne loop_sz_8_16 cmp r4,#16 bne end_loop sub r4,#16 vdup.s8 d5, r8 @row + 1 vdup.s8 d6, r9 @nt - 1 - row vmov d7, d5 @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row mov r6,r10 mov r1,#16 sub r2,r2,r3,lsl #4 add r2,r2,#16 vld1.s8 {d10,d11}, [r14]! @load src[2nt+1+col] vld1.s8 d8, [r12]! vmov d9,d8 vzip.8 d8,d9 vsub.s8 d30, d2, d8 @[nt-1-col] vsub.s8 d31, d2, d9 beq loop_sz_8_16 tf_sz_4: vld1.s8 d10, [r14] @load src[2nt+1+col] vld1.s8 d8, [r12], r10 @load 8 coeffs [col+1] vmov d9,d8 vzip.8 d8,d9 loop_sz_4: @mov r10, #4 @reduce inc to #4 for 4x4 ldr r7, [r6], #-2 @src[2nt-1-row] (dec to take into account row) vdup.s16 d4, r7 @src[2nt-1-row] vsub.s8 d9, d2, d8 @[nt-1-col] vmull.u8 q6, d5, d0 @(row+1) * src[nt-1] vmlal.u8 q6, d6, d10 @(nt-1-row) * src[2nt+1+col] vmlal.u8 q6, d8, d1 @(col+1) * src[3nt+1] vmlal.u8 q6, d9, d4 @(nt-1-col) * src[2nt-1-row] @ vadd.i16 q6, q6, q8 @add (nt) @ vshl.s16 q6, q6, q7 @shr @ vmovn.i16 d12, q6 vrshrn.s16 d12,q6,#3 vst1.s32 {d12}, [r2], r3 vadd.s8 d5, d5, d7 @row++ [(row+1)++] vsub.s8 d6, d6, d7 @[nt-1-row]-- subs r1, r1, #1 bne loop_sz_4 end_loop: ldmfd sp!,{r4-r12,r15} @reload the registers from sp