///***************************************************************************** //* //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //*****************************************************************************/ ///** //******************************************************************************* //* @file //* ihevc_intra_pred_filters_dc.s //* //* @brief //* contains function definitions for intra prediction dc filtering. //* functions are coded using neon intrinsics and can be compiled using //* rvct //* //* @author //* akshaya mukund //* //* @par list of functions: //* //* //* @remarks //* none //* //******************************************************************************* //*/ ///** //******************************************************************************* //* //* @brief //* luma intraprediction filter for dc input //* //* @par description: //* //* @param[in] pu1_ref //* uword8 pointer to the source //* //* @param[out] pu1_dst //* uword8 pointer to the destination //* //* @param[in] src_strd //* integer source stride //* //* @param[in] dst_strd //* integer destination stride //* //* @param[in] pi1_coeff //* word8 pointer to the planar coefficients //* //* @param[in] nt //* size of tranform block //* //* @param[in] mode //* type of filtering //* //* @returns //* //* @remarks //* none //* //******************************************************************************* //*/ //void ihevc_intra_pred_luma_dc(uword8 *pu1_ref, // word32 src_strd, // uword8 *pu1_dst, // word32 dst_strd, // word32 nt, // word32 mode) // //**************variables vs registers***************************************** //x0 => *pu1_ref //x1 => src_strd //x2 => *pu1_dst //x3 => dst_strd //stack contents from #40 // nt // mode // pi1_coeff .text .align 4 .include "ihevc_neon_macros.s" .globl ihevc_intra_pred_luma_dc_av8 .type ihevc_intra_pred_luma_dc_av8, %function ihevc_intra_pred_luma_dc_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments stp x19, x20,[sp,#-16]! //********** testing //mov x6, #128 //b prologue_cpy_32 //********** testing mov x11, #2 //mov #2 to x11 (to be used to add to 2dc_val & 3dc_val) mov x9, #0 mov v17.s[0], w11 mov v17.s[1], w9 clz w5,w4 add x6, x0, x4 //&src[nt] sub x20, x5, #32 //log2nt neg x5, x20 add x7, x0, x4, lsl #1 //&src[2nt] add x8, x7, #1 //&src[2nt+1] mvn x5, x5 add x5, x5, #1 dup v7.2s,w5 ldrb w14, [x8] sxtw x14,w14 shl d7, d7,#32 sub x9, x7, #1 //&src[2nt-1] sshr d7, d7,#32 mov x7, x8 //x7 also stores 2nt+1 ldrb w12, [x9] sxtw x12,w12 add x14, x14, x12 //src[2nt+1] + src[2nt-1] add x14, x14, x11 //src[2nt+1] + src[2nt-1] + 2 cmp x4, #4 beq dc_4 mov x10, x4 //nt add_loop: ld1 {v0.8b},[x6],#8 //load from src[nt] mov x5, #0 // ld1 {v1.8b},[x8],#8 //load from src[2nt+1] uaddlp v2.4h, v0.8b mov v6.s[0], w4 mov v6.s[1], w5 //store nt to accumulate uaddlp v3.4h, v1.8b ld1 {v0.8b},[x6],#8 //load from src[nt] (extra load for 8) ld1 {v1.8b},[x8],#8 //load from src[2nt+1] (extra load for 8) add v4.4h, v2.4h , v3.4h uaddlp v5.2s, v4.4h uadalp v6.1d, v5.2s //accumulate all inp into d6 (end for nt==8) subs x10, x10,#8 beq epil_add_loop core_loop_add: uaddlp v2.4h, v0.8b subs x10, x10,#8 uaddlp v3.4h, v1.8b add v4.4h, v2.4h , v3.4h ld1 {v0.8b},[x6],#8 //load from src[nt] (extra load for 16) uaddlp v5.2s, v4.4h ld1 {v1.8b},[x8],#8 //load from src[2nt+1] (extra load for 16) uadalp v6.1d, v5.2s //accumulate all inp into d6 bne core_loop_add epil_add_loop: sshl d18, d6, d7 //(dc_val) shr by log2nt+1 cmp x4, #32 mov v28.s[0], w14 mov v28.s[1], w5 //src[2nt+1]+2+src[2nt-1] moved to d28 mov x20,#128 csel x6, x20, x6,eq dup v16.8b, v18.b[0] //dc_val shl d25, d18,#1 //2*dc beq prologue_cpy_32 add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val mov x20,#0 csel x6, x20, x6,ne //nt ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0] csel x10, x4, x10,ne add d23, d25 , d18 //3*dc sub x12, x3, x3, lsl #3 //-7*strd add d23, d23 , d17 //3*dc + 2 add x12, x12, #8 //offset after one 8x8 block (-7*strd + 8) dup v24.8h, v23.h[0] //3*dc + 2 (moved to all lanes) sub x0, x3, x4 //strd - nt prologue_col: //0th column and 0-7 rows done here //x8 and x9 (2nt+1+col 2nt-1-row) mov x8, x7 //&src[2nt+1] add x0, x0, #8 //strd - nt + 8 ld1 {v0.8b},[x8],#8 //col 1::7 load (prol) sub x9, x9, #7 //&src[2nt-1-row] ld1 {v1.8b},[x9] //row 7::1 (0 also) load (prol) sub x9, x9, #8 uxtl v20.8h, v0.8b ld1 {v6.8b},[x8] //col 8::15 load (prol extra) add v20.8h, v20.8h , v24.8h //col 1::7 add 3dc+2 (prol) uxtl v22.8h, v1.8b sqshrun v2.8b, v20.8h,#2 //columns shx2 movn (prol) uxtl v26.8h, v6.8b add v22.8h, v22.8h , v24.8h //row 1::7 add 3dc+2 (prol) movi d19, #0x00000000000000ff // sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol) bsl v19.8b, v29.8b , v2.8b //first row with dst[0] add v26.8h, v26.8h , v24.8h //col 8::15 add 3dc+2 (prol extra) rev64 v3.8b, v3.8b st1 {v19.8b},[x2], x3 //store row 0 (prol) sshr d3, d3,#8 //row 0 shift (prol) (first value to be ignored) movi d20, #0x00000000000000ff //byte mask row 1 (prol) loop_again_col_row: bsl v20.8b, v3.8b , v16.8b //row 1 (prol) movi d21, #0x00000000000000ff //byte mask row 2 (prol) sshr d3, d3,#8 //row 1 shift (prol) st1 {v20.8b},[x2], x3 //store row 1 (prol) sqshrun v4.8b, v26.8h,#2 //columns shx2 movn (prol extra) bsl v21.8b, v3.8b , v16.8b //row 2 (prol) movi d20, #0x00000000000000ff //byte mask row 3 (prol) sshr d3, d3,#8 //row 2 shift (prol) st1 {v21.8b},[x2], x3 //store row 2 (prol) bsl v20.8b, v3.8b , v16.8b //row 3 (prol) movi d21, #0x00000000000000ff //byte mask row 4 (prol) sshr d3, d3,#8 //row 3 shift (prol) st1 {v20.8b},[x2], x3 //store row 3 (prol) bsl v21.8b, v3.8b , v16.8b //row 4 (prol) movi d20, #0x00000000000000ff //byte mask row 5 (prol) sshr d3, d3,#8 //row 4 shift (prol) st1 {v21.8b},[x2], x3 //store row 4 (prol) bsl v20.8b, v3.8b , v16.8b //row 5 (prol) movi d21, #0x00000000000000ff //byte mask row 6 (prol) sshr d3, d3,#8 //row 5 shift (prol) st1 {v20.8b},[x2], x3 //store row 5 (prol) ld1 {v1.8b},[x9] //row 8::15 load (prol extra) bsl v21.8b, v3.8b , v16.8b //row 6 (prol) uxtl v22.8h, v1.8b movi d20, #0x00000000000000ff //byte mask row 7 (prol) sshr d3, d3,#8 //row 6 shift (prol) st1 {v21.8b},[x2], x3 //store row 6 (prol) bsl v20.8b, v3.8b , v16.8b //row 7 (prol) add v22.8h, v22.8h , v24.8h //row 8::15 add 3dc+2 (prol extra) sshr d3, d3,#8 //row 7 shift (prol) st1 {v20.8b},[x2], x12 //store row 7 (prol) subs x10, x10, #8 //counter for cols beq end_func blt copy_16 movi d20, #0x00000000000000ff //byte mask row 9 (prol) sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol) rev64 v3.8b, v3.8b st1 {v4.8b},[x2], x3 //store 2nd col (for 16x16) st1 {v16.8b},[x2], x3 st1 {v16.8b},[x2], x3 st1 {v16.8b},[x2], x3 st1 {v16.8b},[x2], x3 st1 {v16.8b},[x2], x3 st1 {v16.8b},[x2], x3 st1 {v16.8b},[x2], x0 //go to next row for 16 bsl v20.8b, v3.8b , v16.8b //row 9 (prol) subs x10, x10, #8 st1 {v20.8b},[x2], x3 //store row 9 (prol) sshr d3, d3,#8 //row 9 shift (prol) movi d20, #0x00000000000000ff //byte mask row 9 (prol) b loop_again_col_row copy_16: st1 {v16.8b},[x2], x3 st1 {v16.8b},[x2], x3 st1 {v16.8b},[x2], x3 st1 {v16.8b},[x2], x3 st1 {v16.8b},[x2], x3 st1 {v16.8b},[x2], x3 st1 {v16.8b},[x2], x3 st1 {v16.8b},[x2] b end_func prologue_cpy_32: mov x9, #128 //sub x7, x3, #-24 add x5, x2, x3 add x8, x5, x3 add x10, x8, x3 dup v20.16b, v16.b[0] lsl x6, x3, #2 sub x6, x6, #16 st1 {v20.16b}, [x2],#16 st1 {v20.16b}, [x5],#16 st1 {v20.16b}, [x8],#16 st1 {v20.16b}, [x10],#16 st1 {v20.16b}, [x2], x6 st1 {v20.16b}, [x5], x6 st1 {v20.16b}, [x8], x6 st1 {v20.16b}, [x10], x6 sub x9, x9, #32 //32x32 prol/epil counter dec kernel_copy: st1 {v20.16b}, [x2],#16 st1 {v20.16b}, [x5],#16 st1 {v20.16b}, [x8],#16 st1 {v20.16b}, [x10],#16 st1 {v20.16b}, [x2], x6 st1 {v20.16b}, [x5], x6 st1 {v20.16b}, [x8], x6 st1 {v20.16b}, [x10], x6 subs x9, x9, #32 st1 {v20.16b}, [x2],#16 st1 {v20.16b}, [x5],#16 st1 {v20.16b}, [x8],#16 st1 {v20.16b}, [x10],#16 st1 {v20.16b}, [x2], x6 st1 {v20.16b}, [x5], x6 st1 {v20.16b}, [x8], x6 st1 {v20.16b}, [x10], x6 bne kernel_copy epilogue_copy: st1 {v20.16b}, [x2],#16 st1 {v20.16b}, [x5],#16 st1 {v20.16b}, [x8],#16 st1 {v20.16b}, [x10],#16 st1 {v20.16b}, [x2] st1 {v20.16b}, [x5] st1 {v20.16b}, [x8] st1 {v20.16b}, [x10] b end_func dc_4: ld1 {v0.8b},[x6],#8 //load from src[nt] ld1 {v1.8b},[x8],#8 //load from src[2nt+1] uaddlp v2.4h, v0.8b mov x5, #0 // mov v6.s[0], w4 mov v6.s[1], w5 //store nt to accumulate uaddlp v3.4h, v1.8b add v4.4h, v2.4h , v3.4h uaddlp v5.2s, v4.4h movi d30, #0x00000000ffffffff and v5.8b, v5.8b , v30.8b mov v28.s[0], w14 mov v28.s[1], w5 //src[2nt+1]+2+src[2nt-1] moved to d28 add d6, d6 , d5 //accumulate all inp into d6 (end for nt==8) sshl d18, d6, d7 //(dc_val) shr by log2nt+1 mov x8, x7 //&src[2nt+1] shl d25, d18,#1 //2*dc sub x9, x9, #3 //&src[2nt-1-row] dup v16.8b, v18.b[0] //dc_val add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0] sub x12, x3, x3, lsl #2 //-3*strd add d23, d25 , d18 //3*dc add d23, d23 , d17 //3*dc + 2 add x12, x12, #4 //offset after one 4x4 block (-3*strd + 4) dup v24.8h, v23.h[0] //3*dc + 2 (moved to all lanes) sub x0, x3, x4 //strd - nt ld1 {v0.8b},[x8] //col 1::3 load (prol) ld1 {v1.8b},[x9] //row 3::1 (0 also) load (prol) uxtl v20.8h, v0.8b uxtl v22.8h, v1.8b add v20.8h, v20.8h , v24.8h //col 1::7 add 3dc+2 (prol) add v22.8h, v22.8h , v24.8h //row 1::7 add 3dc+2 (prol) movi d19, #0x00000000000000ff // sqshrun v2.8b, v20.8h,#2 //columns shx2 movn (prol) movi d20, #0x00000000000000ff //byte mask row 1 (prol) sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol) bsl v19.8b, v29.8b , v2.8b //first row with dst[0] rev64 v3.8b, v3.8b st1 {v19.s}[0],[x2], x3 //store row 0 (prol) sshr d3, d3,#40 //row 0 shift (prol) (first value to be ignored) movi d21, #0x00000000000000ff //byte mask row 2 (prol) bsl v20.8b, v3.8b , v16.8b //row 1 (prol) sshr d3, d3,#8 //row 1 shift (prol) st1 {v20.s}[0],[x2], x3 //store row 1 (prol) bsl v21.8b, v3.8b , v16.8b //row 2 (prol) movi d20, #0x00000000000000ff //byte mask row 3 (prol) sshr d3, d3,#8 //row 2 shift (prol) st1 {v21.s}[0],[x2], x3 //store row 2 (prol) bsl v20.8b, v3.8b , v16.8b //row 3 (prol) st1 {v20.s}[0],[x2] //store row 3 (prol) epilogue_end: end_func: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 ret