/****************************************************************************** * * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ /** ******************************************************************************* * @file * ihevce_decomp_pre_intra_pass_neon.c * * @brief * Contains functions to perform input scaling * * @author * Ittiam * * @par List of Functions: * * @remarks * None * ******************************************************************************** */ /*****************************************************************************/ /* File Includes */ /*****************************************************************************/ /* System include files */ #include <stdio.h> #include <string.h> #include <assert.h> #include <arm_neon.h> /* User include files */ #include "ihevc_typedefs.h" #include "ihevc_macros.h" #include "ihevc_platform_macros.h" #include "itt_video_api.h" #include "ihevc_defs.h" #include "ihevc_cmn_utils_neon.h" #include "ihevce_ipe_instr_set_router.h" /*****************************************************************************/ /* Function Definitions */ /*****************************************************************************/ void ihevce_scaling_filter_mxn( UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_scrtch, WORD32 scrtch_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 ht, WORD32 wd) { #define FILT_TAP_Q 8 #define N_TAPS 7 const WORD16 i4_ftaps[N_TAPS] = { -18, 0, 80, 132, 80, 0, -18 }; WORD32 i, j; WORD32 tmp; UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd; UWORD8 *pu1_scrtch_tmp = pu1_scrtch; /* horizontal filtering */ for(i = -3; i < ht + 2; i++) { for(j = 0; j < wd; j += 2) { tmp = (i4_ftaps[3] * pu1_src_tmp[j] + i4_ftaps[2] * (pu1_src_tmp[j - 1] + pu1_src_tmp[j + 1]) + i4_ftaps[1] * (pu1_src_tmp[j + 2] + pu1_src_tmp[j - 2]) + i4_ftaps[0] * (pu1_src_tmp[j + 3] + pu1_src_tmp[j - 3]) + (1 << (FILT_TAP_Q - 1))) >> FILT_TAP_Q; pu1_scrtch_tmp[j >> 1] = CLIP_U8(tmp); } pu1_scrtch_tmp += scrtch_strd; pu1_src_tmp += src_strd; } /* vertical filtering */ pu1_scrtch_tmp = pu1_scrtch + 3 * scrtch_strd; for(i = 0; i < ht; i += 2) { for(j = 0; j < (wd >> 1); j++) { tmp = (i4_ftaps[3] * pu1_scrtch_tmp[j] + i4_ftaps[2] * (pu1_scrtch_tmp[j + scrtch_strd] + pu1_scrtch_tmp[j - scrtch_strd]) + i4_ftaps[1] * (pu1_scrtch_tmp[j + 2 * scrtch_strd] + pu1_scrtch_tmp[j - 2 * scrtch_strd]) + i4_ftaps[0] * (pu1_scrtch_tmp[j + 3 * scrtch_strd] + pu1_scrtch_tmp[j - 3 * scrtch_strd]) + (1 << (FILT_TAP_Q - 1))) >> FILT_TAP_Q; pu1_dst[j] = CLIP_U8(tmp); } pu1_dst += dst_strd; pu1_scrtch_tmp += (scrtch_strd << 1); } } void ihevce_scale_by_2_neon( UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 wd, WORD32 ht, UWORD8 *pu1_wkg_mem, WORD32 ht_offset, WORD32 block_ht, WORD32 wd_offset, WORD32 block_wd, FT_COPY_2D *pf_copy_2d) { #define MAX_BLK_SZ (MAX_CTB_SIZE + ((N_TAPS >> 1) << 1)) UWORD8 au1_cpy[MAX_BLK_SZ * MAX_BLK_SZ]; UWORD32 cpy_strd = MAX_BLK_SZ; UWORD8 *pu1_cpy = au1_cpy + cpy_strd * (N_TAPS >> 1) + (N_TAPS >> 1); UWORD8 *pu1_in, *pu1_out; WORD32 in_strd, wkg_mem_strd; WORD32 row_start, row_end; WORD32 col_start, col_end; WORD32 i, fun_select; WORD32 ht_tmp, wd_tmp; FT_SCALING_FILTER_BY_2 *ihevce_scaling_filters[2]; assert((wd & 1) == 0); assert((ht & 1) == 0); assert(block_wd <= MAX_CTB_SIZE); assert(block_ht <= MAX_CTB_SIZE); /* function pointers for filtering different dimensions */ ihevce_scaling_filters[0] = ihevce_scaling_filter_mxn; ihevce_scaling_filters[1] = ihevce_scaling_filter_mxn_neon; /* handle boundary blks */ col_start = (wd_offset < (N_TAPS >> 1)) ? 1 : 0; row_start = (ht_offset < (N_TAPS >> 1)) ? 1 : 0; col_end = ((wd_offset + block_wd) > (wd - (N_TAPS >> 1))) ? 1 : 0; row_end = ((ht_offset + block_ht) > (ht - (N_TAPS >> 1))) ? 1 : 0; if(col_end && (wd % block_wd != 0)) { block_wd = (wd % block_wd); } if(row_end && (ht % block_ht != 0)) { block_ht = (ht % block_ht); } /* boundary blks needs to be padded, copy src to tmp buffer */ if(col_start || col_end || row_end || row_start) { UWORD8 *pu1_src_tmp = pu1_src + wd_offset + ht_offset * src_strd; pu1_cpy -= (3 * (1 - col_start) + cpy_strd * 3 * (1 - row_start)); pu1_src_tmp -= (3 * (1 - col_start) + src_strd * 3 * (1 - row_start)); ht_tmp = block_ht + 3 * (1 - row_start) + 3 * (1 - row_end); wd_tmp = block_wd + 3 * (1 - col_start) + 3 * (1 - col_end); pf_copy_2d(pu1_cpy, cpy_strd, pu1_src_tmp, src_strd, wd_tmp, ht_tmp); pu1_in = au1_cpy + cpy_strd * 3 + 3; in_strd = cpy_strd; } else { pu1_in = pu1_src + wd_offset + ht_offset * src_strd; in_strd = src_strd; } /*top padding*/ if(row_start) { UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3; pu1_cpy = au1_cpy + cpy_strd * (3 - 1); memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); pu1_cpy -= cpy_strd; memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); pu1_cpy -= cpy_strd; memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); } /*bottom padding*/ if(row_end) { UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3 + (block_ht - 1) * cpy_strd; pu1_cpy = pu1_cpy_tmp + cpy_strd; memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); pu1_cpy += cpy_strd; memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); pu1_cpy += cpy_strd; memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); } /*left padding*/ if(col_start) { UWORD8 *pu1_cpy_tmp = au1_cpy + 3; pu1_cpy = au1_cpy; for(i = 0; i < block_ht + 6; i++) { pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0]; pu1_cpy += cpy_strd; pu1_cpy_tmp += cpy_strd; } } /*right padding*/ if(col_end) { UWORD8 *pu1_cpy_tmp = au1_cpy + 3 + block_wd - 1; pu1_cpy = au1_cpy + 3 + block_wd; for(i = 0; i < block_ht + 6; i++) { pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0]; pu1_cpy += cpy_strd; pu1_cpy_tmp += cpy_strd; } } wkg_mem_strd = block_wd >> 1; pu1_out = pu1_dst + (wd_offset >> 1); fun_select = (block_wd % 16 == 0); ihevce_scaling_filters[fun_select]( pu1_in, in_strd, pu1_wkg_mem, wkg_mem_strd, pu1_out, dst_strd, block_ht, block_wd); /* Left padding of 16 for 1st block of every row */ if(wd_offset == 0) { UWORD8 u1_val; WORD32 pad_wd = 16; WORD32 pad_ht = block_ht >> 1; UWORD8 *dst = pu1_dst; for(i = 0; i < pad_ht; i++) { u1_val = dst[0]; memset(&dst[-pad_wd], u1_val, pad_wd); dst += dst_strd; } } if(wd == wd_offset + block_wd) { /* Right padding of (16 + (CEIL16(wd/2))-wd/2) for last block of every row */ /* Right padding is done only after processing of last block of that row is done*/ UWORD8 u1_val; WORD32 pad_wd = 16 + CEIL16((wd >> 1)) - (wd >> 1) + 4; WORD32 pad_ht = block_ht >> 1; UWORD8 *dst = pu1_dst + (wd >> 1) - 1; for(i = 0; i < pad_ht; i++) { u1_val = dst[0]; memset(&dst[1], u1_val, pad_wd); dst += dst_strd; } if(ht_offset == 0) { /* Top padding of 16 is done for 1st row only after we reach end of that row */ WORD32 pad_wd = dst_strd; WORD32 pad_ht = 16; UWORD8 *dst = pu1_dst - 16; for(i = 1; i <= pad_ht; i++) { memcpy(dst - (i * dst_strd), dst, pad_wd); } } /* Bottom padding of (16 + (CEIL16(ht/2)) - ht/2) is done only if we have reached end of frame */ if(ht - ht_offset - block_ht == 0) { WORD32 pad_wd = dst_strd; WORD32 pad_ht = 16 + CEIL16((ht >> 1)) - (ht >> 1) + 4; UWORD8 *dst = pu1_dst + (((block_ht >> 1) - 1) * dst_strd) - 16; for(i = 1; i <= pad_ht; i++) memcpy(dst + (i * dst_strd), dst, pad_wd); } } }