/******************************************************************************
*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/**
*******************************************************************************
* @file
* ihevce_scan_coeffs_neon.c
*
* @brief
* Contains definitions for scanning quantized tu
*
* @author
* Ittiam
*
* @par List of Functions:
*
* @remarks
* None
*
********************************************************************************
*/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
/* System include files */
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <arm_neon.h>
/* User include files */
#include "ihevc_typedefs.h"
#include "itt_video_api.h"
#include "ihevc_defs.h"
#include "ihevc_debug.h"
#include "ihevce_api.h"
#include "ihevce_defs.h"
#include "rc_cntrl_param.h"
#include "rc_frame_info_collector.h"
#include "rc_look_ahead_params.h"
#include "ihevce_lap_enc_structs.h"
#include "ihevc_platform_macros.h"
#include "ihevc_structs.h"
#include "ihevce_multi_thrd_structs.h"
#include "ihevc_deblk.h"
#include "ihevc_itrans_recon.h"
#include "ihevc_chroma_itrans_recon.h"
#include "ihevc_chroma_intra_pred.h"
#include "ihevc_intra_pred.h"
#include "ihevc_inter_pred.h"
#include "ihevc_mem_fns.h"
#include "ihevc_padding.h"
#include "ihevc_weighted_pred.h"
#include "ihevc_sao.h"
#include "ihevc_resi_trans.h"
#include "ihevc_quant_iquant_ssd.h"
#include "ihevce_function_selector.h"
#include "ihevce_me_common_defs.h"
#include "ihevce_enc_structs.h"
#include "ihevce_global_tables.h"
#include "ihevce_ipe_instr_set_router.h"
#include "ihevce_common_utils.h"
/*****************************************************************************/
/* Function Declarations */
/*****************************************************************************/
FT_SCAN_COEFFS ihevce_scan_coeffs_neon;
/*****************************************************************************/
/* Function Definitions */
/*****************************************************************************/
static WORD32 movemask_neon(uint8x16_t input)
{
const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 };
uint8x8_t mask_and = vdup_n_u8(0x80);
int8x8_t mask_shift = vld1_s8(xr);
uint8x8_t lo = vget_low_u8(input);
uint8x8_t hi = vget_high_u8(input);
lo = vand_u8(lo, mask_and);
lo = vshl_u8(lo, mask_shift);
hi = vand_u8(hi, mask_and);
hi = vshl_u8(hi, mask_shift);
lo = vpadd_u8(lo, lo);
lo = vpadd_u8(lo, lo);
lo = vpadd_u8(lo, lo);
hi = vpadd_u8(hi, hi);
hi = vpadd_u8(hi, hi);
hi = vpadd_u8(hi, hi);
return ((hi[0] << 8) | (lo[0] & 0xFF));
}
WORD32 ihevce_scan_coeffs_neon(
WORD16 *pi2_quant_coeffs,
WORD32 *pi4_subBlock2csbfId_map,
WORD32 scan_idx,
WORD32 trans_size,
UWORD8 *pu1_out_data,
UWORD8 *pu1_csbf_buf,
WORD32 i4_csbf_stride)
{
WORD32 i, trans_unit_idx, num_gt1_flag, num_gt0_flag;
UWORD16 u2_csbf0flags;
WORD32 num_bytes = 0;
UWORD8 *pu1_trans_table;
UWORD8 *pu1_csb_table;
WORD32 shift_value, mask_value;
WORD32 blk_row, blk_col;
WORD32 x_pos, y_pos;
WORD32 quant_coeff;
UWORD8 *pu1_out_data_header;
UWORD16 *pu2_out_data_coeff;
int8x16_t one, shuffle, zero;
int16x8_t ones;
int8x8x2_t quant;
(void)i4_csbf_stride;
pu1_out_data_header = pu1_out_data;
u2_csbf0flags = 0xBAD0;
pu1_csb_table = (UWORD8 *)&(g_u1_scan_table_4x4[scan_idx][0]);
GETRANGE(shift_value, trans_size);
shift_value = shift_value - 3;
mask_value = (trans_size / 4) - 1;
switch(trans_size)
{
case 32:
pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_8x8[scan_idx][0]);
break;
case 16:
pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_4x4[scan_idx][0]);
break;
case 8:
pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_2x2[scan_idx][0]);
break;
case 4:
pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_1x1[0]);
break;
}
shuffle = vld1q_s8((WORD8 *)pu1_csb_table);
zero = vdupq_n_s8(0);
one = vdupq_n_s8(1);
ones = vdupq_n_s16(1);
for(trans_unit_idx = (trans_size * trans_size / 16) - 1; trans_unit_idx >= 0; trans_unit_idx--)
{
if(pu1_csbf_buf[pi4_subBlock2csbfId_map[pu1_trans_table[trans_unit_idx]]])
{
WORD32 sig_coeff_abs_gt0_flags, sig_coeff_abs_gt1_flags;
WORD32 sign_flag, pos_last_coded;
UWORD8 u1_last_x, u1_last_y;
WORD16 *pi2_temp_quant_coeff = pi2_quant_coeffs;
int16x4_t quant0, quant1, quant2, quant3;
int16x8_t quant01, quant23;
int8x8_t a, b, c, d, shuffle_0, shuffle_1;
int8x16_t shuffle_out, shuffle_out_abs;
uint8x16_t sign, eq0, eq1;
blk_row = pu1_trans_table[trans_unit_idx] >> shift_value;
blk_col = pu1_trans_table[trans_unit_idx] & mask_value;
pi2_temp_quant_coeff += (blk_col * 4 + (blk_row * 4) * trans_size);
quant0 = vld1_s16(pi2_temp_quant_coeff + 0 * trans_size);
quant1 = vld1_s16(pi2_temp_quant_coeff + 1 * trans_size);
quant2 = vld1_s16(pi2_temp_quant_coeff + 2 * trans_size);
quant3 = vld1_s16(pi2_temp_quant_coeff + 3 * trans_size);
quant01 = vcombine_s16(quant0, quant1);
quant23 = vcombine_s16(quant2, quant3);
a = vqmovn_s16(quant01);
b = vqmovn_s16(quant23);
quant.val[0] = a;
quant.val[1] = b;
c = vget_low_s8(shuffle);
d = vget_high_s8(shuffle);
shuffle_0 = vtbl2_s8(quant, c);
shuffle_1 = vtbl2_s8(quant, d);
shuffle_out = vcombine_s8(shuffle_0, shuffle_1);
shuffle_out_abs = vabsq_s8(shuffle_out);
sign = vcgtq_s8(zero, shuffle_out);
eq0 = vceqq_s8(shuffle_out, zero);
eq1 = vceqq_s8(shuffle_out_abs, one);
sign_flag = movemask_neon(sign);
sig_coeff_abs_gt0_flags = movemask_neon(eq0);
sig_coeff_abs_gt1_flags = movemask_neon(eq1);
sig_coeff_abs_gt0_flags = ~sig_coeff_abs_gt0_flags;
sig_coeff_abs_gt1_flags = ~sig_coeff_abs_gt1_flags;
sig_coeff_abs_gt0_flags = sig_coeff_abs_gt0_flags & 0x0000FFFF;
sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags & sig_coeff_abs_gt0_flags;
ASSERT(sig_coeff_abs_gt0_flags != 0);
GET_POS_MSB_32(pos_last_coded, sig_coeff_abs_gt0_flags);
/* Update gt1 flag based on num_gt0_flag */
num_gt0_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags);
/* Find the position of 9th(MAX_GT_ONE+1) 1 in sig_coeff_abs_gt0_flags from MSB and update gt1 flag */
if(num_gt0_flag > MAX_GT_ONE)
{
WORD32 gt0_first_byte = sig_coeff_abs_gt0_flags & 0xFF;
WORD32 num_gt0_second_byte =
ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags & 0xFF00);
WORD32 pos_nineth_one; /* pos. of 9th one from MSB of sig_coeff_abs_gt0_flags */
WORD32 gt0_after_nineth_one, num_gt0_first_byte_to_nine;
num_gt0_first_byte_to_nine = (MAX_GT_ONE + 1) - num_gt0_second_byte;
while(num_gt0_first_byte_to_nine)
{
GET_POS_MSB_32(pos_nineth_one, gt0_first_byte);
gt0_first_byte = CLEAR_BIT(
gt0_first_byte,
pos_nineth_one); /*gt0_second_byte &= (~(0x1<<pos_eighth_one));*/
num_gt0_first_byte_to_nine--;
}
/* Update gt1 based on pos_eighth_one */
gt0_after_nineth_one = SET_BIT(gt0_first_byte, pos_nineth_one);
sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags | gt0_after_nineth_one;
}
/* Get x_pos & y_pos of last coded in csb wrt to TU */
u1_last_x = (pu1_csb_table[pos_last_coded] & 0x3) + blk_col * 4;
u1_last_y = (pu1_csb_table[pos_last_coded] >> 2) + blk_row * 4;
num_gt1_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt1_flags);
/* storing last_x and last_y */
*pu1_out_data_header = u1_last_x;
pu1_out_data_header++;
*pu1_out_data_header = u1_last_y;
pu1_out_data_header++;
/* storing the scan order */
*pu1_out_data_header = (UWORD8)scan_idx;
pu1_out_data_header++;
/* storing last_sub_block pos. in scan order count */
*pu1_out_data_header = (UWORD8)trans_unit_idx;
pu1_out_data_header++;
/*stored the first 4 bytes, now all are word16. So word16 pointer*/
pu2_out_data_coeff = (UWORD16 *)pu1_out_data_header;
/* u2_csbf0flags word */
u2_csbf0flags = 0xBAD0 | 1; /*since right&bottom csbf is 0*/
/* storing u2_csbf0flags word */
*pu2_out_data_coeff = u2_csbf0flags;
pu2_out_data_coeff++;
/* storing u2_sig_coeff_abs_gt0_flags 2 bytes */
*pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt0_flags;
pu2_out_data_coeff++;
/* storing u2_sig_coeff_abs_gt1_flags 2 bytes */
*pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt1_flags;
pu2_out_data_coeff++;
/* storing u2_sign_flags 2 bytes */
*pu2_out_data_coeff = (UWORD16)sign_flag;
pu2_out_data_coeff++;
/* Store the u2_abs_coeff_remaining[] */
for(i = 0; i < num_gt1_flag; i++)
{
volatile WORD32 bit_pos;
ASSERT(sig_coeff_abs_gt1_flags != 0);
GET_POS_MSB_32(bit_pos, sig_coeff_abs_gt1_flags);
sig_coeff_abs_gt1_flags = CLEAR_BIT(
sig_coeff_abs_gt1_flags,
bit_pos); /*sig_coeff_abs_gt1_flags &= (~(0x1<<bit_pos));*/
x_pos = (pu1_csb_table[bit_pos] & 0x3);
y_pos = (pu1_csb_table[bit_pos] >> 2);
quant_coeff = pi2_temp_quant_coeff[x_pos + (y_pos * trans_size)];
/* storing u2_abs_coeff_remaining[i] 2 bytes */
*pu2_out_data_coeff = (UWORD16)abs(quant_coeff) - 1;
pu2_out_data_coeff++;
}
break; /*We just need this loop for finding 1st non-zero csb only*/
}
}
/* go through remaining csb in the scan order */
for(trans_unit_idx = trans_unit_idx - 1; trans_unit_idx >= 0; trans_unit_idx--)
{
blk_row = pu1_trans_table[trans_unit_idx] >> shift_value; /*row of csb*/
blk_col = pu1_trans_table[trans_unit_idx] & mask_value; /*col of csb*/
/* u2_csbf0flags word */
u2_csbf0flags = 0xBAD0 | /* assuming csbf_buf has only 0 or 1 values */
(pu1_csbf_buf[pi4_subBlock2csbfId_map[pu1_trans_table[trans_unit_idx]]]);
/********************************************************************/
/* Minor hack: As per HEVC spec csbf in not signalled in stream for */
/* block0, instead sig coeff map is directly signalled. This is */
/* taken care by forcing csbf for block0 to be 1 even if it is 0 */
/********************************************************************/
if(0 == trans_unit_idx)
{
u2_csbf0flags |= 1;
}
if((blk_col + 1 < trans_size / 4)) /* checking right boundary */
{
if(pu1_csbf_buf[pi4_subBlock2csbfId_map[blk_row * trans_size / 4 + blk_col + 1]])
{
/* set the 2nd bit of u2_csbf0flags for right csbf */
u2_csbf0flags = u2_csbf0flags | (1 << 1);
}
}
if((blk_row + 1 < trans_size / 4)) /* checking bottom oundary */
{
if(pu1_csbf_buf[pi4_subBlock2csbfId_map[(blk_row + 1) * trans_size / 4 + blk_col]])
{
/* set the 3rd bit of u2_csbf0flags for bottom csbf */
u2_csbf0flags = u2_csbf0flags | (1 << 2);
}
}
/* storing u2_csbf0flags word */
*pu2_out_data_coeff = u2_csbf0flags;
pu2_out_data_coeff++;
/* check for the csb flag in our scan order */
if(u2_csbf0flags & 0x1)
{
WORD32 sig_coeff_abs_gt0_flags, sig_coeff_abs_gt1_flags;
WORD32 sign_flag;
int16x4_t quant0, quant1, quant2, quant3;
int16x8_t quant01, quant23;
int8x8_t a, b, c, d, shuffle_0, shuffle_1;
int8x16_t shuffle_out, shuffle_out_abs;
uint8x16_t sign, eq0, eq1;
/* x_pos=blk_col*4, y_pos=blk_row*4 */
WORD16 *pi2_temp_quant_coeff =
pi2_quant_coeffs + blk_col * 4 + (blk_row * 4) * trans_size;
/* Load Quant Values */
quant0 = vld1_s16(pi2_temp_quant_coeff + 0 * trans_size);
quant1 = vld1_s16(pi2_temp_quant_coeff + 1 * trans_size);
quant2 = vld1_s16(pi2_temp_quant_coeff + 2 * trans_size);
quant3 = vld1_s16(pi2_temp_quant_coeff + 3 * trans_size);
/* Two quant rows together */
quant01 = vcombine_s16(quant0, quant1);
quant23 = vcombine_s16(quant2, quant3);
/* All 4 rows: For sign, gt0, gt1 flags, even 8 bit version is enough! */
a = vqmovn_s16(quant01);
b = vqmovn_s16(quant23);
quant.val[0] = a;
quant.val[1] = b;
c = vget_low_s8(shuffle);
d = vget_high_s8(shuffle);
shuffle_0 = vtbl2_s8(quant, c);
shuffle_1 = vtbl2_s8(quant, d);
shuffle_out = vcombine_s8(shuffle_0, shuffle_1);
/* ABS values */
shuffle_out_abs = vabsq_s8(shuffle_out);
/* sign bits : Will get 0xFF if (0 > shuffle_out) */
sign = vcgtq_s8(zero, shuffle_out);
/* gt0 : Will get 0xFF if ( shuffle_out == 0 ) */
eq0 = vceqq_s8(shuffle_out, zero);
/* gt1 : Will get 0xFF if ( abs(shuffle_out) == 1 ) */
eq1 = vceqq_s8(shuffle_out_abs, one);
/* movemask:0 extended upper 16bits,Only low16 bits are required while storing */
sign_flag = movemask_neon(sign);
sig_coeff_abs_gt0_flags = movemask_neon(eq0);
sig_coeff_abs_gt1_flags = movemask_neon(eq1);
/* Update gt0 and gt1 based on ==0 and ==1 flag */
sig_coeff_abs_gt0_flags = ~sig_coeff_abs_gt0_flags; /* != 0 */
sig_coeff_abs_gt1_flags = ~sig_coeff_abs_gt1_flags; /* (abs) != 1 */
sig_coeff_abs_gt0_flags = sig_coeff_abs_gt0_flags & 0x0000FFFF; /* Clear high Word */
sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags & sig_coeff_abs_gt0_flags;
/* Update gt1 flag based on num_gt0_flag */
num_gt0_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags);
/* Find the position of 9th(MAX_GT_ONE+1) 1 in sig_coeff_abs_gt0_flags from MSB and update gt1 flag */
if(num_gt0_flag > MAX_GT_ONE)
{
WORD32 gt0_first_byte = sig_coeff_abs_gt0_flags & 0xFF;
WORD32 num_gt0_second_byte =
ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags & 0xFF00);
WORD32 pos_nineth_one; /* pos. of 9th one from MSB of sig_coeff_abs_gt0_flags */
WORD32 gt0_after_nineth_one, num_gt0_first_byte_to_nine;
num_gt0_first_byte_to_nine = (MAX_GT_ONE + 1) - num_gt0_second_byte;
while(num_gt0_first_byte_to_nine)
{
GET_POS_MSB_32(pos_nineth_one, gt0_first_byte);
gt0_first_byte = CLEAR_BIT(
gt0_first_byte,
pos_nineth_one); /*gt0_second_byte &= (~(0x1<<pos_eighth_one));*/
num_gt0_first_byte_to_nine--;
}
/* Update gt1 based on pos_eighth_one */
gt0_after_nineth_one = SET_BIT(gt0_first_byte, pos_nineth_one);
sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags | gt0_after_nineth_one;
}
num_gt1_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt1_flags);
/* storing u2_sig_coeff_abs_gt0_flags 2 bytes */
*pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt0_flags;
pu2_out_data_coeff++;
/* storing u2_sig_coeff_abs_gt1_flags 2 bytes */
*pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt1_flags;
pu2_out_data_coeff++;
/* storing u2_sign_flags 2 bytes */
*pu2_out_data_coeff = (UWORD16)sign_flag;
pu2_out_data_coeff++;
/* Store the u2_abs_coeff_remaining[] */
for(i = 0; i < num_gt1_flag; i++)
{
volatile WORD32 bit_pos;
ASSERT(sig_coeff_abs_gt1_flags != 0);
GET_POS_MSB_32(bit_pos, sig_coeff_abs_gt1_flags);
sig_coeff_abs_gt1_flags = CLEAR_BIT(
sig_coeff_abs_gt1_flags,
bit_pos); /*sig_coeff_abs_gt1_flags &= (~(0x1<<bit_pos));*/
x_pos = (pu1_csb_table[bit_pos] & 0x3);
y_pos = (pu1_csb_table[bit_pos] >> 2);
quant_coeff = pi2_temp_quant_coeff[x_pos + (y_pos * trans_size)];
/* storing u2_abs_coeff_remaining[i] 2 bytes */
*pu2_out_data_coeff = (UWORD16)abs(quant_coeff) - 1;
pu2_out_data_coeff++;
}
}
}
num_bytes = (UWORD8 *)pu2_out_data_coeff - pu1_out_data;
return num_bytes; /* Return the number of bytes written to out_data */
}