C++程序  |  2060行  |  88.7 KB

/******************************************************************************
 *
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/**
*******************************************************************************
* @file
*  ihevce_common_utils_neon.c
*
* @brief
*  Contains intrinsic definitions of functions for sao param
*
* @author
*  ittiam
*
* @par List of Functions:
*  - ihevce_get_luma_eo_sao_params_neon()
*  - ihevce_get_chroma_eo_sao_params_neon()
*
* @remarks
*  None
*
*******************************************************************************
*/

/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/
/* System include files */
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <arm_neon.h>

/* User include files */
#include "ihevc_typedefs.h"
#include "itt_video_api.h"
#include "ihevce_api.h"

#include "rc_cntrl_param.h"
#include "rc_frame_info_collector.h"
#include "rc_look_ahead_params.h"

#include "ihevc_defs.h"
#include "ihevc_debug.h"
#include "ihevc_structs.h"
#include "ihevc_platform_macros.h"
#include "ihevc_deblk.h"
#include "ihevc_itrans_recon.h"
#include "ihevc_chroma_itrans_recon.h"
#include "ihevc_chroma_intra_pred.h"
#include "ihevc_intra_pred.h"
#include "ihevc_inter_pred.h"
#include "ihevc_mem_fns.h"
#include "ihevc_padding.h"
#include "ihevc_weighted_pred.h"
#include "ihevc_sao.h"
#include "ihevc_resi_trans.h"
#include "ihevc_quant_iquant_ssd.h"
#include "ihevc_cabac_tables.h"
#include "ihevc_cmn_utils_neon.h"

#include "ihevce_defs.h"
#include "ihevce_hle_interface.h"
#include "ihevce_lap_enc_structs.h"
#include "ihevce_multi_thrd_structs.h"
#include "ihevce_me_common_defs.h"
#include "ihevce_had_satd.h"
#include "ihevce_error_codes.h"
#include "ihevce_bitstream.h"
#include "ihevce_cabac.h"
#include "ihevce_rdoq_macros.h"
#include "ihevce_function_selector.h"
#include "ihevce_enc_structs.h"
#include "ihevce_entropy_structs.h"
#include "ihevce_cmn_utils_instr_set_router.h"
#include "ihevce_enc_loop_structs.h"
#include "ihevce_common_utils.h"
#include "ihevce_global_tables.h"

/*****************************************************************************/
/* Function Definitions                                                      */
/*****************************************************************************/

static void ihevce_wt_avg_2d_16x1_neon(
    UWORD8 *pu1_pred0,
    UWORD8 *pu1_pred1,
    UWORD8 *pu1_dst,
    WORD32 w0,
    WORD32 w1,
    WORD32 rnd,
    WORD32 shift)
{
    uint8x16_t a0, a1;
    int32x4_t a6, a7, a9;
    int32x4_t reg0[4], reg1[4];
    int16x8_t a2, a3, a4, a5, a8;

    a8 = vdupq_n_s16((WORD16)rnd);

    a6 = vdupq_n_s32(w0);
    a7 = vdupq_n_s32(w1);
    a9 = vdupq_n_s32(-shift);

    a0 = vld1q_u8(pu1_pred0);
    a1 = vld1q_u8(pu1_pred1);

    a2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a0)));
    a3 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a0)));
    a4 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a1)));
    a5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a1)));

    reg0[0] = vmovl_s16(vget_low_s16(a2));
    reg0[1] = vmovl_s16(vget_high_s16(a2));
    reg0[2] = vmovl_s16(vget_low_s16(a3));
    reg0[3] = vmovl_s16(vget_high_s16(a3));

    reg1[0] = vmovl_s16(vget_low_s16(a4));
    reg1[1] = vmovl_s16(vget_high_s16(a4));
    reg1[2] = vmovl_s16(vget_low_s16(a5));
    reg1[3] = vmovl_s16(vget_high_s16(a5));

    reg0[0] = vmulq_s32(reg0[0], a6);
    reg0[1] = vmulq_s32(reg0[1], a6);
    reg0[2] = vmulq_s32(reg0[2], a6);
    reg0[3] = vmulq_s32(reg0[3], a6);

    reg1[0] = vmulq_s32(reg1[0], a7);
    reg1[1] = vmulq_s32(reg1[1], a7);
    reg1[2] = vmulq_s32(reg1[2], a7);
    reg1[3] = vmulq_s32(reg1[3], a7);

    reg0[0] = vaddq_s32(reg0[0], reg1[0]);
    reg0[1] = vaddq_s32(reg0[1], reg1[1]);
    reg0[2] = vaddq_s32(reg0[2], reg1[2]);
    reg0[3] = vaddq_s32(reg0[3], reg1[3]);

    reg0[0] = vshlq_s32(reg0[0], a9);
    reg0[1] = vshlq_s32(reg0[1], a9);
    reg0[2] = vshlq_s32(reg0[2], a9);
    reg0[3] = vshlq_s32(reg0[3], a9);  // (p0*w0 + p1*w1) >> shift

    a2 = vcombine_s16(vmovn_s32(reg0[0]), vmovn_s32(reg0[1]));
    a3 = vcombine_s16(vmovn_s32(reg0[2]), vmovn_s32(reg0[3]));

    a2 = vaddq_s16(a2, a8);
    a3 = vaddq_s16(a3, a8);  // ((p0*w0 + p1*w1) >> shift) + rnd
    a0 = vcombine_u8(vqmovun_s16(a2), vqmovun_s16(a3));

    vst1q_u8(pu1_dst, a0);
}

static void ihevce_wt_avg_2d_8x1_neon(
    UWORD8 *pu1_pred0,
    UWORD8 *pu1_pred1,
    UWORD8 *pu1_dst,
    WORD32 w0,
    WORD32 w1,
    WORD32 rnd,
    WORD32 shift)
{
    uint8x8_t a2, a3;
    int16x8_t a0, a1, a6;
    int32x4_t a4, a5, a7, a8, a9, a10, a11;

    a6 = vdupq_n_s16((WORD16)rnd);

    a4 = vdupq_n_s32(w0);
    a5 = vdupq_n_s32(w1);
    a7 = vdupq_n_s32((-shift));

    a2 = vld1_u8(pu1_pred0);
    a3 = vld1_u8(pu1_pred1);
    a0 = vreinterpretq_s16_u16(vmovl_u8(a2));
    a1 = vreinterpretq_s16_u16(vmovl_u8(a3));

    a8 = vmovl_s16(vget_low_s16(a0));
    a9 = vmovl_s16(vget_high_s16(a0));
    a10 = vmovl_s16(vget_low_s16(a1));
    a11 = vmovl_s16(vget_high_s16(a1));

    a8 = vmulq_s32(a8, a4);
    a9 = vmulq_s32(a9, a4);
    a10 = vmulq_s32(a10, a5);
    a11 = vmulq_s32(a11, a5);

    a8 = vaddq_s32(a8, a10);
    a10 = vaddq_s32(a9, a11);

    a8 = vshlq_s32(a8, a7);
    a10 = vshlq_s32(a10, a7);

    a0 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a10));
    a0 = vaddq_s16(a0, a6);
    a2 = vqmovun_s16(a0);
    vst1_u8(pu1_dst, a2);
}

static void ihevce_wt_avg_2d_4xn_neon(
    UWORD8 *pu1_pred0,
    UWORD8 *pu1_pred1,
    WORD32 pred0_strd,
    WORD32 pred1_strd,
    WORD32 wd,
    WORD32 ht,
    UWORD8 *pu1_dst,
    WORD32 dst_strd,
    WORD32 w0,
    WORD32 w1,
    WORD32 rnd,
    WORD32 shift)
{
    WORD32 i, j;
    uint8x16_t src0_u8, src1_u8;
    uint16x8_t a0, a1, a2, a3;
    int32x4_t reg0[4], reg1[4];
    int32x4_t a4, a5, a7;
    int16x8_t a8, a9, a6;
    uint32x2_t p0, p1;

    a6 = vdupq_n_s16((WORD16)rnd);

    a4 = vdupq_n_s32(w0);
    a5 = vdupq_n_s32(w1);
    a7 = vdupq_n_s32((-shift));

    for(i = 0; i < ht; i = i + 4)
    {
        for(j = 0; j < wd; j = j + 4)
        {
            src0_u8 = load_unaligned_u8q(pu1_pred0 + ((i * pred0_strd) + j), pred0_strd);
            src1_u8 = load_unaligned_u8q(pu1_pred1 + ((i * pred1_strd) + j), pred1_strd);

            a0 = vmovl_u8(vget_low_u8(src0_u8));
            a1 = vmovl_u8(vget_high_u8(src0_u8));
            a2 = vmovl_u8(vget_low_u8(src1_u8));
            a3 = vmovl_u8(vget_high_u8(src1_u8));

            reg0[0] = vmovl_s16(vreinterpret_s16_u16(vget_low_u16(a0)));
            reg0[1] = vmovl_s16(vreinterpret_s16_u16(vget_high_u16(a0)));
            reg0[2] = vmovl_s16(vreinterpret_s16_u16(vget_low_u16(a1)));
            reg0[3] = vmovl_s16(vreinterpret_s16_u16(vget_high_u16(a1)));

            reg1[0] = vmovl_s16(vreinterpret_s16_u16(vget_low_u16(a2)));
            reg1[1] = vmovl_s16(vreinterpret_s16_u16(vget_high_u16(a2)));
            reg1[2] = vmovl_s16(vreinterpret_s16_u16(vget_low_u16(a3)));
            reg1[3] = vmovl_s16(vreinterpret_s16_u16(vget_high_u16(a3)));

            reg0[0] = vmulq_s32(reg0[0], a4);
            reg0[1] = vmulq_s32(reg0[1], a4);
            reg0[2] = vmulq_s32(reg0[2], a4);
            reg0[3] = vmulq_s32(reg0[3], a4);

            reg1[0] = vmulq_s32(reg1[0], a5);
            reg1[1] = vmulq_s32(reg1[1], a5);
            reg1[2] = vmulq_s32(reg1[2], a5);
            reg1[3] = vmulq_s32(reg1[3], a5);

            reg0[0] = vaddq_s32(reg0[0], reg1[0]);
            reg0[1] = vaddq_s32(reg0[1], reg1[1]);
            reg0[2] = vaddq_s32(reg0[2], reg1[2]);
            reg0[3] = vaddq_s32(reg0[3], reg1[3]);

            reg0[0] = vshlq_s32(reg0[0], a7);
            reg0[1] = vshlq_s32(reg0[1], a7);
            reg0[2] = vshlq_s32(reg0[2], a7);
            reg0[3] = vshlq_s32(reg0[3], a7);

            a8 = vcombine_s16(vmovn_s32(reg0[0]), vmovn_s32(reg0[1]));
            a9 = vcombine_s16(vmovn_s32(reg0[2]), vmovn_s32(reg0[3]));

            a8 = vaddq_s16(a8, a6);
            a9 = vaddq_s16(a9, a6);

            p0 = vreinterpret_u32_u8(vqmovun_s16(a8));
            p1 = vreinterpret_u32_u8(vqmovun_s16(a9));

            *(UWORD32 *)pu1_dst = vget_lane_u32(p0, 0);
            *(UWORD32 *)(pu1_dst + dst_strd) = vget_lane_u32(p0, 1);
            *(UWORD32 *)(pu1_dst + 2 * dst_strd) = vget_lane_u32(p1, 0);
            *(UWORD32 *)(pu1_dst + 3 * dst_strd) = vget_lane_u32(p1, 1);

            pu1_dst += 4;
        }
        pu1_dst = pu1_dst - wd + 4 * dst_strd;
    }
}

/**
********************************************************************************
*
*  @brief  Weighted pred of 2 predictor buffers as per spec
*
*  @param[in] pu1_pred0 : Pred0 buffer
*
*  @param[in] pu1_pred1 : Pred1 buffer
*
*  @param[in] pred0_strd : Stride of pred0 buffer
*
*  @param[in] pred1_strd : Stride of pred1 buffer
*
*  @param[in] wd : Width of pred block
*
*  @param[in] ht : Height of pred block
*
*  @param[out] pu1_dst : Destination buffer that will hold result
*
*  @param[in] dst_strd : Stride of dest buffer
*
*  @param[in] w0 : Weighting factor of Pred0
*
*  @param[in] w1 : weighting factor of pred1
*
*  @param[in] o0 : offset for pred0
*
*  @param[in] o1 : offset for pred1
*
*  @param[in] log_wdc : shift factor as per spec
*
*  @return none
*
********************************************************************************
*/
void ihevce_wt_avg_2d_neon(
    UWORD8 *pu1_pred0,
    UWORD8 *pu1_pred1,
    WORD32 pred0_strd,
    WORD32 pred1_strd,
    WORD32 wd,
    WORD32 ht,
    UWORD8 *pu1_dst,
    WORD32 dst_strd,
    WORD32 w0,
    WORD32 w1,
    WORD32 o0,
    WORD32 o1,
    WORD32 log_wdc)
{
    /* Total Rounding term to be added, including offset */
    WORD32 rnd = (o0 + o1 + 1) >> 1;  // << log_wdc;
    /* Downshift */
    WORD32 shift = log_wdc + 1;
    /* loop counters */
    WORD32 i, j;

    switch(wd)
    {
    case 4:
    case 12:
        ihevce_wt_avg_2d_4xn_neon(
            pu1_pred0,
            pu1_pred1,
            pred0_strd,
            pred1_strd,
            wd,
            ht,
            pu1_dst,
            dst_strd,
            w0,
            w1,
            rnd,
            shift);
        break;
    case 8:
    case 24:
        for(i = 0; i < ht; i++)
        {
            for(j = 0; j < wd; j = j + 8)
            {
                ihevce_wt_avg_2d_8x1_neon(
                    pu1_pred0 + ((i * pred0_strd) + j),
                    pu1_pred1 + ((i * pred1_strd) + j),
                    pu1_dst + ((i * dst_strd) + j),
                    w0,
                    w1,
                    rnd,
                    shift);
            }
        }
        break;
    case 16:
        for(i = 0; i < ht; i++)
            ihevce_wt_avg_2d_16x1_neon(
                pu1_pred0 + (i * pred0_strd),
                pu1_pred1 + (i * pred1_strd),
                pu1_dst + (i * dst_strd),
                w0,
                w1,
                rnd,
                shift);
        break;
    case 32:
    case 64:
        for(i = 0; i < ht; i++)
        {
            for(j = 0; j < wd; j = j + 16)
            {
                ihevce_wt_avg_2d_16x1_neon(
                    pu1_pred0 + ((i * pred0_strd) + j),
                    pu1_pred1 + ((i * pred1_strd) + j),
                    pu1_dst + ((i * dst_strd) + j),
                    w0,
                    w1,
                    rnd,
                    shift);
            }
        }
        break;
    case 48:
        for(i = 0; i < ht; i++)
        {
            for(j = 0; j < wd; j = j + 16)
            {
                ihevce_wt_avg_2d_16x1_neon(
                    pu1_pred0 + ((i * pred0_strd) + j),
                    pu1_pred1 + ((i * pred1_strd) + j),
                    pu1_dst + ((i * dst_strd) + j),
                    w0,
                    w1,
                    rnd,
                    shift);
            }
        }
        break;
    default:
        assert(0);
        break;
    }
    return;
}

static INLINE WORD32 sad_cal(int16x8_t temp_reg)
{
    int64x2_t sad_reg = vpaddlq_s32(vpaddlq_s16(temp_reg));

    return (vget_lane_s32(
        vadd_s32(
            vreinterpret_s32_s64(vget_low_s64(sad_reg)),
            vreinterpret_s32_s64(vget_high_s64(sad_reg))),
        0));
}

void ihevce_get_luma_eo_sao_params_neon(
    void *pv_sao_ctxt,
    WORD32 eo_sao_class,
    WORD32 *pi4_acc_error_category,
    WORD32 *pi4_category_count)
{
    /*temp var*/
    UWORD8 *pu1_luma_recon_buf, *pu1_luma_src_buf;
    UWORD8 *pu1_luma_src_buf_copy, *pu1_luma_recon_buf_copy;
    WORD32 row_end, col_end, row, col;
    WORD32 row_start = 0, col_start = 0;
    WORD32 wd, rem_wd;
    WORD32 a, b, c, edge_idx, pel_err;

    int16x8_t temp_reg0, temp_reg1, temp_reg2, temp_reg3, temp_reg4;
    int16x8_t edgeidx_reg0, edgeidx_reg1, edgeidx_reg2, edgeidx_reg3, edgeidx_reg4;
    int16x8_t edgeidx_reg5, edgeidx_reg6, edgeidx_reg7;
    int16x8_t pel_error, pel_error1;
    int16x8_t sign_reg0, sign_reg1, sign_reg, sign_reg2, sign_reg3;
    int16x8_t edgeidx, edgeidx1;
    int16x8_t temp_reg5, temp_reg6, temp_reg7;
    uint8x16_t src_buf_8x16, recon_buf_8x16, recon_buf0_8x16, recon_buf1_8x16;
    uint8x8_t src_buf, recon_buf, recon_buf0, recon_buf1;

    sao_ctxt_t *ps_sao_ctxt = (sao_ctxt_t *)pv_sao_ctxt;
    const WORD32 i4_luma_recon_strd = ps_sao_ctxt->i4_cur_luma_recon_stride;
    const WORD32 i4_luma_src_strd = ps_sao_ctxt->i4_cur_luma_src_stride;

    const int16x8_t const_2 = vdupq_n_s16(2);
    const int16x8_t const_0 = vdupq_n_s16(0);
    const int16x8_t const_1 = vdupq_n_s16(1);
    const int16x8_t const_3 = vdupq_n_s16(3);
    const int16x8_t const_4 = vdupq_n_s16(4);

    row_end = ps_sao_ctxt->i4_sao_blk_ht;
    col_end = ps_sao_ctxt->i4_sao_blk_wd;

    if((ps_sao_ctxt->i4_ctb_x == 0) && (eo_sao_class != SAO_EDGE_90_DEG))
    {
        col_start = 1;
    }

    if(((ps_sao_ctxt->i4_ctb_x + 1) == ps_sao_ctxt->ps_sps->i2_pic_wd_in_ctb) &&
       (eo_sao_class != SAO_EDGE_90_DEG))
    {
        col_end = col_end - 1;
    }

    if((ps_sao_ctxt->i4_ctb_y == 0) && (eo_sao_class != SAO_EDGE_0_DEG))
    {
        row_start = 1;
    }

    if(((ps_sao_ctxt->i4_ctb_y + 1) == ps_sao_ctxt->ps_sps->i2_pic_ht_in_ctb) &&
       (eo_sao_class != SAO_EDGE_0_DEG))
    {
        row_end = row_end - 1;
    }
    wd = col_end - col_start;
    rem_wd = wd;
    pu1_luma_recon_buf =
        ps_sao_ctxt->pu1_cur_luma_recon_buf + col_start + (row_start * i4_luma_recon_strd);
    pu1_luma_src_buf =
        ps_sao_ctxt->pu1_cur_luma_src_buf + col_start + (row_start * i4_luma_src_strd);

    switch(eo_sao_class)
    {
    case SAO_EDGE_0_DEG:
        for(row = row_start; row < row_end; row++)
        {
            pu1_luma_src_buf_copy = pu1_luma_src_buf;
            pu1_luma_recon_buf_copy = pu1_luma_recon_buf;
            for(col = wd; col > 15; col -= 16)
            {
                /*load src and recon data*/
                src_buf_8x16 = vld1q_u8(pu1_luma_src_buf);
                recon_buf_8x16 = vld1q_u8(pu1_luma_recon_buf);
                recon_buf0_8x16 = vld1q_u8(pu1_luma_recon_buf - 1);
                recon_buf1_8x16 = vld1q_u8(pu1_luma_recon_buf + 1);

                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
                pel_error1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));

                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                sign_reg2 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
                sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
                sign_reg2 = vsubq_s16(sign_reg2, sign_reg);

                sign_reg3 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
                sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
                sign_reg3 = vsubq_s16(sign_reg3, sign_reg);
                /*edgidx*/
                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
                edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);

                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
                edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);

                temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
                temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
                temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg5 = vabsq_s16(temp_reg5);

                edgeidx_reg2 = vabsq_s16(temp_reg2);
                edgeidx_reg6 = vabsq_s16(temp_reg6);
                edgeidx_reg3 = vabsq_s16(temp_reg3);
                edgeidx_reg7 = vabsq_s16(temp_reg7);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg4 = vandq_s16(temp_reg4, pel_error1);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg5 = vandq_s16(temp_reg5, pel_error1);

                temp_reg2 = vandq_s16(temp_reg2, pel_error);
                temp_reg6 = vandq_s16(temp_reg6, pel_error1);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);
                temp_reg7 = vandq_s16(temp_reg7, pel_error1);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));

                temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
                temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
                temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
                temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);

                edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
                edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
                edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
                edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);

                /*store peel error*/
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg2);
                pi4_acc_error_category[4] += sad_cal(temp_reg3);

                /*store edgeidx account*/
                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg2);
                pi4_category_count[4] += sad_cal(edgeidx_reg3);
                pu1_luma_recon_buf += 16;
                pu1_luma_src_buf += 16;
            }
            rem_wd &= 0x0F;

            if(rem_wd > 7)
            {
                /*load data*/
                src_buf = vld1_u8(pu1_luma_src_buf);
                recon_buf = vld1_u8(pu1_luma_recon_buf);
                recon_buf0 = vld1_u8(pu1_luma_recon_buf - 1);
                recon_buf1 = vld1_u8(pu1_luma_recon_buf + 1);
                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);

                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg2 = vabsq_s16(temp_reg2);
                edgeidx_reg3 = vabsq_s16(temp_reg3);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg2 = vandq_s16(temp_reg2, pel_error);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                /*store */
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg2);
                pi4_acc_error_category[4] += sad_cal(temp_reg3);

                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg2);
                pi4_category_count[4] += sad_cal(edgeidx_reg3);
                pu1_luma_recon_buf += 8;
                pu1_luma_src_buf += 8;
            }
            rem_wd &= 0x7;
            if(rem_wd)
            {
                for(col = 0; col < rem_wd; col++)
                {
                    c = pu1_luma_recon_buf[col];
                    a = pu1_luma_recon_buf[col - 1];
                    b = pu1_luma_recon_buf[col + 1];
                    pel_err = pu1_luma_src_buf[col] - pu1_luma_recon_buf[col];
                    edge_idx = 2 + SIGN(c - a) + SIGN(c - b);

                    if(pel_err != 0)
                    {
                        pi4_acc_error_category[edge_idx] += pel_err;
                        pi4_category_count[edge_idx]++;
                    }
                }
            }
            pu1_luma_recon_buf = pu1_luma_recon_buf_copy + i4_luma_recon_strd;
            pu1_luma_src_buf = pu1_luma_src_buf_copy + i4_luma_src_strd;
            rem_wd = wd;
        }
        break;
    case SAO_EDGE_90_DEG:
        for(row = row_start; row < row_end; row++)
        {
            pu1_luma_src_buf_copy = pu1_luma_src_buf;
            pu1_luma_recon_buf_copy = pu1_luma_recon_buf;
            for(col = wd; col > 15; col -= 16)
            {
                /*load src and recon data*/
                src_buf_8x16 = vld1q_u8(pu1_luma_src_buf);
                recon_buf_8x16 = vld1q_u8(pu1_luma_recon_buf);
                recon_buf0_8x16 = vld1q_u8(pu1_luma_recon_buf - i4_luma_recon_strd);
                recon_buf1_8x16 = vld1q_u8(pu1_luma_recon_buf + i4_luma_recon_strd);
                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
                pel_error1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                sign_reg2 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
                sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
                sign_reg2 = vsubq_s16(sign_reg2, sign_reg);

                sign_reg3 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
                sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
                sign_reg3 = vsubq_s16(sign_reg3, sign_reg);
                /*edgeidx*/
                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
                edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);

                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
                edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);

                temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
                temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
                temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg5 = vabsq_s16(temp_reg5);

                edgeidx_reg2 = vabsq_s16(temp_reg2);
                edgeidx_reg6 = vabsq_s16(temp_reg6);
                edgeidx_reg3 = vabsq_s16(temp_reg3);
                edgeidx_reg7 = vabsq_s16(temp_reg7);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg4 = vandq_s16(temp_reg4, pel_error1);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg5 = vandq_s16(temp_reg5, pel_error1);

                temp_reg2 = vandq_s16(temp_reg2, pel_error);
                temp_reg6 = vandq_s16(temp_reg6, pel_error1);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);
                temp_reg7 = vandq_s16(temp_reg7, pel_error1);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));

                temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
                temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
                temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
                temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);

                edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
                edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
                edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
                edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
                /* store */
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg2);
                pi4_acc_error_category[4] += sad_cal(temp_reg3);
                /*store account*/
                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg2);
                pi4_category_count[4] += sad_cal(edgeidx_reg3);
                pu1_luma_recon_buf += 16;
                pu1_luma_src_buf += 16;
            }
            rem_wd &= 0x0F;

            if(rem_wd > 7)
            {
                /*load*/
                src_buf = vld1_u8(pu1_luma_src_buf);
                recon_buf = vld1_u8(pu1_luma_recon_buf);
                recon_buf0 = vld1_u8(pu1_luma_recon_buf - i4_luma_recon_strd);
                recon_buf1 = vld1_u8(pu1_luma_recon_buf + i4_luma_recon_strd);
                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg2 = vabsq_s16(temp_reg2);
                edgeidx_reg3 = vabsq_s16(temp_reg3);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg2 = vandq_s16(temp_reg2, pel_error);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                /*store*/
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg2);
                pi4_acc_error_category[4] += sad_cal(temp_reg3);

                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg2);
                pi4_category_count[4] += sad_cal(edgeidx_reg3);
                pu1_luma_recon_buf += 8;
                pu1_luma_src_buf += 8;
            }
            rem_wd &= 0x7;
            if(rem_wd)
            {
                for(col = 0; col < rem_wd; col++)
                {
                    c = pu1_luma_recon_buf[col];
                    a = pu1_luma_recon_buf[col - i4_luma_recon_strd];
                    b = pu1_luma_recon_buf[col + i4_luma_recon_strd];
                    pel_err = pu1_luma_src_buf[col] - pu1_luma_recon_buf[col];
                    edge_idx = 2 + SIGN(c - a) + SIGN(c - b);

                    if(pel_err != 0)
                    {
                        pi4_acc_error_category[edge_idx] += pel_err;
                        pi4_category_count[edge_idx]++;
                    }
                }
            }
            pu1_luma_recon_buf = pu1_luma_recon_buf_copy + i4_luma_recon_strd;
            pu1_luma_src_buf = pu1_luma_src_buf_copy + i4_luma_src_strd;
            rem_wd = wd;
        }
        break;
    case SAO_EDGE_135_DEG:
        for(row = row_start; row < row_end; row++)
        {
            pu1_luma_src_buf_copy = pu1_luma_src_buf;
            pu1_luma_recon_buf_copy = pu1_luma_recon_buf;
            for(col = wd; col > 15; col -= 16)
            {
                /*load src and recon data*/
                src_buf_8x16 = vld1q_u8(pu1_luma_src_buf);
                recon_buf_8x16 = vld1q_u8(pu1_luma_recon_buf);
                recon_buf0_8x16 = vld1q_u8(pu1_luma_recon_buf - 1 - i4_luma_recon_strd);
                recon_buf1_8x16 = vld1q_u8(pu1_luma_recon_buf + 1 + i4_luma_recon_strd);
                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
                pel_error1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                sign_reg2 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
                sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
                sign_reg2 = vsubq_s16(sign_reg2, sign_reg);

                sign_reg3 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
                sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
                sign_reg3 = vsubq_s16(sign_reg3, sign_reg);

                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
                edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);

                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
                edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);

                temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
                temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
                temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg5 = vabsq_s16(temp_reg5);

                edgeidx_reg2 = vabsq_s16(temp_reg2);
                edgeidx_reg6 = vabsq_s16(temp_reg6);
                edgeidx_reg3 = vabsq_s16(temp_reg3);
                edgeidx_reg7 = vabsq_s16(temp_reg7);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg4 = vandq_s16(temp_reg4, pel_error1);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg5 = vandq_s16(temp_reg5, pel_error1);

                temp_reg2 = vandq_s16(temp_reg2, pel_error);
                temp_reg6 = vandq_s16(temp_reg6, pel_error1);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);
                temp_reg7 = vandq_s16(temp_reg7, pel_error1);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));

                temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
                temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
                temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
                temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);

                edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
                edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
                edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
                edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
                /*store*/
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg2);
                pi4_acc_error_category[4] += sad_cal(temp_reg3);

                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg2);
                pi4_category_count[4] += sad_cal(edgeidx_reg3);
                pu1_luma_recon_buf += 16;
                pu1_luma_src_buf += 16;
            }
            rem_wd &= 0x0F;

            if(rem_wd > 7)
            {
                /*load data*/
                src_buf = vld1_u8(pu1_luma_src_buf);
                recon_buf = vld1_u8(pu1_luma_recon_buf);
                recon_buf0 = vld1_u8(pu1_luma_recon_buf - 1 - i4_luma_recon_strd);
                recon_buf1 = vld1_u8(pu1_luma_recon_buf + 1 + i4_luma_recon_strd);
                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg3 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg4 = (int16x8_t)vceqq_s16(const_4, edgeidx);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg3 = vabsq_s16(temp_reg3);
                edgeidx_reg4 = vabsq_s16(temp_reg4);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);
                temp_reg4 = vandq_s16(temp_reg4, pel_error);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                /*store*/
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg3);
                pi4_acc_error_category[4] += sad_cal(temp_reg4);

                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg3);
                pi4_category_count[4] += sad_cal(edgeidx_reg4);
                pu1_luma_recon_buf += 8;
                pu1_luma_src_buf += 8;
            }
            rem_wd &= 0x7;
            if(rem_wd)
            {
                for(col = 0; col < rem_wd; col++)
                {
                    c = pu1_luma_recon_buf[col];
                    a = pu1_luma_recon_buf[col - 1 - i4_luma_recon_strd];
                    b = pu1_luma_recon_buf[col + 1 + i4_luma_recon_strd];
                    pel_err = pu1_luma_src_buf[col] - pu1_luma_recon_buf[col];
                    edge_idx = 2 + SIGN(c - a) + SIGN(c - b);

                    if(pel_err != 0)
                    {
                        pi4_acc_error_category[edge_idx] += pel_err;
                        pi4_category_count[edge_idx]++;
                    }
                }
            }
            pu1_luma_recon_buf = pu1_luma_recon_buf_copy + i4_luma_recon_strd;
            pu1_luma_src_buf = pu1_luma_src_buf_copy + i4_luma_src_strd;
            rem_wd = wd;
        }
        break;
    case SAO_EDGE_45_DEG:
        for(row = row_start; row < row_end; row++)
        {
            pu1_luma_src_buf_copy = pu1_luma_src_buf;
            pu1_luma_recon_buf_copy = pu1_luma_recon_buf;
            for(col = wd; col > 15; col -= 16)
            {
                /*load data*/
                src_buf_8x16 = vld1q_u8(pu1_luma_src_buf);
                recon_buf_8x16 = vld1q_u8(pu1_luma_recon_buf);
                recon_buf0_8x16 = vld1q_u8(pu1_luma_recon_buf + 1 - i4_luma_recon_strd);
                recon_buf1_8x16 = vld1q_u8(pu1_luma_recon_buf - 1 + i4_luma_recon_strd);
                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
                pel_error1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                sign_reg2 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
                sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
                sign_reg2 = vsubq_s16(sign_reg2, sign_reg);

                sign_reg3 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
                sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
                sign_reg3 = vsubq_s16(sign_reg3, sign_reg);

                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
                edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);

                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
                edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);

                temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
                temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
                temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg5 = vabsq_s16(temp_reg5);

                edgeidx_reg2 = vabsq_s16(temp_reg2);
                edgeidx_reg6 = vabsq_s16(temp_reg6);
                edgeidx_reg3 = vabsq_s16(temp_reg3);
                edgeidx_reg7 = vabsq_s16(temp_reg7);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg4 = vandq_s16(temp_reg4, pel_error1);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg5 = vandq_s16(temp_reg5, pel_error1);

                temp_reg2 = vandq_s16(temp_reg2, pel_error);
                temp_reg6 = vandq_s16(temp_reg6, pel_error1);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);
                temp_reg7 = vandq_s16(temp_reg7, pel_error1);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));

                temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
                temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
                temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
                temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);

                edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
                edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
                edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
                edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
                /*store*/
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg2);
                pi4_acc_error_category[4] += sad_cal(temp_reg3);

                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg2);
                pi4_category_count[4] += sad_cal(edgeidx_reg3);
                pu1_luma_recon_buf += 16;
                pu1_luma_src_buf += 16;
            }
            rem_wd &= 0x0F;

            if(rem_wd > 7)
            {
                /*load*/
                src_buf = vld1_u8(pu1_luma_src_buf);
                recon_buf = vld1_u8(pu1_luma_recon_buf);
                recon_buf0 = vld1_u8(pu1_luma_recon_buf + 1 - i4_luma_recon_strd);
                recon_buf1 = vld1_u8(pu1_luma_recon_buf - 1 + i4_luma_recon_strd);

                pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));

                sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);

                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg3 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg4 = (int16x8_t)vceqq_s16(const_4, edgeidx);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg3 = vabsq_s16(temp_reg3);
                edgeidx_reg4 = vabsq_s16(temp_reg4);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);
                temp_reg4 = vandq_s16(temp_reg4, pel_error);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                /*store*/
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg3);
                pi4_acc_error_category[4] += sad_cal(temp_reg4);

                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg3);
                pi4_category_count[4] += sad_cal(edgeidx_reg4);
                pu1_luma_recon_buf += 8;
                pu1_luma_src_buf += 8;
            }
            rem_wd &= 0x7;
            if(rem_wd)
            {
                for(col = 0; col < rem_wd; col++)
                {
                    c = pu1_luma_recon_buf[col];
                    a = pu1_luma_recon_buf[col + 1 - i4_luma_recon_strd];
                    b = pu1_luma_recon_buf[col - 1 + i4_luma_recon_strd];
                    pel_err = pu1_luma_src_buf[col] - pu1_luma_recon_buf[col];
                    edge_idx = 2 + SIGN(c - a) + SIGN(c - b);
                    if(pel_err != 0)
                    {
                        pi4_acc_error_category[edge_idx] += pel_err;
                        pi4_category_count[edge_idx]++;
                    }
                }
            }
            pu1_luma_recon_buf = pu1_luma_recon_buf_copy + i4_luma_recon_strd;
            pu1_luma_src_buf = pu1_luma_src_buf_copy + i4_luma_src_strd;
            rem_wd = wd;
        }
        break;
    default:
        break;
    }
}

void ihevce_get_chroma_eo_sao_params_neon(
    void *pv_sao_ctxt,
    WORD32 eo_sao_class,
    WORD32 *pi4_acc_error_category,
    WORD32 *pi4_category_count)
{
    /*temp var*/
    UWORD8 *pu1_chroma_recon_buf, *pu1_chroma_src_buf;
    UWORD8 *pu1_chroma_src_buf_copy, *pu1_chroma_recon_buf_copy;
    WORD32 row_end, col_end, row, col;
    WORD32 row_start = 0, col_start = 0;
    WORD32 wd, rem_wd;
    WORD32 a, b, c, edge_idx, pel_err;

    int16x8_t temp_reg0, temp_reg1, temp_reg2, temp_reg3, temp_reg4;
    int16x8_t edgeidx_reg0, edgeidx_reg1, edgeidx_reg2, edgeidx_reg3, edgeidx_reg4;
    int16x8_t edgeidx_reg5, edgeidx_reg6, edgeidx_reg7;
    int16x8_t pel_error, pel_error1;
    int16x8_t sign_reg0, sign_reg1, sign_reg, sign_reg2, sign_reg3;
    int16x8_t edgeidx, edgeidx1;
    int16x8_t temp_reg5, temp_reg6, temp_reg7;
    uint8x16_t src_buf_8x16, recon_buf_8x16, recon_buf0_8x16, recon_buf1_8x16;
    uint8x8_t src_buf, recon_buf, recon_buf0, recon_buf1;

    sao_ctxt_t *ps_sao_ctxt = (sao_ctxt_t *)pv_sao_ctxt;
    const WORD32 i4_chroma_recon_strd = ps_sao_ctxt->i4_cur_chroma_recon_stride;
    const WORD32 i4_chroma_src_strd = ps_sao_ctxt->i4_cur_chroma_src_stride;

    const int16x8_t const_2 = vdupq_n_s16(2);
    const int16x8_t const_0 = vdupq_n_s16(0);
    const int16x8_t const_1 = vdupq_n_s16(1);
    const int16x8_t const_3 = vdupq_n_s16(3);
    const int16x8_t const_4 = vdupq_n_s16(4);

    row_end = ps_sao_ctxt->i4_sao_blk_ht >> 1;
    col_end = ps_sao_ctxt->i4_sao_blk_wd;

    if((ps_sao_ctxt->i4_ctb_x == 0) && (eo_sao_class != SAO_EDGE_90_DEG))
    {
        col_start = 2;
    }

    if(((ps_sao_ctxt->i4_ctb_x + 1) == ps_sao_ctxt->ps_sps->i2_pic_wd_in_ctb) &&
       (eo_sao_class != SAO_EDGE_90_DEG))
    {
        col_end = col_end - 2;
    }

    if((ps_sao_ctxt->i4_ctb_y == 0) && (eo_sao_class != SAO_EDGE_0_DEG))
    {
        row_start = 1;
    }

    if(((ps_sao_ctxt->i4_ctb_y + 1) == ps_sao_ctxt->ps_sps->i2_pic_ht_in_ctb) &&
       (eo_sao_class != SAO_EDGE_0_DEG))
    {
        row_end = row_end - 1;
    }
    wd = col_end - col_start;
    rem_wd = wd;
    pu1_chroma_recon_buf =
        ps_sao_ctxt->pu1_cur_chroma_recon_buf + col_start + (row_start * i4_chroma_recon_strd);
    pu1_chroma_src_buf =
        ps_sao_ctxt->pu1_cur_chroma_src_buf + col_start + (row_start * i4_chroma_src_strd);

    switch(eo_sao_class)
    {
    case SAO_EDGE_0_DEG:
        for(row = row_start; row < row_end; row++)
        {
            pu1_chroma_src_buf_copy = pu1_chroma_src_buf;
            pu1_chroma_recon_buf_copy = pu1_chroma_recon_buf;
            for(col = wd; col > 15; col -= 16)
            {
                /*load src and recon data*/
                src_buf_8x16 = vld1q_u8(pu1_chroma_src_buf);
                recon_buf_8x16 = vld1q_u8(pu1_chroma_recon_buf);
                recon_buf0_8x16 = vld1q_u8(pu1_chroma_recon_buf - 2);
                recon_buf1_8x16 = vld1q_u8(pu1_chroma_recon_buf + 2);

                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
                pel_error1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));

                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                sign_reg2 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
                sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
                sign_reg2 = vsubq_s16(sign_reg2, sign_reg);

                sign_reg3 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
                sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
                sign_reg3 = vsubq_s16(sign_reg3, sign_reg);
                /*edgidx*/
                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
                edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);

                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
                edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);

                temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
                temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
                temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg5 = vabsq_s16(temp_reg5);

                edgeidx_reg2 = vabsq_s16(temp_reg2);
                edgeidx_reg6 = vabsq_s16(temp_reg6);
                edgeidx_reg3 = vabsq_s16(temp_reg3);
                edgeidx_reg7 = vabsq_s16(temp_reg7);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg4 = vandq_s16(temp_reg4, pel_error1);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg5 = vandq_s16(temp_reg5, pel_error1);

                temp_reg2 = vandq_s16(temp_reg2, pel_error);
                temp_reg6 = vandq_s16(temp_reg6, pel_error1);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);
                temp_reg7 = vandq_s16(temp_reg7, pel_error1);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));

                temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
                temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
                temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
                temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);

                edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
                edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
                edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
                edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);

                /*store peel error*/
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg2);
                pi4_acc_error_category[4] += sad_cal(temp_reg3);

                /*store edgeidx account*/
                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg2);
                pi4_category_count[4] += sad_cal(edgeidx_reg3);
                pu1_chroma_recon_buf += 16;
                pu1_chroma_src_buf += 16;
            }
            rem_wd &= 0x0F;

            if(rem_wd > 7)
            {
                /*load data*/
                src_buf = vld1_u8(pu1_chroma_src_buf);
                recon_buf = vld1_u8(pu1_chroma_recon_buf);
                recon_buf0 = vld1_u8(pu1_chroma_recon_buf - 2);
                recon_buf1 = vld1_u8(pu1_chroma_recon_buf + 2);
                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);

                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg2 = vabsq_s16(temp_reg2);
                edgeidx_reg3 = vabsq_s16(temp_reg3);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg2 = vandq_s16(temp_reg2, pel_error);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                /*store */
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg2);
                pi4_acc_error_category[4] += sad_cal(temp_reg3);

                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg2);
                pi4_category_count[4] += sad_cal(edgeidx_reg3);
                pu1_chroma_recon_buf += 8;
                pu1_chroma_src_buf += 8;
            }
            rem_wd &= 0x7;
            if(rem_wd)
            {
                for(col = 0; col < rem_wd; col++)
                {
                    c = pu1_chroma_recon_buf[col];
                    a = pu1_chroma_recon_buf[col - 2];
                    b = pu1_chroma_recon_buf[col + 2];
                    pel_err = pu1_chroma_src_buf[col] - pu1_chroma_recon_buf[col];
                    edge_idx = 2 + SIGN(c - a) + SIGN(c - b);

                    if(pel_err != 0)
                    {
                        pi4_acc_error_category[edge_idx] += pel_err;
                        pi4_category_count[edge_idx]++;
                    }
                }
            }
            pu1_chroma_recon_buf = pu1_chroma_recon_buf_copy + i4_chroma_recon_strd;
            pu1_chroma_src_buf = pu1_chroma_src_buf_copy + i4_chroma_src_strd;
            rem_wd = wd;
        }
        break;
    case SAO_EDGE_90_DEG:
        for(row = row_start; row < row_end; row++)
        {
            pu1_chroma_src_buf_copy = pu1_chroma_src_buf;
            pu1_chroma_recon_buf_copy = pu1_chroma_recon_buf;
            for(col = wd; col > 15; col -= 16)
            {
                /*load src and recon data*/
                src_buf_8x16 = vld1q_u8(pu1_chroma_src_buf);
                recon_buf_8x16 = vld1q_u8(pu1_chroma_recon_buf);
                recon_buf0_8x16 = vld1q_u8(pu1_chroma_recon_buf - i4_chroma_recon_strd);
                recon_buf1_8x16 = vld1q_u8(pu1_chroma_recon_buf + i4_chroma_recon_strd);
                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
                pel_error1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                sign_reg2 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
                sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
                sign_reg2 = vsubq_s16(sign_reg2, sign_reg);

                sign_reg3 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
                sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
                sign_reg3 = vsubq_s16(sign_reg3, sign_reg);
                /*edgeidx*/
                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
                edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);

                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
                edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);

                temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
                temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
                temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg5 = vabsq_s16(temp_reg5);

                edgeidx_reg2 = vabsq_s16(temp_reg2);
                edgeidx_reg6 = vabsq_s16(temp_reg6);
                edgeidx_reg3 = vabsq_s16(temp_reg3);
                edgeidx_reg7 = vabsq_s16(temp_reg7);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg4 = vandq_s16(temp_reg4, pel_error1);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg5 = vandq_s16(temp_reg5, pel_error1);

                temp_reg2 = vandq_s16(temp_reg2, pel_error);
                temp_reg6 = vandq_s16(temp_reg6, pel_error1);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);
                temp_reg7 = vandq_s16(temp_reg7, pel_error1);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));

                temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
                temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
                temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
                temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);

                edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
                edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
                edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
                edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
                /* store */
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg2);
                pi4_acc_error_category[4] += sad_cal(temp_reg3);
                /*store account*/
                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg2);
                pi4_category_count[4] += sad_cal(edgeidx_reg3);
                pu1_chroma_recon_buf += 16;
                pu1_chroma_src_buf += 16;
            }
            rem_wd &= 0x0F;

            if(rem_wd > 7)
            {
                /*load*/
                src_buf = vld1_u8(pu1_chroma_src_buf);
                recon_buf = vld1_u8(pu1_chroma_recon_buf);
                recon_buf0 = vld1_u8(pu1_chroma_recon_buf - i4_chroma_recon_strd);
                recon_buf1 = vld1_u8(pu1_chroma_recon_buf + i4_chroma_recon_strd);
                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg2 = vabsq_s16(temp_reg2);
                edgeidx_reg3 = vabsq_s16(temp_reg3);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg2 = vandq_s16(temp_reg2, pel_error);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                /*store*/
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg2);
                pi4_acc_error_category[4] += sad_cal(temp_reg3);

                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg2);
                pi4_category_count[4] += sad_cal(edgeidx_reg3);
                pu1_chroma_recon_buf += 8;
                pu1_chroma_src_buf += 8;
            }
            rem_wd &= 0x7;
            if(rem_wd)
            {
                for(col = 0; col < rem_wd; col++)
                {
                    c = pu1_chroma_recon_buf[col];
                    a = pu1_chroma_recon_buf[col - i4_chroma_recon_strd];
                    b = pu1_chroma_recon_buf[col + i4_chroma_recon_strd];
                    pel_err = pu1_chroma_src_buf[col] - pu1_chroma_recon_buf[col];
                    edge_idx = 2 + SIGN(c - a) + SIGN(c - b);

                    if(pel_err != 0)
                    {
                        pi4_acc_error_category[edge_idx] += pel_err;
                        pi4_category_count[edge_idx]++;
                    }
                }
            }
            pu1_chroma_recon_buf = pu1_chroma_recon_buf_copy + i4_chroma_recon_strd;
            pu1_chroma_src_buf = pu1_chroma_src_buf_copy + i4_chroma_src_strd;
            rem_wd = wd;
        }
        break;
    case SAO_EDGE_135_DEG:
        for(row = row_start; row < row_end; row++)
        {
            pu1_chroma_src_buf_copy = pu1_chroma_src_buf;
            pu1_chroma_recon_buf_copy = pu1_chroma_recon_buf;
            for(col = wd; col > 15; col -= 16)
            {
                /*load src and recon data*/
                src_buf_8x16 = vld1q_u8(pu1_chroma_src_buf);
                recon_buf_8x16 = vld1q_u8(pu1_chroma_recon_buf);
                recon_buf0_8x16 = vld1q_u8(pu1_chroma_recon_buf - 2 - i4_chroma_recon_strd);
                recon_buf1_8x16 = vld1q_u8(pu1_chroma_recon_buf + 2 + i4_chroma_recon_strd);
                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
                pel_error1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                sign_reg2 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
                sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
                sign_reg2 = vsubq_s16(sign_reg2, sign_reg);

                sign_reg3 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
                sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
                sign_reg3 = vsubq_s16(sign_reg3, sign_reg);

                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
                edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);

                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
                edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);

                temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
                temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
                temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg5 = vabsq_s16(temp_reg5);

                edgeidx_reg2 = vabsq_s16(temp_reg2);
                edgeidx_reg6 = vabsq_s16(temp_reg6);
                edgeidx_reg3 = vabsq_s16(temp_reg3);
                edgeidx_reg7 = vabsq_s16(temp_reg7);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg4 = vandq_s16(temp_reg4, pel_error1);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg5 = vandq_s16(temp_reg5, pel_error1);

                temp_reg2 = vandq_s16(temp_reg2, pel_error);
                temp_reg6 = vandq_s16(temp_reg6, pel_error1);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);
                temp_reg7 = vandq_s16(temp_reg7, pel_error1);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));

                temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
                temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
                temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
                temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);

                edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
                edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
                edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
                edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
                /*store*/
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg2);
                pi4_acc_error_category[4] += sad_cal(temp_reg3);

                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg2);
                pi4_category_count[4] += sad_cal(edgeidx_reg3);
                pu1_chroma_recon_buf += 16;
                pu1_chroma_src_buf += 16;
            }
            rem_wd &= 0x0F;

            if(rem_wd > 7)
            {
                /*load data*/
                src_buf = vld1_u8(pu1_chroma_src_buf);
                recon_buf = vld1_u8(pu1_chroma_recon_buf);
                recon_buf0 = vld1_u8(pu1_chroma_recon_buf - 2 - i4_chroma_recon_strd);
                recon_buf1 = vld1_u8(pu1_chroma_recon_buf + 2 + i4_chroma_recon_strd);
                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg3 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg4 = (int16x8_t)vceqq_s16(const_4, edgeidx);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg3 = vabsq_s16(temp_reg3);
                edgeidx_reg4 = vabsq_s16(temp_reg4);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);
                temp_reg4 = vandq_s16(temp_reg4, pel_error);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                /*store*/
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg3);
                pi4_acc_error_category[4] += sad_cal(temp_reg4);

                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg3);
                pi4_category_count[4] += sad_cal(edgeidx_reg4);
                pu1_chroma_recon_buf += 8;
                pu1_chroma_src_buf += 8;
            }
            rem_wd &= 0x7;
            if(rem_wd)
            {
                for(col = 0; col < rem_wd; col++)
                {
                    c = pu1_chroma_recon_buf[col];
                    a = pu1_chroma_recon_buf[col - 2 - i4_chroma_recon_strd];
                    b = pu1_chroma_recon_buf[col + 2 + i4_chroma_recon_strd];
                    pel_err = pu1_chroma_src_buf[col] - pu1_chroma_recon_buf[col];
                    edge_idx = 2 + SIGN(c - a) + SIGN(c - b);

                    if(pel_err != 0)
                    {
                        pi4_acc_error_category[edge_idx] += pel_err;
                        pi4_category_count[edge_idx]++;
                    }
                }
            }
            pu1_chroma_recon_buf = pu1_chroma_recon_buf_copy + i4_chroma_recon_strd;
            pu1_chroma_src_buf = pu1_chroma_src_buf_copy + i4_chroma_src_strd;
            rem_wd = wd;
        }
        break;
    case SAO_EDGE_45_DEG:
        for(row = row_start; row < row_end; row++)
        {
            pu1_chroma_src_buf_copy = pu1_chroma_src_buf;
            pu1_chroma_recon_buf_copy = pu1_chroma_recon_buf;
            for(col = wd; col > 15; col -= 16)
            {
                /*load data*/
                src_buf_8x16 = vld1q_u8(pu1_chroma_src_buf);
                recon_buf_8x16 = vld1q_u8(pu1_chroma_recon_buf);
                recon_buf0_8x16 = vld1q_u8(pu1_chroma_recon_buf + 2 - i4_chroma_recon_strd);
                recon_buf1_8x16 = vld1q_u8(pu1_chroma_recon_buf - 2 + i4_chroma_recon_strd);
                /*pel_error*/
                pel_error = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
                pel_error1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
                /*sign*/
                sign_reg0 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                sign_reg2 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
                sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
                sign_reg2 = vsubq_s16(sign_reg2, sign_reg);

                sign_reg3 = vreinterpretq_s16_u16(
                    vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
                sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
                sign_reg3 = vsubq_s16(sign_reg3, sign_reg);

                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
                edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);

                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
                edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);

                temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
                temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
                temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg5 = vabsq_s16(temp_reg5);

                edgeidx_reg2 = vabsq_s16(temp_reg2);
                edgeidx_reg6 = vabsq_s16(temp_reg6);
                edgeidx_reg3 = vabsq_s16(temp_reg3);
                edgeidx_reg7 = vabsq_s16(temp_reg7);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg4 = vandq_s16(temp_reg4, pel_error1);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg5 = vandq_s16(temp_reg5, pel_error1);

                temp_reg2 = vandq_s16(temp_reg2, pel_error);
                temp_reg6 = vandq_s16(temp_reg6, pel_error1);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);
                temp_reg7 = vandq_s16(temp_reg7, pel_error1);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));

                temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
                temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
                temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
                temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);

                edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
                edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
                edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
                edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
                /*store*/
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg2);
                pi4_acc_error_category[4] += sad_cal(temp_reg3);

                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg2);
                pi4_category_count[4] += sad_cal(edgeidx_reg3);
                pu1_chroma_recon_buf += 16;
                pu1_chroma_src_buf += 16;
            }
            rem_wd &= 0x0F;

            if(rem_wd > 7)
            {
                /*load*/
                src_buf = vld1_u8(pu1_chroma_src_buf);
                recon_buf = vld1_u8(pu1_chroma_recon_buf);
                recon_buf0 = vld1_u8(pu1_chroma_recon_buf + 2 - i4_chroma_recon_strd);
                recon_buf1 = vld1_u8(pu1_chroma_recon_buf - 2 + i4_chroma_recon_strd);

                pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));

                sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
                sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
                sign_reg0 = vsubq_s16(sign_reg0, sign_reg);

                sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
                sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
                sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
                sign_reg1 = vsubq_s16(sign_reg1, sign_reg);

                edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);

                edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
                edgeidx = vandq_s16(edgeidx_reg0, edgeidx);

                temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
                temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
                temp_reg3 = (int16x8_t)vceqq_s16(const_3, edgeidx);
                temp_reg4 = (int16x8_t)vceqq_s16(const_4, edgeidx);

                edgeidx_reg1 = vabsq_s16(temp_reg1);
                edgeidx_reg3 = vabsq_s16(temp_reg3);
                edgeidx_reg4 = vabsq_s16(temp_reg4);

                temp_reg0 = vandq_s16(temp_reg0, pel_error);
                temp_reg1 = vandq_s16(temp_reg1, pel_error);
                temp_reg3 = vandq_s16(temp_reg3, pel_error);
                temp_reg4 = vandq_s16(temp_reg4, pel_error);

                edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
                /*store*/
                pi4_acc_error_category[0] += sad_cal(temp_reg0);
                pi4_acc_error_category[1] += sad_cal(temp_reg1);
                pi4_acc_error_category[3] += sad_cal(temp_reg3);
                pi4_acc_error_category[4] += sad_cal(temp_reg4);

                pi4_category_count[0] += sad_cal(edgeidx_reg0);
                pi4_category_count[1] += sad_cal(edgeidx_reg1);
                pi4_category_count[3] += sad_cal(edgeidx_reg3);
                pi4_category_count[4] += sad_cal(edgeidx_reg4);
                pu1_chroma_recon_buf += 8;
                pu1_chroma_src_buf += 8;
            }
            rem_wd &= 0x7;
            if(rem_wd)
            {
                for(col = 0; col < rem_wd; col++)
                {
                    c = pu1_chroma_recon_buf[col];
                    a = pu1_chroma_recon_buf[col + 2 - i4_chroma_recon_strd];
                    b = pu1_chroma_recon_buf[col - 2 + i4_chroma_recon_strd];
                    pel_err = pu1_chroma_src_buf[col] - pu1_chroma_recon_buf[col];
                    edge_idx = 2 + SIGN(c - a) + SIGN(c - b);
                    if(pel_err != 0)
                    {
                        pi4_acc_error_category[edge_idx] += pel_err;
                        pi4_category_count[edge_idx]++;
                    }
                }
            }
            pu1_chroma_recon_buf = pu1_chroma_recon_buf_copy + i4_chroma_recon_strd;
            pu1_chroma_src_buf = pu1_chroma_src_buf_copy + i4_chroma_src_strd;
            rem_wd = wd;
        }
        break;
    default:
        break;
    }
}