///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
///*******************************************************************************
//* @file
//*  ihevc_deblk_luma_vert.s
//*
//* @brief
//*  contains function definitions for inter prediction  interpolation.
//* functions are coded using neon  intrinsics and can be compiled using

//* rvct
//*
//* @author
//*  anand s
//*
//* @par list of functions:
//*
//*
//* @remarks
//*  none
//*
//*******************************************************************************/
//void ihevc_deblk_chroma_vert(UWORD8 *pu1_src,
//                             WORD32 src_strd,
//                             WORD32 quant_param_p,
//                             WORD32 quant_param_q,
//                             WORD32 qp_offset_u,
//                             WORD32 qp_offset_v,
//                             WORD32 tc_offset_div2,
//                             WORD32 filter_flag_p,
//                             WORD32 filter_flag_q)

.text
.align 4
.include "ihevc_neon_macros.s"



.extern gai4_ihevc_qp_table
.extern gai4_ihevc_tc_table
.globl ihevc_deblk_chroma_vert_av8

.type ihevc_deblk_chroma_vert_av8, %function

ihevc_deblk_chroma_vert_av8:
    sxtw        x4,w4
    sxtw        x5,w5
    sxtw        x6,w6
    mov         x15,x5
    mov         x5,x6
    mov         x6,x15
    mov         x12, x7
    mov         x7, x4
    ldr         w4, [sp]

    push_v_regs
    stp         x19, x20,[sp,#-16]!

    sub         x8,x0,#4
    add         x2,x2,x3
    ld1         {v5.8b},[x8],x1
    add         x2,x2,#1
    ld1         {v17.8b},[x8],x1
    ld1         {v16.8b},[x8],x1
    ld1         {v4.8b},[x8]

    trn1        v29.8b, v5.8b, v17.8b
    trn2        v17.8b, v5.8b, v17.8b
    mov         v5.d[0], v29.d[0]
    adds        x3,x7,x2,asr #1
    trn1        v29.8b, v16.8b, v4.8b
    trn2        v4.8b, v16.8b, v4.8b
    mov         v16.d[0], v29.d[0]
    adrp        x7, :got:gai4_ihevc_qp_table
    ldr         x7, [x7, #:got_lo12:gai4_ihevc_qp_table]


    bmi         l1.2944
    cmp         x3,#0x39
    bgt         lbl78
    ldr         w3, [x7,x3,lsl #2]
    sxtw        x3,w3
lbl78:
    sub         x20,x3,#6
    csel        x3, x20, x3,gt
l1.2944:
    trn1        v29.4h, v5.4h, v16.4h
    trn2        v16.4h, v5.4h, v16.4h
    mov         v5.d[0], v29.d[0]
    adds        x2,x6,x2,asr #1
    trn1        v29.4h, v17.4h, v4.4h
    trn2        v4.4h, v17.4h, v4.4h
    mov         v17.d[0], v29.d[0]
    bmi         l1.2964
    cmp         x2,#0x39
    bgt         lbl86
    ldr         w2, [x7,x2,lsl #2]
    sxtw        x2,w2
lbl86:
    sub         x20,x2,#6
    csel        x2, x20, x2,gt
l1.2964:
    trn1        v29.2s, v5.2s, v17.2s
    trn2        v17.2s, v5.2s, v17.2s
    mov         v5.d[0], v29.d[0]
    add         x3,x3,x5,lsl #1
    trn1        v29.2s, v16.2s, v4.2s
    trn2        v4.2s, v16.2s, v4.2s
    mov         v16.d[0], v29.d[0]
    add         x6,x3,#2
    uxtl        v18.8h, v17.8b
    cmp         x6,#0x35
    mov         x20,#0x35
    csel        x3, x20, x3,gt
    bgt         l1.2996
    adds        x6,x3,#2
    add         x20,x3,#2
    csel        x3, x20, x3,pl
    mov         x20,#0
    csel        x3, x20, x3,mi
l1.2996:
    usubl       v0.8h, v17.8b, v16.8b
    adrp        x6, :got:gai4_ihevc_tc_table
    ldr         x6, [x6, #:got_lo12:gai4_ihevc_tc_table]
    shl         v0.8h, v0.8h,#2
    add         x2,x2,x5,lsl #1
    add         x5,x2,#2
    uaddw       v0.8h,  v0.8h ,  v5.8b
    cmp         x5,#0x35
    ldr         w3, [x6,x3,lsl #2]
    sxtw        x3,w3
    usubw       v4.8h,  v0.8h ,  v4.8b
    mov         x20,#0x35
    csel        x2, x20, x2,gt
    bgt         l1.3036
    adds        x5,x2,#2
    add         x20,x2,#2
    csel        x2, x20, x2,pl
    mov         x20,#0
    csel        x2, x20, x2,mi
l1.3036:


    srshr       v6.8h, v4.8h,#3
    dup         v2.4h,w3
    ldr         w2, [x6,x2,lsl #2]
    sxtw        x2,w2
    sub         x20,x3,#0
    neg         x3, x20
    cmp         x12,#0
    dup         v3.4h,w2
    sub         x20,x2,#0
    neg         x2, x20
    dup         v30.4h,w3
    dup         v31.4h,w2

    mov         v30.d[1],v31.d[0]
    mov         v2.d[1],v3.d[0]

    smin        v4.8h,  v6.8h ,  v2.8h
    smax        v2.8h,  v30.8h ,  v4.8h

    uxtl        v6.8h, v16.8b

    add         v0.8h,  v6.8h ,  v2.8h
    sub         v2.8h,  v18.8h ,  v2.8h
    sqxtun      v0.8b, v0.8h
    sub         x2,x0,#2
    sqxtun      v1.8b, v2.8h
    trn1        v29.2s, v0.2s, v1.2s
    trn2        v1.2s, v0.2s, v1.2s
    mov         v0.d[0], v29.d[0]
    trn1        v29.8b, v0.8b, v1.8b
    trn2        v1.8b, v0.8b, v1.8b
    mov         v0.d[0], v29.d[0]
    beq         l1.3204

    st1         {v0.h}[0],[x2],x1
    st1         {v1.h}[0],[x2],x1
    st1         {v0.h}[1],[x2],x1
    st1         {v1.h}[1],[x2]
l1.3204:
    cmp         x4,#0
    beq         l1.3228
    st1         {v0.h}[2],[x0],x1
    st1         {v1.h}[2],[x0],x1
    st1         {v0.h}[3],[x0],x1
    st1         {v1.h}[3],[x0]
l1.3228:
    ldp         x19, x20,[sp],#16
    pop_v_regs
    ret