///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//*******************************************************************************
//* //file
//*  ihevc_inter_pred_chroma_vert_neon.s
//*
//* //brief
//*  contains function definitions for inter prediction  interpolation.
//* functions are coded using neon  intrinsics and can be compiled using

//* rvct
//*
//* //author
//*  yogeswaran rs
//*
//* //par list of functions:
//*
//*
//* //remarks
//*  none
//*
//*******************************************************************************
//*/
///**
///**
//*******************************************************************************
//*
//* //brief
//*   chroma interprediction filter for vertical input
//*
//* //par description:
//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
//*    'pu1_dst'  the output is down shifted by 6 and clipped to 8 bits
//*    assumptions : the function is optimized considering the fact width is
//*    multiple of 2,4 or 8. and also considering height  should be multiple of 2
//*    width 4,8 is optimized further
//*
//* //param[in] pu1_src
//*  uword8 pointer to the source
//*
//* //param[out] pu1_dst
//*  uword8 pointer to the destination
//*
//* //param[in] src_strd
//*  integer source stride
//*
//* //param[in] dst_strd
//*  integer destination stride
//*
//* //param[in] pi1_coeff
//*  word8 pointer to the filter coefficients
//*
//* //param[in] ht
//*  integer height of the array
//*
//* //param[in] wd
//*  integer width of the array
//*
//* //returns
//*
//* //remarks
//*  none
//*
//*******************************************************************************
//*/
//void ihevc_inter_pred_chroma_vert(uword8 *pu1_src,
//                                   uword8 *pu1_dst,
//                                   word32 src_strd,
//                                   word32 dst_strd,
//                                   word8 *pi1_coeff,
//                                   word32 ht,
//                                   word32 wd)
//**************variables vs registers*****************************************
//x0 => *pu1_src
//x1 => *pi2_dst
//x2 =>  src_strd
//x3 =>  dst_strd
.text
.align 4

.include "ihevc_neon_macros.s"

.globl ihevc_inter_pred_chroma_vert_av8

.type ihevc_inter_pred_chroma_vert_av8, %function

ihevc_inter_pred_chroma_vert_av8:

    // stmfd sp!,{x4-x12,x14}        //stack stores the values of the arguments

    stp         x19, x20,[sp,#-16]!

    mov         x15,x4 // pi1_coeff
    mov         x16,x5 // ht
    mov         x17,x6 // wd

    mov         x4,x16                      //loads ht
    mov         x12,x15                     //loads pi1_coeff
    cmp         x4,#0                       //checks ht == 0
    mov         x6,x17                      //loads wd
    sub         x0,x0,x2                    //pu1_src - src_strd
    ld1         {v0.8b},[x12]               //loads pi1_coeff

    ble         end_loops                   //jumps to end

    tst         x6,#3                       //checks (wd & 3)
    abs         v3.8b, v0.8b                //vabs_s8(coeff)
    lsl         x10,x6,#1                   //2*wd
    dup         v0.8b, v3.b[0]              //coeffabs_0
    dup         v1.8b, v3.b[1]              //coeffabs_1
    dup         v2.8b, v3.b[2]              //coeffabs_2
    dup         v3.8b, v3.b[3]              //coeffabs_3

    bgt         outer_loop_wd_2             //jumps to loop handling wd ==2

    tst         x4,#7                       //checks ht for mul of 8
    beq         core_loop_ht_8              //when height is multiple of 8

    lsl         x7,x3,#1                    //2*dst_strd
    sub         x9,x7,x10                   //2*dst_strd - 2wd
    lsl         x12,x2,#1                   //2*src_strd
    sub         x8,x12,x10                  //2*src_strd - 2wd
    mov         x5,x10                      //2wd

inner_loop_ht_2:                            //called when wd is multiple of 4 and ht is 4,2

    add         x6,x0,x2                    //pu1_src +src_strd
    ld1         {v17.8b},[x6],x2            //loads pu1_src
    subs        x5,x5,#8                    //2wd - 8
    ld1         {v5.8b},[x0],#8             //loads src
    umull       v6.8h, v17.8b, v1.8b        //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
    ld1         {v4.8b},[x6],x2             //loads incremented src
    umlsl       v6.8h, v5.8b, v0.8b         //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
    ld1         {v16.8b},[x6],x2            //loads incremented src
    umlal       v6.8h, v4.8b, v2.8b         //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
    umull       v4.8h, v4.8b, v1.8b
    umlsl       v6.8h, v16.8b, v3.8b
    umlsl       v4.8h, v17.8b, v0.8b
    ld1         {v18.8b},[x6]               //loads the incremented src
    umlal       v4.8h, v16.8b, v2.8b
    sqrshrun    v6.8b, v6.8h,#6             //shifts right
    umlsl       v4.8h, v18.8b, v3.8b
    add         x6,x1,x3                    //pu1_dst + dst_strd
    sqrshrun    v4.8b, v4.8h,#6             //shifts right
    st1         {v6.8b},[x1],#8             //stores the loaded value

    st1         {v4.8b},[x6]                //stores the loaded value

    bgt         inner_loop_ht_2             //inner loop again

    subs        x4,x4,#2                    //ht - 2
    add         x1,x1,x9                    //pu1_dst += (2*dst_strd - 2wd)
    mov         x5,x10                      //2wd
    add         x0,x0,x8                    //pu1_src += (2*src_strd - 2wd)

    bgt         inner_loop_ht_2             //loop again

    b           end_loops                   //jumps to end

outer_loop_wd_2:                            //called when width is multiple of 2
    lsl         x5,x3,#1                    //2*dst_strd
    mov         x12,x10                     //2wd
    sub         x9,x5,x10                   //2*dst_strd - 2wd
    lsl         x7,x2,#1                    //2*src_strd
    sub         x8,x7,x10                   //2*src_strd - 2wd

inner_loop_wd_2:

    add         x6,x0,x2                    //pu1_src + src_strd
    ld1         {v6.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
    subs        x12,x12,#4                  //2wd - 4
    add         x0,x0,#4                    //pu1_src + 4
    ld1         {v6.s}[1],[x6],x2           //loads pu1_src_tmp
    dup         v7.2s, v6.s[1]
    ld1         {v7.s}[1],[x6],x2           //loads pu1_src_tmp
    umull       v4.8h, v7.8b, v1.8b         //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
    dup         v7.2s, v7.s[1]
    ld1         {v7.s}[1],[x6],x2
    umlsl       v4.8h, v6.8b, v0.8b
    umlal       v4.8h, v7.8b, v2.8b
    dup         v7.2s, v7.s[1]
    ld1         {v7.s}[1],[x6]
    add         x6,x1,x3                    //pu1_dst + dst_strd
    umlsl       v4.8h, v7.8b, v3.8b
    sqrshrun    v4.8b, v4.8h,#6             //vrshrq_n_s16(vreinterpretq_s16_u16(mul_res1),6)
    st1         {v4.s}[0],[x1]              //stores the loaded value
    add         x1,x1,#4                    //pu1_dst += 4
    st1         {v4.s}[1],[x6]              //stores the loaded value

    bgt         inner_loop_wd_2             //inner loop again

    //inner loop ends
    subs        x4,x4,#2                    //ht - 2
    add         x1,x1,x9                    //pu1_dst += 2*dst_strd - 2*wd
    mov         x12,x10                     //2wd
    add         x0,x0,x8                    //pu1_src += 2*src_strd - 2*wd

    bgt         inner_loop_wd_2             //loop again

    b           end_loops                   //jumps to end

core_loop_ht_8:                             //when wd & ht is multiple of 8

    lsl         x12,x3,#2                   //4*dst_strd
    sub         x8,x12,x10                  //4*dst_strd - 2wd
    lsl         x12,x2,#2                   //4*src_strd
    sub         x9,x12,x10                  //4*src_strd - 2wd

    bic         x5,x10,#7                   //x5 ->wd
    lsr         x14, x10, #3                //divide by 8
    mul         x12, x4 , x14               //multiply height by width
    sub         x12, x12,#4                 //subtract by one for epilog

prolog:
    add         x6,x0,x2                    //pu1_src + src_strd
    ld1         {v5.8b},[x6],x2             //loads pu1_src
    subs        x5,x5,#8                    //2wd - 8
    ld1         {v4.8b},[x0],#8             //loads the source
    ld1         {v6.8b},[x6],x2             //load and increment
    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
    ld1         {v7.8b},[x6],x2             //load and increment
    umlsl       v30.8h, v4.8b, v0.8b
    add         x7,x1,x3                    //pu1_dst
    umlal       v30.8h, v6.8b, v2.8b
    umlsl       v30.8h, v7.8b, v3.8b
    ld1         {v16.8b},[x6],x2            //load and increment

    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
    add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
    csel        x0, x20, x0,le
    umlsl       v28.8h, v5.8b, v0.8b
    bic         x20,x10,#7                  //x5 ->wd
    csel        x5, x20, x5,le
    umlal       v28.8h, v7.8b, v2.8b
    ld1         {v17.8b},[x6],x2
    umlsl       v28.8h, v16.8b, v3.8b
    sqrshrun    v30.8b, v30.8h,#6

    ld1         {v18.8b},[x6],x2
    umull       v26.8h, v7.8b, v1.8b
    add         x6,x0,x2                    //pu1_src + src_strd
    umlsl       v26.8h, v6.8b, v0.8b
    st1         {v30.8b},[x1],#8            //stores the loaded value
    umlal       v26.8h, v16.8b, v2.8b
    ld1         {v4.8b},[x0],#8             //loads the source
    umlsl       v26.8h, v17.8b, v3.8b
    sqrshrun    v28.8b, v28.8h,#6

    add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
    csel        x1, x20, x1,le
    umull       v24.8h, v16.8b, v1.8b
    ld1         {v5.8b},[x6],x2             //loads pu1_src
    umlsl       v24.8h, v7.8b, v0.8b
    subs        x12,x12,#4
    ld1         {v6.8b},[x6],x2             //load and increment
    umlal       v24.8h, v17.8b, v2.8b
    ld1         {v7.8b},[x6],x2             //load and increment
    umlsl       v24.8h, v18.8b, v3.8b

    lsl         x11,x2,#2
    st1         {v28.8b},[x7],x3            //stores the loaded value
    sqrshrun    v26.8b, v26.8h,#6
    sub         x20,x2,x2,lsl #3
    neg         x11, x20
    add         x14,x2,x2,lsl #1
    add         x14,x14,x11
    ble         epilog                      //jumps to epilog

kernel_8:

    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
    subs        x5,x5,#8                    //2wd - 8
    umlsl       v30.8h, v4.8b, v0.8b
    add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
    csel        x0, x20, x0,le
    umlal       v30.8h, v6.8b, v2.8b
    lsl         x20,x2,#3
    sub         x20,x20,x2
    csel        x11,x20,x11,le
    //rsble        x11,x2,x2,lsl #3
    umlsl       v30.8h, v7.8b, v3.8b
    st1         {v26.8b},[x7],x3            //stores the loaded value
    sqrshrun    v24.8b, v24.8h,#6

    ld1         {v16.8b},[x6],x2            //load and increment

    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
    bic         x20,x10,#7                  //x5 ->wd
    csel        x5, x20, x5,le
    umlsl       v28.8h, v5.8b, v0.8b
    st1         {v24.8b},[x7],x3            //stores the loaded value

    umlal       v28.8h, v7.8b, v2.8b

    ld1         {v17.8b},[x6],x2
    sqrshrun    v30.8b, v30.8h,#6

    umlsl       v28.8h, v16.8b, v3.8b
    ld1         {v18.8b},[x6],x2
    add         x7,x1,x3                    //pu1_dst
    umull       v26.8h, v7.8b, v1.8b
    add         x6,x0,x2                    //pu1_src + src_strd

    add         x20,x0, x11
    prfm        PLDL1KEEP,[x20]


    umlsl       v26.8h, v6.8b, v0.8b
    ld1         {v4.8b},[x0],#8             //loads the source

    umlal       v26.8h, v16.8b, v2.8b
    st1         {v30.8b},[x1],#8            //stores the loaded value

    umlsl       v26.8h, v17.8b, v3.8b
    ld1         {v5.8b},[x6],x2             //loads pu1_src

    add         x11,x11,x2
    sqrshrun    v28.8b, v28.8h,#6

    umull       v24.8h, v16.8b, v1.8b
    ld1         {v6.8b},[x6],x2             //load and increment
    add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
    csel        x1, x20, x1,le

    cmp         x11,x14
    lsl         x20,x2,#3
    sub         x20,x20,x2
    csel        x11,x20,x11,gt
    //rsbgt        x11,x2,x2,lsl #3

    umlsl       v24.8h, v7.8b, v0.8b
    subs        x12,x12,#4

    umlal       v24.8h, v17.8b, v2.8b
    ld1         {v7.8b},[x6],x2             //load and increment

    umlsl       v24.8h, v18.8b, v3.8b
    st1         {v28.8b},[x7],x3            //stores the loaded value
    sqrshrun    v26.8b, v26.8h,#6

    bgt         kernel_8                    //jumps to kernel_8

epilog:

    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
    umlsl       v30.8h, v4.8b, v0.8b
    umlal       v30.8h, v6.8b, v2.8b
    umlsl       v30.8h, v7.8b, v3.8b
    st1         {v26.8b},[x7],x3            //stores the loaded value
    sqrshrun    v24.8b, v24.8h,#6

    ld1         {v16.8b},[x6],x2            //load and increment
    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
    umlsl       v28.8h, v5.8b, v0.8b
    umlal       v28.8h, v7.8b, v2.8b
    umlsl       v28.8h, v16.8b, v3.8b
    st1         {v24.8b},[x7],x3            //stores the loaded value
    sqrshrun    v30.8b, v30.8h,#6

    ld1         {v17.8b},[x6],x2
    umull       v26.8h, v7.8b, v1.8b
    add         x7,x1,x3                    //pu1_dst
    umlsl       v26.8h, v6.8b, v0.8b
    st1         {v30.8b},[x1],#8            //stores the loaded value

    sqrshrun    v28.8b, v28.8h,#6
    umlal       v26.8h, v16.8b, v2.8b
    ld1         {v18.8b},[x6],x2
    umlsl       v26.8h, v17.8b, v3.8b

    umull       v24.8h, v16.8b, v1.8b
    sqrshrun    v26.8b, v26.8h,#6
    st1         {v28.8b},[x7],x3            //stores the loaded value
    umlsl       v24.8h, v7.8b, v0.8b
    umlal       v24.8h, v17.8b, v2.8b
    st1         {v26.8b},[x7],x3            //stores the loaded value
    umlsl       v24.8h, v18.8b, v3.8b

    sqrshrun    v24.8b, v24.8h,#6
    st1         {v24.8b},[x7],x3            //stores the loaded value
end_loops:
    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
    ldp         x19, x20,[sp],#16

    ret