///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//*******************************************************************************
//* @file
//*  ihevc_intra_pred_filters_dc.s
//*
//* @brief
//*  contains function definitions for intra prediction dc filtering.
//* functions are coded using neon  intrinsics and can be compiled using

//* rvct
//*
//* @author
//*  akshaya mukund
//*
//* @par list of functions:
//*
//*
//* @remarks
//*  none
//*
//*******************************************************************************
//*/
///**
//*******************************************************************************
//*
//* @brief
//*    luma intraprediction filter for dc input
//*
//* @par description:
//*
//* @param[in] pu1_ref
//*  uword8 pointer to the source
//*
//* @param[out] pu1_dst
//*  uword8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] pi1_coeff
//*  word8 pointer to the planar coefficients
//*
//* @param[in] nt
//*  size of tranform block
//*
//* @param[in] mode
//*  type of filtering
//*
//* @returns
//*
//* @remarks
//*  none
//*
//*******************************************************************************
//*/

//void ihevc_intra_pred_luma_dc(uword8 *pu1_ref,
//                              word32 src_strd,
//                              uword8 *pu1_dst,
//                              word32 dst_strd,
//                              word32 nt,
//                              word32 mode)
//
//**************variables vs registers*****************************************
//x0 => *pu1_ref
//x1 => src_strd
//x2 => *pu1_dst
//x3 => dst_strd

//stack contents from #40
//    nt
//    mode
//    pi1_coeff

.text
.align 4
.include "ihevc_neon_macros.s"


.globl ihevc_intra_pred_luma_dc_av8

.type ihevc_intra_pred_luma_dc_av8, %function

ihevc_intra_pred_luma_dc_av8:

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

    stp         x19, x20,[sp,#-16]!


//********** testing
    //mov        x6, #128
    //b        prologue_cpy_32
//********** testing

    mov         x11, #2                     //mov #2 to x11 (to be used to add to 2dc_val & 3dc_val)
    mov         x9, #0
    mov         v17.s[0], w11
    mov         v17.s[1], w9

    clz         w5,w4

    add         x6, x0, x4                  //&src[nt]
    sub         x20, x5, #32                //log2nt
    neg         x5, x20
    add         x7, x0, x4, lsl #1          //&src[2nt]

    add         x8, x7, #1                  //&src[2nt+1]
    mvn         x5, x5
    add         x5, x5, #1
    dup         v7.2s,w5

    ldrb        w14, [x8]
    sxtw        x14,w14
    shl         d7, d7,#32

    sub         x9, x7, #1                  //&src[2nt-1]
    sshr        d7, d7,#32

    mov         x7, x8                      //x7 also stores 2nt+1

    ldrb        w12, [x9]
    sxtw        x12,w12
    add         x14, x14, x12               //src[2nt+1] + src[2nt-1]
    add         x14, x14, x11               //src[2nt+1] + src[2nt-1] + 2

    cmp         x4, #4
    beq         dc_4

    mov         x10, x4                     //nt

add_loop:
    ld1         {v0.8b},[x6],#8             //load from src[nt]
    mov         x5, #0                      //
    ld1         {v1.8b},[x8],#8             //load from src[2nt+1]

    uaddlp      v2.4h,  v0.8b

    mov         v6.s[0], w4
    mov         v6.s[1], w5                 //store nt to accumulate
    uaddlp      v3.4h,  v1.8b

    ld1         {v0.8b},[x6],#8             //load from src[nt] (extra load for 8)

    ld1         {v1.8b},[x8],#8             //load from src[2nt+1] (extra load for 8)
    add         v4.4h,  v2.4h ,  v3.4h


    uaddlp      v5.2s,  v4.4h


    uadalp      v6.1d,  v5.2s               //accumulate all inp into d6 (end for nt==8)

    subs        x10, x10,#8
    beq         epil_add_loop

core_loop_add:
    uaddlp      v2.4h,  v0.8b
    subs        x10, x10,#8
    uaddlp      v3.4h,  v1.8b



    add         v4.4h,  v2.4h ,  v3.4h
    ld1         {v0.8b},[x6],#8             //load from src[nt] (extra load for 16)

    uaddlp      v5.2s,  v4.4h
    ld1         {v1.8b},[x8],#8             //load from src[2nt+1] (extra load for 16)

    uadalp      v6.1d,  v5.2s               //accumulate all inp into d6
    bne         core_loop_add

epil_add_loop:

    sshl        d18, d6, d7                 //(dc_val) shr by log2nt+1
    cmp         x4, #32

    mov         v28.s[0], w14
    mov         v28.s[1], w5                //src[2nt+1]+2+src[2nt-1] moved to d28
    mov         x20,#128
    csel        x6, x20, x6,eq

    dup         v16.8b, v18.b[0]            //dc_val
    shl         d25, d18,#1                 //2*dc

    beq         prologue_cpy_32

    add         d27,  d25 ,  d28            //src[2nt+1]+2+src[2nt-1]+2dc_val
    mov         x20,#0
    csel        x6, x20, x6,ne              //nt

    ushr        v29.4h, v27.4h,#2           //final dst[0]'s value in d15[0]
    csel        x10, x4, x10,ne

    add         d23,  d25 ,  d18            //3*dc
    sub         x12, x3, x3, lsl #3         //-7*strd

    add         d23,  d23 ,  d17            //3*dc + 2
    add         x12, x12, #8                //offset after one 8x8 block (-7*strd + 8)

    dup         v24.8h, v23.h[0]            //3*dc + 2 (moved to all lanes)
    sub         x0, x3, x4                  //strd - nt

prologue_col:
    //0th column and 0-7 rows done here
    //x8 and x9 (2nt+1+col 2nt-1-row)

    mov         x8, x7                      //&src[2nt+1]

    add         x0, x0, #8                  //strd - nt + 8
    ld1         {v0.8b},[x8],#8             //col 1::7 load (prol)
    sub         x9, x9, #7                  //&src[2nt-1-row]

    ld1         {v1.8b},[x9]                //row 7::1 (0 also) load (prol)
    sub         x9, x9, #8

    uxtl        v20.8h, v0.8b

    ld1         {v6.8b},[x8]                //col 8::15 load (prol extra)
    add         v20.8h,  v20.8h ,  v24.8h   //col 1::7 add 3dc+2 (prol)

    uxtl        v22.8h, v1.8b
    sqshrun     v2.8b, v20.8h,#2            //columns shx2 movn (prol)

    uxtl        v26.8h, v6.8b
    add         v22.8h,  v22.8h ,  v24.8h   //row 1::7 add 3dc+2 (prol)

    movi        d19, #0x00000000000000ff    //
    sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)

    bsl         v19.8b,  v29.8b ,  v2.8b    //first row with dst[0]
    add         v26.8h,  v26.8h ,  v24.8h   //col 8::15 add 3dc+2 (prol extra)

    rev64       v3.8b,  v3.8b

    st1         {v19.8b},[x2], x3           //store row 0 (prol)
    sshr        d3, d3,#8                   //row 0 shift (prol) (first value to be ignored)

    movi        d20, #0x00000000000000ff    //byte mask row 1 (prol)

loop_again_col_row:

    bsl         v20.8b,  v3.8b ,  v16.8b    //row 1    (prol)

    movi        d21, #0x00000000000000ff    //byte mask row 2 (prol)
    sshr        d3, d3,#8                   //row 1 shift (prol)

    st1         {v20.8b},[x2], x3           //store row 1 (prol)
    sqshrun     v4.8b, v26.8h,#2            //columns shx2 movn (prol extra)


    bsl         v21.8b,  v3.8b ,  v16.8b    //row 2 (prol)

    movi        d20, #0x00000000000000ff    //byte mask row 3 (prol)
    sshr        d3, d3,#8                   //row 2 shift (prol)

    st1         {v21.8b},[x2], x3           //store row 2 (prol)


    bsl         v20.8b,  v3.8b ,  v16.8b    //row 3    (prol)

    movi        d21, #0x00000000000000ff    //byte mask row 4 (prol)
    sshr        d3, d3,#8                   //row 3 shift (prol)

    st1         {v20.8b},[x2], x3           //store row 3 (prol)


    bsl         v21.8b,  v3.8b ,  v16.8b    //row 4 (prol)

    movi        d20, #0x00000000000000ff    //byte mask row 5 (prol)
    sshr        d3, d3,#8                   //row 4 shift (prol)

    st1         {v21.8b},[x2], x3           //store row 4 (prol)


    bsl         v20.8b,  v3.8b ,  v16.8b    //row 5 (prol)

    movi        d21, #0x00000000000000ff    //byte mask row 6 (prol)
    sshr        d3, d3,#8                   //row 5 shift (prol)

    st1         {v20.8b},[x2], x3           //store row 5 (prol)

    ld1         {v1.8b},[x9]                //row 8::15 load (prol extra)

    bsl         v21.8b,  v3.8b ,  v16.8b    //row 6 (prol)

    uxtl        v22.8h, v1.8b

    movi        d20, #0x00000000000000ff    //byte mask row 7 (prol)
    sshr        d3, d3,#8                   //row 6 shift (prol)

    st1         {v21.8b},[x2], x3           //store row 6 (prol)

    bsl         v20.8b,  v3.8b ,  v16.8b    //row 7 (prol)
    add         v22.8h,  v22.8h ,  v24.8h   //row 8::15 add 3dc+2 (prol extra)

    sshr        d3, d3,#8                   //row 7 shift (prol)
    st1         {v20.8b},[x2], x12          //store row 7 (prol)

    subs        x10, x10, #8                //counter for cols

    beq         end_func
    blt         copy_16


    movi        d20, #0x00000000000000ff    //byte mask row 9 (prol)
    sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)

    rev64       v3.8b,  v3.8b

    st1         {v4.8b},[x2], x3            //store 2nd col (for 16x16)

    st1         {v16.8b},[x2], x3
    st1         {v16.8b},[x2], x3
    st1         {v16.8b},[x2], x3
    st1         {v16.8b},[x2], x3
    st1         {v16.8b},[x2], x3
    st1         {v16.8b},[x2], x3
    st1         {v16.8b},[x2], x0           //go to next row for 16


    bsl         v20.8b,  v3.8b ,  v16.8b    //row 9    (prol)
    subs        x10, x10, #8

    st1         {v20.8b},[x2], x3           //store row 9 (prol)
    sshr        d3, d3,#8                   //row 9 shift (prol)

    movi        d20, #0x00000000000000ff    //byte mask row 9 (prol)

    b           loop_again_col_row


copy_16:
    st1         {v16.8b},[x2], x3
    st1         {v16.8b},[x2], x3
    st1         {v16.8b},[x2], x3
    st1         {v16.8b},[x2], x3
    st1         {v16.8b},[x2], x3
    st1         {v16.8b},[x2], x3
    st1         {v16.8b},[x2], x3
    st1         {v16.8b},[x2]

    b           end_func

prologue_cpy_32:
    mov         x9, #128
    //sub        x7, x3, #-24
    add         x5, x2, x3
    add         x8, x5, x3
    add         x10, x8, x3
    dup         v20.16b, v16.b[0]
    lsl         x6, x3, #2
    sub         x6, x6, #16

    st1         {v20.16b}, [x2],#16
    st1         {v20.16b}, [x5],#16
    st1         {v20.16b}, [x8],#16
    st1         {v20.16b}, [x10],#16

    st1         {v20.16b}, [x2], x6
    st1         {v20.16b}, [x5], x6
    st1         {v20.16b}, [x8], x6
    st1         {v20.16b}, [x10], x6

    sub         x9, x9, #32                 //32x32 prol/epil counter dec

kernel_copy:
    st1         {v20.16b}, [x2],#16
    st1         {v20.16b}, [x5],#16
    st1         {v20.16b}, [x8],#16
    st1         {v20.16b}, [x10],#16

    st1         {v20.16b}, [x2], x6
    st1         {v20.16b}, [x5], x6
    st1         {v20.16b}, [x8], x6
    st1         {v20.16b}, [x10], x6

    subs        x9, x9, #32

    st1         {v20.16b}, [x2],#16
    st1         {v20.16b}, [x5],#16
    st1         {v20.16b}, [x8],#16
    st1         {v20.16b}, [x10],#16

    st1         {v20.16b}, [x2], x6
    st1         {v20.16b}, [x5], x6
    st1         {v20.16b}, [x8], x6
    st1         {v20.16b}, [x10], x6

    bne         kernel_copy

epilogue_copy:
    st1         {v20.16b}, [x2],#16
    st1         {v20.16b}, [x5],#16
    st1         {v20.16b}, [x8],#16
    st1         {v20.16b}, [x10],#16

    st1         {v20.16b}, [x2]
    st1         {v20.16b}, [x5]
    st1         {v20.16b}, [x8]
    st1         {v20.16b}, [x10]

    b           end_func


dc_4:
    ld1         {v0.8b},[x6],#8             //load from src[nt]
    ld1         {v1.8b},[x8],#8             //load from src[2nt+1]

    uaddlp      v2.4h,  v0.8b
    mov         x5, #0                      //
    mov         v6.s[0], w4
    mov         v6.s[1], w5                 //store nt to accumulate
    uaddlp      v3.4h,  v1.8b

    add         v4.4h,  v2.4h ,  v3.4h


    uaddlp      v5.2s,  v4.4h
    movi        d30, #0x00000000ffffffff

    and         v5.8b,  v5.8b ,  v30.8b

    mov         v28.s[0], w14
    mov         v28.s[1], w5                //src[2nt+1]+2+src[2nt-1] moved to d28
    add         d6,  d6 ,  d5               //accumulate all inp into d6 (end for nt==8)

    sshl        d18, d6, d7                 //(dc_val) shr by log2nt+1
    mov         x8, x7                      //&src[2nt+1]

    shl         d25, d18,#1                 //2*dc
    sub         x9, x9, #3                  //&src[2nt-1-row]

    dup         v16.8b, v18.b[0]            //dc_val
    add         d27,  d25 ,  d28            //src[2nt+1]+2+src[2nt-1]+2dc_val

    ushr        v29.4h, v27.4h,#2           //final dst[0]'s value in d15[0]
    sub         x12, x3, x3, lsl #2         //-3*strd
    add         d23,  d25 ,  d18            //3*dc

    add         d23,  d23 ,  d17            //3*dc + 2
    add         x12, x12, #4                //offset after one 4x4 block (-3*strd + 4)

    dup         v24.8h, v23.h[0]            //3*dc + 2 (moved to all lanes)
    sub         x0, x3, x4                  //strd - nt


    ld1         {v0.8b},[x8]                //col 1::3 load (prol)
    ld1         {v1.8b},[x9]                //row 3::1 (0 also) load (prol)

    uxtl        v20.8h, v0.8b

    uxtl        v22.8h, v1.8b
    add         v20.8h,  v20.8h ,  v24.8h   //col 1::7 add 3dc+2 (prol)

    add         v22.8h,  v22.8h ,  v24.8h   //row 1::7 add 3dc+2 (prol)

    movi        d19, #0x00000000000000ff    //
    sqshrun     v2.8b, v20.8h,#2            //columns shx2 movn (prol)

    movi        d20, #0x00000000000000ff    //byte mask row 1 (prol)
    sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)


    bsl         v19.8b,  v29.8b ,  v2.8b    //first row with dst[0]

    rev64       v3.8b,  v3.8b

    st1         {v19.s}[0],[x2], x3         //store row 0 (prol)
    sshr        d3, d3,#40                  //row 0 shift (prol) (first value to be ignored)

    movi        d21, #0x00000000000000ff    //byte mask row 2 (prol)

    bsl         v20.8b,  v3.8b ,  v16.8b    //row 1    (prol)
    sshr        d3, d3,#8                   //row 1 shift (prol)

    st1         {v20.s}[0],[x2], x3         //store row 1 (prol)

    bsl         v21.8b,  v3.8b ,  v16.8b    //row 2 (prol)

    movi        d20, #0x00000000000000ff    //byte mask row 3 (prol)

    sshr        d3, d3,#8                   //row 2 shift (prol)
    st1         {v21.s}[0],[x2], x3         //store row 2 (prol)

    bsl         v20.8b,  v3.8b ,  v16.8b    //row 3    (prol)
    st1         {v20.s}[0],[x2]             //store row 3 (prol)

epilogue_end:
end_func:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

    ret