@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@*  ihevc_intra_pred_filters_dc.s
@*
@* @brief
@*  contains function definitions for intra prediction dc filtering.
@* functions are coded using neon  intrinsics and can be compiled using

@* rvct
@*
@* @author
@*  akshaya mukund
@*
@* @par list of functions:
@*
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/
@/**
@*******************************************************************************
@*
@* @brief
@*    luma intraprediction filter for dc input
@*
@* @par description:
@*
@* @param[in] pu1_ref
@*  uword8 pointer to the source
@*
@* @param[out] pu1_dst
@*  uword8 pointer to the destination
@*
@* @param[in] src_strd
@*  integer source stride
@*
@* @param[in] dst_strd
@*  integer destination stride
@*
@* @param[in] pi1_coeff
@*  word8 pointer to the planar coefficients
@*
@* @param[in] nt
@*  size of tranform block
@*
@* @param[in] mode
@*  type of filtering
@*
@* @returns
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/

@void ihevc_intra_pred_luma_dc(uword8 *pu1_ref,
@                              word32 src_strd,
@                              uword8 *pu1_dst,
@                              word32 dst_strd,
@                              word32 nt,
@                              word32 mode)
@
@**************variables vs registers*****************************************
@r0 => *pu1_ref
@r1 => src_strd
@r2 => *pu1_dst
@r3 => dst_strd

@stack contents from #40
@   nt
@   mode
@   pi1_coeff

.text
.align 4




.globl ihevc_intra_pred_luma_dc_a9q

.type ihevc_intra_pred_luma_dc_a9q, %function

ihevc_intra_pred_luma_dc_a9q:

    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments

    ldr         r4,[sp,#40]                 @loads nt

@********** testing
    @mov        r6, #128
    @b      prologue_cpy_32
@********** testing

    mov         r11, #2                     @mov #2 to r11 (to be used to add to 2dc_val & 3dc_val)
    mov         r9, #0
    vmov        d17, r11, r9

    clz         r5, r4

    add         r6, r0, r4                  @&src[nt]
    rsb         r5, r5, #32                 @log2nt
    add         r7, r0, r4, lsl #1          @&src[2nt]

    add         r8, r7, #1                  @&src[2nt+1]
    mvn         r5, r5
    add         r5, r5, #1
    vdup.32     d8, r5

    ldrb        r14, [r8]
    vshl.i64    d8, d8, #32

    sub         r9, r7, #1                  @&src[2nt-1]
    vshr.s64    d8, d8, #32

    mov         r7, r8                      @r7 also stores 2nt+1

    ldrb        r12, [r9]
    add         r14, r14, r12               @src[2nt+1] + src[2nt-1]
    add         r14, r14, r11               @src[2nt+1] + src[2nt-1] + 2

    cmp         r4, #4
    beq         dc_4

    mov         r10, r4                     @nt

add_loop:
    vld1.s8     d0, [r6]!                   @load from src[nt]
    mov         r5, #0                      @
    vld1.s8     d1, [r8]!                   @load from src[2nt+1]

    vpaddl.u8   d2, d0

    vmov        d6, r4, r5                  @store nt to accumulate
    vpaddl.u8   d3, d1

    vld1.s8     d0, [r6]!                   @load from src[nt] (extra load for 8)

    vld1.s8     d1, [r8]!                   @load from src[2nt+1] (extra load for 8)
    vadd.u16    d4, d2, d3


    vpaddl.u16  d5, d4


    vpadal.u32  d6, d5                      @accumulate all inp into d6 (end for nt==8)

    subs        r10, #8
    beq         epil_add_loop

core_loop_add:
    vpaddl.u8   d2, d0
    subs        r10, #8
    vpaddl.u8   d3, d1



    vadd.u16    d4, d2, d3
    vld1.s8     d0, [r6]!                   @load from src[nt] (extra load for 16)

    vpaddl.u16  d5, d4
    vld1.s8     d1, [r8]!                   @load from src[2nt+1] (extra load for 16)

    vpadal.u32  d6, d5                      @accumulate all inp into d6
    bne         core_loop_add

epil_add_loop:

    vshl.s64    d9, d6, d8                  @(dc_val) shr by log2nt+1
    cmp         r4, #32

    vmov        d28, r14, r5                @src[2nt+1]+2+src[2nt-1] moved to d28
    moveq       r6, #128

    vdup.8      d16, d9[0]                  @dc_val
    vshl.s64    d13, d9, #1                 @2*dc

    beq         prologue_cpy_32

    vadd.i64    d14, d13, d28               @src[2nt+1]+2+src[2nt-1]+2dc_val
    movne       r6, #0                      @nt

    vshr.u16    d15, d14, #2                @final dst[0]'s value in d15[0]
    movne       r10, r4

    vadd.i64    d11, d13, d9                @3*dc
    sub         r12, r3, r3, lsl #3         @-7*strd

    vadd.i64    d11, d11, d17               @3*dc + 2
    add         r12, r12, #8                @offset after one 8x8 block (-7*strd + 8)

    vdup.16     q12, d11[0]                 @3*dc + 2 (moved to all lanes)
    sub         r0, r3, r4                  @strd - nt

prologue_col:
    @0th column and 0-7 rows done here
    @r8 and r9 (2nt+1+col 2nt-1-row)

    mov         r8, r7                      @&src[2nt+1]

    add         r0, r0, #8                  @strd - nt + 8
    vld1.s8     d0, [r8]!                   @col 1::7 load (prol)
    sub         r9, r9, #7                  @&src[2nt-1-row]

    vld1.s8     d1, [r9]                    @row 7::1 (0 also) load (prol)
    sub         r9, r9, #8

    vmovl.u8    q10, d0

    vld1.s8     d6, [r8]                    @col 8::15 load (prol extra)
    vadd.i16    q10, q10, q12               @col 1::7 add 3dc+2 (prol)

    vmovl.u8    q11, d1
    vqshrun.s16 d2, q10, #2                 @columns shr2 movn (prol)

    vmovl.u8    q13, d6
    vadd.i16    q11, q11, q12               @row 1::7 add 3dc+2 (prol)

    vmov.i64    d19, #0x00000000000000ff    @
    vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)

    vbsl        d19, d15, d2                @first row with dst[0]
    vadd.i16    q13, q13, q12               @col 8::15 add 3dc+2 (prol extra)

    vrev64.8    d3, d3

    vst1.8      d19, [r2], r3               @store row 0 (prol)
    vshr.s64    d3, d3, #8                  @row 0 shift (prol) (first value to be ignored)

    vmov.i64    d20, #0x00000000000000ff    @byte mask row 1 (prol)

loop_again_col_row:

    vbsl        d20, d3, d16                @row 1  (prol)

    vmov.i64    d21, #0x00000000000000ff    @byte mask row 2 (prol)
    vshr.s64    d3, d3, #8                  @row 1 shift (prol)

    vst1.8      d20, [r2], r3               @store row 1 (prol)
    vqshrun.s16 d4, q13, #2                 @columns shr2 movn (prol extra)


    vbsl        d21, d3, d16                @row 2 (prol)

    vmov.i64    d20, #0x00000000000000ff    @byte mask row 3 (prol)
    vshr.s64    d3, d3, #8                  @row 2 shift (prol)

    vst1.8      d21, [r2], r3               @store row 2 (prol)


    vbsl        d20, d3, d16                @row 3  (prol)

    vmov.i64    d21, #0x00000000000000ff    @byte mask row 4 (prol)
    vshr.s64    d3, d3, #8                  @row 3 shift (prol)

    vst1.8      d20, [r2], r3               @store row 3 (prol)


    vbsl        d21, d3, d16                @row 4 (prol)

    vmov.i64    d20, #0x00000000000000ff    @byte mask row 5 (prol)
    vshr.s64    d3, d3, #8                  @row 4 shift (prol)

    vst1.8      d21, [r2], r3               @store row 4 (prol)


    vbsl        d20, d3, d16                @row 5 (prol)

    vmov.i64    d21, #0x00000000000000ff    @byte mask row 6 (prol)
    vshr.s64    d3, d3, #8                  @row 5 shift (prol)

    vst1.8      d20, [r2], r3               @store row 5 (prol)

    vld1.s8     d1, [r9]                    @row 8::15 load (prol extra)

    vbsl        d21, d3, d16                @row 6 (prol)

    vmovl.u8    q11, d1

    vmov.i64    d20, #0x00000000000000ff    @byte mask row 7 (prol)
    vshr.s64    d3, d3, #8                  @row 6 shift (prol)

    vst1.8      d21, [r2], r3               @store row 6 (prol)

    vbsl        d20, d3, d16                @row 7 (prol)
    vadd.i16    q11, q11, q12               @row 8::15 add 3dc+2 (prol extra)

    vshr.s64    d3, d3, #8                  @row 7 shift (prol)
    vst1.8      d20, [r2], r12              @store row 7 (prol)

    subs        r10, r10, #8                @counter for cols

    beq         end_func
    blt         copy_16


    vmov.i64    d20, #0x00000000000000ff    @byte mask row 9 (prol)
    vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)

    vrev64.8    d3, d3

    vst1.8      d4, [r2], r3                @store 2nd col (for 16x16)

    vst1.8      d16, [r2], r3
    vst1.8      d16, [r2], r3
    vst1.8      d16, [r2], r3
    vst1.8      d16, [r2], r3
    vst1.8      d16, [r2], r3
    vst1.8      d16, [r2], r3
    vst1.8      d16, [r2], r0               @go to next row for 16


    vbsl        d20, d3, d16                @row 9  (prol)
    subs        r10, r10, #8

    vst1.8      d20, [r2], r3               @store row 9 (prol)
    vshr.s64    d3, d3, #8                  @row 9 shift (prol)

    vmov.i64    d20, #0x00000000000000ff    @byte mask row 9 (prol)

    b           loop_again_col_row


copy_16:
    vst1.8      d16, [r2], r3
    vst1.8      d16, [r2], r3
    vst1.8      d16, [r2], r3
    vst1.8      d16, [r2], r3
    vst1.8      d16, [r2], r3
    vst1.8      d16, [r2], r3
    vst1.8      d16, [r2], r3
    vst1.8      d16, [r2]

    b           end_func

prologue_cpy_32:
    mov         r9, #128
    @sub        r7, r3, #-24
    add         r5, r2, r3
    add         r8, r5, r3
    add         r10, r8, r3
    vdup.8      q10, d16[0]
    lsl         r6, r3, #2
    add         r6, r6, #0xfffffff0

    vst1.8      {d20,d21}, [r2]!
    vst1.8      {d20,d21}, [r5]!
    vst1.8      {d20,d21}, [r8]!
    vst1.8      {d20,d21}, [r10]!

    vst1.8      {d20,d21}, [r2], r6
    vst1.8      {d20,d21}, [r5], r6
    vst1.8      {d20,d21}, [r8], r6
    vst1.8      {d20,d21}, [r10], r6

    sub         r9, r9, #32                 @32x32 prol/epil counter dec

kernel_copy:
    vst1.8      {d20,d21}, [r2]!
    vst1.8      {d20,d21}, [r5]!
    vst1.8      {d20,d21}, [r8]!
    vst1.8      {d20,d21}, [r10]!

    vst1.8      {d20,d21}, [r2], r6
    vst1.8      {d20,d21}, [r5], r6
    vst1.8      {d20,d21}, [r8], r6
    vst1.8      {d20,d21}, [r10], r6

    subs        r9, r9, #32

    vst1.8      {d20,d21}, [r2]!
    vst1.8      {d20,d21}, [r5]!
    vst1.8      {d20,d21}, [r8]!
    vst1.8      {d20,d21}, [r10]!

    vst1.8      {d20,d21}, [r2], r6
    vst1.8      {d20,d21}, [r5], r6
    vst1.8      {d20,d21}, [r8], r6
    vst1.8      {d20,d21}, [r10], r6

    bne         kernel_copy

epilogue_copy:
    vst1.8      {d20,d21}, [r2]!
    vst1.8      {d20,d21}, [r5]!
    vst1.8      {d20,d21}, [r8]!
    vst1.8      {d20,d21}, [r10]!

    vst1.8      {d20,d21}, [r2]
    vst1.8      {d20,d21}, [r5]
    vst1.8      {d20,d21}, [r8]
    vst1.8      {d20,d21}, [r10]

    b           end_func


dc_4:
    vld1.s8     d0, [r6]!                   @load from src[nt]
    vld1.s8     d1, [r8]!                   @load from src[2nt+1]

    vpaddl.u8   d2, d0
    mov         r5, #0                      @
    vmov        d6, r4, r5                  @store nt to accumulate
    vpaddl.u8   d3, d1

    vadd.u16    d4, d2, d3


    vpaddl.u16  d5, d4
    vmov.i64    d30, #0x00000000ffffffff

    vand        d5, d5, d30

    vmov        d28, r14, r5                @src[2nt+1]+2+src[2nt-1] moved to d28
    vadd.i64    d6, d6, d5                  @accumulate all inp into d6 (end for nt==8)

    vshl.s64    d9, d6, d8                  @(dc_val) shr by log2nt+1
    mov         r8, r7                      @&src[2nt+1]

    vshl.s64    d13, d9, #1                 @2*dc
    sub         r9, r9, #3                  @&src[2nt-1-row]

    vdup.8      d16, d9[0]                  @dc_val
    vadd.i64    d14, d13, d28               @src[2nt+1]+2+src[2nt-1]+2dc_val

    vshr.u16    d15, d14, #2                @final dst[0]'s value in d15[0]
    sub         r12, r3, r3, lsl #2         @-3*strd
    vadd.i64    d11, d13, d9                @3*dc

    vadd.i64    d11, d11, d17               @3*dc + 2
    add         r12, r12, #4                @offset after one 4x4 block (-3*strd + 4)

    vdup.16     q12, d11[0]                 @3*dc + 2 (moved to all lanes)
    sub         r0, r3, r4                  @strd - nt


    vld1.s8     d0, [r8]                    @col 1::3 load (prol)
    vld1.s8     d1, [r9]                    @row 3::1 (0 also) load (prol)

    vmovl.u8    q10, d0

    vmovl.u8    q11, d1
    vadd.i16    q10, q10, q12               @col 1::7 add 3dc+2 (prol)

    vadd.i16    q11, q11, q12               @row 1::7 add 3dc+2 (prol)

    vmov.i64    d19, #0x00000000000000ff    @
    vqshrun.s16 d2, q10, #2                 @columns shr2 movn (prol)

    vmov.i64    d20, #0x00000000000000ff    @byte mask row 1 (prol)
    vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)


    vbsl        d19, d15, d2                @first row with dst[0]

    vrev64.8    d3, d3

    vst1.32     d19[0], [r2], r3            @store row 0 (prol)
    vshr.s64    d3, d3, #40                 @row 0 shift (prol) (first value to be ignored)

    vmov.i64    d21, #0x00000000000000ff    @byte mask row 2 (prol)

    vbsl        d20, d3, d16                @row 1  (prol)
    vshr.s64    d3, d3, #8                  @row 1 shift (prol)

    vst1.32     d20[0], [r2], r3            @store row 1 (prol)

    vbsl        d21, d3, d16                @row 2 (prol)

    vmov.i64    d20, #0x00000000000000ff    @byte mask row 3 (prol)

    vshr.s64    d3, d3, #8                  @row 2 shift (prol)
    vst1.32     d21[0], [r2], r3            @store row 2 (prol)

    vbsl        d20, d3, d16                @row 3  (prol)
    vst1.32     d20[0], [r2]                @store row 3 (prol)

epilogue_end:
end_func:
    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp