@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@*  ihevc_intra_pred_chroma_dc_neon.s
@*
@* @brief
@*  contains function definitions for intra prediction dc filtering.
@* functions are coded using neon  intrinsics and can be compiled using

@* rvct
@*
@* @author
@*  yogeswaran rs
@*
@* @par list of functions:
@*
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/
@/**
@*******************************************************************************
@*
@* @brief
@*    luma intraprediction filter for dc input
@*
@* @par description:
@*
@* @param[in] pu1_ref
@*  uword8 pointer to the source
@*
@* @param[out] pu1_dst
@*  uword8 pointer to the destination
@*
@* @param[in] src_strd
@*  integer source stride
@*
@* @param[in] dst_strd
@*  integer destination stride
@*
@* @param[in] pi1_coeff
@*  word8 pointer to the planar coefficients
@*
@* @param[in] nt
@*  size of tranform block
@*
@* @param[in] mode
@*  type of filtering
@*
@* @returns
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/

@void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref,
@                                word32 src_strd,
@                                uword8 *pu1_dst,
@                                word32 dst_strd,
@                                word32 nt,
@                                word32 mode)
@
@**************variables vs registers*****************************************
@r0 => *pu1_ref
@r1 => src_strd
@r2 => *pu1_dst
@r3 => dst_strd

@stack contents from #40
@   nt
@   mode
@   pi1_coeff

.text
.align 4




.globl ihevc_intra_pred_chroma_dc_a9q

.type ihevc_intra_pred_chroma_dc_a9q, %function

ihevc_intra_pred_chroma_dc_a9q:

    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments

    ldr         r4,[sp,#40]                 @loads nt
    mov         r9, #0
    vmov        d17, r9, r9

    clz         r5, r4                      @counts leading zeros

    add         r6, r0, r4,lsl #1           @&src[2nt]
    vmov        d18, r9, r9
    rsb         r5, r5, #32                 @log2nt
    add         r7, r0, r4, lsl #2          @&src[4nt]
    mov         r12,r5
    add         r8, r7, #2                  @&src[4nt+2]

    cmp         r4, #4
    beq         dc_4                        @nt=4 loop


add_loop:
    vld2.s8     {d30,d31}, [r6]!            @load from src[nt]
    lsl         r10,r4,#1                   @2nt

    vpaddl.u8   d2, d30
    subs        r10, #0x10

    vld2.s8     {d26,d27}, [r8]!            @load from src[2nt+1]

    vpaddl.u8   d3, d31
    vpaddl.u16  d2, d2
    vpaddl.u16  d3, d3

    vpadal.u32  d17, d2

    vpadal.u32  d18, d3

    vpaddl.u8   d2, d26
    vpaddl.u8   d3, d27

    vpaddl.u16  d2, d2
    vpaddl.u16  d3, d3

    vpadal.u32  d17, d2
    vpadal.u32  d18, d3

    beq         epil_add_loop

core_loop_add:
    vld2.s8     {d30,d31}, [r6]!            @load from src[nt]
    vpaddl.u8   d28, d30
    vpaddl.u8   d3, d31

    vld2.s8     {d26,d27}, [r8]!            @load from src[2nt+1]

    vpaddl.u16  d3, d3
    vpaddl.u16  d29, d28

    vpadal.u32  d18, d3
    vpadal.u32  d17, d29

    vpaddl.u8   d3, d27
    vpaddl.u8   d28, d26

    vpaddl.u16  d3, d3
    vpaddl.u16  d29, d28

    vpadal.u32  d18, d3
    vpadal.u32  d17, d29


epil_add_loop:

    vmov.32     r1,d18[0]
    vmov.32     r11,d17[0]

    add         r1,r1,r4
    add         r11,r11,r4

    lsr         r1,r1,r12
    lsr         r11,r11,r12

    vdup.8      d17,r1
    vdup.8      d16,r11

prologue_cpy_32:

    add         r5, r2, r3
    subs        r9, r4, #8
    lsl         r6, r3, #2
    moveq       r11,r6
    add         r8, r5, r3
    add         r10, r8, r3

    beq         epilogue_copy

    vst2.8      {d16,d17}, [r2]!
    add         r6, r6, #0xfffffff0

    vst2.8      {d16,d17}, [r5]!
    vst2.8      {d16,d17}, [r8]!
    movne       r11,#16
    vst2.8      {d16,d17}, [r10]!


    vst2.8      {d16,d17}, [r2], r6
    vst2.8      {d16,d17}, [r5], r6
    vst2.8      {d16,d17}, [r8], r6
    vst2.8      {d16,d17}, [r10], r6

kernel_copy:
    vst2.8      {d16,d17}, [r2]!
    vst2.8      {d16,d17}, [r5]!
    vst2.8      {d16,d17}, [r8]!
    vst2.8      {d16,d17}, [r10]!

    vst2.8      {d16,d17}, [r2], r6
    vst2.8      {d16,d17}, [r5], r6
    vst2.8      {d16,d17}, [r8], r6
    vst2.8      {d16,d17}, [r10], r6

    vst2.8      {d16,d17}, [r2]!
    vst2.8      {d16,d17}, [r5]!
    vst2.8      {d16,d17}, [r8]!
    vst2.8      {d16,d17}, [r10]!

    vst2.8      {d16,d17}, [r2], r6
    vst2.8      {d16,d17}, [r5], r6
    vst2.8      {d16,d17}, [r8], r6
    vst2.8      {d16,d17}, [r10], r6

epilogue_copy:
    vst2.8      {d16,d17}, [r2],r11
    vst2.8      {d16,d17}, [r5],r11
    vst2.8      {d16,d17}, [r8],r11
    vst2.8      {d16,d17}, [r10],r11

    vst2.8      {d16,d17}, [r2]
    vst2.8      {d16,d17}, [r5]
    vst2.8      {d16,d17}, [r8]
    vst2.8      {d16,d17}, [r10]
    b           end_func

dc_4:
    vld2.s8     {d30,d31},[r6]              @load from src[nt]
    vshl.i64    d3,d30,#32

    vld2.s8     {d26,d27},[r8]              @load from src[2nt+1]
    vshl.i64    d2,d31,#32

    vpaddl.u8   d3,d3
    vpaddl.u8   d2,d2
    vpaddl.u16  d3,d3
    vpaddl.u16  d2,d2
    vpadal.u32  d17,d3
    vpadal.u32  d18,d2

    vshl.i64    d3,d26,#32
    vshl.i64    d2,d27,#32
    vpaddl.u8   d3,d3
    vpaddl.u8   d2,d2
    vpaddl.u16  d3,d3
    vpaddl.u16  d2,d2
    vpadal.u32  d17,d3
    vpadal.u32  d18,d2

    vmov.32     r10,d17[0]
    vmov.32     r11,d18[0]

    add         r10,r10,r4
    add         r11,r11,r4
    lsr         r10,r10,r12
    lsr         r11,r11,r12
    orr         r10,r10,r11,lsl #8
    vdup.16     d0,r10

    vst1.8      {d0},[r2],r3
    vst1.8      {d0},[r2],r3
    vst1.8      {d0},[r2],r3
    vst1.8      {d0},[r2]

end_func:
    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp