///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//*******************************************************************************
//* @file
//*  ihevc_inter_pred_chroma_copy.s
//*
//* @brief
//*  Contains function definitions for inter prediction  interpolation.
//* Functions are coded using NEON  intrinsics and can be compiled using ARM
//* RVCT
//*
//* @author
//*  Yogeswaran RS
//*
//* @par List of Functions:
//*
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//*/
///**
//*******************************************************************************
//*
//* @brief
//*   Chroma interprediction filter for copy
//*
//* @par Description:
//*    Copies the array of width 'wd' and height 'ht' from the  location pointed
//*    by 'src' to the location pointed by 'dst'
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] pi1_coeff
//*  WORD8 pointer to the filter coefficients
//*
//* @param[in] ht
//*  integer height of the array
//*
//* @param[in] wd
//*  integer width of the array
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//*/

//void ihevc_inter_pred_chroma_copy( UWORD8 *pu1_src,
//                                   UWORD8 *pu1_dst,
//                                   WORD32 src_strd,
//                                   WORD32 dst_strd,
//                                   WORD8 *pi1_coeff,
//                                   WORD32 ht,
//                                   WORD32 wd)
//**************Variables Vs Registers*****************************************
//x0 => *pu1_src
//x1 => *pu1_dst
//x2 =>  src_strd
//x3 =>  dst_strd
//x4 => *pi1_coeff
//x5 =>  ht
//x6 =>  wd

.text
.align 4

.globl ihevc_inter_pred_chroma_copy_av8

.type ihevc_inter_pred_chroma_copy_av8, %function

ihevc_inter_pred_chroma_copy_av8:

    LSL         x12,x6,#1                   //wd << 1
    CMP         x5,#0                       //checks ht == 0
    BLE         END_LOOPS
    AND         x8,x5,#3                    //check ht for mul of 2
    SUB         x5,x5,x8                    //check the rounded height value
    TST         x12,#15                     //checks wd for multiples for 16
    BEQ         CORE_LOOP_WD_16
    TST         x12,#7                      //checks wd for multiples for 4 & 8
    BEQ         CORE_LOOP_WD_8
    SUB         x11,x12,#4
    CMP         x5,#0
    BEQ         OUTER_LOOP_WD_4_HT_2

OUTER_LOOP_WD_4:
    SUBS        x4,x12,#0                   //checks wd == 0
    BLE         END_INNER_LOOP_WD_4

INNER_LOOP_WD_4:
    LD1         {v0.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
    ST1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    ADD         x0,x0,#4                    //pu1_src += 4
    ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    SUBS        x4,x4,#4                    //(wd -4)
    ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    ADD         x1,x1,#4                    //pu1_dst += 4
    ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    BGT         INNER_LOOP_WD_4

END_INNER_LOOP_WD_4:
    SUBS        x5,x5,#4                    //ht - 4
    SUB         x0,x7,x11                   //pu1_src = pu1_src_tmp
    SUB         x1,x6,x11                   //pu1_dst = pu1_dst_tmp
    BGT         OUTER_LOOP_WD_4
    CMP         x8,#0
    BGT         OUTER_LOOP_WD_4_HT_2

END_LOOPS:
    RET

OUTER_LOOP_WD_4_HT_2:
    SUBS        x4,x12,#0                   //checks wd == 0
    BLE         END_LOOPS

INNER_LOOP_WD_4_HT_2:
    LD1         {v0.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
    ST1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    ADD         x0,x0,#4                    //pu1_src += 4
    ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    SUBS        x4,x4,#4                    //(wd -4)
    ADD         x1,x1,#4                    //pu1_dst += 4
    BGT         INNER_LOOP_WD_4_HT_2
    B           END_LOOPS

CORE_LOOP_WD_8:
    SUB         x11,x12,#8
    CMP         x5,#0
    BEQ         OUTER_LOOP_WD_8_HT_2

OUTER_LOOP_WD_8:
    SUBS        x4,x12,#0                   //checks wd
    BLE         END_INNER_LOOP_WD_8


INNER_LOOP_WD_8:
    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
    LD1         {v0.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
    ST1         {v0.8b},[x1],#8             //vst1_u8(pu1_dst_tmp, tmp_src)
    LD1         {v1.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
    ST1         {v1.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
    SUBS        x4,x4,#8                    //wd - 8(Loop condition)
    LD1         {v2.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
    ST1         {v2.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
    LD1         {v3.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
    ST1         {v3.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
    BGT         INNER_LOOP_WD_8

END_INNER_LOOP_WD_8:
    SUBS        x5,x5,#4                    //ht -= 4
    SUB         x0,x7,x11                   //pu1_src = pu1_src_tmp
    SUB         x1,x6,x11                   //pu1_dst = pu1_dst_tmp
    BGT         OUTER_LOOP_WD_8
    CMP         x8,#0
    BGT         OUTER_LOOP_WD_8_HT_2
    B           END_LOOPS

OUTER_LOOP_WD_8_HT_2:
    SUBS        x4,x12,#0                   //checks wd
    BLE         END_LOOPS

INNER_LOOP_WD_8_HT_2:
    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
    LD1         {v0.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
    ST1         {v0.8b},[x1],#8             //vst1_u8(pu1_dst_tmp, tmp_src)
    LD1         {v1.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
    ST1         {v1.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
    B           END_LOOPS

CORE_LOOP_WD_16:
    SUB         x11,x12,#16
    CMP         x5,#0
    BEQ         OUTER_LOOP_WD_16_HT_2

OUTER_LOOP_WD_16:
    SUBS        x4,x12,#0                   //checks wd
    BLE         END_INNER_LOOP_WD_16

INNER_LOOP_WD_16:
    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
    LD1         {v0.16b},[x0],#16           //vld1_u8(pu1_src_tmp)
    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
    ST1         {v0.16b},[x1],#16           //vst1_u8(pu1_dst_tmp, tmp_src)
    LD1         {v1.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
    ST1         {v1.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
    SUBS        x4,x4,#16                   //wd - 16(Loop condition)
    LD1         {v2.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
    ST1         {v2.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
    LD1         {v3.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
    ST1         {v3.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
    BGT         INNER_LOOP_WD_16

END_INNER_LOOP_WD_16:
    SUBS        x5,x5,#4                    //ht -= 4
    SUB         x0,x7,x11                   //pu1_src = pu1_src_tmp
    SUB         x1,x6,x11                   //pu1_dst = pu1_dst_tmp
    BGT         OUTER_LOOP_WD_16
    CMP         x8,#0
    BGT         OUTER_LOOP_WD_16_HT_2
    B           END_LOOPS

OUTER_LOOP_WD_16_HT_2:
    SUBS        x4,x12,#0                   //checks wd
    BLE         END_LOOPS

INNER_LOOP_WD_16_HT_2:
    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
    LD1         {v0.16b},[x0],#16           //vld1_u8(pu1_src_tmp)
    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
    ST1         {v0.16b},[x1],#16           //vst1_u8(pu1_dst_tmp, tmp_src)
    LD1         {v1.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
    ST1         {v1.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)

    RET