//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
//******************************************************************************
//* @file
//*  ih264_intra_pred_luma_8x8_av8.s
//*
//* @brief
//*  Contains function definitions for intra 8x8 Luma prediction .
//*
//* @author
//*  Ittiam
//*
//* @par List of Functions:
//*
//*  -ih264_intra_pred_luma_8x8_mode_vert_av8
//*  -ih264_intra_pred_luma_8x8_mode_horz_av8
//*  -ih264_intra_pred_luma_8x8_mode_dc_av8
//*  -ih264_intra_pred_luma_8x8_mode_diag_dl_av8
//*  -ih264_intra_pred_luma_8x8_mode_diag_dr_av8
//*  -ih264_intra_pred_luma_8x8_mode_vert_r_av8
//*  -ih264_intra_pred_luma_8x8_mode_horz_d_av8
//*  -ih264_intra_pred_luma_8x8_mode_vert_l_av8
//*  -ih264_intra_pred_luma_8x8_mode_horz_u_av8
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//*/

///* All the functions here are replicated from ih264_intra_pred_filters.c
//

///**
///**
///**

.text
.p2align 2
.include "ih264_neon_macros.s"

.extern ih264_gai1_intrapred_luma_8x8_horz_u



///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_8x8_mode_vert
//*
//* @brief
//*   Perform Intra prediction for  luma_8x8 mode:vertical
//*
//* @par Description:
//* Perform Intra prediction for  luma_8x8 mode:vertical ,described in sec 8.3.2.2.2
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] ui_neighboravailability
//* availability of neighbouring pixels(Not used in this function)
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//void ih264_intra_pred_luma_8x8_mode_vert(UWORD8 *pu1_src,
//                                        UWORD8 *pu1_dst,
//                                        WORD32 src_strd,
//                                        WORD32 dst_strd,
//                                        WORD32 ui_neighboravailability)

//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    w2 =>  src_strd
//    w3 =>  dst_strd
//    w4 =>  ui_neighboravailability


    .global ih264_intra_pred_luma_8x8_mode_vert_av8

ih264_intra_pred_luma_8x8_mode_vert_av8:

    // STMFD sp!, {x4-x12, x14}          //store register values to stack
    push_v_regs
    //stp x19, x20,[sp,#-16]!
    sxtw      x3, w3

    add       x0, x0, #9
    ld1       {v0.8b}, [x0]

    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3

    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    //ldp x19, x20,[sp],#16
    pop_v_regs
    ret





///******************************************************************************


///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_8x8_mode_horz
//*
//* @brief
//*  Perform Intra prediction for  luma_8x8 mode:horizontal
//*
//* @par Description:
//*  Perform Intra prediction for  luma_8x8 mode:horizontal ,described in sec 8.3.2.2.2
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] ui_neighboravailability
//* availability of neighbouring pixels(Not used in this function)
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//*/
//void ih264_intra_pred_luma_8x8_mode_horz(UWORD8 *pu1_src,
//                                         UWORD8 *pu1_dst,
//                                         WORD32 src_strd,
//                                         WORD32 dst_strd,
//                                         WORD32 ui_neighboravailability)
//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    w2 =>  src_strd
//    w3 =>  dst_strd
//    w4 =>  ui_neighboravailability


    .global ih264_intra_pred_luma_8x8_mode_horz_av8

ih264_intra_pred_luma_8x8_mode_horz_av8:



    // STMFD sp!, {x4-x12, x14}          //store register values to stack
    push_v_regs
    stp       x19, x20, [sp, #-16]!
    sxtw      x3, w3
    add       x0, x0, #7

    ldrb      w5, [x0], #-1
    ldrb      w6, [x0], #-1
    dup       v0.8b, w5
    st1       {v0.8b}, [x1], x3
    ldrb      w7, [x0], #-1
    dup       v1.8b, w6
    st1       {v1.8b}, [x1], x3
    dup       v2.8b, w7
    ldrb      w8, [x0], #-1
    dup       v3.8b, w8
    st1       {v2.8b}, [x1], x3
    ldrb      w5, [x0], #-1
    st1       {v3.8b}, [x1], x3
    dup       v0.8b, w5
    ldrb      w6, [x0], #-1
    st1       {v0.8b}, [x1], x3
    ldrb      w7, [x0], #-1
    dup       v1.8b, w6
    dup       v2.8b, w7
    st1       {v1.8b}, [x1], x3
    ldrb      w8, [x0], #-1
    dup       v3.8b, w8
    st1       {v2.8b}, [x1], x3
    st1       {v3.8b}, [x1], x3

    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret







///******************************************************************************


///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_8x8_mode_dc
//*
//* @brief
//*  Perform Intra prediction for  luma_8x8 mode:DC
//*
//* @par Description:
//*  Perform Intra prediction for  luma_8x8 mode:DC ,described in sec 8.3.2.2.3
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] ui_neighboravailability
//*  availability of neighbouring pixels
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************/
//void ih264_intra_pred_luma_8x8_mode_dc(UWORD8 *pu1_src,
//                                       UWORD8 *pu1_dst,
//                                       WORD32 src_strd,
//                                       WORD32 dst_strd,
//                                       WORD32 ui_neighboravailability)

//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    w2 =>  src_strd
//    w3 =>  dst_strd
//    w4 =>  ui_neighboravailability


    .global ih264_intra_pred_luma_8x8_mode_dc_av8

ih264_intra_pred_luma_8x8_mode_dc_av8:



    // STMFD sp!, {x4-x12, x14}          //store register values to stack
    push_v_regs
    sxtw      x3, w3
    stp       x19, x20, [sp, #-16]!

    ands      w6, w4, #0x01
    beq       top_available             //LEFT NOT AVAILABLE

    add       x10, x0, #7
    mov       x2, #-1
    ldrb      w5, [x10], -1
    ldrb      w6, [x10], -1
    ldrb      w7, [x10], -1
    add       w5, w5, w6
    ldrb      w8, [x10], -1
    add       w5, w5, w7
    ldrb      w6, [x10], -1
    add       w5, w5, w8
    ldrb      w7, [x10], -1
    add       w5, w5, w6
    ldrb      w8, [x10], -1
    add       w5, w5, w7
    ands      w11, w4, #0x04            // CHECKING IF TOP_AVAILABLE  ELSE BRANCHING TO ONLY LEFT AVAILABLE
    add       w5, w5, w8
    ldrb      w6, [x10], -1
    add       w5, w5, w6
    beq       left_available
    add       x10, x0, #9
    //    BOTH LEFT AND TOP AVAILABLE
    ld1       {v0.8b}, [x10]
    uaddlp    v1.4h, v0.8b
    uaddlp    v3.2s, v1.4h
    uaddlp    v2.1d, v3.2s
    dup       v10.8h, w5
    dup       v8.8h, v2.h[0]
    add       v12.8h, v8.8h , v10.8h
    sqrshrun  v31.8b, v12.8h, #4
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    b         end_func

top_available: // ONLT TOP AVAILABLE
    ands      w11, w4, #0x04            // CHECKING TOP AVAILABILTY  OR ELSE BRANCH TO NONE AVAILABLE
    beq       none_available

    add       x10, x0, #9
    ld1       {v10.8b}, [x10]
    uaddlp    v14.4h, v10.8b
    uaddlp    v13.2s, v14.4h
    uaddlp    v12.1d, v13.2s
    rshrn     v4.8b, v12.8h, #3
    dup       v31.8b, v4.b[0]
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    st1       {v31.8b}, [x1], x3
    b         end_func


left_available: //ONLY LEFT AVAILABLE
    add       x5, x5, #4
    lsr       x5, x5, #3
    dup       v0.8b, w5
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    b         end_func

none_available:                         //NONE AVAILABLE
    mov       x9, #128
    dup       v0.8b, w9
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3
    st1       {v0.8b}, [x1], x3


end_func:

    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret






///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_8x8_mode_diag_dl
//*
//* @brief
//*  Perform Intra prediction for  luma_8x8 mode:Diagonal_Down_Left
//*
//* @par Description:
//*  Perform Intra prediction for  luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.4
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] ui_neighboravailability
//*  availability of neighbouring pixels
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************/
//void ih264_intra_pred_luma_8x8_mode_diag_dl(UWORD8 *pu1_src,
//                                            UWORD8 *pu1_dst,
//                                            WORD32 src_strd,
//                                              WORD32 dst_strd,
//                                              WORD32 ui_neighboravailability)

//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    w2 =>  src_strd
//    w3 =>  dst_strd
//    w4 =>  ui_neighboravailability

    .global ih264_intra_pred_luma_8x8_mode_diag_dl_av8

ih264_intra_pred_luma_8x8_mode_diag_dl_av8:

    // STMFD sp!, {x4-x12, x14}          //store register values to stack
    push_v_regs
    stp       x19, x20, [sp, #-16]!
    sxtw      x3, w3

    add       x0, x0, #9
    sub       x5, x3, #4
    add       x6, x0, #15
    ld1       { v0.16b}, [x0]
    mov       v1.d[0], v0.d[1]
    ext       v4.16b, v0.16b , v0.16b , #2
    mov       v5.d[0], v4.d[1]
    ext       v2.16b, v0.16b , v0.16b , #1
    mov       v3.d[0], v2.d[1]
    ld1       {v5.b}[6], [x6]
    // q1 = q0 shifted to left once
    // q2 = q1 shifted to left once
    uaddl     v20.8h, v0.8b, v2.8b      //Adding for FILT121
    uaddl     v22.8h, v1.8b, v3.8b
    uaddl     v24.8h, v2.8b, v4.8b
    uaddl     v26.8h, v3.8b, v5.8b
    add       v24.8h, v20.8h , v24.8h
    add       v26.8h, v22.8h , v26.8h

    sqrshrun  v4.8b, v24.8h, #2
    sqrshrun  v5.8b, v26.8h, #2
    mov       v4.d[1], v5.d[0]
    //Q2 has all FILT121 values
    st1       {v4.8b}, [x1], x3
    ext       v18.16b, v4.16b , v4.16b , #1
    ext       v16.16b, v18.16b , v18.16b , #1
    st1       {v18.8b}, [x1], x3
    ext       v14.16b, v16.16b , v16.16b , #1
    st1       {v16.8b}, [x1], x3
    st1       {v14.8b}, [x1], x3
    st1       {v4.s}[1], [x1], #4
    st1       {v5.s}[0], [x1], x5
    st1       {v18.s}[1], [x1], #4
    st1       {v18.s}[2], [x1], x5
    st1       {v16.s}[1], [x1], #4
    st1       {v16.s}[2], [x1], x5
    st1       {v14.s}[1], [x1], #4
    st1       {v14.s}[2], [x1], x5


end_func_diag_dl:
    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret




///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_8x8_mode_diag_dr
//*
//* @brief
//* Perform Intra prediction for  luma_8x8 mode:Diagonal_Down_Right
//*
//* @par Description:
//*  Perform Intra prediction for  luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.5
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] ui_neighboravailability
//*  availability of neighbouring pixels
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************/
//void ih264_intra_pred_luma_8x8_mode_diag_dr(UWORD8 *pu1_src,
//                                            UWORD8 *pu1_dst,
//                                            WORD32 src_strd,
//                                              WORD32 dst_strd,
//                                              WORD32 ui_neighboravailability)

//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    w2 =>  src_strd
//    w3 =>  dst_strd
//    w4 =>  ui_neighboravailability


    .global ih264_intra_pred_luma_8x8_mode_diag_dr_av8

ih264_intra_pred_luma_8x8_mode_diag_dr_av8:

    // STMFD sp!, {x4-x12, x14}          //store register values to stack
    push_v_regs
    stp       x19, x20, [sp, #-16]!
    sxtw      x3, w3


    ld1       { v0.16b}, [x0]
    mov       v1.d[0], v0.d[1]
    add       x0, x0, #1
    ld1       { v2.16b}, [x0]
    mov       v3.d[0], v2.d[1]
    ext       v4.16b, v2.16b , v2.16b , #1
    mov       v5.d[0], v4.d[1]
    // q1 = q0 shifted to left once
    // q2 = q1 shifted to left once
    uaddl     v20.8h, v0.8b, v2.8b      //Adding for FILT121
    uaddl     v22.8h, v1.8b, v3.8b
    uaddl     v24.8h, v2.8b, v4.8b
    uaddl     v26.8h, v3.8b, v5.8b
    add       v24.8h, v20.8h , v24.8h
    add       v26.8h, v22.8h , v26.8h
    sqrshrun  v4.8b, v24.8h, #2
    sqrshrun  v5.8b, v26.8h, #2
    mov       v4.d[1], v5.d[0]
    //Q2 has all FILT121 values
    sub       x5, x3, #4
    ext       v18.16b, v4.16b , v4.16b , #15
    st1       {v18.d}[1], [x1], x3
    ext       v16.16b, v18.16b , v18.16b , #15
    st1       {v16.d}[1], [x1], x3
    ext       v14.16b, v16.16b , v16.16b , #15
    st1       {v14.d}[1], [x1], x3
    st1       {v4.s}[1], [x1], #4
    st1       {v5.s}[0], [x1], x5
    st1       {v18.s}[1], [x1], #4
    st1       {v18.s}[2], [x1], x5
    st1       {v16.s}[1], [x1], #4
    st1       {v16.s}[2], [x1], x5
    st1       {v14.s}[1], [x1], #4
    st1       {v14.s}[2], [x1], x5
    st1       {v4.8b}, [x1], x3

end_func_diag_dr:
    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret




///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_8x8_mode_vert_r
//*
//* @brief
//* Perform Intra prediction for  luma_8x8 mode:Vertical_Right
//*
//* @par Description:
//*   Perform Intra prediction for  luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.6
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] ui_neighboravailability
//*  availability of neighbouring pixels
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************/
//void ih264_intra_pred_luma_8x8_mode_vert_r(UWORD8 *pu1_src,
//                                            UWORD8 *pu1_dst,
//                                            WORD32 src_strd,
//                                              WORD32 dst_strd,
//                                              WORD32 ui_neighboravailability)

//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    w2 =>  src_strd
//    w3 =>  dst_strd
//    w4 =>  ui_neighboravailability


    .global ih264_intra_pred_luma_8x8_mode_vert_r_av8

ih264_intra_pred_luma_8x8_mode_vert_r_av8:

    // STMFD sp!, {x4-x12, x14}          //store register values to stack
    push_v_regs
    stp       x19, x20, [sp, #-16]!
    sxtw      x3, w3

    ld1       { v0.16b}, [x0]
    mov       v1.d[0], v0.d[1]
    add       x0, x0, #1
    ld1       { v2.16b}, [x0]
    mov       v3.d[0], v2.d[1]
    ext       v4.16b, v2.16b , v2.16b , #1
    mov       v5.d[0], v4.d[1]
    // q1 = q0 shifted to left once
    // q2 = q1 shifted to left once
    uaddl     v20.8h, v0.8b, v2.8b
    uaddl     v22.8h, v1.8b, v3.8b
    uaddl     v24.8h, v2.8b, v4.8b
    uaddl     v26.8h, v3.8b, v5.8b
    add       v24.8h, v20.8h , v24.8h
    add       v26.8h, v22.8h , v26.8h

    sqrshrun  v4.8b, v20.8h, #1
    sqrshrun  v5.8b, v22.8h, #1
    mov       v4.d[1], v5.d[0]
    sqrshrun  v6.8b, v24.8h, #2
    sqrshrun  v7.8b, v26.8h, #2
    mov       v6.d[1], v7.d[0]
    //Q2 has all FILT11 values
    //Q3 has all FILT121 values
    sub       x5, x3, #6
    sub       x6, x3, #4
    st1       {v5.8b}, [x1], x3         // row 0
    ext       v18.16b, v6.16b , v6.16b , #15
    mov       v22.16b , v18.16b
    ext       v16.16b, v4.16b , v4.16b , #1
    st1       {v18.d}[1], [x1], x3      //row 1
    mov       v14.16b , v16.16b
    ext       v20.16b, v4.16b , v4.16b , #15
    uzp1      v17.16b, v16.16b, v18.16b
    uzp2      v18.16b, v16.16b, v18.16b
    mov       v16.16b , v17.16b
    //row 2
    ext       v12.16b, v16.16b , v16.16b , #1
    st1       {v20.d}[1], [x1]
    st1       {v6.b}[6], [x1], x3
    //row 3

    st1       {v12.h}[5], [x1], #2
    st1       {v6.s}[2], [x1], #4
    st1       {v6.h}[6], [x1], x5
    //row 4
    st1       {v18.h}[5], [x1], #2
    st1       {v4.s}[2], [x1], #4
    st1       {v4.h}[6], [x1], x5
    //row 5
    ext       v26.16b, v18.16b , v18.16b , #1
    st1       {v16.h}[5], [x1], #2
    st1       {v22.s}[2], [x1], #4
    st1       {v22.h}[6], [x1], x5
    //row 6
    st1       {v26.h}[4], [x1], #2
    st1       {v26.b}[10], [x1], #1
    st1       {v4.b}[8], [x1], #1
    st1       {v14.s}[2], [x1], x6
    //row 7
    st1       {v12.s}[2], [x1], #4
    st1       {v6.s}[2], [x1], #4

end_func_vert_r:
    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret




///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_8x8_mode_horz_d
//*
//* @brief
//* Perform Intra prediction for  luma_8x8 mode:Horizontal_Down
//*
//* @par Description:
//*   Perform Intra prediction for  luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.7
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] ui_neighboravailability
//*  availability of neighbouring pixels
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************/
//void ih264_intra_pred_luma_8x8_mode_horz_d(UWORD8 *pu1_src,
//                                            UWORD8 *pu1_dst,
//                                            WORD32 src_strd,
//                                              WORD32 dst_strd,
//                                              WORD32 ui_neighboravailability)

//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    w2 =>  src_strd
//    w3 =>  dst_strd
//    w4 =>  ui_neighboravailability

    .global ih264_intra_pred_luma_8x8_mode_horz_d_av8

ih264_intra_pred_luma_8x8_mode_horz_d_av8:

    // STMFD sp!, {x4-x12, x14}          //store register values to stack
    push_v_regs
    stp       x19, x20, [sp, #-16]!
    sxtw      x3, w3

    ld1       { v0.16b}, [x0]
    mov       v1.d[0], v0.d[1]
    add       x0, x0, #1
    ld1       { v2.16b}, [x0]
    mov       v3.d[0], v2.d[1]
    ext       v4.16b, v2.16b , v2.16b , #1
    mov       v5.d[0], v4.d[1]
    // q1 = q0 shifted to left once
    // q2 = q1 shifted to left once
    uaddl     v20.8h, v0.8b, v2.8b
    uaddl     v22.8h, v1.8b, v3.8b
    uaddl     v24.8h, v2.8b, v4.8b
    uaddl     v26.8h, v3.8b, v5.8b
    add       v24.8h, v20.8h , v24.8h
    add       v26.8h, v22.8h , v26.8h

    sqrshrun  v4.8b, v20.8h, #1
    sqrshrun  v5.8b, v22.8h, #1
    mov       v4.d[1], v5.d[0]
    sqrshrun  v6.8b, v24.8h, #2
    sqrshrun  v7.8b, v26.8h, #2
    mov       v6.d[1], v7.d[0]
    //Q2 has all FILT11 values
    //Q3 has all FILT121 values
    mov       v8.16b, v4.16b
    mov       v10.16b, v6.16b
    sub       x6, x3, #6
    trn1      v9.16b, v8.16b, v10.16b
    trn2      v10.16b, v8.16b, v10.16b  //
    mov       v8.16b, v9.16b
    mov       v12.16b, v8.16b
    mov       v14.16b, v10.16b
    sub       x5, x3, #4
    trn1      v13.8h, v12.8h, v14.8h
    trn2      v14.8h, v12.8h, v14.8h
    mov       v12.16b, v13.16b
    ext       v16.16b, v6.16b , v6.16b , #14
    //ROW 0
    st1       {v16.d}[1], [x1]
    st1       {v10.h}[3], [x1], x3

    //ROW 1
    st1       {v14.s}[1], [x1], #4
    st1       {v6.s}[2], [x1], x5
    //ROW 2
    st1       {v10.h}[2], [x1], #2
    st1       {v14.s}[1], [x1], #4
    st1       {v7.h}[0], [x1], x6
    //ROW 3
    st1       {v12.s}[1], [x1], #4
    st1       {v14.s}[1], [x1], x5
    //ROW 4
    st1       {v14.h}[1], [x1], #2
    st1       {v12.s}[1], [x1], #4
    st1       {v14.h}[2], [x1], x6
    //ROW 5
    st1       {v14.s}[0], [x1], #4
    st1       {v12.s}[1], [x1], x5
    //ROW 6
    st1       {v10.h}[0], [x1], #2
    st1       {v8.h}[1], [x1], #2
    st1       {v14.h}[1], [x1], #2
    st1       {v12.h}[2], [x1], x6
    //ROW 7
    st1       {v12.s}[0], [x1], #4
    st1       {v14.s}[0], [x1], x5

end_func_horz_d:
    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret





///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_8x8_mode_vert_l
//*
//* @brief
//*  Perform Intra prediction for  luma_8x8 mode:Vertical_Left
//*
//* @par Description:
//*   Perform Intra prediction for  luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.8
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] ui_neighboravailability
//*  availability of neighbouring pixels
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************/
//void ih264_intra_pred_luma_8x8_mode_vert_l(UWORD8 *pu1_src,
//                                            UWORD8 *pu1_dst,
//                                            WORD32 src_strd,
//                                              WORD32 dst_strd,
//                                              WORD32 ui_neighboravailability)

//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    w2 =>  src_strd
//    w3 =>  dst_strd
//    w4 =>  ui_neighboravailability


    .global ih264_intra_pred_luma_8x8_mode_vert_l_av8

ih264_intra_pred_luma_8x8_mode_vert_l_av8:

    // STMFD sp!, {x4-x12, x14}         //Restoring registers from stack
    push_v_regs
    stp       x19, x20, [sp, #-16]!
    sxtw      x3, w3
    add       x0, x0, #9
    ld1       { v0.16b}, [x0]
    mov       v1.d[0], v0.d[1]
    add       x0, x0, #1
    ld1       { v2.16b}, [x0]
    mov       v3.d[0], v2.d[1]
    ext       v4.16b, v2.16b , v2.16b , #1
    mov       v5.d[0], v4.d[1]
    uaddl     v20.8h, v0.8b, v2.8b
    uaddl     v22.8h, v1.8b, v3.8b
    uaddl     v24.8h, v2.8b, v4.8b
    uaddl     v26.8h, v3.8b, v5.8b
    add       v24.8h, v20.8h , v24.8h
    add       v26.8h, v22.8h , v26.8h

    sqrshrun  v4.8b, v20.8h, #1
    sqrshrun  v5.8b, v22.8h, #1
    mov       v4.d[1], v5.d[0]
    sqrshrun  v6.8b, v24.8h, #2
    ext       v8.16b, v4.16b , v4.16b , #1
    sqrshrun  v7.8b, v26.8h, #2
    mov       v6.d[1], v7.d[0]
    //Q2 has all FILT11 values
    //Q3 has all FILT121 values

    ext       v10.16b, v6.16b , v6.16b , #1
    //ROW 0,1
    st1       {v4.8b}, [x1], x3
    st1       {v6.8b}, [x1], x3

    ext       v12.16b, v8.16b , v8.16b , #1
    ext       v14.16b, v10.16b , v10.16b , #1
    //ROW 2,3
    st1       {v8.8b}, [x1], x3
    st1       {v10.8b}, [x1], x3

    ext       v16.16b, v12.16b , v12.16b , #1
    ext       v18.16b, v14.16b , v14.16b , #1
    //ROW 4,5
    st1       {v12.8b}, [x1], x3
    st1       {v14.8b}, [x1], x3
    //ROW 6,7
    st1       {v16.8b}, [x1], x3
    st1       {v18.8b}, [x1], x3

end_func_vert_l:
    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret





///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_8x8_mode_horz_u
//*
//* @brief
//*     Perform Intra prediction for  luma_8x8 mode:Horizontal_Up
//*
//* @par Description:
//*      Perform Intra prediction for  luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.9
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] ui_neighboravailability
//*  availability of neighbouring pixels
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************/
//void ih264_intra_pred_luma_8x8_mode_horz_u(UWORD8 *pu1_src,
//                                           UWORD8 *pu1_dst,
//                                           WORD32 src_strd,
//                                             WORD32 dst_strd,
//                                             WORD32 ui_neighboravailability)

//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    w2 =>  src_strd
//    w3 =>  dst_strd
//    w4 =>  ui_neighboravailability

    .global ih264_intra_pred_luma_8x8_mode_horz_u_av8

ih264_intra_pred_luma_8x8_mode_horz_u_av8:

    // STMFD sp!, {x4-x12, x14}          //store register values to stack
    push_v_regs
    stp       x19, x20, [sp, #-16]!
    sxtw      x3, w3

    ld1       {v0.8b}, [x0]
    ld1       {v1.b}[7], [x0]
    mov       v0.d[1], v1.d[0]
    ext       v2.16b, v0.16b , v0.16b , #1
    mov       v3.d[0], v2.d[1]
    ext       v4.16b, v2.16b , v2.16b , #1
    mov       v5.d[0], v4.d[1]

    adrp      x12, :got:ih264_gai1_intrapred_luma_8x8_horz_u
    ldr       x12, [x12, #:got_lo12:ih264_gai1_intrapred_luma_8x8_horz_u]
    uaddl     v20.8h, v0.8b, v2.8b
    uaddl     v22.8h, v1.8b, v3.8b
    uaddl     v24.8h, v2.8b, v4.8b
    uaddl     v26.8h, v3.8b, v5.8b
    add       v24.8h, v20.8h , v24.8h
    add       v26.8h, v22.8h , v26.8h
    ld1       { v10.16b}, [x12]
    mov       v11.d[0], v10.d[1]
    sqrshrun  v4.8b, v20.8h, #1
    sqrshrun  v5.8b, v22.8h, #1
    mov       v4.d[1], v5.d[0]
    sqrshrun  v6.8b, v24.8h, #2
    sqrshrun  v7.8b, v26.8h, #2
    mov       v6.d[1], v7.d[0]
    //Q2 has all FILT11 values
    //Q3 has all FILT121 values
    mov       v30.16b, v4.16b
    mov       v31.16b, v6.16b
    tbl       v12.8b, {v30.16b, v31.16b}, v10.8b
    dup       v14.16b, v5.b[7]          //
    tbl       v13.8b, {v30.16b, v31.16b}, v11.8b
    mov       v12.d[1], v13.d[0]
    ext       v16.16b, v12.16b , v14.16b , #2
    ext       v18.16b, v16.16b , v14.16b , #2
    st1       {v12.8b}, [x1], x3        //0
    ext       v20.16b, v18.16b , v14.16b , #2
    st1       {v16.8b}, [x1], x3        //1
    st1       {v18.8b}, [x1], x3        //2
    st1       {v20.8b}, [x1], x3        //3
    st1       {v13.8b}, [x1], x3        //4
    st1       {v16.d}[1], [x1], x3      //5
    st1       {v18.d}[1], [x1], x3      //6
    st1       {v20.d}[1], [x1], x3      //7


end_func_horz_u:
    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret