@/****************************************************************************** @ * @ * Copyright (C) 2015 The Android Open Source Project @ * @ * Licensed under the Apache License, Version 2.0 (the "License"); @ * you may not use this file except in compliance with the License. @ * You may obtain a copy of the License at: @ * @ * http://www.apache.org/licenses/LICENSE-2.0 @ * @ * Unless required by applicable law or agreed to in writing, software @ * distributed under the License is distributed on an "AS IS" BASIS, @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @ * See the License for the specific language governing permissions and @ * limitations under the License. @ * @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ @/* @//---------------------------------------------------------------------------- @// File Name : impeg2_inter_pred.s @// @// Description : This file has motion compensation related @// interpolation functions on Neon + CortexA-8 platform @// @// Reference Document : @// @// Revision History : @// Date Author Detail Description @// ------------ ---------------- ---------------------------------- @// 18 jun 2010 S Hamsalekha Created @// @//------------------------------------------------------------------------- @*/ @/* @// ---------------------------------------------------------------------------- @// Include Files @// ---------------------------------------------------------------------------- @*/ .text .p2align 2 @/* @// ---------------------------------------------------------------------------- @// Struct/Union Types and Define @// ---------------------------------------------------------------------------- @*/ @/* @// ---------------------------------------------------------------------------- @// Static Global Data section variables @// ---------------------------------------------------------------------------- @*/ @// -------------------------- NONE -------------------------------------------- @/* @// ---------------------------------------------------------------------------- @// Static Prototype Functions @// ---------------------------------------------------------------------------- @*/ @// -------------------------- NONE -------------------------------------------- @/* @// ---------------------------------------------------------------------------- @// Exported functions @// ---------------------------------------------------------------------------- @*/ @//--------------------------------------------------------------------------- @// Function Name : impeg2_copy_mb_a9q() @// @// Detail Description : Copies one MB worth of data from src to the dst @// @// Inputs : r0 - pointer to src @// r1 - pointer to dst @// r2 - source width @// r3 - destination width @// Registers Used : r4, r5, d0, d1 @// @// Stack Usage : 12 bytes @// @// Outputs : @// @// Return Data : None @// @// Programming Note : <program limitation> @//----------------------------------------------------------------------------- @*/ .global impeg2_copy_mb_a9q impeg2_copy_mb_a9q: stmfd r13!, {r4, r5, r14} ldr r4, [r0] @src->y ldr r5, [r1] @dst->y @Read one row of data from the src vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst @//Repeat 15 times for y vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst vld1.8 {d0, d1}, [r4], r2 @Load and increment src vst1.8 {d0, d1}, [r5], r3 @Store and increment dst mov r2, r2, lsr #1 @src_offset /= 2 mov r3, r3, lsr #1 @dst_offset /= 2 ldr r4, [r0, #4] @src->u ldr r5, [r1, #4] @dst->u @Read one row of data from the src vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst @//Repeat 7 times for u vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst ldr r4, [r0, #8] @src->v ldr r5, [r1, #8] @dst->v @Read one row of data from the src vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst @//Repeat 7 times for v vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst vld1.8 {d0}, [r4], r2 @Load and increment src vst1.8 {d0}, [r5], r3 @Store and increment dst ldmfd r13!, {r4, r5, pc} @/* @//--------------------------------------------------------------------------- @// Function Name : impeg2_mc_fullx_halfy_8x8_a9q() @// @// Detail Description : This function pastes the reference block in the @// current frame buffer.This function is called for @// blocks that are not coded and have motion vectors @// with a half pel resolution. @// @// Inputs : r0 - out : Current Block Pointer @// r1 - ref : Refernce Block Pointer @// r2 - ref_wid : Refernce Block Width @// r3 - out_wid ; Current Block Width @// @// Registers Used : D0-D9 @// @// Stack Usage : 4 bytes @// @// Outputs : The Motion Compensated Block @// @// Return Data : None @// @// Programming Note : <program limitation> @//----------------------------------------------------------------------------- @*/ .global impeg2_mc_fullx_halfy_8x8_a9q impeg2_mc_fullx_halfy_8x8_a9q: stmfd r13!, {r14} add r14, r1, r2 mov r2, r2, lsl #1 @/* Load 8 + 1 rows from reference block */ @/* Do the addition with out rounding off as rounding value is 1 */ vld1.8 {d0}, [r1], r2 @// first row hence r1 = D0 vld1.8 {d2}, [r14], r2 @// second row hence r2 = D2 vld1.8 {d4}, [r1], r2 @// third row hence r3 = D4 vld1.8 {d6}, [r14], r2 @// fourth row hence r4 = D6 vld1.8 {d1}, [r1], r2 @// fifth row hence r5 = D1 vld1.8 {d3}, [r14], r2 @// sixth row hence r6 = D3 vrhadd.u8 d9, d1, d6 @// estimated row 4 = D9 vld1.8 {d5}, [r1], r2 @// seventh row hence r7 = D5 vrhadd.u8 q0, q0, q1 @// estimated row 1 = D0, row 5 = D1 vld1.8 {d7}, [r14], r2 @// eighth row hence r8 = D7 vrhadd.u8 q1, q1, q2 @// estimated row 2 = D2, row 6 = D3 vld1.8 {d8}, [r1], r2 @// ninth row hence r9 = D8 vrhadd.u8 q2, q2, q3 @// estimated row 3 = D4, row 7 = D5 add r14, r0, r3 mov r3, r3, lsl #1 @/* Store the eight rows calculated above */ vst1.8 {d2}, [r14], r3 @// second row hence D2 vrhadd.u8 d7, d7, d8 @// estimated row 8 = D7 vst1.8 {d0}, [r0], r3 @// first row hence D0 vst1.8 {d9}, [r14], r3 @// fourth row hence D9 vst1.8 {d4}, [r0], r3 @// third row hence D4 vst1.8 {d3}, [r14], r3 @// sixth row hence r6 = D3 vst1.8 {d1}, [r0], r3 @// fifth row hence r5 = D1 vst1.8 {d7}, [r14], r3 @// eighth row hence r8 = D7 vst1.8 {d5}, [r0], r3 @// seventh row hence r7 = D5 ldmfd sp!, {pc} @/* @//--------------------------------------------------------------------------- @// Function Name : impeg2_mc_halfx_fully_8x8_a9q() @// @// Detail Description : This function pastes the reference block in the @// current frame buffer.This function is called for @// blocks that are not coded and have motion vectors @// with a half pel resolutionand VopRoundingType is 0 .. @// @// Inputs : r0 - out : Current Block Pointer @// r1 - ref : Refernce Block Pointer @// r2 - ref_wid : Refernce Block Width @// r3 - out_wid ; Current Block Width @// @// Registers Used : r12, r14, d0-d10, d12-d14, d16-d18, d20-d22 @// @// Stack Usage : 8 bytes @// @// Outputs : The Motion Compensated Block @// @// Return Data : None @// @// Programming Note : <program limitation> @//----------------------------------------------------------------------------- @*/ .global impeg2_mc_halfx_fully_8x8_a9q impeg2_mc_halfx_fully_8x8_a9q: stmfd sp!, {r12, lr} add r14, r1, r2, lsl #2 add r12, r0, r3, lsl#2 vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1 vld1.8 {d2, d3}, [r14], r2 @ row5 vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2 vld1.8 {d6, d7}, [r14], r2 @row6 vext.8 d8, d0, d1, #1 @Extract pixels (1-8) of row1 vext.8 d12, d2, d3, #1 @Extract pixels (1-8) of row5 vext.8 d16, d4, d5, #1 @Extract pixels (1-8) of row2 vext.8 d20, d6, d7, #1 @Extract pixels (1-8) of row6 vld1.8 {d9, d10}, [r1], r2 @load row3 vld1.8 {d13, d14}, [r14], r2 @load row7 vld1.8 {d17, d18}, [r1], r2 @load row4 vld1.8 {d21, d22}, [r14], r2 @load row8 vext.8 d1, d9, d10, #1 @Extract pixels (1-8) of row3 vext.8 d3, d13, d14, #1 @Extract pixels (1-8) of row7 vext.8 d5, d17, d18, #1 @Extract pixels (1-8) of row4 vext.8 d7, d21, d22, #1 @Extract pixels (1-8) of row8 vrhadd.u8 q0, q0, q4 @operate on row1 and row3 vrhadd.u8 q1, q1, q6 @operate on row5 and row7 vrhadd.u8 q2, q2, q8 @operate on row2 and row4 vrhadd.u8 q3, q3, q10 @operate on row6 and row8 vst1.8 d0, [r0], r3 @store row1 vst1.8 d2, [r12], r3 @store row5 vst1.8 d4, [r0], r3 @store row2 vst1.8 d6, [r12], r3 @store row6 vst1.8 d1, [r0], r3 @store row3 vst1.8 d3, [r12], r3 @store row7 vst1.8 d5, [r0], r3 @store row4 vst1.8 d7, [r12], r3 @store row8 ldmfd sp!, {r12, pc} @/* @//--------------------------------------------------------------------------- @// Function Name : impeg2_mc_halfx_halfy_8x8_a9q() @// @// Detail Description : This function pastes the reference block in the @// current frame buffer.This function is called for @// blocks that are not coded and have motion vectors @// with a half pel resolutionand VopRoundingType is 0 .. @// @// Inputs : r0 - out : Current Block Pointer @// r1 - ref : Refernce Block Pointer @// r2 - ref_wid : Refernce Block Width @// r3 - out_wid ; Current Block Width @// @// Registers Used : r14, q0-q15 @// @// Stack Usage : 4 bytes @// @// Outputs : The Motion Compensated Block @// @// Return Data : None @// @// Programming Note : <program limitation> @//----------------------------------------------------------------------------- @*/ .global impeg2_mc_halfx_halfy_8x8_a9q impeg2_mc_halfx_halfy_8x8_a9q: stmfd sp!, {r14} add r14, r1, r2, lsl #2 vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1 vld1.8 {d2, d3}, [r14], r2 @ row5 vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2 vld1.8 {d6, d7}, [r14], r2 @row6 vext.8 d1, d0, d1, #1 @Extract pixels (1-8) of row1 vext.8 d3, d2, d3, #1 @Extract pixels (1-8) of row5 vext.8 d5, d4, d5, #1 @Extract pixels (1-8) of row2 vext.8 d7, d6, d7, #1 @Extract pixels (1-8) of row6 vld1.8 {d8, d9}, [r1], r2 @load row3 vld1.8 {d10, d11}, [r14], r2 @load row7 vld1.8 {d12, d13}, [r1], r2 @load row4 vld1.8 {d14, d15}, [r14], r2 @load row8 vext.8 d9, d8, d9, #1 @Extract pixels (1-8) of row3 vld1.8 {d16, d17}, [r14], r2 @load row9 vext.8 d11, d10, d11, #1 @Extract pixels (1-8) of row7 vext.8 d13, d12, d13, #1 @Extract pixels (1-8) of row4 vext.8 d15, d14, d15, #1 @Extract pixels (1-8) of row8 vext.8 d17, d16, d17, #1 @Extract pixels (1-8) of row9 @interpolation in x direction vaddl.u8 q0, d0, d1 @operate row1 vaddl.u8 q1, d2, d3 @operate row5 vaddl.u8 q2, d4, d5 @operate row2 vaddl.u8 q3, d6, d7 @operate row6 vaddl.u8 q4, d8, d9 @operate row3 vaddl.u8 q5, d10, d11 @operate row7 vaddl.u8 q6, d12, d13 @operate row4 vaddl.u8 q7, d14, d15 @operate row8 vaddl.u8 q8, d16, d17 @operate row9 @interpolation in y direction add r14, r0, r3, lsl #2 vadd.u16 q9, q0, q2 @operate row1 and row2 vadd.u16 q13, q1, q3 @operate row5 and row6 vadd.u16 q10, q2, q4 @operate row2 and row3 vadd.u16 q14, q3, q5 @operate row6 and row7 vrshrn.u16 d18, q9, #2 @row1 vrshrn.u16 d26, q13, #2 @row5 vrshrn.u16 d20, q10, #2 @row2 vrshrn.u16 d28, q14, #2 @row6 vadd.u16 q11, q4, q6 @operate row3 and row4 vst1.8 d18, [r0], r3 @store row1 vadd.u16 q15, q5, q7 @operate row7 and row8 vst1.8 d26, [r14], r3 @store row5 vadd.u16 q12, q6, q1 @operate row4 and row5 vst1.8 d20, [r0], r3 @store row2 vadd.u16 q7, q7, q8 @operate row8 and row9 vst1.8 d28, [r14], r3 @store row6 vrshrn.u16 d22, q11, #2 @row3 vrshrn.u16 d30, q15, #2 @row7 vrshrn.u16 d24, q12, #2 @row4 vrshrn.u16 d14, q7, #2 @row8 vst1.8 d22, [r0], r3 @store row3 vst1.8 d30, [r14], r3 @store row7 vst1.8 d24, [r0], r3 @store row4 vst1.8 d14, [r14], r3 @store row8 ldmfd sp!, {pc} @/* @//--------------------------------------------------------------------------- @// Function Name : impeg2_mc_fullx_fully_8x8_a9q() @// @// Detail Description : This function pastes the reference block in the @// current frame buffer.This function is called for @// blocks that are not coded and have motion vectors @// with a half pel resolutionand .. @// @// Inputs : r0 - out : Current Block Pointer @// r1 - ref : Refernce Block Pointer @// r2 - ref_wid : Refernce Block Width @// r3 - out_wid ; Current Block Width @// @// Registers Used : r12, r14, d0-d3 @// @// Stack Usage : 8 bytes @// @// Outputs : The Motion Compensated Block @// @// Return Data : None @// @// Programming Note : <program limitation> @//----------------------------------------------------------------------------- @*/ .global impeg2_mc_fullx_fully_8x8_a9q impeg2_mc_fullx_fully_8x8_a9q: stmfd sp!, {r12, lr} add r14, r1, r2, lsl #2 add r12, r0, r3, lsl #2 vld1.8 d0, [r1], r2 @load row1 vld1.8 d1, [r14], r2 @load row4 vld1.8 d2, [r1], r2 @load row2 vld1.8 d3, [r14], r2 @load row5 vst1.8 d0, [r0], r3 @store row1 vst1.8 d1, [r12], r3 @store row4 vst1.8 d2, [r0], r3 @store row2 vst1.8 d3, [r12], r3 @store row5 vld1.8 d0, [r1], r2 @load row3 vld1.8 d1, [r14], r2 @load row6 vld1.8 d2, [r1], r2 @load row4 vld1.8 d3, [r14], r2 @load row8 vst1.8 d0, [r0], r3 @store row3 vst1.8 d1, [r12], r3 @store row6 vst1.8 d2, [r0], r3 @store row4 vst1.8 d3, [r12], r3 @store row8 ldmfd sp!, {r12, pc} @/* @//--------------------------------------------------------------------------- @// Function Name : impeg2_interpolate_a9q() @// @// Detail Description : interpolates two buffers and adds pred @// @// Inputs : r0 - pointer to src1 @// r1 - pointer to src2 @// r2 - dest buf @// r3 - dst stride @// Registers Used : r4, r5, r7, r14, d0-d15 @// @// Stack Usage : 20 bytes @// @// Outputs : The Motion Compensated Block @// @// Return Data : None @// @// Programming Note : <program limitation> @//----------------------------------------------------------------------------- @*/ .global impeg2_interpolate_a9q impeg2_interpolate_a9q: stmfd r13!, {r4, r5, r7, r12, r14} ldr r4, [r0, #0] @ptr_y src1 ldr r5, [r1, #0] @ptr_y src2 ldr r7, [r2, #0] @ptr_y dst buf mov r12, #4 @counter for number of blocks interp_lumablocks_stride: vld1.8 {d0, d1}, [r4]! @row1 src1 vld1.8 {d2, d3}, [r4]! @row2 src1 vld1.8 {d4, d5}, [r4]! @row3 src1 vld1.8 {d6, d7}, [r4]! @row4 src1 vld1.8 {d8, d9}, [r5]! @row1 src2 vld1.8 {d10, d11}, [r5]! @row2 src2 vld1.8 {d12, d13}, [r5]! @row3 src2 vld1.8 {d14, d15}, [r5]! @row4 src2 vrhadd.u8 q0, q0, q4 @operate on row1 vrhadd.u8 q1, q1, q5 @operate on row2 vrhadd.u8 q2, q2, q6 @operate on row3 vrhadd.u8 q3, q3, q7 @operate on row4 vst1.8 {d0, d1}, [r7], r3 @row1 vst1.8 {d2, d3}, [r7], r3 @row2 vst1.8 {d4, d5}, [r7], r3 @row3 vst1.8 {d6, d7}, [r7], r3 @row4 subs r12, r12, #1 bne interp_lumablocks_stride mov r3, r3, lsr #1 @stride >> 1 ldr r4, [r0, #4] @ptr_u src1 ldr r5, [r1, #4] @ptr_u src2 ldr r7 , [r2, #4] @ptr_u dst buf mov r12, #2 @counter for number of blocks @chroma blocks interp_chromablocks_stride: vld1.8 {d0, d1}, [r4]! @row1 & 2 src1 vld1.8 {d2, d3}, [r4]! @row3 & 4 src1 vld1.8 {d4, d5}, [r4]! @row5 & 6 src1 vld1.8 {d6, d7}, [r4]! @row7 & 8 src1 vld1.8 {d8, d9}, [r5]! @row1 & 2 src2 vld1.8 {d10, d11}, [r5]! @row3 & 4 src2 vld1.8 {d12, d13}, [r5]! @row5 & 6 src2 vld1.8 {d14, d15}, [r5]! @row7 & 8 src2 vrhadd.u8 q0, q0, q4 @operate on row1 & 2 vrhadd.u8 q1, q1, q5 @operate on row3 & 4 vrhadd.u8 q2, q2, q6 @operate on row5 & 6 vrhadd.u8 q3, q3, q7 @operate on row7 & 8 vst1.8 {d0}, [r7], r3 @row1 vst1.8 {d1}, [r7], r3 @row2 vst1.8 {d2}, [r7], r3 @row3 vst1.8 {d3}, [r7], r3 @row4 vst1.8 {d4}, [r7], r3 @row5 vst1.8 {d5}, [r7], r3 @row6 vst1.8 {d6}, [r7], r3 @row7 vst1.8 {d7}, [r7], r3 @row8 ldr r4, [r0, #8] @ptr_v src1 ldr r5, [r1, #8] @ptr_v src2 ldr r7, [r2, #8] @ptr_v dst buf subs r12, r12, #1 bne interp_chromablocks_stride ldmfd r13!, {r4, r5, r7, r12, pc}