/* * Copyright (C) 2008 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <machine/cpu-features.h> .text .align .global jpeg_idct_ifast .func jpeg_idct_ifast // NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15 // jpeg_idct_ifast (j_decompress_ptr cinfo, // jpeg_component_info * compptr, // short* coef_block, // unsigned char* output_buf, // int output_col) #define local_TMP0123 sp #define local_TMP0 [sp, #0] #define local_TMP1 [sp, #4] #define local_TMP2 [sp, #8] #define local_TMP3 [sp, #12] #define local_RANGE_TABLE [sp, #16] #define local_OUTPUT_COL [sp, #20] #define local_OUTPUT_BUF [sp, #24] #define local_UNUSED [sp, #28] #define off_WORKSPACE 32 #define local_WORKSPACE [sp, #offWORKSPACE] #define local_SIZE (off_WORKSPACE + 8*8*4) #define off_DECOMPRESS_range_limit_base 324 #define off_COMPINFO_quanttable 80 #define DCTSIZE 8 #define VY(x) ((x)*DCTSIZE*2) #define QY(x) ((x)*DCTSIZE*4) #define VX(x) ((x)*2) #define QX(x) ((x)*4) #define FIX_1_414213562 #362 #define FIX_1_082392200 #277 #define FIX_1_847759065 #473 #define FIX_2_613125930 #669 #define RANGE_MASK 1023 jpeg_idct_ifast: PLD (r2, #0) stmdb sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr} ldr r4, [sp, #4*10] sub sp, #local_SIZE ldr r10,[r1, #off_COMPINFO_quanttable] // r10 = quanttable str r4, local_OUTPUT_COL str r3, local_OUTPUT_BUF ldr r5, [r0, #off_DECOMPRESS_range_limit_base] add r5, r5, #128 str r5, local_RANGE_TABLE mov fp, r2 // fp = coef_block add ip, sp, #off_WORKSPACE VLoopTail: ldrsh r0, [fp, #VY(0)] ldrsh r1, [fp, #VY(1)] ldrsh r2, [fp, #VY(2)] ldrsh r3, [fp, #VY(3)] ldrsh r4, [fp, #VY(4)] ldrsh r5, [fp, #VY(5)] ldrsh r6, [fp, #VY(6)] ldrsh r7, [fp, #VY(7)] cmp r1, #0 orreqs r8, r2, r3 orreqs r8, r4, r5 orreqs r8, r6, r7 beq VLoopHeadZero VLoopHead: // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0] (r0) // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4] (r4) // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2] (r2) // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6] (r6) // tmp10 = tmp0 + tmp2 (r0) // tmp11 = tmp0 - tmp2 (r4) ldr r9, [r10, #QY(4)] ldr r8, [r10, #QY(0)] #if __ARM_HAVE_HALFWORD_MULTIPLY smulbb r4, r9, r4 smlabb r0, r8, r0, r4 #else mul r4, r9, r4 mul r0, r8, r0 add r0, r4 #endif ldr r9, [r10, #QY(6)] ldr r8, [r10, #QY(2)] sub r4, r0, r4, lsl #1 #if __ARM_HAVE_HALFWORD_MULTIPLY smulbb r6, r9, r6 smlabb r2, r8, r2, r6 #else mul r6, r9, r6 mul r2, r8, r2 add r2, r6 #endif // tmp13 = tmp1 + tmp3 (r2) // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13 (r6) // FIX_1_4142... = 362 = 45*8 + 2 sub r6, r2, r6, lsl #1 mov r8, #360 add r8, r8, #2 mul r9, r6, r8 // tmp0 = tmp10 + tmp13; (r0) // tmp3 = tmp10 - tmp13; (r8) // tmp1 = tmp11 + tmp12; (r4) // tmp2 = tmp11 - tmp12; (r6) add r0, r0, r2 rsb r6, r2, r9, asr #8 sub r8, r0, r2, lsl #1 add r4, r4, r6 sub r6, r4, r6, lsl #1 stmia local_TMP0123, {r0, r4, r6, r8} // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above // odd part // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] ) (r1) // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] ) (r5) // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] ) (r3) // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] ) (r7) // z13 = tmp6 + tmp5; (r0) // z10 = tmp6 - tmp5; (r2) // z11 = tmp4 + tmp7; (r4) // z12 = tmp4 - tmp7; (r6) ldr r2, [r10, #QY(1)] ldr r9, [r10, #QY(5)] #if __ARM_HAVE_HALFWORD_MULTIPLY smulbb r1, r2, r1 #else mul r1, r2, r1 #endif ldr r2, [r10, #QY(3)] #if __ARM_HAVE_HALFWORD_MULTIPLY smulbb r5, r9, r5 #else mul r5, r9, r5 #endif ldr r9, [r10, #QY(7)] #if __ARM_HAVE_HALFWORD_MULTIPLY smlabb r0, r2, r3, r5 smlabb r4, r9, r7, r1 #else mul r0, r2, r3 add r0, r5 mul r4, r9, r7 add r4, r1 #endif rsb r2, r0, r5, lsl #1 rsb r6, r4, r1, lsl #1 // tmp7 = z11 + z13; (r7) // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1) // FIX_... = 360 + 2 add r7, r4, r0 sub r1, r4, r0 mov r8, #360 add r8, r8, #2 mul r1, r8, r1 // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8) // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0) // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2) // FIX_1_8477... = 473 = 472 + 1 // FIX_1_082... = 277 = 276 + 1 // FIX_2_... = 669 = 668 + 1 add r8, r2, r6 mov r9, #472 mla r8, r9, r8, r8 mov r9, #276 mla r0, r6, r9, r6 mov r9, #668 mla r2, r9, r2, r2 sub r0, r0, r8 rsb r2, r2, r8 // tmp6 = tmp12 - tmp7; (r6) // tmp5 = tmp11 - tmp6; (r5) // tmp4 = tmp10 + tmp5; (r4) rsb r6, r7, r2, asr #8 rsb r5, r6, r1, asr #8 add r4, r5, r0, asr #8 ldmia local_TMP0123, {r0, r1, r2, r3} // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); add r0, r0, r7 sub r7, r0, r7, lsl #1 add r1, r1, r6 sub r6, r1, r6, lsl #1 add r2, r2, r5 sub r5, r2, r5, lsl #1 sub r3, r3, r4 add r4, r3, r4, lsl #1 str r0, [ip, #QY(0)] str r1, [ip, #QY(1)] str r2, [ip, #QY(2)] str r3, [ip, #QY(3)] str r4, [ip, #QY(4)] str r5, [ip, #QY(5)] str r6, [ip, #QY(6)] str r7, [ip, #QY(7)] // inptr++; /* advance pointers to next column */ // quantptr++; // wsptr++; add fp, fp, #2 add r10, r10, #4 add ip, ip, #4 add r0, sp, #(off_WORKSPACE + 4*8) cmp ip, r0 bne VLoopTail HLoopStart: // reset pointers PLD (sp, #off_WORKSPACE) add ip, sp, #off_WORKSPACE ldr r10, local_RANGE_TABLE HLoopTail: // output = *output_buf++ + output_col ldr r0, local_OUTPUT_BUF ldr r1, local_OUTPUT_COL ldr r2, [r0], #4 str r0, local_OUTPUT_BUF add fp, r2, r1 PLD (ip, #32) ldmia ip!, {r0-r7} cmp r1, #0 orreqs r8, r2, r3 orreqs r8, r4, r5 orreqs r8, r6, r7 beq HLoopTailZero HLoopHead: // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); (r0) // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); (r4) add r0, r0, r4 sub r4, r0, r4, lsl #1 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); (r2) // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13; (r6) // FIX_... = 360 + 2 add r2, r2, r6 sub r6, r2, r6, lsl #1 mov r8, #360 add r8, r8, #2 mul r6, r8, r6 // tmp0 = tmp10 + tmp13; (r0) // tmp3 = tmp10 - tmp13; (r8) // tmp1 = tmp11 + tmp12; (r4) // tmp2 = tmp11 - tmp12; (r6) add r0, r0, r2 rsb r6, r2, r6, asr #8 sub r8, r0, r2, lsl #1 add r4, r4, r6 sub r6, r4, r6, lsl #1 stmia local_TMP0123, {r0, r4, r6, r8} // Odd part // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; (r0) // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; (r2) // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; (r4) // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; (r6) add r0, r5, r3 sub r2, r5, r3 add r4, r1, r7 sub r6, r1, r7 // tmp7 = z11 + z13; (r7) // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1) // FIX_... = 360 + 2 add r7, r4, r0 sub r1, r4, r0 mov r8, #360 add r8, r8, #2 mul r1, r8, r1 // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8) // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0) // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2) // FIX_1_8477... = 473 = 472 + 1 // FIX_1_082... = 277 = 276 + 1 // FIX_2_... = 669 = 668 + 1 add r8, r2, r6 mov r9, #472 mla r8, r9, r8, r8 mov r9, #276 mla r0, r6, r9, r6 mov r9, #668 mla r2, r9, r2, r2 sub r0, r0, r8 sub r2, r8, r2 // tmp6 = tmp12 - tmp7; (r6) // tmp5 = tmp11 - tmp6; (r5) // tmp4 = tmp10 + tmp5; (r4) rsb r6, r7, r2, asr #8 rsb r5, r6, r1, asr #8 add r4, r5, r0, asr #8 ldmia local_TMP0123, {r0, r1, r2, r3} // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK]; // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK]; // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK]; // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK]; // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK]; // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK]; // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK]; // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK]; mov r8, #128 add r0, r0, r7 sub r7, r0, r7, lsl #1 add r0, r8, r0, asr #5 add r7, r8, r7, asr #5 add r1, r1, r6 sub r6, r1, r6, lsl #1 add r1, r8, r1, asr #5 add r6, r8, r6, asr #5 add r2, r2, r5 sub r5, r2, r5, lsl #1 add r2, r8, r2, asr #5 add r5, r8, r5, asr #5 sub r3, r3, r4 add r4, r3, r4, lsl #1 add r3, r8, r3, asr #5 add r4, r8, r4, asr #5 #if __ARM_ARCH__ >= 6 usat r0, #8, r0 usat r1, #8, r1 usat r2, #8, r2 usat r3, #8, r3 usat r4, #8, r4 usat r5, #8, r5 usat r6, #8, r6 usat r7, #8, r7 #else cmp r0, #255 mvnhi r0, r0, asr #31 andhi r0, #255 cmp r7, #255 mvnhi r7, r7, asr #31 cmp r1, #255 mvnhi r1, r1, asr #31 andhi r1, #255 cmp r6, #255 mvnhi r6, r6, asr #31 andhi r6, #255 cmp r2, #255 mvnhi r2, r2, asr #31 andhi r2, #255 cmp r5, #255 mvnhi r5, r5, asr #31 andhi r5, #255 cmp r3, #255 mvnhi r3, r3, asr #31 cmp r4, #255 mvnhi r4, r4, asr #31 andhi r4, #255 #endif // r3 r2 r1 r0 orr r0, r0, r1, lsl #8 orr r0, r0, r2, lsl #16 orr r0, r0, r3, lsl #24 // r7 r6 r5 r4 orr r1, r4, r5, lsl #8 orr r1, r1, r6, lsl #16 orr r1, r1, r7, lsl #24 stmia fp, {r0, r1} add r0, sp, #(off_WORKSPACE + 8*8*4) cmp ip, r0 bne HLoopTail Exit: add sp, sp, #local_SIZE ldmia sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr} bx lr VLoopHeadZero: // ok, all AC coefficients are 0 ldr r1, [r10, #QY(0)] add fp, fp, #2 add r10, r10, #4 mul r0, r1, r0 str r0, [ip, #QY(0)] str r0, [ip, #QY(1)] str r0, [ip, #QY(2)] str r0, [ip, #QY(3)] str r0, [ip, #QY(4)] str r0, [ip, #QY(5)] str r0, [ip, #QY(6)] str r0, [ip, #QY(7)] add ip, ip, #4 add r0, sp, #(off_WORKSPACE + 4*8) cmp ip, r0 beq HLoopStart b VLoopTail HLoopTailZero: mov r0, r0, asr #5 add r0, #128 #if __ARM_ARCH__ >= 6 usat r0, #8, r0 #else cmp r0, #255 mvnhi r0, r0, asr #31 andhi r0, r0, #255 #endif orr r0, r0, lsl #8 orr r0, r0, lsl #16 mov r1, r0 stmia fp, {r0, r1} add r0, sp, #(off_WORKSPACE + 64*4) cmp ip, r0 beq Exit b HLoopTail .endfunc