scaleopt.c - Android社区 - https://www.androidos.net.cn/

/*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

/****************************************************************************
*
*   Module Title :     scaleopt.cpp
*
*   Description  :     Optimized scaling functions
*
****************************************************************************/
#include "pragmas.h"

/****************************************************************************
*  Module Statics
****************************************************************************/
#if 0
__declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
__declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
__declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
#endif

#include "vpx_scale/vpxscale.h"
#include "vpx_mem/vpx_mem.h"

/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_3_5_scale_mmx
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_3_5_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short const35_2[] = { 154,  51, 205, 102 };
    __declspec(align(16)) unsigned short const35_1[] = { 102, 205,  51, 154 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };

(void) dest_width;

__asm
    {

push ebx

mov         esi,    source
        mov         edi,    dest

mov         ecx,    source_width
        lea         edx,    [esi+ecx-3];

movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
        movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx

movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
        pxor        mm7,    mm7             // clear mm7

horiz_line_3_5_loop:

mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
        mov        ebx,    eax

and         ebx,    0xffff00        // ebx = xx 01 02 xx
        mov         ecx,    eax             // ecx = 00 01 02 03

and         eax,    0xffff0000      // eax = xx xx 02 03
        xor         ecx,    eax             // ecx = 00 01 xx xx

shr         ebx,    8               // ebx = 01 02 xx xx
        or          eax,    ebx             // eax = 01 02 02 03

shl         ebx,    16              // ebx = xx xx 01 02
        movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx

or          ebx,    ecx             // ebx = 00 01 01 02
        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx

movd        mm0,    ebx             // mm0 = 00 01 01 02
        pmullw      mm1,    mm6             //

punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
        pmullw      mm0,    mm5             //

mov         [edi],  ebx             // writeoutput 00 xx xx xx
        add         esi,    3

add         edi,    5
        paddw       mm0,    mm1

paddw       mm0,    mm4
        psrlw       mm0,    8

cmp         esi,    edx
        packuswb    mm0,    mm7

movd        DWORD Ptr [edi-4], mm0
        jl          horiz_line_3_5_loop

//Exit:
        mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
        mov         ebx,    eax

and         ebx,    0xffff00        // ebx = xx 01 02 xx
        mov         ecx,    eax             // ecx = 00 01 02 03

and         eax,    0xffff0000      // eax = xx xx 02 03
        xor         ecx,    eax             // ecx = 00 01 xx xx

shr         ebx,    8               // ebx = 01 02 xx xx
        or          eax,    ebx             // eax = 01 02 02 03

shl         eax,    8               // eax = xx 01 02 02
        and         eax,    0xffff0000      // eax = xx xx 02 02

or          eax,    ebx             // eax = 01 02 02 02

shl         ebx,    16              // ebx = xx xx 01 02
        movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx

or          ebx,    ecx             // ebx = 00 01 01 02
        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx

movd        mm0,    ebx             // mm0 = 00 01 01 02
        pmullw      mm1,    mm6             //

punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
        pmullw      mm0,    mm5             //

mov         [edi],  ebx             // writeoutput 00 xx xx xx
        paddw       mm0,    mm1

paddw       mm0,    mm4
        psrlw       mm0,    8

packuswb    mm0,    mm7
        movd        DWORD Ptr [edi+1], mm0

pop ebx

}

/*
    const unsigned char *src = source;
    unsigned char *des = dest;
    unsigned int a, b, c ;
    unsigned int i;
    (void) dest_width;

for ( i=0; i<source_width-3; i+=3 )
    {
        a = src[0];
        b = src[1];
        des [0] = (UINT8) (a);
        // 2 * left + 3 * right /5
        des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
        c = src[2] ;
        // 4 * left + 1 * right /5
        des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
        // 1 * left + 4 * right /5
        des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);

a = src[3];
        // 3 * left + 2 * right /5
        des [4] = (UINT8) (( c * 154 + a * 102 + 128 ) >> 8);

src += 3;
        des += 5;
    }

a = src[0];
    b = src[1];
    des [0] = (UINT8) (a);
    // 2 * left + 3 * right /5
    des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
    c = src[2] ;
    // 4 * left + 1 * right /5
    des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
    // 1 * left + 4 * right /5
    des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);

des [4] = (UINT8) (c);
    */
}

/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_4_5_scale_mmx
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_4_5_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __declspec(align(16)) unsigned short const45_2[] = {205, 154, 102,  51 };
    __declspec(align(16)) unsigned short const45_1[] = { 51, 102, 154, 205 };
    __declspec(align(16)) unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};

(void)dest_width;

__asm
    {

mov         esi,    source
        mov         edi,    dest

mov         ecx,    source_width
        lea         edx,    [esi+ecx-8];

movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
        movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx

movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
        pxor        mm7,    mm7             // clear mm7

horiz_line_4_5_loop:

movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
        movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08

movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
        movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08

movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx

punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205

pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx

movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205

punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
        pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51

paddw       mm0,    mm1             // added round values
        paddw       mm0,    mm4

psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
        packuswb    mm0,    mm7

movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
        add         edi,    10

add         esi,    8
        paddw       mm2,    mm3             //

paddw       mm2,    mm4             // added round values
        cmp         esi,    edx

psrlw       mm2,    8
        packuswb    mm2,    mm7

movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
        jl         horiz_line_4_5_loop

//Exit:
        movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
        movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07

movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
        psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00

movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
        pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00

psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
        por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07

movq        mm3,    mm1

movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx

punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205

pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx

movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205

punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
        pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51

paddw       mm0,    mm1             // added round values
        paddw       mm0,    mm4

psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
        packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx

movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
        paddw       mm2,    mm3             //

paddw       mm2,    mm4             // added round values
        psrlw       mm2,    8

packuswb    mm2,    mm7
        movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09

}
    /*
        const unsigned char *src = source;
        unsigned char *des = dest;
        unsigned int a, b, c ;
        unsigned i;
        (void) dest_width;

for ( i=0; i<source_width-4; i+=4 )
        {
            a = src[0];
            b = src[1];
            des [0] = (UINT8) a;
            des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
            c = src[2] * 154;
            a = src[3];
            des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
            des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
            b = src[4];
            des [4] = (UINT8) (( a * 205 + 51 * b + 128) >> 8);

src += 4;
            des += 5;
        }

a = src[0];
        b = src[1];
        des [0] = (UINT8) (a);
        des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
        c = src[2] * 154;
        a = src[3];
        des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
        des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
        des [4] = (UINT8) (a);
    */
}

/****************************************************************************
 *
 *  ROUTINE       : vertical_band_4_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has a "C" only
 *                  version.
 *
 ****************************************************************************/
static
void vertical_band_4_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{

__declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };

__asm
    {

mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

lea         edi,    [esi+ecx*2]             // tow lines below
        add         edi,    ecx                     // three lines below

pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter

vs_4_5_loop:

movq        mm0,    QWORD ptr [esi]         // src[0];
        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];

movq        mm2,    mm0                     // Make a copy
        punpcklbw   mm0,    mm7                     // unpack low to word

movq        mm5,    one_fifth
        punpckhbw   mm2,    mm7                     // unpack high to word

pmullw      mm0,    mm5                     // a * 1/5

movq        mm3,    mm1                     // make a copy
        punpcklbw   mm1,    mm7                     // unpack low to word

pmullw      mm2,    mm5                     // a * 1/5
        movq        mm6,    four_fifths               // constan

movq        mm4,    mm1                     // copy of low b
        pmullw      mm4,    mm6                     // b * 4/5

punpckhbw   mm3,    mm7                     // unpack high to word
        movq        mm5,    mm3                     // copy of high b

pmullw      mm5,    mm6                     // b * 4/5
        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5

paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
        paddw       mm0,    round_values             // + 128

paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8

psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des [1]

movq        QWORD ptr [esi+ecx], mm0        // write des[1]
        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]

// mm1, mm3 --- Src[1]
        // mm0 --- Src[2]
        // mm7 for unpacking

movq        mm5,    two_fifths
        movq        mm2,    mm0                     // make a copy

pmullw      mm1,    mm5                     // b * 2/5
        movq        mm6,    three_fifths

punpcklbw   mm0,    mm7                     // unpack low to word
        pmullw      mm3,    mm5                     // b * 2/5

movq        mm4,    mm0                     // make copy of c
        punpckhbw   mm2,    mm7                     // unpack high to word

pmullw      mm4,    mm6                     // c * 3/5
        movq        mm5,    mm2

pmullw      mm5,    mm6                     // c * 3/5
        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5

paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
        paddw       mm1,    round_values             // + 128

paddw       mm3,    round_values             // + 128
        psrlw       mm1,    8

psrlw       mm3,    8
        packuswb    mm1,    mm3                     // des[2]

movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
        movq        mm1,    [edi]                   // mm1=Src[3];

// mm0, mm2 --- Src[2]
        // mm1 --- Src[3]
        // mm6 --- 3/5
        // mm7 for unpacking

pmullw      mm0,    mm6                     // c * 3/5
        movq        mm5,    two_fifths               // mm5 = 2/5

movq        mm3,    mm1                     // make a copy
        pmullw      mm2,    mm6                     // c * 3/5

punpcklbw   mm1,    mm7                     // unpack low
        movq        mm4,    mm1                     // make a copy

punpckhbw   mm3,    mm7                     // unpack high
        pmullw      mm4,    mm5                     // d * 2/5

movq        mm6,    mm3                     // make a copy
        pmullw      mm6,    mm5                     // d * 2/5

paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5

paddw       mm0,    round_values             // + 128
        paddw       mm2,    round_values             // + 128

psrlw       mm0,    8
        psrlw       mm2,    8

packuswb    mm0,    mm2                     // des[3]
        movq        QWORD ptr [edi], mm0            // write des[3]

//  mm1, mm3 --- Src[3]
        //  mm7 -- cleared for unpacking

movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group

movq        mm5,    four_fifths              // mm5 = 4/5
        pmullw      mm1,    mm5                     // d * 4/5

movq        mm6,    one_fifth                // mm6 = 1/5
        movq        mm2,    mm0                     // make a copy

pmullw      mm3,    mm5                     // d * 4/5
        punpcklbw   mm0,    mm7                     // unpack low

pmullw      mm0,    mm6                     // an * 1/5
        punpckhbw   mm2,    mm7                     // unpack high

paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
        pmullw      mm2,    mm6                     // an * 1/5

paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
        paddw       mm1,    round_values             // + 128

paddw       mm3,    round_values             // + 128
        psrlw       mm1,    8

psrlw       mm3,    8
        packuswb    mm1,    mm3                     // des[4]

movq        QWORD ptr [edi+ecx], mm1        // write des[4]

add         edi,    8
        add         esi,    8

sub         edx,    8
        jg         vs_4_5_loop
    }
}

/****************************************************************************
 *
 *  ROUTINE       : last_vertical_band_4_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : None
 *
 *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void last_vertical_band_4_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };

__asm
    {
        mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

lea         edi,    [esi+ecx*2]             // tow lines below
        add         edi,    ecx                     // three lines below

pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter

last_vs_4_5_loop: