/*
Copyright (c) 2014, Intel Corporation
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
    * this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright notice,
    * this list of conditions and the following disclaimer in the documentation
    * and/or other materials provided with the distribution.

    * Neither the name of Intel Corporation nor the names of its contributors
    * may be used to endorse or promote products derived from this software
    * without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#ifdef USE_AS_STRNCMP
/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
   if the new counter > the old one or is 0.  */
#define UPDATE_STRNCMP_COUNTER				\
	/* calculate left number to compare */		\
	lea	-16(%rcx, %r11), %r9;			\
	cmp	%r9, %r11;				\
	jb	L(strcmp_exitz);			\
	test	%r9, %r9;				\
	je	L(strcmp_exitz);			\
	mov	%r9, %r11

#else
#define UPDATE_STRNCMP_COUNTER
#ifndef STRCMP
#define STRCMP		strcmp
#endif
#endif

#ifndef L
# define L(label)	.L##label
#endif

#ifndef cfi_startproc
# define cfi_startproc			.cfi_startproc
#endif

#ifndef cfi_endproc
# define cfi_endproc			.cfi_endproc
#endif

#ifndef ENTRY
# define ENTRY(name)			\
	.type name,  @function; 	\
	.globl name;			\
	.p2align 4;			\
name:					\
	cfi_startproc
#endif

#ifndef END
# define END(name)			\
	cfi_endproc;			\
	.size name, .-name
#endif
#define RETURN ret
	.section .text.ssse3,"ax",@progbits
ENTRY (STRCMP)
/*
 * This implementation uses SSE to compare up to 16 bytes at a time.
 */
#ifdef USE_AS_STRNCMP
	test	%rdx, %rdx
	je	L(strcmp_exitz)
	cmp	$1, %rdx
	je	L(Byte0)
	mov	%rdx, %r11
#endif
	mov	%esi, %ecx
	mov	%edi, %eax
/* Use 64bit AND here to avoid long NOP padding.  */
	and	$0x3f, %rcx		/* rsi alignment in cache line */
	and	$0x3f, %rax		/* rdi alignment in cache line */
	cmp	$0x30, %ecx
	ja	L(crosscache)	/* rsi: 16-byte load will cross cache line */
	cmp	$0x30, %eax
	ja	L(crosscache)	/* rdi: 16-byte load will cross cache line */
	movlpd	(%rdi), %xmm1
	movlpd	(%rsi), %xmm2
	movhpd	8(%rdi), %xmm1
	movhpd	8(%rsi), %xmm2
	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
	jnz	L(less16bytes)	/* If not, find different value or null char */
#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)	/* finish comparision */
#endif
	add	$16, %rsi		/* prepare to search next 16 bytes */
	add	$16, %rdi		/* prepare to search next 16 bytes */

	/*
	 * Determine source and destination string offsets from 16-byte alignment.
	 * Use relative offset difference between the two to determine which case
	 * below to use.
	 */
	.p2align 4
L(crosscache):
	and	$0xfffffffffffffff0, %rsi	/* force %rsi is 16 byte aligned */
	and	$0xfffffffffffffff0, %rdi	/* force %rdi is 16 byte aligned */
	mov	$0xffff, %edx			/* for equivalent offset */
	xor	%r8d, %r8d
	and	$0xf, %ecx			/* offset of rsi */
	and	$0xf, %eax			/* offset of rdi */
	cmp	%eax, %ecx
	je	L(ashr_0)			/* rsi and rdi relative offset same */
	ja	L(bigger)
	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
	xchg	%ecx, %eax
	xchg	%rsi, %rdi
L(bigger):
	lea	15(%rax), %r9
	sub	%rcx, %r9
	lea	L(unaligned_table)(%rip), %r10
	movslq	(%r10, %r9,4), %r9
	lea	(%r10, %r9), %r10
	jmp	*%r10				/* jump to corresponding case */

/*
 * The following cases will be handled by ashr_0
 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
 *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
 */
	.p2align 4
L(ashr_0):

	movdqa	(%rsi), %xmm1
	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
	pmovmskb %xmm1, %r9d
	shr	%cl, %edx			/* adjust 0xffff for offset */
	shr	%cl, %r9d			/* adjust for 16-byte offset */
	sub	%r9d, %edx
	/*
	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
	 * the start from (16-rax) and no null char was seen.
	 */
	jne	L(less32bytes)		/* mismatch or null char */
	UPDATE_STRNCMP_COUNTER
	mov	$16, %rcx
	mov	$16, %r9
	pxor	%xmm0, %xmm0			/* clear xmm0, may have changed above */

	/*
	 * Now both strings are aligned at 16-byte boundary. Loop over strings
	 * checking 32-bytes per iteration.
	 */
	.p2align 4
L(loop_ashr_0):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)		/* mismatch or null char seen */

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif
	add	$16, %rcx
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)
#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif
	add	$16, %rcx
	jmp	L(loop_ashr_0)

/*
 * The following cases will be handled by ashr_1
 * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
 *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
 */
	.p2align 4
L(ashr_1):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
	pslldq	$15, %xmm2		/* shift first string to align with second */
	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx		/* adjust 0xffff for offset */
	shr	%cl, %r9d		/* adjust for 16-byte offset */
	sub	%r9d, %edx
	jnz	L(less32bytes)	/* mismatch or null char seen */
	movdqa	(%rdi), %xmm3
	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx		/* index for loads*/
	mov	$1, %r9d		/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	1(%rdi), %r10
	and	$0xfff, %r10		/* offset into 4K page */
	sub	$0x1000, %r10		/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_1):
	add	$16, %r10
	jg	L(nibble_ashr_1)	/* cross page boundary */

L(gobble_ashr_1):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4		 /* store for next cycle */

	palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif
	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_1)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4		/* store for next cycle */

	palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif
	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_1)

	/*
	 * Nibble avoids loads across page boundary. This is to avoid a potential
	 * access into unmapped memory.
	 */
	.p2align 4
L(nibble_ashr_1):
	pcmpeqb	%xmm3, %xmm0		 /* check nibble for null char*/
	pmovmskb %xmm0, %edx
	test	$0xfffe, %edx
	jnz	L(ashr_1_exittail)	/* find null char*/

#ifdef USE_AS_STRNCMP
	cmp	$14, %r11
	jbe	L(ashr_1_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10		/* substract 4K from %r10 */
	jmp	L(gobble_ashr_1)

	/*
	 * Once find null char, determine if there is a string mismatch
	 * before the null char.
	 */
	.p2align 4
L(ashr_1_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$1, %xmm0
	psrldq	$1, %xmm3
	jmp	L(aftertail)

/*
 * The following cases will be handled by ashr_2
 * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
 *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
 */
	.p2align 4
L(ashr_2):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq	$14, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)
	movdqa	(%rdi), %xmm3
	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$2, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	2(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */
	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_2):
	add	$16, %r10
	jg	L(nibble_ashr_2)

L(gobble_ashr_2):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_2)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_2)

	.p2align 4
L(nibble_ashr_2):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0xfffc, %edx
	jnz	L(ashr_2_exittail)

#ifdef USE_AS_STRNCMP
	cmp	$13, %r11
	jbe	L(ashr_2_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_2)

	.p2align 4
L(ashr_2_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$2, %xmm0
	psrldq	$2, %xmm3
	jmp	L(aftertail)

/*
 * The following cases will be handled by ashr_3
 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
 *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
 */
	.p2align 4
L(ashr_3):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq	$13, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)
	movdqa	(%rdi), %xmm3

	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$3, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	3(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */
	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_3):
	add	$16, %r10
	jg	L(nibble_ashr_3)

L(gobble_ashr_3):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_3)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_3)

	.p2align 4
L(nibble_ashr_3):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0xfff8, %edx
	jnz	L(ashr_3_exittail)

#ifdef USE_AS_STRNCMP
	cmp	$12, %r11
	jbe	L(ashr_3_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_3)

	.p2align 4
L(ashr_3_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$3, %xmm0
	psrldq	$3, %xmm3
	jmp	L(aftertail)

/*
 * The following cases will be handled by ashr_4
 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
 *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
 */
	.p2align 4
L(ashr_4):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq	$12, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)
	movdqa	(%rdi), %xmm3

	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$4, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	4(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */
	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_4):
	add	$16, %r10
	jg	L(nibble_ashr_4)

L(gobble_ashr_4):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_4)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_4)

	.p2align 4
L(nibble_ashr_4):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0xfff0, %edx
	jnz	L(ashr_4_exittail)

#ifdef USE_AS_STRNCMP
	cmp	$11, %r11
	jbe	L(ashr_4_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_4)

	.p2align 4
L(ashr_4_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$4, %xmm0
	psrldq	$4, %xmm3
	jmp	L(aftertail)

/*
 * The following cases will be handled by ashr_5
 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
 *        n(11~15)          n - 11      	  4(15 +(n-11) - n)         ashr_5
 */
	.p2align 4
L(ashr_5):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq	$11, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)
	movdqa	(%rdi), %xmm3

	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$5, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	5(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */
	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_5):
	add	$16, %r10
	jg	L(nibble_ashr_5)

L(gobble_ashr_5):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_5)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_5)

	.p2align 4
L(nibble_ashr_5):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0xffe0, %edx
	jnz	L(ashr_5_exittail)

#ifdef USE_AS_STRNCMP
	cmp	$10, %r11
	jbe	L(ashr_5_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_5)

	.p2align 4
L(ashr_5_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$5, %xmm0
	psrldq	$5, %xmm3
	jmp	L(aftertail)

/*
 * The following cases will be handled by ashr_6
 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
 *        n(10~15)          n - 10      	  5(15 +(n-10) - n)         ashr_6
 */
	.p2align 4
L(ashr_6):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq	$10, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)
	movdqa	(%rdi), %xmm3

	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$6, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	6(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */
	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_6):
	add	$16, %r10
	jg	L(nibble_ashr_6)

L(gobble_ashr_6):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_6)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_6)

	.p2align 4
L(nibble_ashr_6):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0xffc0, %edx
	jnz	L(ashr_6_exittail)

#ifdef USE_AS_STRNCMP
	cmp	$9, %r11
	jbe	L(ashr_6_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_6)

	.p2align 4
L(ashr_6_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$6, %xmm0
	psrldq	$6, %xmm3
	jmp	L(aftertail)

/*
 * The following cases will be handled by ashr_7
 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
 *        n(9~15)          n - 9      	        6(15 +(n - 9) - n)         ashr_7
 */
	.p2align 4
L(ashr_7):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq	$9, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)
	movdqa	(%rdi), %xmm3

	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$7, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	7(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */
	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_7):
	add	$16, %r10
	jg	L(nibble_ashr_7)

L(gobble_ashr_7):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_7)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_7)

	.p2align 4
L(nibble_ashr_7):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0xff80, %edx
	jnz	L(ashr_7_exittail)

#ifdef USE_AS_STRNCMP
	cmp	$8, %r11
	jbe	L(ashr_7_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_7)

	.p2align 4
L(ashr_7_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$7, %xmm0
	psrldq	$7, %xmm3
	jmp	L(aftertail)

/*
 *  The following cases will be handled by ashr_8
 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
 *        n(8~15)          n - 8      	        7(15 +(n - 8) - n)         ashr_8
 */
	.p2align 4
L(ashr_8):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq	$8, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)
	movdqa	(%rdi), %xmm3

	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$8, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	8(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */
	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_8):
	add	$16, %r10
	jg	L(nibble_ashr_8)

L(gobble_ashr_8):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_8)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_8)

	.p2align 4
L(nibble_ashr_8):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0xff00, %edx
	jnz	L(ashr_8_exittail)

#ifdef USE_AS_STRNCMP
	cmp	$7, %r11
	jbe	L(ashr_8_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_8)

	.p2align 4
L(ashr_8_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$8, %xmm0
	psrldq	$8, %xmm3
	jmp	L(aftertail)

/*
 *  The following cases will be handled by ashr_9
 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
 *        n(7~15)          n - 7      	        8(15 +(n - 7) - n)         ashr_9
 */
	.p2align 4
L(ashr_9):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq	$7, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)
	movdqa	(%rdi), %xmm3

	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$9, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	9(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */
	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_9):
	add	$16, %r10
	jg	L(nibble_ashr_9)

L(gobble_ashr_9):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_9)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3		/* store for next cycle */
	jmp	L(loop_ashr_9)

	.p2align 4
L(nibble_ashr_9):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0xfe00, %edx
	jnz	L(ashr_9_exittail)

#ifdef USE_AS_STRNCMP
	cmp	$6, %r11
	jbe	L(ashr_9_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_9)

	.p2align 4
L(ashr_9_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$9, %xmm0
	psrldq	$9, %xmm3
	jmp	L(aftertail)

/*
 *  The following cases will be handled by ashr_10
 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
 *        n(6~15)          n - 6      	        9(15 +(n - 6) - n)         ashr_10
 */
	.p2align 4
L(ashr_10):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq	$6, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)
	movdqa	(%rdi), %xmm3

	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$10, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	10(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */
	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_10):
	add	$16, %r10
	jg	L(nibble_ashr_10)

L(gobble_ashr_10):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $10, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_10)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $10, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_10)

	.p2align 4
L(nibble_ashr_10):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0xfc00, %edx
	jnz	L(ashr_10_exittail)

#ifdef USE_AS_STRNCMP
	cmp	$5, %r11
	jbe	L(ashr_10_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_10)

	.p2align 4
L(ashr_10_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$10, %xmm0
	psrldq	$10, %xmm3
	jmp	L(aftertail)

/*
 *  The following cases will be handled by ashr_11
 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
 *        n(5~15)          n - 5      	        10(15 +(n - 5) - n)         ashr_11
 */
	.p2align 4
L(ashr_11):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq	$5, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)
	movdqa	(%rdi), %xmm3

	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$11, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	11(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */
	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_11):
	add	$16, %r10
	jg	L(nibble_ashr_11)

L(gobble_ashr_11):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $11, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_11)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $11, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_11)

	.p2align 4
L(nibble_ashr_11):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0xf800, %edx
	jnz	L(ashr_11_exittail)

#ifdef USE_AS_STRNCMP
	cmp	$4, %r11
	jbe	L(ashr_11_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_11)

	.p2align 4
L(ashr_11_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$11, %xmm0
	psrldq	$11, %xmm3
	jmp	L(aftertail)

/*
 *  The following cases will be handled by ashr_12
 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
 *        n(4~15)          n - 4      	        11(15 +(n - 4) - n)         ashr_12
 */
	.p2align 4
L(ashr_12):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq	$4, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)
	movdqa	(%rdi), %xmm3

	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$12, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	12(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */
	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_12):
	add	$16, %r10
	jg	L(nibble_ashr_12)

L(gobble_ashr_12):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $12, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_12)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $12, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_12)

	.p2align 4
L(nibble_ashr_12):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0xf000, %edx
	jnz	L(ashr_12_exittail)

#ifdef USE_AS_STRNCMP
	cmp	$3, %r11
	jbe	L(ashr_12_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_12)

	.p2align 4
L(ashr_12_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$12, %xmm0
	psrldq	$12, %xmm3
	jmp	L(aftertail)

/*
 *  The following cases will be handled by ashr_13
 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
 *        n(3~15)          n - 3      	        12(15 +(n - 3) - n)         ashr_13
 */
	.p2align 4
L(ashr_13):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq	$3, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)
	movdqa	(%rdi), %xmm3

	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$13, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	13(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */
	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_13):
	add	$16, %r10
	jg	L(nibble_ashr_13)

L(gobble_ashr_13):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $13, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_13)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $13, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_13)

	.p2align 4
L(nibble_ashr_13):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0xe000, %edx
	jnz	L(ashr_13_exittail)

#ifdef USE_AS_STRNCMP
	cmp	$2, %r11
	jbe	L(ashr_13_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_13)

	.p2align 4
L(ashr_13_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq  $13, %xmm0
	psrldq  $13, %xmm3
	jmp	L(aftertail)

/*
 *  The following cases will be handled by ashr_14
 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
 *        n(2~15)          n - 2      	        13(15 +(n - 2) - n)         ashr_14
 */
	.p2align 4
L(ashr_14):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq  $2, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)
	movdqa	(%rdi), %xmm3

	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$14, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	14(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */
	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_14):
	add	$16, %r10
	jg	L(nibble_ashr_14)

L(gobble_ashr_14):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $14, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_14)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $14, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_14)

	.p2align 4
L(nibble_ashr_14):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0xc000, %edx
	jnz	L(ashr_14_exittail)

#ifdef USE_AS_STRNCMP
	cmp	$1, %r11
	jbe	L(ashr_14_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_14)

	.p2align 4
L(ashr_14_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$14, %xmm0
	psrldq	$14, %xmm3
	jmp	L(aftertail)

/*
 *  The following cases will be handled by ashr_15
 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
 *        n(1~15)          n - 1      	        14(15 +(n - 1) - n)         ashr_15
 */
	.p2align 4
L(ashr_15):
	pxor	%xmm0, %xmm0
	movdqa	(%rdi), %xmm2
	movdqa	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pslldq	$1, %xmm2
	pcmpeqb	%xmm1, %xmm2
	psubb	%xmm0, %xmm2
	pmovmskb %xmm2, %r9d
	shr	%cl, %edx
	shr	%cl, %r9d
	sub	%r9d, %edx
	jnz	L(less32bytes)

	movdqa	(%rdi), %xmm3

	UPDATE_STRNCMP_COUNTER

	pxor	%xmm0, %xmm0
	mov	$16, %rcx	/* index for loads */
	mov	$15, %r9d	/* byte position left over from less32bytes case */
	/*
	 * Setup %r10 value allows us to detect crossing a page boundary.
	 * When %r10 goes positive we have crossed a page boundary and
	 * need to do a nibble.
	 */
	lea	15(%rdi), %r10
	and	$0xfff, %r10	/* offset into 4K page */

	sub	$0x1000, %r10	/* subtract 4K pagesize */

	.p2align 4
L(loop_ashr_15):
	add	$16, %r10
	jg	L(nibble_ashr_15)

L(gobble_ashr_15):
	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $15, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3

	add	$16, %r10
	jg	L(nibble_ashr_15)	/* cross page boundary */

	movdqa	(%rsi, %rcx), %xmm1
	movdqa	(%rdi, %rcx), %xmm2
	movdqa	%xmm2, %xmm4

	palignr $15, %xmm3, %xmm2        /* merge into one 16byte value */

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm2, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	sub	$0xffff, %edx
	jnz	L(exit)

#ifdef USE_AS_STRNCMP
	sub	$16, %r11
	jbe	L(strcmp_exitz)
#endif

	add	$16, %rcx
	movdqa	%xmm4, %xmm3
	jmp	L(loop_ashr_15)

	.p2align 4
L(nibble_ashr_15):
	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
	pmovmskb %xmm0, %edx
	test	$0x8000, %edx
	jnz	L(ashr_15_exittail)

#ifdef USE_AS_STRNCMP
	test	%r11, %r11
	je	L(ashr_15_exittail)
#endif

	pxor	%xmm0, %xmm0
	sub	$0x1000, %r10
	jmp	L(gobble_ashr_15)

	.p2align 4
L(ashr_15_exittail):
	movdqa	(%rsi, %rcx), %xmm1
	psrldq	$15, %xmm3
	psrldq	$15, %xmm0

	.p2align 4
L(aftertail):
	pcmpeqb	%xmm3, %xmm1
	psubb	%xmm0, %xmm1
	pmovmskb %xmm1, %edx
	not	%edx

	.p2align 4
L(exit):
	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
L(less32bytes):
	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
	test	%r8d, %r8d
	jz	L(ret)
	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */

	.p2align 4
L(ret):
L(less16bytes):
	bsf	%rdx, %rdx		/* find and store bit index in %rdx */

#ifdef USE_AS_STRNCMP
	sub	%rdx, %r11
	jbe	L(strcmp_exitz)
#endif
	movzbl	(%rsi, %rdx), %ecx
	movzbl	(%rdi, %rdx), %eax

	sub	%ecx, %eax
	ret

L(strcmp_exitz):
	xor	%eax, %eax
	ret

	.p2align 4
L(Byte0):
	movzbl	(%rsi), %ecx
	movzbl	(%rdi), %eax

	sub	%ecx, %eax
	ret
END (STRCMP)

	.section .rodata,"a",@progbits
	.p2align 3
L(unaligned_table):
	.int	L(ashr_1) - L(unaligned_table)
	.int	L(ashr_2) - L(unaligned_table)
	.int	L(ashr_3) - L(unaligned_table)
	.int	L(ashr_4) - L(unaligned_table)
	.int	L(ashr_5) - L(unaligned_table)
	.int	L(ashr_6) - L(unaligned_table)
	.int	L(ashr_7) - L(unaligned_table)
	.int	L(ashr_8) - L(unaligned_table)
	.int	L(ashr_9) - L(unaligned_table)
	.int	L(ashr_10) - L(unaligned_table)
	.int	L(ashr_11) - L(unaligned_table)
	.int	L(ashr_12) - L(unaligned_table)
	.int	L(ashr_13) - L(unaligned_table)
	.int	L(ashr_14) - L(unaligned_table)
	.int	L(ashr_15) - L(unaligned_table)
	.int	L(ashr_0) - L(unaligned_table)