/*
Copyright (c) 2011 Intel Corporation
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
    * this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright notice,
    * this list of conditions and the following disclaimer in the documentation
    * and/or other materials provided with the distribution.

    * Neither the name of Intel Corporation nor the names of its contributors
    * may be used to endorse or promote products derived from this software
    * without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#ifndef USE_AS_WCSCAT

# ifndef L
#  define L(label)	.L##label
# endif

# ifndef cfi_startproc
#  define cfi_startproc	.cfi_startproc
# endif

# ifndef cfi_endproc
#  define cfi_endproc	.cfi_endproc
# endif

# ifndef ENTRY
#  define ENTRY(name)	\
	.type name,  @function;	\
	.globl name;	\
	.p2align 4;	\
name:	\
	cfi_startproc
# endif

# ifndef END
#  define END(name)	\
	cfi_endproc;	\
	.size name, .-name
# endif

# define PARMS	4
# define STR	PARMS
# define RETURN ret

	.text
ENTRY (wcslen)
	mov	STR(%esp), %edx
#endif
	cmpl	$0, (%edx)
	jz	L(exit_tail0)
	cmpl	$0, 4(%edx)
	jz	L(exit_tail1)
	cmpl	$0, 8(%edx)
	jz	L(exit_tail2)
	cmpl	$0, 12(%edx)
	jz	L(exit_tail3)
	cmpl	$0, 16(%edx)
	jz	L(exit_tail4)
	cmpl	$0, 20(%edx)
	jz	L(exit_tail5)
	cmpl	$0, 24(%edx)
	jz	L(exit_tail6)
	cmpl	$0, 28(%edx)
	jz	L(exit_tail7)

	pxor	%xmm0, %xmm0

	lea	32(%edx), %eax
	lea	-16(%eax), %ecx
	and	$-16, %eax

	pcmpeqd	(%eax), %xmm0
	pmovmskb %xmm0, %edx
	pxor	%xmm1, %xmm1
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm1
	pmovmskb %xmm1, %edx
	pxor	%xmm2, %xmm2
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm2
	pmovmskb %xmm2, %edx
	pxor	%xmm3, %xmm3
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm3
	pmovmskb %xmm3, %edx
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm0
	pmovmskb %xmm0, %edx
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm1
	pmovmskb %xmm1, %edx
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm2
	pmovmskb %xmm2, %edx
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm3
	pmovmskb %xmm3, %edx
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm0
	pmovmskb %xmm0, %edx
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm1
	pmovmskb %xmm1, %edx
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm2
	pmovmskb %xmm2, %edx
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm3
	pmovmskb %xmm3, %edx
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm0
	pmovmskb %xmm0, %edx
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm1
	pmovmskb %xmm1, %edx
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm2
	pmovmskb %xmm2, %edx
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%eax), %xmm3
	pmovmskb %xmm3, %edx
	lea	16(%eax), %eax
	test	%edx, %edx
	jnz	L(exit)

	and	$-0x40, %eax

	.p2align 4
L(aligned_64_loop):
	movaps	(%eax), %xmm0
	movaps	16(%eax), %xmm1
	movaps	32(%eax), %xmm2
	movaps	48(%eax), %xmm6

	pminub	%xmm1, %xmm0
	pminub	%xmm6, %xmm2
	pminub	%xmm0, %xmm2
	pcmpeqd	%xmm3, %xmm2
	pmovmskb %xmm2, %edx
	lea	64(%eax), %eax
	test	%edx, %edx
	jz	L(aligned_64_loop)

	pcmpeqd	-64(%eax), %xmm3
	pmovmskb %xmm3, %edx
	lea	48(%ecx), %ecx
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	%xmm1, %xmm3
	pmovmskb %xmm3, %edx
	lea	-16(%ecx), %ecx
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	-32(%eax), %xmm3
	pmovmskb %xmm3, %edx
	lea	-16(%ecx), %ecx
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	%xmm6, %xmm3
	pmovmskb %xmm3, %edx
	lea	-16(%ecx), %ecx
	test	%edx, %edx
	jnz	L(exit)

	jmp	L(aligned_64_loop)

	.p2align 4
L(exit):
	sub	%ecx, %eax
	shr	$2, %eax
	test	%dl, %dl
	jz	L(exit_high)

	mov	%dl, %cl
	and	$15, %cl
	jz	L(exit_1)
	RETURN

	.p2align 4
L(exit_high):
	mov	%dh, %ch
	and	$15, %ch
	jz	L(exit_3)
	add	$2, %eax
	RETURN

	.p2align 4
L(exit_1):
	add	$1, %eax
	RETURN

	.p2align 4
L(exit_3):
	add	$3, %eax
	RETURN

	.p2align 4
L(exit_tail0):
	xor	%eax, %eax
	RETURN

	.p2align 4
L(exit_tail1):
	mov	$1, %eax
	RETURN

	.p2align 4
L(exit_tail2):
	mov	$2, %eax
	RETURN

	.p2align 4
L(exit_tail3):
	mov	$3, %eax
	RETURN

	.p2align 4
L(exit_tail4):
	mov	$4, %eax
	RETURN

	.p2align 4
L(exit_tail5):
	mov	$5, %eax
	RETURN

	.p2align 4
L(exit_tail6):
	mov	$6, %eax
	RETURN

	.p2align 4
L(exit_tail7):
	mov	$7, %eax
#ifndef USE_AS_WCSCAT
	RETURN

END (wcslen)
#endif