/*
 * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 and
 * only version 2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA.
 */

/* Numerology:
 * WXYZ
 * W: width in bytes
 * X: Load=0, Store=1
 * Y: Location 0=preamble,8=loop,9=epilog
 * Z: Location=0,handler=9
 */
	.text
	.global FUNCNAME
	.type FUNCNAME, @function
	.p2align 5
FUNCNAME:
	{
		p0 = cmp.gtu(bytes,#0)
		if (!p0.new) jump:nt .Ldone
		r3 = or(dst,src)
		r4 = xor(dst,src)
	}
	{
		p1 = cmp.gtu(bytes,#15)
		p0 = bitsclr(r3,#7)
		if (!p0.new) jump:nt .Loop_not_aligned_8
		src_dst_sav = combine(src,dst)
	}

	{
		loopcount = lsr(bytes,#3)
		if (!p1) jump .Lsmall
	}
	p3=sp1loop0(.Loop8,loopcount)
.Loop8:
8080:
8180:
	{
		if (p3) memd(dst++#8) = d_dbuf
		d_dbuf = memd(src++#8)
	}:endloop0
8190:
	{
		memd(dst++#8) = d_dbuf
		bytes -= asl(loopcount,#3)
		jump .Lsmall
	}

.Loop_not_aligned_8:
	{
		p0 = bitsclr(r4,#7)
		if (p0.new) jump:nt .Lalign
	}
	{
		p0 = bitsclr(r3,#3)
		if (!p0.new) jump:nt .Loop_not_aligned_4
		p1 = cmp.gtu(bytes,#7)
	}

	{
		if (!p1) jump .Lsmall
		loopcount = lsr(bytes,#2)
	}
	p3=sp1loop0(.Loop4,loopcount)
.Loop4:
4080:
4180:
	{
		if (p3) memw(dst++#4) = w_dbuf
		w_dbuf = memw(src++#4)
	}:endloop0
4190:
	{
		memw(dst++#4) = w_dbuf
		bytes -= asl(loopcount,#2)
		jump .Lsmall
	}

.Loop_not_aligned_4:
	{
		p0 = bitsclr(r3,#1)
		if (!p0.new) jump:nt .Loop_not_aligned
		p1 = cmp.gtu(bytes,#3)
	}

	{
		if (!p1) jump .Lsmall
		loopcount = lsr(bytes,#1)
	}
	p3=sp1loop0(.Loop2,loopcount)
.Loop2:
2080:
2180:
	{
		if (p3) memh(dst++#2) = w_dbuf
		w_dbuf = memuh(src++#2)
	}:endloop0
2190:
	{
		memh(dst++#2) = w_dbuf
		bytes -= asl(loopcount,#1)
		jump .Lsmall
	}

.Loop_not_aligned: /* Works for as small as one byte */
	p3=sp1loop0(.Loop1,bytes)
.Loop1:
1080:
1180:
	{
		if (p3) memb(dst++#1) = w_dbuf
		w_dbuf = memub(src++#1)
	}:endloop0
	/* Done */
1190:
	{
		memb(dst) = w_dbuf
		jumpr r31
		r0 = #0
	}

.Lsmall:
	{
		p0 = cmp.gtu(bytes,#0)
		if (p0.new) jump:nt .Loop_not_aligned
	}
.Ldone:
	{
		r0 = #0
		jumpr r31
	}
	.falign
.Lalign:
1000:
	{
		if (p0.new) w_dbuf = memub(src)
		p0 = tstbit(src,#0)
		if (!p1) jump .Lsmall
	}
1100:
	{
		if (p0) memb(dst++#1) = w_dbuf
		if (p0) bytes = add(bytes,#-1)
		if (p0) src = add(src,#1)
	}
2000:
	{
		if (p0.new) w_dbuf = memuh(src)
		p0 = tstbit(src,#1)
		if (!p1) jump .Lsmall
	}
2100:
	{
		if (p0) memh(dst++#2) = w_dbuf
		if (p0) bytes = add(bytes,#-2)
		if (p0) src = add(src,#2)
	}
4000:
	{
		if (p0.new) w_dbuf = memw(src)
		p0 = tstbit(src,#2)
		if (!p1) jump .Lsmall
	}
4100:
	{
		if (p0) memw(dst++#4) = w_dbuf
		if (p0) bytes = add(bytes,#-4)
		if (p0) src = add(src,#4)
		jump FUNCNAME
	}
	.size FUNCNAME,.-FUNCNAME