// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build ppc64 ppc64le

#include "textflag.h"

// func memmove(to, from unsafe.Pointer, n uintptr)
TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
	MOVD	to+0(FP), R3
	MOVD	from+8(FP), R4
	MOVD	n+16(FP), R5

	// Determine if there are doublewords to
	// copy so a more efficient move can be done
check:
	ANDCC	$7, R5, R7	// R7: bytes to copy
	SRD	$3, R5, R6	// R6: double words to copy
	CMP	R6, $0, CR1	// CR1[EQ] set if no double words to copy

	// Determine overlap by subtracting dest - src and comparing against the
	// length.  The catches the cases where src and dest are in different types
	// of storage such as stack and static to avoid doing backward move when not
	// necessary.

	SUB	R4, R3, R8	// dest - src
	CMPU	R8, R5, CR2	// < len?
	BC	12, 8, backward // BLT CR2 backward

	// Copying forward if no overlap.

	BC	12, 6, noforwardlarge	// "BEQ CR1, noforwardlarge"
	SRDCC	$2,R6,R8		// 32 byte chunks?
	BNE	forward32setup		//
	MOVD	R6,CTR			// R6 = number of double words

	// Move double words

forward8:
	MOVD    0(R4), R8		// double word
	ADD     $8,R4
	MOVD    R8, 0(R3)		//
	ADD     $8,R3
	BC      16, 0, forward8
	BR	noforwardlarge		// handle remainder

	// Prepare for moves of 32 bytes at a time.

forward32setup:
	DCBTST	(R3)			// prepare data cache
	DCBT	(R4)
	MOVD	R8, CTR			// double work count
	MOVD	$16, R8

forward32:
	LXVD2X	(R4+R0), VS32		// load 16 bytes
	LXVD2X	(R4+R8), VS33
	ADD	$32, R4
	STXVD2X	VS32, (R3+R0)		// store 16 bytes
	STXVD2X	VS33, (R3+R8)
	ADD	$32,R3			// bump up for next set
	BC	16, 0, forward32	// continue
	RLDCLCC	$61,R5,$3,R6		// remaining doublewords
	BEQ	noforwardlarge
	MOVD	R6,CTR			// set up the CTR
	BR	forward8

noforwardlarge:
	CMP	R7,$0			// any remaining bytes
	BC	4, 1, LR		// ble lr

forwardtail:
	MOVD	R7, CTR			// move tail bytes

forwardtailloop:
	MOVBZ	0(R4), R8		// move single bytes
	ADD	$1,R4
	MOVBZ	R8, 0(R3)
	ADD	$1,R3
	BC	16, 0, forwardtailloop
	RET

backward:
	// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
	// R3 and R4 are advanced to the end of the destination/source buffers
	// respectively and moved back as we copy.

	ADD	R5, R4, R4		// end of source
	ADD	R3, R5, R3		// end of dest

	BEQ	nobackwardtail		// earlier condition

	MOVD	R7, CTR			// bytes to move

backwardtailloop:
	MOVBZ 	-1(R4), R8		// point to last byte
	SUB	$1,R4
	MOVBZ 	R8, -1(R3)
	SUB	$1,R3
	BC	16, 0, backwardtailloop // bndz

nobackwardtail:
	BC	4, 5, LR		// ble CR1 lr

backwardlarge:
	MOVD	R6, CTR
	SUB	R3, R4, R9		// Use vsx if moving
	CMP	R9, $32			// at least 32 byte chunks
	BLT	backwardlargeloop	// and distance >= 32
	SRDCC	$2,R6,R8		// 32 byte chunks
	BNE	backward32setup

backwardlargeloop:
	MOVD 	-8(R4), R8
	SUB	$8,R4
	MOVD 	R8, -8(R3)
	SUB	$8,R3
	BC	16, 0, backwardlargeloop // bndz
	RET

backward32setup:
	MOVD	R8, CTR			// set up loop ctr
	MOVD	$16, R8			// 32 bytes at at time

backward32loop:
	SUB	$32, R4
	SUB	$32, R3
	LXVD2X	(R4+R0), VS32           // load 16 bytes
	LXVD2X	(R4+R8), VS33
	STXVD2X	VS32, (R3+R0)           // store 16 bytes
	STXVD2X	VS33, (R3+R8)
	BC      16, 0, backward32loop   // bndz
	BC	4, 5, LR		// ble CR1 lr
	MOVD	R6, CTR
	BR	backwardlargeloop