// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build ppc64 ppc64le

#include "go_asm.h"
#include "textflag.h"

TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
	MOVD	b_base+0(FP), R3	// R3 = byte array pointer
	MOVD	b_len+8(FP), R4		// R4 = length
	MOVBZ	c+24(FP), R5		// R5 = byte
	MOVD	$ret+32(FP), R14	// R14 = &ret
	BR	indexbytebody<>(SB)

TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
	MOVD	s_base+0(FP), R3  // R3 = string
	MOVD	s_len+8(FP), R4	  // R4 = length
	MOVBZ	c+16(FP), R5	  // R5 = byte
	MOVD	$ret+24(FP), R14  // R14 = &ret
	BR	indexbytebody<>(SB)

TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
	MOVD	R3,R17		// Save base address for calculating the index later.
	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
	ADD	R4,R3,R7	// Last acceptable address in R7.
	DCBT	(R8)		// Prepare cache line.

	RLDIMI	$16,R5,$32,R5
	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
	MOVD	$-1,R9
	WORD	$0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
	RLDIMI	$32,R5,$0,R5
	MOVD	R7,R10		// Save last acceptable address in R10 for later.
	ADD	$-1,R7,R7
#ifdef GOARCH_ppc64le
	SLD	R6,R9,R9	// Prepare mask for Little Endian
#else
	SRD	R6,R9,R9	// Same for Big Endian
#endif
	BLE	small_string	// Jump to the small string case if it's ≤32 bytes.

	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
	// in V0, V1 and V10, then branch to the preloop.
	ANDCC	$63,R3,R11
	BEQ	CR0,qw_align
	RLDICL	$0,R3,$61,R11

	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
	CMPB	R12,R5,R3	// Check for a match.
	AND	R9,R3,R3	// Mask bytes below s_base
	RLDICL	$0,R7,$61,R6	// length-1
	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
	BNE	CR7,done
	ADD	$8,R8,R8
	ADD	$-8,R4,R4
	ADD	R4,R11,R4

	// Check for quadword alignment
	ANDCC	$15,R8,R11
	BEQ	CR0,qw_align

	// Not aligned, so handle the next doubleword
	MOVD	0(R8),R12
	CMPB	R12,R5,R3
	CMPU	R3,$0,CR7
	BNE	CR7,done
	ADD	$8,R8,R8
	ADD	$-8,R4,R4

	// Either quadword aligned or 64-byte at this point. We can use LVX.
qw_align:

	// Set up auxiliary data for the vectorized algorithm.
	VSPLTISB  $0,V0		// Replicate 0 across V0
	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
	MTVRD	  R5,V1
	LVSL	  (R0+R0),V11
	VSLB	  V11,V10,V10
	VSPLTB	  $7,V1,V1	// Replicate byte across V1
	CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
	BLE	  tail

	// We will load 4 quardwords per iteration in the loop, so check for
	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
	ANDCC	  $63,R8,R11
	BEQ	  CR0,preloop

	// Not 64-byte aligned. Load one quadword at a time until aligned.
	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
	BNE	    CR6,found_qw_align
	ADD	    $16,R8,R8
	ADD	    $-16,R4,R4

	ANDCC	    $63,R8,R11
	BEQ	    CR0,preloop
	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
	BNE	    CR6,found_qw_align
	ADD	    $16,R8,R8
	ADD	    $-16,R4,R4

	ANDCC	    $63,R8,R11
	BEQ	    CR0,preloop
	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
	BNE	    CR6,found_qw_align
	ADD	    $-16,R4,R4
	ADD	    $16,R8,R8

	// 64-byte aligned. Prepare for the main loop.
preloop:
	CMPU	R4,$64
	BLE	tail	      // If len ≤ 64, don't use the vectorized loop

	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
	// per loop iteration. The last doubleword is in R10, so our loop counter
	// starts at (R10-R8)/64.
	SUB	R8,R10,R6
	SRD	$6,R6,R9      // Loop counter in R9
	MOVD	R9,CTR

	ADD	$-64,R8,R8   // Adjust index for loop entry
	MOVD	$16,R11      // Load offsets for the vector loads
	MOVD	$32,R9
	MOVD	$48,R7

	// Main loop we will load 64 bytes per iteration
loop:
	ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
	LVX	    (R8+R11),V3
	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
	VCMPEQUB    V1,V3,V7

	LVX	    (R8+R9),V4
	LVX	    (R8+R7),V5
	VCMPEQUB    V1,V4,V8
	VCMPEQUB    V1,V5,V9

	VOR	    V6,V7,V11	      // Compress the result in a single vector
	VOR	    V8,V9,V12
	VOR	    V11,V12,V13
	VCMPEQUBCC  V0,V13,V14	      // Check for byte
	BGE	    CR6,found
	BC	    16,0,loop	      // bdnz loop

	// Handle the tailing bytes or R4 ≤ 64
	RLDICL	$0,R6,$58,R4
	ADD	$64,R8,R8
tail:
	CMPU	    R4,$0
	BEQ	    notfound
	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6
	BNE	    CR6,found_qw_align
	ADD	    $16,R8,R8
	CMPU	    R4,$16,CR6
	BLE	    CR6,notfound
	ADD	    $-16,R4,R4

	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6
	BNE	    CR6,found_qw_align
	ADD	    $16,R8,R8
	CMPU	    R4,$16,CR6
	BLE	    CR6,notfound
	ADD	    $-16,R4,R4

	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6
	BNE	    CR6,found_qw_align
	ADD	    $16,R8,R8
	CMPU	    R4,$16,CR6
	BLE	    CR6,notfound
	ADD	    $-16,R4,R4

	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6
	BNE	    CR6,found_qw_align

notfound:
	MOVD	$-1,R3
	MOVD	R3,(R14)
	RET

found:
	// We will now compress the results into a single doubleword,
	// so it can be moved to a GPR for the final index calculation.

	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
	// first bit of each byte into bits 48-63.
	VBPERMQ	  V6,V10,V6
	VBPERMQ	  V7,V10,V7
	VBPERMQ	  V8,V10,V8
	VBPERMQ	  V9,V10,V9

	// Shift each 16-bit component into its correct position for
	// merging into a single doubleword.
#ifdef GOARCH_ppc64le
	VSLDOI	  $2,V7,V7,V7
	VSLDOI	  $4,V8,V8,V8
	VSLDOI	  $6,V9,V9,V9
#else
	VSLDOI	  $6,V6,V6,V6
	VSLDOI	  $4,V7,V7,V7
	VSLDOI	  $2,V8,V8,V8
#endif

	// Merge V6-V9 into a single doubleword and move to a GPR.
	VOR	V6,V7,V11
	VOR	V8,V9,V4
	VOR	V4,V11,V4
	MFVRD	V4,R3

#ifdef GOARCH_ppc64le
	ADD	  $-1,R3,R11
	ANDN	  R3,R11,R11
	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
#else
	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
#endif
	ADD	R8,R11,R3	// Calculate byte address

return:
	SUB	R17,R3
	MOVD	R3,(R14)
	RET

found_qw_align:
	// Use the same algorithm as above. Compress the result into
	// a single doubleword and move it to a GPR for the final
	// calculation.
	VBPERMQ	  V6,V10,V6

#ifdef GOARCH_ppc64le
	MFVRD	  V6,R3
	ADD	  $-1,R3,R11
	ANDN	  R3,R11,R11
	POPCNTD	  R11,R11
#else
	VSLDOI	  $6,V6,V6,V6
	MFVRD	  V6,R3
	CNTLZD	  R3,R11
#endif
	ADD	  R8,R11,R3
	CMPU	  R11,R4
	BLT	  return
	BR	  notfound

done:
	// At this point, R3 has 0xFF in the same position as the byte we are
	// looking for in the doubleword. Use that to calculate the exact index
	// of the byte.
#ifdef GOARCH_ppc64le
	ADD	$-1,R3,R11
	ANDN	R3,R11,R11
	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
#else
	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
#endif
	CMPU	R8,R7		// Check if we are at the last doubleword.
	SRD	$3,R11		// Convert trailing zeros to bytes.
	ADD	R11,R8,R3
	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
	BNE	return
	BLE	CR7,return
	BR	notfound

small_string:
	// We unroll this loop for better performance.
	CMPU	R4,$0		// Check for length=0
	BEQ	notfound

	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
	CMPB	R12,R5,R3	// Check for a match.
	AND	R9,R3,R3	// Mask bytes below s_base.
	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
	RLDICL	$0,R7,$61,R6	// length-1
	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
	CMPU	R8,R7
	BNE	CR7,done
	BEQ	notfound	// Hit length.

	MOVDU	8(R8),R12
	CMPB	R12,R5,R3
	CMPU	R3,$0,CR6
	CMPU	R8,R7
	BNE	CR6,done
	BEQ	notfound

	MOVDU	8(R8),R12
	CMPB	R12,R5,R3
	CMPU	R3,$0,CR6
	CMPU	R8,R7
	BNE	CR6,done
	BEQ	notfound

	MOVDU	8(R8),R12
	CMPB	R12,R5,R3
	CMPU	R3,$0,CR6
	CMPU	R8,R7
	BNE	CR6,done
	BEQ	notfound

	MOVDU	8(R8),R12
	CMPB	R12,R5,R3
	CMPU	R3,$0,CR6
	BNE	CR6,done
	BR	notfound