// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build ppc64le ppc64

#include "go_asm.h"
#include "textflag.h"

TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
	MOVD  b_base+0(FP), R3    // R3 = byte array pointer
	MOVD  b_len+8(FP), R4     // R4 = length
	MOVBZ c+24(FP), R5        // R5 = byte
	MOVD  $ret+32(FP), R14    // R14 = &ret
	BR    countbytebody<>(SB)

TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
	MOVD  s_base+0(FP), R3    // R3 = string
	MOVD  s_len+8(FP), R4     // R4 = length
	MOVBZ c+16(FP), R5        // R5 = byte
	MOVD  $ret+24(FP), R14    // R14 = &ret
	BR    countbytebody<>(SB)

// R3: addr of string
// R4: len of string
// R5: byte to count
// R14: addr for return value
// endianness shouldn't matter since we are just counting and order
// is irrelevant
TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
	DCBT (R3)    // Prepare cache line.
	MOVD R0, R18 // byte count
	MOVD R3, R19 // Save base address for calculating the index later.
	MOVD R4, R16

	MOVD   R5, R6
	RLDIMI $8, R6, $48, R6
	RLDIMI $16, R6, $32, R6
	RLDIMI $32, R6, $0, R6  // fill reg with the byte to count

	VSPLTISW $3, V4     // used for shift
	MTVRD    R6, V1     // move compare byte
	VSPLTB   $7, V1, V1 // replicate byte across V1

	CMPU   R4, $32          // Check if it's a small string (<32 bytes)
	BLT    tail             // Jump to the small string case
	XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator

cmploop:
	LXVW4X (R3), VS32 // load bytes from string

	// when the bytes match, the corresonding byte contains all 1s
	VCMPEQUB V1, V0, V2     // compare bytes
	VPOPCNTD V2, V3         // each double word contains its count
	VADDUDM  V3, V5, V5     // accumulate bit count in each double word
	ADD      $16, R3, R3    // increment pointer
	SUB      $16, R16, R16  // remaining bytes
	CMP      R16, $16       // at least 16 remaining?
	BGE      cmploop
	VSRD     V5, V4, V5     // shift by 3 to convert bits to bytes
	VSLDOI   $8, V5, V5, V6 // get the double word values from vector
	MFVSRD   V5, R9
	MFVSRD   V6, R10
	ADD      R9, R10, R9
	ADD      R9, R18, R18

tail:
	CMP R16, $8 // 8 bytes left?
	BLT small

	MOVD    (R3), R12     // load 8 bytes
	CMPB    R12, R6, R17  // compare bytes
	POPCNTD R17, R15      // bit count
	SRD     $3, R15, R15  // byte count
	ADD     R15, R18, R18 // add to byte count

next1:
	ADD $8, R3, R3
	SUB $8, R16, R16 // remaining bytes
	BR  tail

small:
	CMP   $0, R16   // any remaining
	BEQ   done
	MOVBZ (R3), R12 // check each remaining byte
	CMP   R12, R5
	BNE   next2
	ADD   $1, R18

next2:
	SUB $1, R16
	ADD $1, R3  // inc address
	BR  small

done:
	MOVD R18, (R14) // return count
	RET