// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le

#include "textflag.h"

// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.

// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB), NOSPLIT, $0
	MOVD   x+0(FP), R4
	MOVD   y+8(FP), R5
	MULHDU R4, R5, R6
	MULLD  R4, R5, R7
	MOVD   R6, z1+16(FP)
	MOVD   R7, z0+24(FP)
	RET

TEXT ·addVV(SB), NOSPLIT, $0
	BR ·addVV_g(SB)

// func subVV(z, x, y []Word) (c Word)
// z[i] = x[i] - y[i] for all i, carrying
TEXT ·subVV(SB), NOSPLIT, $0
	MOVD z_len+8(FP), R7
	MOVD x+24(FP), R8
	MOVD y+48(FP), R9
	MOVD z+0(FP), R10

	MOVD $0, R4  // c = 0
	MOVD $0, R5  // i = 0
	MOVD $1, R29 // work around lack of ADDI
	MOVD $8, R28 // work around lack of scaled addressing

	SUBC R0, R0  // clear CA
	JMP  sublend

// amd64 saves and restores CF, but I believe they only have to do that because all of
// their math operations clobber it - we should just be able to recover it at the end.
subloop:
	MULLD R5, R28, R6
	MOVD  (R8)(R6), R11 // x[i]
	MOVD  (R9)(R6), R12 // y[i]

	SUBE R12, R11, R15
	MOVD R15, (R10)(R6)

	ADD R29, R5 // i++

sublend:
	CMP R5, R7
	BLT subloop

	ADDZE R4
	XOR   R29, R4
	MOVD  R4, c+72(FP)
	RET

TEXT ·addVW(SB), NOSPLIT, $0
	BR ·addVW_g(SB)

TEXT ·subVW(SB), NOSPLIT, $0
	BR ·subVW_g(SB)

TEXT ·shlVU(SB), NOSPLIT, $0
	BR ·shlVU_g(SB)

TEXT ·shrVU(SB), NOSPLIT, $0
	BR ·shrVU_g(SB)

// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
	MOVD z+0(FP), R10
	MOVD x+24(FP), R8
	MOVD y+48(FP), R9
	MOVD r+56(FP), R4     // c = r
	MOVD z_len+8(FP), R11
	MOVD $0, R3           // i = 0
	MOVD $8, R18
	MOVD $1, R19

	JMP e5

l5:
	MULLD  R18, R3, R5
	MOVD   (R8)(R5), R20
	MULLD  R9, R20, R6
	MULHDU R9, R20, R7
	ADDC   R4, R6
	ADDZE  R7
	MOVD   R6, (R10)(R5)
	MOVD   R7, R4
	ADD    R19, R3

e5:
	CMP R3, R11
	BLT l5

	MOVD R4, c+64(FP)
	RET

// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB), NOSPLIT, $0
	MOVD z+0(FP), R10
	MOVD x+24(FP), R8
	MOVD y+48(FP), R9
	MOVD z_len+8(FP), R22

	MOVD $0, R5   // i = 0
	MOVD $0, R4   // c = 0
	MOVD $8, R28
	MOVD $-2, R23
	AND  R22, R23 // mask the last bit of z.len
	MOVD $2, R24
	CMP  R23, R24
	BGE  unrolled
	JMP  end

unrolled:
	MOVD  $8, R19         // no (RA)(RB*8) on power
	MULLD R5, R19
	MOVD  (R10)(R19), R11 // R11 = z[i]
	MOVD  (R8)(R19), R16  // R16 = x[i]
	ADD   R28, R19, R25
	MOVD  (R10)(R25), R17
	MOVD  (R8)(R25), R18

	MULLD  R9, R16, R12
	MULHDU R9, R16, R14
	MULLD  R9, R18, R6
	MULHDU R9, R18, R7
	ADDC   R4, R12
	ADDZE  R14
	ADDC   R11, R12        // z[i] = (x[i]*y) + z[i] + carry
	ADDZE  R14             // carry = high order bits + add carry
	MOVD   R12, (R10)(R19)
	ADDC   R14, R6
	ADDZE  R7
	ADDC   R17, R6
	ADDZE  R7
	MOVD   R6, (R10)(R25)
	MOVD   R7, R4

	ADD R24, R5
	CMP R5, R23
	BLT unrolled
	JMP end

loop:
	MOVD   $8, R19
	MULLD  R5, R19
	MOVD   (R10)(R19), R11
	MOVD   (R8)(R19), R16
	MULLD  R9, R16, R12
	MULHDU R9, R16, R14
	ADDC   R4, R12
	ADDZE  R14
	ADDC   R11, R12
	ADDZE  R14
	MOVD   R12, (R10)(R19)
	MOVD   R14, R4

	MOVD $1, R15
	ADD  R15, R5

end:
	CMP R5, R22
	BLT loop

	MOVD R4, c+56(FP)
	RET

TEXT ·divWVW(SB), NOSPLIT, $0
	BR ·divWVW_g(SB)

// func bitLen(x Word) int
TEXT ·bitLen(SB), NOSPLIT, $0
	MOVD   x+0(FP), R4
	CNTLZD R4, R4
	MOVD   $64, R5
	SUB    R4, R5
	MOVD   R5, n+8(FP)
	RET