// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build !math_big_pure_go

#include "textflag.h"

// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.

// TODO: Consider re-implementing using Advanced SIMD
// once the assembler supports those instructions.

// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),NOSPLIT,$0
	MOVD	x+0(FP), R0
	MOVD	y+8(FP), R1
	MUL	R0, R1, R2
	UMULH	R0, R1, R3
	MOVD	R3, z1+16(FP)
	MOVD	R2, z0+24(FP)
	RET


// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB),NOSPLIT,$0
	B	·divWW_g(SB) // ARM64 has no multiword division


// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),NOSPLIT,$0
	MOVD	z+0(FP), R3
	MOVD	z_len+8(FP), R0
	MOVD	x+24(FP), R1
	MOVD	y+48(FP), R2
	ADDS	$0, R0 // clear carry flag
loop:
	CBZ	R0, done // careful not to touch the carry flag
	MOVD.P	8(R1), R4
	MOVD.P	8(R2), R5
	ADCS	R4, R5
	MOVD.P	R5, 8(R3)
	SUB	$1, R0
	B	loop
done:
	CSET	HS, R0 // extract carry flag
	MOVD	R0, c+72(FP)
	RET


// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB),NOSPLIT,$0
	MOVD	z+0(FP), R3
	MOVD	z_len+8(FP), R0
	MOVD	x+24(FP), R1
	MOVD	y+48(FP), R2
	CMP	R0, R0 // set carry flag
loop:
	CBZ	R0, done // careful not to touch the carry flag
	MOVD.P	8(R1), R4
	MOVD.P	8(R2), R5
	SBCS	R5, R4
	MOVD.P	R4, 8(R3)
	SUB	$1, R0
	B	loop
done:
	CSET	LO, R0 // extract carry flag
	MOVD	R0, c+72(FP)
	RET


// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),NOSPLIT,$0
	MOVD	z+0(FP), R3
	MOVD	z_len+8(FP), R0
	MOVD	x+24(FP), R1
	MOVD	y+48(FP), R2
	CBZ	R0, return_y
	MOVD.P	8(R1), R4
	ADDS	R2, R4
	MOVD.P	R4, 8(R3)
	SUB	$1, R0
loop:
	CBZ	R0, done // careful not to touch the carry flag
	MOVD.P	8(R1), R4
	ADCS	$0, R4
	MOVD.P	R4, 8(R3)
	SUB	$1, R0
	B	loop
done:
	CSET	HS, R0 // extract carry flag
	MOVD	R0, c+56(FP)
	RET
return_y: // z is empty; copy y to c
	MOVD	R2, c+56(FP)
	RET


// func subVW(z, x []Word, y Word) (c Word)
TEXT ·subVW(SB),NOSPLIT,$0
	MOVD	z+0(FP), R3
	MOVD	z_len+8(FP), R0
	MOVD	x+24(FP), R1
	MOVD	y+48(FP), R2
	CBZ	R0, rety
	MOVD.P	8(R1), R4
	SUBS	R2, R4
	MOVD.P	R4, 8(R3)
	SUB	$1, R0
loop:
	CBZ	R0, done // careful not to touch the carry flag
	MOVD.P	8(R1), R4
	SBCS	$0, R4
	MOVD.P	R4, 8(R3)
	SUB	$1, R0
	B	loop
done:
	CSET	LO, R0 // extract carry flag
	MOVD	R0, c+56(FP)
	RET
rety: // z is empty; copy y to c
	MOVD	R2, c+56(FP)
	RET


// func shlVU(z, x []Word, s uint) (c Word)
TEXT ·shlVU(SB),NOSPLIT,$0
	B ·shlVU_g(SB)


// func shrVU(z, x []Word, s uint) (c Word)
TEXT ·shrVU(SB),NOSPLIT,$0
	B ·shrVU_g(SB)


// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
	MOVD	z+0(FP), R1
	MOVD	z_len+8(FP), R0
	MOVD	x+24(FP), R2
	MOVD	y+48(FP), R3
	MOVD	r+56(FP), R4
loop:
	CBZ	R0, done
	MOVD.P	8(R2), R5
	UMULH	R5, R3, R7
	MUL	R5, R3, R6
	ADDS	R4, R6
	ADC	$0, R7
	MOVD.P	R6, 8(R1)
	MOVD	R7, R4
	SUB	$1, R0
	B	loop
done:
	MOVD	R4, c+64(FP)
	RET


// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB),NOSPLIT,$0
	B ·addMulVVW_g(SB)


// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),NOSPLIT,$0
	B ·divWVW_g(SB)


// func bitLen(x Word) (n int)
TEXT ·bitLen(SB),NOSPLIT,$0
	MOVD	x+0(FP), R0
	CLZ	R0, R0
	MOVD	$64, R1
	SUB	R0, R1, R0
	MOVD	R0, n+8(FP)
	RET