#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project.
#
# Rights for redistribution and usage in source and binary forms are
# granted according to the OpenSSL license. Warranty of any kind is
# disclaimed.
# ====================================================================


# July 1999
#
# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
#
# The module is designed to work with either of the "new" MIPS ABI(5),
# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
# IRIX 5.x not only because it doesn't support new ABIs but also
# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
# cause illegal instruction exception:-(
#
# In addition the code depends on preprocessor flags set up by MIPSpro
# compiler driver (either as or cc) and therefore (probably?) can't be
# compiled by the GNU assembler. GNU C driver manages fine though...
# I mean as long as -mmips-as is specified or is the default option,
# because then it simply invokes /usr/bin/as which in turn takes
# perfect care of the preprocessor definitions. Another neat feature
# offered by the MIPSpro assembler is an optimization pass. This gave
# me the opportunity to have the code looking more regular as all those
# architecture dependent instruction rescheduling details were left to
# the assembler. Cool, huh?
#
# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
# goes way over 3 times faster!
#
#					<appro@fy.chalmers.se>

# October 2010
#
# Adapt the module even for 32-bit ABIs and other OSes. The former was
# achieved by mechanical replacement of 64-bit arithmetic instructions
# such as dmultu, daddu, etc. with their 32-bit counterparts and
# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
# >3x performance improvement naturally does not apply to 32-bit code
# [because there is no instruction 32-bit compiler can't use], one
# has to content with 40-85% improvement depending on benchmark and
# key length, more for longer keys.

$flavour = shift;
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

if ($flavour =~ /64|n32/i) {
	$LD="ld";
	$ST="sd";
	$MULTU="dmultu";
	$DIVU="ddivu";
	$ADDU="daddu";
	$SUBU="dsubu";
	$SRL="dsrl";
	$SLL="dsll";
	$BNSZ=8;
	$PTR_ADD="daddu";
	$PTR_SUB="dsubu";
	$SZREG=8;
	$REG_S="sd";
	$REG_L="ld";
} else {
	$LD="lw";
	$ST="sw";
	$MULTU="multu";
	$DIVU="divu";
	$ADDU="addu";
	$SUBU="subu";
	$SRL="srl";
	$SLL="sll";
	$BNSZ=4;
	$PTR_ADD="addu";
	$PTR_SUB="subu";
	$SZREG=4;
	$REG_S="sw";
	$REG_L="lw";
	$code=".set	mips2\n";
}

# Below is N32/64 register layout used in the original module.
#
($zero,$at,$v0,$v1)=map("\$$_",(0..3));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
#
# No special adaptation is required for O32. NUBI on the other hand
# is treated by saving/restoring ($v1,$t0..$t3).

$gp=$v1 if ($flavour =~ /nubi/i);

$minus4=$v1;

$code.=<<___;
.rdata
.asciiz	"mips3.s, Version 1.2"
.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"

.text
.set	noat

.align	5
.globl	bn_mul_add_words
.ent	bn_mul_add_words
bn_mul_add_words:
	.set	noreorder
	bgtz	$a2,bn_mul_add_words_internal
	move	$v0,$zero
	jr	$ra
	move	$a0,$v0
.end	bn_mul_add_words

.align	5
.ent	bn_mul_add_words_internal
bn_mul_add_words_internal:
___
$code.=<<___ if ($flavour =~ /nubi/i);
	.frame	$sp,6*$SZREG,$ra
	.mask	0x8000f008,-$SZREG
	.set	noreorder
	$PTR_SUB $sp,6*$SZREG
	$REG_S	$ra,5*$SZREG($sp)
	$REG_S	$t3,4*$SZREG($sp)
	$REG_S	$t2,3*$SZREG($sp)
	$REG_S	$t1,2*$SZREG($sp)
	$REG_S	$t0,1*$SZREG($sp)
	$REG_S	$gp,0*$SZREG($sp)
___
$code.=<<___;
	.set	reorder
	li	$minus4,-4
	and	$ta0,$a2,$minus4
	$LD	$t0,0($a1)
	beqz	$ta0,.L_bn_mul_add_words_tail

.L_bn_mul_add_words_loop:
	$MULTU	$t0,$a3
	$LD	$t1,0($a0)
	$LD	$t2,$BNSZ($a1)
	$LD	$t3,$BNSZ($a0)
	$LD	$ta0,2*$BNSZ($a1)
	$LD	$ta1,2*$BNSZ($a0)
	$ADDU	$t1,$v0
	sltu	$v0,$t1,$v0	# All manuals say it "compares 32-bit
				# values", but it seems to work fine
				# even on 64-bit registers.
	mflo	$at
	mfhi	$t0
	$ADDU	$t1,$at
	$ADDU	$v0,$t0
	 $MULTU	$t2,$a3
	sltu	$at,$t1,$at
	$ST	$t1,0($a0)
	$ADDU	$v0,$at

	$LD	$ta2,3*$BNSZ($a1)
	$LD	$ta3,3*$BNSZ($a0)
	$ADDU	$t3,$v0
	sltu	$v0,$t3,$v0
	mflo	$at
	mfhi	$t2
	$ADDU	$t3,$at
	$ADDU	$v0,$t2
	 $MULTU	$ta0,$a3
	sltu	$at,$t3,$at
	$ST	$t3,$BNSZ($a0)
	$ADDU	$v0,$at

	subu	$a2,4
	$PTR_ADD $a0,4*$BNSZ
	$PTR_ADD $a1,4*$BNSZ
	$ADDU	$ta1,$v0
	sltu	$v0,$ta1,$v0
	mflo	$at
	mfhi	$ta0
	$ADDU	$ta1,$at
	$ADDU	$v0,$ta0
	 $MULTU	$ta2,$a3
	sltu	$at,$ta1,$at
	$ST	$ta1,-2*$BNSZ($a0)
	$ADDU	$v0,$at


	and	$ta0,$a2,$minus4
	$ADDU	$ta3,$v0
	sltu	$v0,$ta3,$v0
	mflo	$at
	mfhi	$ta2
	$ADDU	$ta3,$at
	$ADDU	$v0,$ta2
	sltu	$at,$ta3,$at
	$ST	$ta3,-$BNSZ($a0)
	$ADDU	$v0,$at
	.set	noreorder
	bgtzl	$ta0,.L_bn_mul_add_words_loop
	$LD	$t0,0($a1)

	beqz	$a2,.L_bn_mul_add_words_return
	nop

.L_bn_mul_add_words_tail:
	.set	reorder
	$LD	$t0,0($a1)
	$MULTU	$t0,$a3
	$LD	$t1,0($a0)
	subu	$a2,1
	$ADDU	$t1,$v0
	sltu	$v0,$t1,$v0
	mflo	$at
	mfhi	$t0
	$ADDU	$t1,$at
	$ADDU	$v0,$t0
	sltu	$at,$t1,$at
	$ST	$t1,0($a0)
	$ADDU	$v0,$at
	beqz	$a2,.L_bn_mul_add_words_return

	$LD	$t0,$BNSZ($a1)
	$MULTU	$t0,$a3
	$LD	$t1,$BNSZ($a0)
	subu	$a2,1
	$ADDU	$t1,$v0
	sltu	$v0,$t1,$v0
	mflo	$at
	mfhi	$t0
	$ADDU	$t1,$at
	$ADDU	$v0,$t0
	sltu	$at,$t1,$at
	$ST	$t1,$BNSZ($a0)
	$ADDU	$v0,$at
	beqz	$a2,.L_bn_mul_add_words_return

	$LD	$t0,2*$BNSZ($a1)
	$MULTU	$t0,$a3
	$LD	$t1,2*$BNSZ($a0)
	$ADDU	$t1,$v0
	sltu	$v0,$t1,$v0
	mflo	$at
	mfhi	$t0
	$ADDU	$t1,$at
	$ADDU	$v0,$t0
	sltu	$at,$t1,$at
	$ST	$t1,2*$BNSZ($a0)
	$ADDU	$v0,$at

.L_bn_mul_add_words_return:
	.set	noreorder
___
$code.=<<___ if ($flavour =~ /nubi/i);
	$REG_L	$t3,4*$SZREG($sp)
	$REG_L	$t2,3*$SZREG($sp)
	$REG_L	$t1,2*$SZREG($sp)
	$REG_L	$t0,1*$SZREG($sp)
	$REG_L	$gp,0*$SZREG($sp)
	$PTR_ADD $sp,6*$SZREG
___
$code.=<<___;
	jr	$ra
	move	$a0,$v0
.end	bn_mul_add_words_internal

.align	5
.globl	bn_mul_words
.ent	bn_mul_words
bn_mul_words:
	.set	noreorder
	bgtz	$a2,bn_mul_words_internal
	move	$v0,$zero
	jr	$ra
	move	$a0,$v0
.end	bn_mul_words

.align	5
.ent	bn_mul_words_internal
bn_mul_words_internal:
___
$code.=<<___ if ($flavour =~ /nubi/i);
	.frame	$sp,6*$SZREG,$ra
	.mask	0x8000f008,-$SZREG
	.set	noreorder
	$PTR_SUB $sp,6*$SZREG
	$REG_S	$ra,5*$SZREG($sp)
	$REG_S	$t3,4*$SZREG($sp)
	$REG_S	$t2,3*$SZREG($sp)
	$REG_S	$t1,2*$SZREG($sp)
	$REG_S	$t0,1*$SZREG($sp)
	$REG_S	$gp,0*$SZREG($sp)
___
$code.=<<___;
	.set	reorder
	li	$minus4,-4
	and	$ta0,$a2,$minus4
	$LD	$t0,0($a1)
	beqz	$ta0,.L_bn_mul_words_tail

.L_bn_mul_words_loop:
	$MULTU	$t0,$a3
	$LD	$t2,$BNSZ($a1)
	$LD	$ta0,2*$BNSZ($a1)
	$LD	$ta2,3*$BNSZ($a1)
	mflo	$at
	mfhi	$t0
	$ADDU	$v0,$at
	sltu	$t1,$v0,$at
	 $MULTU	$t2,$a3
	$ST	$v0,0($a0)
	$ADDU	$v0,$t1,$t0

	subu	$a2,4
	$PTR_ADD $a0,4*$BNSZ
	$PTR_ADD $a1,4*$BNSZ
	mflo	$at
	mfhi	$t2
	$ADDU	$v0,$at
	sltu	$t3,$v0,$at
	 $MULTU	$ta0,$a3
	$ST	$v0,-3*$BNSZ($a0)
	$ADDU	$v0,$t3,$t2

	mflo	$at
	mfhi	$ta0
	$ADDU	$v0,$at
	sltu	$ta1,$v0,$at
	 $MULTU	$ta2,$a3
	$ST	$v0,-2*$BNSZ($a0)
	$ADDU	$v0,$ta1,$ta0

	and	$ta0,$a2,$minus4
	mflo	$at
	mfhi	$ta2
	$ADDU	$v0,$at
	sltu	$ta3,$v0,$at
	$ST	$v0,-$BNSZ($a0)
	$ADDU	$v0,$ta3,$ta2
	.set	noreorder
	bgtzl	$ta0,.L_bn_mul_words_loop
	$LD	$t0,0($a1)

	beqz	$a2,.L_bn_mul_words_return
	nop

.L_bn_mul_words_tail:
	.set	reorder
	$LD	$t0,0($a1)
	$MULTU	$t0,$a3
	subu	$a2,1
	mflo	$at
	mfhi	$t0
	$ADDU	$v0,$at
	sltu	$t1,$v0,$at
	$ST	$v0,0($a0)
	$ADDU	$v0,$t1,$t0
	beqz	$a2,.L_bn_mul_words_return

	$LD	$t0,$BNSZ($a1)
	$MULTU	$t0,$a3
	subu	$a2,1
	mflo	$at
	mfhi	$t0
	$ADDU	$v0,$at
	sltu	$t1,$v0,$at
	$ST	$v0,$BNSZ($a0)
	$ADDU	$v0,$t1,$t0
	beqz	$a2,.L_bn_mul_words_return

	$LD	$t0,2*$BNSZ($a1)
	$MULTU	$t0,$a3
	mflo	$at
	mfhi	$t0
	$ADDU	$v0,$at
	sltu	$t1,$v0,$at
	$ST	$v0,2*$BNSZ($a0)
	$ADDU	$v0,$t1,$t0

.L_bn_mul_words_return:
	.set	noreorder
___
$code.=<<___ if ($flavour =~ /nubi/i);
	$REG_L	$t3,4*$SZREG($sp)
	$REG_L	$t2,3*$SZREG($sp)
	$REG_L	$t1,2*$SZREG($sp)
	$REG_L	$t0,1*$SZREG($sp)
	$REG_L	$gp,0*$SZREG($sp)
	$PTR_ADD $sp,6*$SZREG
___
$code.=<<___;
	jr	$ra
	move	$a0,$v0
.end	bn_mul_words_internal

.align	5
.globl	bn_sqr_words
.ent	bn_sqr_words
bn_sqr_words:
	.set	noreorder
	bgtz	$a2,bn_sqr_words_internal
	move	$v0,$zero
	jr	$ra
	move	$a0,$v0
.end	bn_sqr_words

.align	5
.ent	bn_sqr_words_internal
bn_sqr_words_internal:
___
$code.=<<___ if ($flavour =~ /nubi/i);
	.frame	$sp,6*$SZREG,$ra
	.mask	0x8000f008,-$SZREG
	.set	noreorder
	$PTR_SUB $sp,6*$SZREG
	$REG_S	$ra,5*$SZREG($sp)
	$REG_S	$t3,4*$SZREG($sp)
	$REG_S	$t2,3*$SZREG($sp)
	$REG_S	$t1,2*$SZREG($sp)
	$REG_S	$t0,1*$SZREG($sp)
	$REG_S	$gp,0*$SZREG($sp)
___
$code.=<<___;
	.set	reorder
	li	$minus4,-4
	and	$ta0,$a2,$minus4
	$LD	$t0,0($a1)
	beqz	$ta0,.L_bn_sqr_words_tail

.L_bn_sqr_words_loop:
	$MULTU	$t0,$t0
	$LD	$t2,$BNSZ($a1)
	$LD	$ta0,2*$BNSZ($a1)
	$LD	$ta2,3*$BNSZ($a1)
	mflo	$t1
	mfhi	$t0
	$ST	$t1,0($a0)
	$ST	$t0,$BNSZ($a0)

	$MULTU	$t2,$t2
	subu	$a2,4
	$PTR_ADD $a0,8*$BNSZ
	$PTR_ADD $a1,4*$BNSZ
	mflo	$t3
	mfhi	$t2
	$ST	$t3,-6*$BNSZ($a0)
	$ST	$t2,-5*$BNSZ($a0)

	$MULTU	$ta0,$ta0
	mflo	$ta1
	mfhi	$ta0
	$ST	$ta1,-4*$BNSZ($a0)
	$ST	$ta0,-3*$BNSZ($a0)


	$MULTU	$ta2,$ta2
	and	$ta0,$a2,$minus4
	mflo	$ta3
	mfhi	$ta2
	$ST	$ta3,-2*$BNSZ($a0)
	$ST	$ta2,-$BNSZ($a0)

	.set	noreorder
	bgtzl	$ta0,.L_bn_sqr_words_loop
	$LD	$t0,0($a1)

	beqz	$a2,.L_bn_sqr_words_return
	nop

.L_bn_sqr_words_tail:
	.set	reorder
	$LD	$t0,0($a1)
	$MULTU	$t0,$t0
	subu	$a2,1
	mflo	$t1
	mfhi	$t0
	$ST	$t1,0($a0)
	$ST	$t0,$BNSZ($a0)
	beqz	$a2,.L_bn_sqr_words_return

	$LD	$t0,$BNSZ($a1)
	$MULTU	$t0,$t0
	subu	$a2,1
	mflo	$t1
	mfhi	$t0
	$ST	$t1,2*$BNSZ($a0)
	$ST	$t0,3*$BNSZ($a0)
	beqz	$a2,.L_bn_sqr_words_return

	$LD	$t0,2*$BNSZ($a1)
	$MULTU	$t0,$t0
	mflo	$t1
	mfhi	$t0
	$ST	$t1,4*$BNSZ($a0)
	$ST	$t0,5*$BNSZ($a0)

.L_bn_sqr_words_return:
	.set	noreorder
___
$code.=<<___ if ($flavour =~ /nubi/i);
	$REG_L	$t3,4*$SZREG($sp)
	$REG_L	$t2,3*$SZREG($sp)
	$REG_L	$t1,2*$SZREG($sp)
	$REG_L	$t0,1*$SZREG($sp)
	$REG_L	$gp,0*$SZREG($sp)
	$PTR_ADD $sp,6*$SZREG
___
$code.=<<___;
	jr	$ra
	move	$a0,$v0

.end	bn_sqr_words_internal

.align	5
.globl	bn_add_words
.ent	bn_add_words
bn_add_words:
	.set	noreorder
	bgtz	$a3,bn_add_words_internal
	move	$v0,$zero
	jr	$ra
	move	$a0,$v0
.end	bn_add_words

.align	5
.ent	bn_add_words_internal
bn_add_words_internal:
___
$code.=<<___ if ($flavour =~ /nubi/i);
	.frame	$sp,6*$SZREG,$ra
	.mask	0x8000f008,-$SZREG
	.set	noreorder
	$PTR_SUB $sp,6*$SZREG
	$REG_S	$ra,5*$SZREG($sp)
	$REG_S	$t3,4*$SZREG($sp)
	$REG_S	$t2,3*$SZREG($sp)
	$REG_S	$t1,2*$SZREG($sp)
	$REG_S	$t0,1*$SZREG($sp)
	$REG_S	$gp,0*$SZREG($sp)
___
$code.=<<___;
	.set	reorder
	li	$minus4,-4
	and	$at,$a3,$minus4
	$LD	$t0,0($a1)
	beqz	$at,.L_bn_add_words_tail

.L_bn_add_words_loop:
	$LD	$ta0,0($a2)
	subu	$a3,4
	$LD	$t1,$BNSZ($a1)
	and	$at,$a3,$minus4
	$LD	$t2,2*$BNSZ($a1)
	$PTR_ADD $a2,4*$BNSZ
	$LD	$t3,3*$BNSZ($a1)
	$PTR_ADD $a0,4*$BNSZ
	$LD	$ta1,-3*$BNSZ($a2)
	$PTR_ADD $a1,4*$BNSZ
	$LD	$ta2,-2*$BNSZ($a2)
	$LD	$ta3,-$BNSZ($a2)
	$ADDU	$ta0,$t0
	sltu	$t8,$ta0,$t0
	$ADDU	$t0,$ta0,$v0
	sltu	$v0,$t0,$ta0
	$ST	$t0,-4*$BNSZ($a0)
	$ADDU	$v0,$t8

	$ADDU	$ta1,$t1
	sltu	$t9,$ta1,$t1
	$ADDU	$t1,$ta1,$v0
	sltu	$v0,$t1,$ta1
	$ST	$t1,-3*$BNSZ($a0)
	$ADDU	$v0,$t9

	$ADDU	$ta2,$t2
	sltu	$t8,$ta2,$t2
	$ADDU	$t2,$ta2,$v0
	sltu	$v0,$t2,$ta2
	$ST	$t2,-2*$BNSZ($a0)
	$ADDU	$v0,$t8
	
	$ADDU	$ta3,$t3
	sltu	$t9,$ta3,$t3
	$ADDU	$t3,$ta3,$v0
	sltu	$v0,$t3,$ta3
	$ST	$t3,-$BNSZ($a0)
	$ADDU	$v0,$t9
	
	.set	noreorder
	bgtzl	$at,.L_bn_add_words_loop
	$LD	$t0,0($a1)

	beqz	$a3,.L_bn_add_words_return
	nop

.L_bn_add_words_tail:
	.set	reorder
	$LD	$t0,0($a1)
	$LD	$ta0,0($a2)
	$ADDU	$ta0,$t0
	subu	$a3,1
	sltu	$t8,$ta0,$t0
	$ADDU	$t0,$ta0,$v0
	sltu	$v0,$t0,$ta0
	$ST	$t0,0($a0)
	$ADDU	$v0,$t8
	beqz	$a3,.L_bn_add_words_return

	$LD	$t1,$BNSZ($a1)
	$LD	$ta1,$BNSZ($a2)
	$ADDU	$ta1,$t1
	subu	$a3,1
	sltu	$t9,$ta1,$t1
	$ADDU	$t1,$ta1,$v0
	sltu	$v0,$t1,$ta1
	$ST	$t1,$BNSZ($a0)
	$ADDU	$v0,$t9
	beqz	$a3,.L_bn_add_words_return

	$LD	$t2,2*$BNSZ($a1)
	$LD	$ta2,2*$BNSZ($a2)
	$ADDU	$ta2,$t2
	sltu	$t8,$ta2,$t2
	$ADDU	$t2,$ta2,$v0
	sltu	$v0,$t2,$ta2
	$ST	$t2,2*$BNSZ($a0)
	$ADDU	$v0,$t8

.L_bn_add_words_return:
	.set	noreorder
___
$code.=<<___ if ($flavour =~ /nubi/i);
	$REG_L	$t3,4*$SZREG($sp)
	$REG_L	$t2,3*$SZREG($sp)
	$REG_L	$t1,2*$SZREG($sp)
	$REG_L	$t0,1*$SZREG($sp)
	$REG_L	$gp,0*$SZREG($sp)
	$PTR_ADD $sp,6*$SZREG
___
$code.=<<___;
	jr	$ra
	move	$a0,$v0

.end	bn_add_words_internal

.align	5
.globl	bn_sub_words
.ent	bn_sub_words
bn_sub_words:
	.set	noreorder
	bgtz	$a3,bn_sub_words_internal
	move	$v0,$zero
	jr	$ra
	move	$a0,$zero
.end	bn_sub_words

.align	5
.ent	bn_sub_words_internal
bn_sub_words_internal:
___
$code.=<<___ if ($flavour =~ /nubi/i);
	.frame	$sp,6*$SZREG,$ra
	.mask	0x8000f008,-$SZREG
	.set	noreorder
	$PTR_SUB $sp,6*$SZREG
	$REG_S	$ra,5*$SZREG($sp)
	$REG_S	$t3,4*$SZREG($sp)
	$REG_S	$t2,3*$SZREG($sp)
	$REG_S	$t1,2*$SZREG($sp)
	$REG_S	$t0,1*$SZREG($sp)
	$REG_S	$gp,0*$SZREG($sp)
___
$code.=<<___;
	.set	reorder
	li	$minus4,-4
	and	$at,$a3,$minus4
	$LD	$t0,0($a1)
	beqz	$at,.L_bn_sub_words_tail

.L_bn_sub_words_loop:
	$LD	$ta0,0($a2)
	subu	$a3,4
	$LD	$t1,$BNSZ($a1)
	and	$at,$a3,$minus4
	$LD	$t2,2*$BNSZ($a1)
	$PTR_ADD $a2,4*$BNSZ
	$LD	$t3,3*$BNSZ($a1)
	$PTR_ADD $a0,4*$BNSZ
	$LD	$ta1,-3*$BNSZ($a2)
	$PTR_ADD $a1,4*$BNSZ
	$LD	$ta2,-2*$BNSZ($a2)
	$LD	$ta3,-$BNSZ($a2)
	sltu	$t8,$t0,$ta0
	$SUBU	$ta0,$t0,$ta0
	$SUBU	$t0,$ta0,$v0
	sgtu	$v0,$t0,$ta0
	$ST	$t0,-4*$BNSZ($a0)
	$ADDU	$v0,$t8

	sltu	$t9,$t1,$ta1
	$SUBU	$ta1,$t1,$ta1
	$SUBU	$t1,$ta1,$v0
	sgtu	$v0,$t1,$ta1
	$ST	$t1,-3*$BNSZ($a0)
	$ADDU	$v0,$t9


	sltu	$t8,$t2,$ta2
	$SUBU	$ta2,$t2,$ta2
	$SUBU	$t2,$ta2,$v0
	sgtu	$v0,$t2,$ta2
	$ST	$t2,-2*$BNSZ($a0)
	$ADDU	$v0,$t8

	sltu	$t9,$t3,$ta3
	$SUBU	$ta3,$t3,$ta3
	$SUBU	$t3,$ta3,$v0
	sgtu	$v0,$t3,$ta3
	$ST	$t3,-$BNSZ($a0)
	$ADDU	$v0,$t9

	.set	noreorder
	bgtzl	$at,.L_bn_sub_words_loop
	$LD	$t0,0($a1)

	beqz	$a3,.L_bn_sub_words_return
	nop

.L_bn_sub_words_tail:
	.set	reorder
	$LD	$t0,0($a1)
	$LD	$ta0,0($a2)
	subu	$a3,1
	sltu	$t8,$t0,$ta0
	$SUBU	$ta0,$t0,$ta0
	$SUBU	$t0,$ta0,$v0
	sgtu	$v0,$t0,$ta0
	$ST	$t0,0($a0)
	$ADDU	$v0,$t8
	beqz	$a3,.L_bn_sub_words_return

	$LD	$t1,$BNSZ($a1)
	subu	$a3,1
	$LD	$ta1,$BNSZ($a2)
	sltu	$t9,$t1,$ta1
	$SUBU	$ta1,$t1,$ta1
	$SUBU	$t1,$ta1,$v0
	sgtu	$v0,$t1,$ta1
	$ST	$t1,$BNSZ($a0)
	$ADDU	$v0,$t9
	beqz	$a3,.L_bn_sub_words_return

	$LD	$t2,2*$BNSZ($a1)
	$LD	$ta2,2*$BNSZ($a2)
	sltu	$t8,$t2,$ta2
	$SUBU	$ta2,$t2,$ta2
	$SUBU	$t2,$ta2,$v0
	sgtu	$v0,$t2,$ta2
	$ST	$t2,2*$BNSZ($a0)
	$ADDU	$v0,$t8

.L_bn_sub_words_return:
	.set	noreorder
___
$code.=<<___ if ($flavour =~ /nubi/i);
	$REG_L	$t3,4*$SZREG($sp)
	$REG_L	$t2,3*$SZREG($sp)
	$REG_L	$t1,2*$SZREG($sp)
	$REG_L	$t0,1*$SZREG($sp)
	$REG_L	$gp,0*$SZREG($sp)
	$PTR_ADD $sp,6*$SZREG
___
$code.=<<___;
	jr	$ra
	move	$a0,$v0
.end	bn_sub_words_internal

.align 5
.globl	bn_div_3_words
.ent	bn_div_3_words
bn_div_3_words:
	.set	noreorder
	move	$a3,$a0		# we know that bn_div_words does not
				# touch $a3, $ta2, $ta3 and preserves $a2
				# so that we can save two arguments
				# and return address in registers
				# instead of stack:-)
				
	$LD	$a0,($a3)
	move	$ta2,$a1
	bne	$a0,$a2,bn_div_3_words_internal
	$LD	$a1,-$BNSZ($a3)
	li	$v0,-1
	jr	$ra
	move	$a0,$v0
.end	bn_div_3_words

.align	5
.ent	bn_div_3_words_internal
bn_div_3_words_internal:
___
$code.=<<___ if ($flavour =~ /nubi/i);
	.frame	$sp,6*$SZREG,$ra
	.mask	0x8000f008,-$SZREG
	.set	noreorder
	$PTR_SUB $sp,6*$SZREG
	$REG_S	$ra,5*$SZREG($sp)
	$REG_S	$t3,4*$SZREG($sp)
	$REG_S	$t2,3*$SZREG($sp)
	$REG_S	$t1,2*$SZREG($sp)
	$REG_S	$t0,1*$SZREG($sp)
	$REG_S	$gp,0*$SZREG($sp)
___
$code.=<<___;
	.set	reorder
	move	$ta3,$ra
	bal	bn_div_words
	move	$ra,$ta3
	$MULTU	$ta2,$v0
	$LD	$t2,-2*$BNSZ($a3)
	move	$ta0,$zero
	mfhi	$t1
	mflo	$t0
	sltu	$t8,$t1,$a1
.L_bn_div_3_words_inner_loop:
	bnez	$t8,.L_bn_div_3_words_inner_loop_done
	sgeu	$at,$t2,$t0
	seq	$t9,$t1,$a1
	and	$at,$t9
	sltu	$t3,$t0,$ta2
	$ADDU	$a1,$a2
	$SUBU	$t1,$t3
	$SUBU	$t0,$ta2
	sltu	$t8,$t1,$a1
	sltu	$ta0,$a1,$a2
	or	$t8,$ta0
	.set	noreorder
	beqzl	$at,.L_bn_div_3_words_inner_loop
	$SUBU	$v0,1
	.set	reorder
.L_bn_div_3_words_inner_loop_done:
	.set	noreorder
___
$code.=<<___ if ($flavour =~ /nubi/i);
	$REG_L	$t3,4*$SZREG($sp)
	$REG_L	$t2,3*$SZREG($sp)
	$REG_L	$t1,2*$SZREG($sp)
	$REG_L	$t0,1*$SZREG($sp)
	$REG_L	$gp,0*$SZREG($sp)
	$PTR_ADD $sp,6*$SZREG
___
$code.=<<___;
	jr	$ra
	move	$a0,$v0
.end	bn_div_3_words_internal

.align	5
.globl	bn_div_words
.ent	bn_div_words
bn_div_words:
	.set	noreorder
	bnez	$a2,bn_div_words_internal
	li	$v0,-1		# I would rather signal div-by-zero
				# which can be done with 'break 7'
	jr	$ra
	move	$a0,$v0
.end	bn_div_words

.align	5
.ent	bn_div_words_internal
bn_div_words_internal:
___
$code.=<<___ if ($flavour =~ /nubi/i);
	.frame	$sp,6*$SZREG,$ra
	.mask	0x8000f008,-$SZREG
	.set	noreorder
	$PTR_SUB $sp,6*$SZREG
	$REG_S	$ra,5*$SZREG($sp)
	$REG_S	$t3,4*$SZREG($sp)
	$REG_S	$t2,3*$SZREG($sp)
	$REG_S	$t1,2*$SZREG($sp)
	$REG_S	$t0,1*$SZREG($sp)
	$REG_S	$gp,0*$SZREG($sp)
___
$code.=<<___;
	move	$v1,$zero
	bltz	$a2,.L_bn_div_words_body
	move	$t9,$v1
	$SLL	$a2,1
	bgtz	$a2,.-4
	addu	$t9,1

	.set	reorder
	negu	$t1,$t9
	li	$t2,-1
	$SLL	$t2,$t1
	and	$t2,$a0
	$SRL	$at,$a1,$t1
	.set	noreorder
	bnezl	$t2,.+8
	break	6		# signal overflow
	.set	reorder
	$SLL	$a0,$t9
	$SLL	$a1,$t9
	or	$a0,$at
___
$QT=$ta0;
$HH=$ta1;
$DH=$v1;
$code.=<<___;
.L_bn_div_words_body:
	$SRL	$DH,$a2,4*$BNSZ	# bits
	sgeu	$at,$a0,$a2
	.set	noreorder
	bnezl	$at,.+8
	$SUBU	$a0,$a2
	.set	reorder

	li	$QT,-1
	$SRL	$HH,$a0,4*$BNSZ	# bits
	$SRL	$QT,4*$BNSZ	# q=0xffffffff
	beq	$DH,$HH,.L_bn_div_words_skip_div1
	$DIVU	$zero,$a0,$DH
	mflo	$QT
.L_bn_div_words_skip_div1:
	$MULTU	$a2,$QT
	$SLL	$t3,$a0,4*$BNSZ	# bits
	$SRL	$at,$a1,4*$BNSZ	# bits
	or	$t3,$at
	mflo	$t0
	mfhi	$t1
.L_bn_div_words_inner_loop1:
	sltu	$t2,$t3,$t0
	seq	$t8,$HH,$t1
	sltu	$at,$HH,$t1
	and	$t2,$t8
	sltu	$v0,$t0,$a2
	or	$at,$t2
	.set	noreorder
	beqz	$at,.L_bn_div_words_inner_loop1_done
	$SUBU	$t1,$v0
	$SUBU	$t0,$a2
	b	.L_bn_div_words_inner_loop1
	$SUBU	$QT,1
	.set	reorder
.L_bn_div_words_inner_loop1_done:

	$SLL	$a1,4*$BNSZ	# bits
	$SUBU	$a0,$t3,$t0
	$SLL	$v0,$QT,4*$BNSZ	# bits

	li	$QT,-1
	$SRL	$HH,$a0,4*$BNSZ	# bits
	$SRL	$QT,4*$BNSZ	# q=0xffffffff
	beq	$DH,$HH,.L_bn_div_words_skip_div2
	$DIVU	$zero,$a0,$DH
	mflo	$QT
.L_bn_div_words_skip_div2:
	$MULTU	$a2,$QT
	$SLL	$t3,$a0,4*$BNSZ	# bits
	$SRL	$at,$a1,4*$BNSZ	# bits
	or	$t3,$at
	mflo	$t0
	mfhi	$t1
.L_bn_div_words_inner_loop2:
	sltu	$t2,$t3,$t0
	seq	$t8,$HH,$t1
	sltu	$at,$HH,$t1
	and	$t2,$t8
	sltu	$v1,$t0,$a2
	or	$at,$t2
	.set	noreorder
	beqz	$at,.L_bn_div_words_inner_loop2_done
	$SUBU	$t1,$v1
	$SUBU	$t0,$a2
	b	.L_bn_div_words_inner_loop2
	$SUBU	$QT,1
	.set	reorder
.L_bn_div_words_inner_loop2_done:

	$SUBU	$a0,$t3,$t0
	or	$v0,$QT
	$SRL	$v1,$a0,$t9	# $v1 contains remainder if anybody wants it
	$SRL	$a2,$t9		# restore $a2

	.set	noreorder
	move	$a1,$v1
___
$code.=<<___ if ($flavour =~ /nubi/i);
	$REG_L	$t3,4*$SZREG($sp)
	$REG_L	$t2,3*$SZREG($sp)
	$REG_L	$t1,2*$SZREG($sp)
	$REG_L	$t0,1*$SZREG($sp)
	$REG_L	$gp,0*$SZREG($sp)
	$PTR_ADD $sp,6*$SZREG
___
$code.=<<___;
	jr	$ra
	move	$a0,$v0
.end	bn_div_words_internal
___
undef $HH; undef $QT; undef $DH;

($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);

($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2

($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);

$code.=<<___;

.align	5
.globl	bn_mul_comba8
.ent	bn_mul_comba8
bn_mul_comba8:
	.set	noreorder
___
$code.=<<___ if ($flavour =~ /nubi/i);
	.frame	$sp,12*$SZREG,$ra
	.mask	0x803ff008,-$SZREG
	$PTR_SUB $sp,12*$SZREG
	$REG_S	$ra,11*$SZREG($sp)
	$REG_S	$s5,10*$SZREG($sp)
	$REG_S	$s4,9*$SZREG($sp)
	$REG_S	$s3,8*$SZREG($sp)
	$REG_S	$s2,7*$SZREG($sp)
	$REG_S	$s1,6*$SZREG($sp)
	$REG_S	$s0,5*$SZREG($sp)
	$REG_S	$t3,4*$SZREG($sp)
	$REG_S	$t2,3*$SZREG($sp)
	$REG_S	$t1,2*$SZREG($sp)
	$REG_S	$t0,1*$SZREG($sp)
	$REG_S	$gp,0*$SZREG($sp)
___
$code.=<<___ if ($flavour !~ /nubi/i);
	.frame	$sp,6*$SZREG,$ra
	.mask	0x003f0000,-$SZREG
	$PTR_SUB $sp,6*$SZREG
	$REG_S	$s5,5*$SZREG($sp)
	$REG_S	$s4,4*$SZREG($sp)
	$REG_S	$s3,3*$SZREG($sp)
	$REG_S	$s2,2*$SZREG($sp)
	$REG_S	$s1,1*$SZREG($sp)
	$REG_S	$s0,0*$SZREG($sp)
___
$code.=<<___;

	.set	reorder
	$LD	$a_0,0($a1)	# If compiled with -mips3 option on
				# R5000 box assembler barks on this
				# 1ine with "should not have mult/div
				# as last instruction in bb (R10K
				# bug)" warning. If anybody out there
				# has a clue about how to circumvent
				# this do send me a note.
				#		<appro\@fy.chalmers.se>

	$LD	$b_0,0($a2)
	$LD	$a_1,$BNSZ($a1)
	$LD	$a_2,2*$BNSZ($a1)
	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
	$LD	$a_3,3*$BNSZ($a1)
	$LD	$b_1,$BNSZ($a2)
	$LD	$b_2,2*$BNSZ($a2)
	$LD	$b_3,3*$BNSZ($a2)
	mflo	$c_1
	mfhi	$c_2

	$LD	$a_4,4*$BNSZ($a1)
	$LD	$a_5,5*$BNSZ($a1)
	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
	$LD	$a_6,6*$BNSZ($a1)
	$LD	$a_7,7*$BNSZ($a1)
	$LD	$b_4,4*$BNSZ($a2)
	$LD	$b_5,5*$BNSZ($a2)
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
	$ADDU	$c_3,$t_2,$at
	$LD	$b_6,6*$BNSZ($a2)
	$LD	$b_7,7*$BNSZ($a2)
	$ST	$c_1,0($a0)	# r[0]=c1;
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$c_1,$c_3,$t_2
	$ST	$c_2,$BNSZ($a0)	# r[1]=c2;

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$c_2,$c_1,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,2*$BNSZ($a0)	# r[2]=c3;

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$c_3,$c_2,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	 $MULTU	$a_4,$b_0		# mul_add_c(a[4],b[0],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	$ST	$c_1,3*$BNSZ($a0)	# r[3]=c1;

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$c_1,$c_3,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_0,$b_4		# mul_add_c(a[0],b[4],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	 $MULTU	$a_0,$b_5		# mul_add_c(a[0],b[5],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	$ST	$c_2,4*$BNSZ($a0)	# r[4]=c2;

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_1,$b_4		# mul_add_c(a[1],b[4],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$c_2,$c_1,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_4,$b_1		# mul_add_c(a[4],b[1],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_5,$b_0		# mul_add_c(a[5],b[0],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	 $MULTU	$a_6,$b_0		# mul_add_c(a[6],b[0],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,5*$BNSZ($a0)	# r[5]=c3;

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_5,$b_1		# mul_add_c(a[5],b[1],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$c_3,$c_2,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_4,$b_2		# mul_add_c(a[4],b[2],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_2,$b_4		# mul_add_c(a[2],b[4],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_1,$b_5		# mul_add_c(a[1],b[5],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_0,$b_6		# mul_add_c(a[0],b[6],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	 $MULTU	$a_0,$b_7		# mul_add_c(a[0],b[7],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	$ST	$c_1,6*$BNSZ($a0)	# r[6]=c1;

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_1,$b_6		# mul_add_c(a[1],b[6],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$c_1,$c_3,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_2,$b_5		# mul_add_c(a[2],b[5],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_3,$b_4		# mul_add_c(a[3],b[4],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_4,$b_3		# mul_add_c(a[4],b[3],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_5,$b_2		# mul_add_c(a[5],b[2],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_6,$b_1		# mul_add_c(a[6],b[1],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_7,$b_0		# mul_add_c(a[7],b[0],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	 $MULTU	$a_7,$b_1		# mul_add_c(a[7],b[1],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	$ST	$c_2,7*$BNSZ($a0)	# r[7]=c2;

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_6,$b_2		# mul_add_c(a[6],b[2],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$c_2,$c_1,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_5,$b_3		# mul_add_c(a[5],b[3],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_4,$b_4		# mul_add_c(a[4],b[4],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_3,$b_5		# mul_add_c(a[3],b[5],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_2,$b_6		# mul_add_c(a[2],b[6],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_1,$b_7		# mul_add_c(a[1],b[7],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	 $MULTU	$a_2,$b_7		# mul_add_c(a[2],b[7],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,8*$BNSZ($a0)	# r[8]=c3;

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_3,$b_6		# mul_add_c(a[3],b[6],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$c_3,$c_2,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_4,$b_5		# mul_add_c(a[4],b[5],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_5,$b_4		# mul_add_c(a[5],b[4],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_6,$b_3		# mul_add_c(a[6],b[3],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_7,$b_2		# mul_add_c(a[7],b[2],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	 $MULTU	$a_7,$b_3		# mul_add_c(a[7],b[3],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	$ST	$c_1,9*$BNSZ($a0)	# r[9]=c1;

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_6,$b_4		# mul_add_c(a[6],b[4],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$c_1,$c_3,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_5,$b_5		# mul_add_c(a[5],b[5],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_4,$b_6		# mul_add_c(a[4],b[6],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_3,$b_7		# mul_add_c(a[3],b[7],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_4,$b_7		# mul_add_c(a[4],b[7],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	$ST	$c_2,10*$BNSZ($a0)	# r[10]=c2;

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_5,$b_6		# mul_add_c(a[5],b[6],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$c_2,$c_1,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_6,$b_5		# mul_add_c(a[6],b[5],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_7,$b_4		# mul_add_c(a[7],b[4],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	 $MULTU	$a_7,$b_5		# mul_add_c(a[7],b[5],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,11*$BNSZ($a0)	# r[11]=c3;

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_6,$b_6		# mul_add_c(a[6],b[6],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$c_3,$c_2,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_5,$b_7		# mul_add_c(a[5],b[7],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	 $MULTU	$a_6,$b_7		# mul_add_c(a[6],b[7],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	$ST	$c_1,12*$BNSZ($a0)	# r[12]=c1;

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_7,$b_6		# mul_add_c(a[7],b[6],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$c_1,$c_3,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_7,$b_7		# mul_add_c(a[7],b[7],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	$ST	$c_2,13*$BNSZ($a0)	# r[13]=c2;

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	$ST	$c_3,14*$BNSZ($a0)	# r[14]=c3;
	$ST	$c_1,15*$BNSZ($a0)	# r[15]=c1;

	.set	noreorder
___
$code.=<<___ if ($flavour =~ /nubi/i);
	$REG_L	$s5,10*$SZREG($sp)
	$REG_L	$s4,9*$SZREG($sp)
	$REG_L	$s3,8*$SZREG($sp)
	$REG_L	$s2,7*$SZREG($sp)
	$REG_L	$s1,6*$SZREG($sp)
	$REG_L	$s0,5*$SZREG($sp)
	$REG_L	$t3,4*$SZREG($sp)
	$REG_L	$t2,3*$SZREG($sp)
	$REG_L	$t1,2*$SZREG($sp)
	$REG_L	$t0,1*$SZREG($sp)
	$REG_L	$gp,0*$SZREG($sp)
	jr	$ra
	$PTR_ADD $sp,12*$SZREG
___
$code.=<<___ if ($flavour !~ /nubi/i);
	$REG_L	$s5,5*$SZREG($sp)
	$REG_L	$s4,4*$SZREG($sp)
	$REG_L	$s3,3*$SZREG($sp)
	$REG_L	$s2,2*$SZREG($sp)
	$REG_L	$s1,1*$SZREG($sp)
	$REG_L	$s0,0*$SZREG($sp)
	jr	$ra
	$PTR_ADD $sp,6*$SZREG
___
$code.=<<___;
.end	bn_mul_comba8

.align	5
.globl	bn_mul_comba4
.ent	bn_mul_comba4
bn_mul_comba4:
___
$code.=<<___ if ($flavour =~ /nubi/i);
	.frame	$sp,6*$SZREG,$ra
	.mask	0x8000f008,-$SZREG
	.set	noreorder
	$PTR_SUB $sp,6*$SZREG
	$REG_S	$ra,5*$SZREG($sp)
	$REG_S	$t3,4*$SZREG($sp)
	$REG_S	$t2,3*$SZREG($sp)
	$REG_S	$t1,2*$SZREG($sp)
	$REG_S	$t0,1*$SZREG($sp)
	$REG_S	$gp,0*$SZREG($sp)
___
$code.=<<___;
	.set	reorder
	$LD	$a_0,0($a1)
	$LD	$b_0,0($a2)
	$LD	$a_1,$BNSZ($a1)
	$LD	$a_2,2*$BNSZ($a1)
	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
	$LD	$a_3,3*$BNSZ($a1)
	$LD	$b_1,$BNSZ($a2)
	$LD	$b_2,2*$BNSZ($a2)
	$LD	$b_3,3*$BNSZ($a2)
	mflo	$c_1
	mfhi	$c_2
	$ST	$c_1,0($a0)

	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
	$ADDU	$c_3,$t_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$c_1,$c_3,$t_2
	$ST	$c_2,$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$c_2,$c_1,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,2*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$c_3,$c_2,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	 $MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	$ST	$c_1,3*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$c_1,$c_3,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	 $MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	$ST	$c_2,4*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$c_2,$c_1,$t_2
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	 $MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,5*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	$ST	$c_1,6*$BNSZ($a0)
	$ST	$c_2,7*$BNSZ($a0)

	.set	noreorder
___
$code.=<<___ if ($flavour =~ /nubi/i);
	$REG_L	$t3,4*$SZREG($sp)
	$REG_L	$t2,3*$SZREG($sp)
	$REG_L	$t1,2*$SZREG($sp)
	$REG_L	$t0,1*$SZREG($sp)
	$REG_L	$gp,0*$SZREG($sp)
	$PTR_ADD $sp,6*$SZREG
___
$code.=<<___;
	jr	$ra
	nop
.end	bn_mul_comba4
___

($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);

$code.=<<___;

.align	5
.globl	bn_sqr_comba8
.ent	bn_sqr_comba8
bn_sqr_comba8:
___
$code.=<<___ if ($flavour =~ /nubi/i);
	.frame	$sp,6*$SZREG,$ra
	.mask	0x8000f008,-$SZREG
	.set	noreorder
	$PTR_SUB $sp,6*$SZREG
	$REG_S	$ra,5*$SZREG($sp)
	$REG_S	$t3,4*$SZREG($sp)
	$REG_S	$t2,3*$SZREG($sp)
	$REG_S	$t1,2*$SZREG($sp)
	$REG_S	$t0,1*$SZREG($sp)
	$REG_S	$gp,0*$SZREG($sp)
___
$code.=<<___;
	.set	reorder
	$LD	$a_0,0($a1)
	$LD	$a_1,$BNSZ($a1)
	$LD	$a_2,2*$BNSZ($a1)
	$LD	$a_3,3*$BNSZ($a1)

	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
	$LD	$a_4,4*$BNSZ($a1)
	$LD	$a_5,5*$BNSZ($a1)
	$LD	$a_6,6*$BNSZ($a1)
	$LD	$a_7,7*$BNSZ($a1)
	mflo	$c_1
	mfhi	$c_2
	$ST	$c_1,0($a0)

	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
	mflo	$t_1
	mfhi	$t_2
	slt	$c_1,$t_2,$zero
	$SLL	$t_2,1
	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$c_3,$t_2,$at
	$ST	$c_2,$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_2,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,2*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_3,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_1,$a_2		# mul_add_c2(a[1],b[2],c1,c2,c3);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_3,$at
	 $MULTU	$a_4,$a_0		# mul_add_c2(a[4],b[0],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	$ST	$c_1,3*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_1,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_1,$at
	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	 $MULTU	$a_0,$a_5		# mul_add_c2(a[0],b[5],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	$ST	$c_2,4*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_2,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_1,$a_4		# mul_add_c2(a[1],b[4],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_2,$at
	$MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	 $MULTU	$a_6,$a_0		# mul_add_c2(a[6],b[0],c1,c2,c3);
	$ADDU	$c_2,$at
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,5*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_3,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_5,$a_1		# mul_add_c2(a[5],b[1],c1,c2,c3);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_3,$at
	$MULTU	$a_4,$a_2		# mul_add_c2(a[4],b[2],c1,c2,c3);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_3,$at
	$MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	 $MULTU	$a_0,$a_7		# mul_add_c2(a[0],b[7],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	$ST	$c_1,6*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_1,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_1,$a_6		# mul_add_c2(a[1],b[6],c2,c3,c1);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_1,$at
	$MULTU	$a_2,$a_5		# mul_add_c2(a[2],b[5],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_1,$at
	$MULTU	$a_3,$a_4		# mul_add_c2(a[3],b[4],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_1,$at
	 $MULTU	$a_7,$a_1		# mul_add_c2(a[7],b[1],c3,c1,c2);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	$ST	$c_2,7*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_2,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_6,$a_2		# mul_add_c2(a[6],b[2],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_2,$at
	$MULTU	$a_5,$a_3		# mul_add_c2(a[5],b[3],c3,c1,c2);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_2,$at
	$MULTU	$a_4,$a_4		# mul_add_c(a[4],b[4],c3,c1,c2);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	 $MULTU	$a_2,$a_7		# mul_add_c2(a[2],b[7],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,8*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_3,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_3,$a_6		# mul_add_c2(a[3],b[6],c1,c2,c3);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_3,$at
	$MULTU	$a_4,$a_5		# mul_add_c2(a[4],b[5],c1,c2,c3);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_3,$at
	 $MULTU	$a_7,$a_3		# mul_add_c2(a[7],b[3],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	$ST	$c_1,9*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_1,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_6,$a_4		# mul_add_c2(a[6],b[4],c2,c3,c1);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_1,$at
	$MULTU	$a_5,$a_5		# mul_add_c(a[5],b[5],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	 $MULTU	$a_4,$a_7		# mul_add_c2(a[4],b[7],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	$ST	$c_2,10*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_2,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_5,$a_6		# mul_add_c2(a[5],b[6],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_2,$at
	 $MULTU	$a_7,$a_5		# mul_add_c2(a[7],b[5],c1,c2,c3);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,11*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_3,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_6,$a_6		# mul_add_c(a[6],b[6],c1,c2,c3);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	 $MULTU	$a_6,$a_7		# mul_add_c2(a[6],b[7],c2,c3,c1);
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	$ST	$c_1,12*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_1,$t_2,$zero
	$SLL	$t_2,1
	 $MULTU	$a_7,$a_7		# mul_add_c(a[7],b[7],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	$ST	$c_2,13*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	$ST	$c_3,14*$BNSZ($a0)
	$ST	$c_1,15*$BNSZ($a0)

	.set	noreorder
___
$code.=<<___ if ($flavour =~ /nubi/i);
	$REG_L	$t3,4*$SZREG($sp)
	$REG_L	$t2,3*$SZREG($sp)
	$REG_L	$t1,2*$SZREG($sp)
	$REG_L	$t0,1*$SZREG($sp)
	$REG_L	$gp,0*$SZREG($sp)
	$PTR_ADD $sp,6*$SZREG
___
$code.=<<___;
	jr	$ra
	nop
.end	bn_sqr_comba8

.align	5
.globl	bn_sqr_comba4
.ent	bn_sqr_comba4
bn_sqr_comba4:
___
$code.=<<___ if ($flavour =~ /nubi/i);
	.frame	$sp,6*$SZREG,$ra
	.mask	0x8000f008,-$SZREG
	.set	noreorder
	$PTR_SUB $sp,6*$SZREG
	$REG_S	$ra,5*$SZREG($sp)
	$REG_S	$t3,4*$SZREG($sp)
	$REG_S	$t2,3*$SZREG($sp)
	$REG_S	$t1,2*$SZREG($sp)
	$REG_S	$t0,1*$SZREG($sp)
	$REG_S	$gp,0*$SZREG($sp)
___
$code.=<<___;
	.set	reorder
	$LD	$a_0,0($a1)
	$LD	$a_1,$BNSZ($a1)
	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
	$LD	$a_2,2*$BNSZ($a1)
	$LD	$a_3,3*$BNSZ($a1)
	mflo	$c_1
	mfhi	$c_2
	$ST	$c_1,0($a0)

	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
	mflo	$t_1
	mfhi	$t_2
	slt	$c_1,$t_2,$zero
	$SLL	$t_2,1
	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$c_3,$t_2,$at
	$ST	$c_2,$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_2,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,2*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_3,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_1,$a_2		# mul_add_c(a2[1],b[2],c1,c2,c3);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_3,$at
	 $MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	$ST	$c_1,3*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_1,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	 $MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	$ST	$c_2,4*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_2,$t_2,$zero
	$SLL	$t_2,1
	 $MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,5*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	$ST	$c_1,6*$BNSZ($a0)
	$ST	$c_2,7*$BNSZ($a0)

	.set	noreorder
___
$code.=<<___ if ($flavour =~ /nubi/i);
	$REG_L	$t3,4*$SZREG($sp)
	$REG_L	$t2,3*$SZREG($sp)
	$REG_L	$t1,2*$SZREG($sp)
	$REG_L	$t0,1*$SZREG($sp)
	$REG_L	$gp,0*$SZREG($sp)
	$PTR_ADD $sp,6*$SZREG
___
$code.=<<___;
	jr	$ra
	nop
.end	bn_sqr_comba4
___
print $code;
close STDOUT;