#!/usr/bin/env perl # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # December 2005 # # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons # for undertaken effort are multiple. First of all, UltraSPARC is not # the whole SPARCv9 universe and other VIS-free implementations deserve # optimized code as much. Secondly, newly introduced UltraSPARC T1, # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with # several integrated RSA/DSA accelerator circuits accessible through # kernel driver [only(*)], but having decent user-land software # implementation is important too. Finally, reasons like desire to # experiment with dedicated squaring procedure. Yes, this module # implements one, because it was easiest to draft it in SPARCv9 # instructions... # (*) Engine accessing the driver in question is on my TODO list. # For reference, acceleator is estimated to give 6 to 10 times # improvement on single-threaded RSA sign. It should be noted # that 6-10x improvement coefficient does not actually mean # something extraordinary in terms of absolute [single-threaded] # performance, as SPARCv9 instruction set is by all means least # suitable for high performance crypto among other 64 bit # platforms. 6-10x factor simply places T1 in same performance # domain as say AMD64 and IA-64. Improvement of RSA verify don't # appear impressive at all, but it's the sign operation which is # far more critical/interesting. # You might notice that inner loops are modulo-scheduled:-) This has # essentially negligible impact on UltraSPARC performance, it's # Fujitsu SPARC64 V users who should notice and hopefully appreciate # the advantage... Currently this module surpasses sparcv9a-mont.pl # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a # module still have hidden potential [see TODO list there], which is # estimated to be larger than 20%... # int bn_mul_mont( $rp="%i0"; # BN_ULONG *rp, $ap="%i1"; # const BN_ULONG *ap, $bp="%i2"; # const BN_ULONG *bp, $np="%i3"; # const BN_ULONG *np, $n0="%i4"; # const BN_ULONG *n0, $num="%i5"; # int num); $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } if ($bits==64) { $bias=2047; $frame=192; } else { $bias=0; $frame=128; } $car0="%o0"; $car1="%o1"; $car2="%o2"; # 1 bit $acc0="%o3"; $acc1="%o4"; $mask="%g1"; # 32 bits, what a waste... $tmp0="%g4"; $tmp1="%g5"; $i="%l0"; $j="%l1"; $mul0="%l2"; $mul1="%l3"; $tp="%l4"; $apj="%l5"; $npj="%l6"; $tpj="%l7"; $fname="bn_mul_mont_int"; $code=<<___; .section ".text",#alloc,#execinstr .global $fname .align 32 $fname: cmp %o5,4 ! 128 bits minimum bge,pt %icc,.Lenter sethi %hi(0xffffffff),$mask retl clr %o0 .align 32 .Lenter: save %sp,-$frame,%sp sll $num,2,$num ! num*=4 or $mask,%lo(0xffffffff),$mask ld [$n0],$n0 cmp $ap,$bp and $num,$mask,$num ld [$bp],$mul0 ! bp[0] nop add %sp,$bias,%o7 ! real top of stack ld [$ap],$car0 ! ap[0] ! redundant in squaring context sub %o7,$num,%o7 ld [$ap+4],$apj ! ap[1] and %o7,-1024,%o7 ld [$np],$car1 ! np[0] sub %o7,$bias,%sp ! alloca ld [$np+4],$npj ! np[1] be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont mov 12,$j mulx $car0,$mul0,$car0 ! ap[0]*bp[0] mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] and $car0,$mask,$acc0 add %sp,$bias+$frame,$tp ld [$ap+8],$apj !prologue! mulx $n0,$acc0,$mul1 ! "t[0]"*n0 and $mul1,$mask,$mul1 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 srlx $car0,32,$car0 add $acc0,$car1,$car1 ld [$np+8],$npj !prologue! srlx $car1,32,$car1 mov $tmp0,$acc0 !prologue! .L1st: mulx $apj,$mul0,$tmp0 mulx $npj,$mul1,$tmp1 add $acc0,$car0,$car0 ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 add $acc1,$car1,$car1 ld [$np+$j],$npj ! np[j] srlx $car0,32,$car0 add $acc0,$car1,$car1 add $j,4,$j ! j++ mov $tmp0,$acc0 st $car1,[$tp] cmp $j,$num mov $tmp1,$acc1 srlx $car1,32,$car1 bl %icc,.L1st add $tp,4,$tp ! tp++ !.L1st mulx $apj,$mul0,$tmp0 !epilogue! mulx $npj,$mul1,$tmp1 add $acc0,$car0,$car0 and $car0,$mask,$acc0 add $acc1,$car1,$car1 srlx $car0,32,$car0 add $acc0,$car1,$car1 st $car1,[$tp] srlx $car1,32,$car1 add $tmp0,$car0,$car0 and $car0,$mask,$acc0 add $tmp1,$car1,$car1 srlx $car0,32,$car0 add $acc0,$car1,$car1 st $car1,[$tp+4] srlx $car1,32,$car1 add $car0,$car1,$car1 st $car1,[$tp+8] srlx $car1,32,$car2 mov 4,$i ! i++ ld [$bp+4],$mul0 ! bp[1] .Louter: add %sp,$bias+$frame,$tp ld [$ap],$car0 ! ap[0] ld [$ap+4],$apj ! ap[1] ld [$np],$car1 ! np[0] ld [$np+4],$npj ! np[1] ld [$tp],$tmp1 ! tp[0] ld [$tp+4],$tpj ! tp[1] mov 12,$j mulx $car0,$mul0,$car0 mulx $apj,$mul0,$tmp0 !prologue! add $tmp1,$car0,$car0 ld [$ap+8],$apj !prologue! and $car0,$mask,$acc0 mulx $n0,$acc0,$mul1 and $mul1,$mask,$mul1 mulx $car1,$mul1,$car1 mulx $npj,$mul1,$acc1 !prologue! srlx $car0,32,$car0 add $acc0,$car1,$car1 ld [$np+8],$npj !prologue! srlx $car1,32,$car1 mov $tmp0,$acc0 !prologue! .Linner: mulx $apj,$mul0,$tmp0 mulx $npj,$mul1,$tmp1 add $tpj,$car0,$car0 ld [$ap+$j],$apj ! ap[j] add $acc0,$car0,$car0 add $acc1,$car1,$car1 ld [$np+$j],$npj ! np[j] and $car0,$mask,$acc0 ld [$tp+8],$tpj ! tp[j] srlx $car0,32,$car0 add $acc0,$car1,$car1 add $j,4,$j ! j++ mov $tmp0,$acc0 st $car1,[$tp] ! tp[j-1] srlx $car1,32,$car1 mov $tmp1,$acc1 cmp $j,$num bl %icc,.Linner add $tp,4,$tp ! tp++ !.Linner mulx $apj,$mul0,$tmp0 !epilogue! mulx $npj,$mul1,$tmp1 add $tpj,$car0,$car0 add $acc0,$car0,$car0 ld [$tp+8],$tpj ! tp[j] and $car0,$mask,$acc0 add $acc1,$car1,$car1 srlx $car0,32,$car0 add $acc0,$car1,$car1 st $car1,[$tp] ! tp[j-1] srlx $car1,32,$car1 add $tpj,$car0,$car0 add $tmp0,$car0,$car0 and $car0,$mask,$acc0 add $tmp1,$car1,$car1 add $acc0,$car1,$car1 st $car1,[$tp+4] ! tp[j-1] srlx $car0,32,$car0 add $i,4,$i ! i++ srlx $car1,32,$car1 add $car0,$car1,$car1 cmp $i,$num add $car2,$car1,$car1 st $car1,[$tp+8] srlx $car1,32,$car2 bl,a %icc,.Louter ld [$bp+$i],$mul0 ! bp[i] !.Louter add $tp,12,$tp .Ltail: add $np,$num,$np add $rp,$num,$rp mov $tp,$ap sub %g0,$num,%o7 ! k=-num ba .Lsub subcc %g0,%g0,%g0 ! clear %icc.c .align 16 .Lsub: ld [$tp+%o7],%o0 ld [$np+%o7],%o1 subccc %o0,%o1,%o1 ! tp[j]-np[j] add $rp,%o7,$i add %o7,4,%o7 brnz %o7,.Lsub st %o1,[$i] subc $car2,0,$car2 ! handle upmost overflow bit and $tp,$car2,$ap andn $rp,$car2,$np or $ap,$np,$ap sub %g0,$num,%o7 .Lcopy: ld [$ap+%o7],%o0 ! copy or in-place refresh st %g0,[$tp+%o7] ! zap tp st %o0,[$rp+%o7] add %o7,4,%o7 brnz %o7,.Lcopy nop mov 1,%i0 ret restore ___ ######## ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over ######## code without following dedicated squaring procedure. ######## $sbit="%i2"; # re-use $bp! $code.=<<___; .align 32 .Lbn_sqr_mont: mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] mulx $apj,$mul0,$tmp0 !prologue! and $car0,$mask,$acc0 add %sp,$bias+$frame,$tp ld [$ap+8],$apj !prologue! mulx $n0,$acc0,$mul1 ! "t[0]"*n0 srlx $car0,32,$car0 and $mul1,$mask,$mul1 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 mulx $npj,$mul1,$acc1 !prologue! and $car0,1,$sbit ld [$np+8],$npj !prologue! srlx $car0,1,$car0 add $acc0,$car1,$car1 srlx $car1,32,$car1 mov $tmp0,$acc0 !prologue! .Lsqr_1st: mulx $apj,$mul0,$tmp0 mulx $npj,$mul1,$tmp1 add $acc0,$car0,$car0 ! ap[j]*a0+c0 add $acc1,$car1,$car1 ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 ld [$np+$j],$npj ! np[j] srlx $car0,32,$car0 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 mov $tmp1,$acc1 srlx $acc0,32,$sbit add $j,4,$j ! j++ and $acc0,$mask,$acc0 cmp $j,$num add $acc0,$car1,$car1 st $car1,[$tp] mov $tmp0,$acc0 srlx $car1,32,$car1 bl %icc,.Lsqr_1st add $tp,4,$tp ! tp++ !.Lsqr_1st mulx $apj,$mul0,$tmp0 ! epilogue mulx $npj,$mul1,$tmp1 add $acc0,$car0,$car0 ! ap[j]*a0+c0 add $acc1,$car1,$car1 and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 st $car1,[$tp] srlx $car1,32,$car1 add $tmp0,$car0,$car0 ! ap[j]*a0+c0 add $tmp1,$car1,$car1 and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 st $car1,[$tp+4] srlx $car1,32,$car1 add $car0,$car0,$car0 or $sbit,$car0,$car0 add $car0,$car1,$car1 st $car1,[$tp+8] srlx $car1,32,$car2 ld [%sp+$bias+$frame],$tmp0 ! tp[0] ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] ld [%sp+$bias+$frame+8],$tpj ! tp[2] ld [$ap+4],$mul0 ! ap[1] ld [$ap+8],$apj ! ap[2] ld [$np],$car1 ! np[0] ld [$np+4],$npj ! np[1] mulx $n0,$tmp0,$mul1 mulx $mul0,$mul0,$car0 and $mul1,$mask,$mul1 mulx $car1,$mul1,$car1 mulx $npj,$mul1,$acc1 add $tmp0,$car1,$car1 and $car0,$mask,$acc0 ld [$np+8],$npj ! np[2] srlx $car1,32,$car1 add $tmp1,$car1,$car1 srlx $car0,32,$car0 add $acc0,$car1,$car1 and $car0,1,$sbit add $acc1,$car1,$car1 srlx $car0,1,$car0 mov 12,$j st $car1,[%sp+$bias+$frame] ! tp[0]= srlx $car1,32,$car1 add %sp,$bias+$frame+4,$tp .Lsqr_2nd: mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $acc0,$car0,$car0 add $tpj,$car1,$car1 ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 ld [$np+$j],$npj ! np[j] srlx $car0,32,$car0 add $acc1,$car1,$car1 ld [$tp+8],$tpj ! tp[j] add $acc0,$acc0,$acc0 add $j,4,$j ! j++ or $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 cmp $j,$num add $acc0,$car1,$car1 st $car1,[$tp] ! tp[j-1] srlx $car1,32,$car1 bl %icc,.Lsqr_2nd add $tp,4,$tp ! tp++ !.Lsqr_2nd mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $acc0,$car0,$car0 add $tpj,$car1,$car1 and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc1,$car1,$car1 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 st $car1,[$tp] ! tp[j-1] srlx $car1,32,$car1 add $car0,$car0,$car0 or $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] srlx $car1,32,$car2 ld [%sp+$bias+$frame],$tmp1 ! tp[0] ld [%sp+$bias+$frame+4],$tpj ! tp[1] ld [$ap+8],$mul0 ! ap[2] ld [$np],$car1 ! np[0] ld [$np+4],$npj ! np[1] mulx $n0,$tmp1,$mul1 and $mul1,$mask,$mul1 mov 8,$i mulx $mul0,$mul0,$car0 mulx $car1,$mul1,$car1 and $car0,$mask,$acc0 add $tmp1,$car1,$car1 srlx $car0,32,$car0 add %sp,$bias+$frame,$tp srlx $car1,32,$car1 and $car0,1,$sbit srlx $car0,1,$car0 mov 4,$j .Lsqr_outer: .Lsqr_inner1: mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $j,4,$j ld [$tp+8],$tpj cmp $j,$i add $acc1,$car1,$car1 ld [$np+$j],$npj st $car1,[$tp] srlx $car1,32,$car1 bl %icc,.Lsqr_inner1 add $tp,4,$tp !.Lsqr_inner1 add $j,4,$j ld [$ap+$j],$apj ! ap[j] mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 ld [$np+$j],$npj ! np[j] add $acc0,$car1,$car1 ld [$tp+8],$tpj ! tp[j] add $acc1,$car1,$car1 st $car1,[$tp] srlx $car1,32,$car1 add $j,4,$j cmp $j,$num be,pn %icc,.Lsqr_no_inner2 add $tp,4,$tp .Lsqr_inner2: mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $acc0,$car0,$car0 ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 ld [$np+$j],$npj ! np[j] srlx $car0,32,$car0 add $acc0,$acc0,$acc0 ld [$tp+8],$tpj ! tp[j] or $sbit,$acc0,$acc0 add $j,4,$j ! j++ srlx $acc0,32,$sbit and $acc0,$mask,$acc0 cmp $j,$num add $acc0,$car1,$car1 add $acc1,$car1,$car1 st $car1,[$tp] ! tp[j-1] srlx $car1,32,$car1 bl %icc,.Lsqr_inner2 add $tp,4,$tp ! tp++ .Lsqr_no_inner2: mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $acc0,$car0,$car0 and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 add $acc1,$car1,$car1 st $car1,[$tp] ! tp[j-1] srlx $car1,32,$car1 add $car0,$car0,$car0 or $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] srlx $car1,32,$car2 add $i,4,$i ! i++ ld [%sp+$bias+$frame],$tmp1 ! tp[0] ld [%sp+$bias+$frame+4],$tpj ! tp[1] ld [$ap+$i],$mul0 ! ap[j] ld [$np],$car1 ! np[0] ld [$np+4],$npj ! np[1] mulx $n0,$tmp1,$mul1 and $mul1,$mask,$mul1 add $i,4,$tmp0 mulx $mul0,$mul0,$car0 mulx $car1,$mul1,$car1 and $car0,$mask,$acc0 add $tmp1,$car1,$car1 srlx $car0,32,$car0 add %sp,$bias+$frame,$tp srlx $car1,32,$car1 and $car0,1,$sbit srlx $car0,1,$car0 cmp $tmp0,$num ! i<num-1 bl %icc,.Lsqr_outer mov 4,$j .Lsqr_last: mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $j,4,$j ld [$tp+8],$tpj cmp $j,$i add $acc1,$car1,$car1 ld [$np+$j],$npj st $car1,[$tp] srlx $car1,32,$car1 bl %icc,.Lsqr_last add $tp,4,$tp !.Lsqr_last mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $acc0,$car1,$car1 add $acc1,$car1,$car1 st $car1,[$tp] srlx $car1,32,$car1 add $car0,$car0,$car0 ! recover $car0 or $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] srlx $car1,32,$car2 ba .Ltail add $tp,8,$tp .type $fname,#function .size $fname,(.-$fname) .asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" .align 32 ___ $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT;