summaryrefslogtreecommitdiffstats
path: root/crypto/bn
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2011-11-14 20:55:24 +0000
committerAndy Polyakov <appro@openssl.org>2011-11-14 20:55:24 +0000
commitb66723b23e3a1a261f561751674ddc7c123c015b (patch)
tree8e1578d7ef4f01050517c10eff84c536ab8260fb /crypto/bn
parentcf96d71c220121f93da5b336b480dac1bea69822 (diff)
MIPS assembler pack update from HEAD.
Diffstat (limited to 'crypto/bn')
-rw-r--r--crypto/bn/asm/mips-mont.pl426
-rw-r--r--crypto/bn/asm/mips.pl2585
2 files changed, 3011 insertions, 0 deletions
diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl
new file mode 100644
index 0000000000..b944a12b8e
--- /dev/null
+++ b/crypto/bn/asm/mips-mont.pl
@@ -0,0 +1,426 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This module doesn't present direct interest for OpenSSL, because it
+# doesn't provide better performance for longer keys, at least not on
+# in-order-execution cores. While 512-bit RSA sign operations can be
+# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
+# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
+# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
+# verify:-( All comparisons are against bn_mul_mont-free assembler.
+# The module might be of interest to embedded system developers, as
+# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
+# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
+# code.
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+# old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+ $PTR_ADD="dadd"; # incidentally works even on n32
+ $PTR_SUB="dsub"; # incidentally works even on n32
+ $REG_S="sd";
+ $REG_L="ld";
+ $SZREG=8;
+} else {
+ $PTR_ADD="add";
+ $PTR_SUB="sub";
+ $REG_S="sw";
+ $REG_L="lw";
+ $SZREG=4;
+}
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+ $LD="ld";
+ $ST="sd";
+ $MULTU="dmultu";
+ $ADDU="daddu";
+ $SUBU="dsubu";
+ $BNSZ=8;
+} else {
+ $LD="lw";
+ $ST="sw";
+ $MULTU="multu";
+ $ADDU="addu";
+ $SUBU="subu";
+ $BNSZ=4;
+}
+
+# int bn_mul_mont(
+$rp=$a0; # BN_ULONG *rp,
+$ap=$a1; # const BN_ULONG *ap,
+$bp=$a2; # const BN_ULONG *bp,
+$np=$a3; # const BN_ULONG *np,
+$n0=$a4; # const BN_ULONG *n0,
+$num=$a5; # int num);
+
+$lo0=$a6;
+$hi0=$a7;
+$lo1=$t1;
+$hi1=$t2;
+$aj=$s0;
+$bi=$s1;
+$nj=$s2;
+$tp=$s3;
+$alo=$s4;
+$ahi=$s5;
+$nlo=$s6;
+$nhi=$s7;
+$tj=$s8;
+$i=$s9;
+$j=$s10;
+$m1=$s11;
+
+$FRAMESIZE=14;
+
+$code=<<___;
+.text
+
+.set noat
+.set noreorder
+
+.align 5
+.globl bn_mul_mont
+.ent bn_mul_mont
+bn_mul_mont:
+___
+$code.=<<___ if ($flavour =~ /o32/i);
+ lw $n0,16($sp)
+ lw $num,20($sp)
+___
+$code.=<<___;
+ slt $at,$num,4
+ bnez $at,1f
+ li $t0,0
+ slt $at,$num,17 # on in-order CPU
+ bnezl $at,bn_mul_mont_internal
+ nop
+1: jr $ra
+ li $a0,0
+.end bn_mul_mont
+
+.align 5
+.ent bn_mul_mont_internal
+bn_mul_mont_internal:
+ .frame $fp,$FRAMESIZE*$SZREG,$ra
+ .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
+ $PTR_SUB $sp,$FRAMESIZE*$SZREG
+ $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
+ $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
+ $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
+ $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
+ $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
+ $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
+ $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
+ $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
+ $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
+ $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
+ $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
+ $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+ move $fp,$sp
+
+ .set reorder
+ $LD $n0,0($n0)
+ $LD $bi,0($bp) # bp[0]
+ $LD $aj,0($ap) # ap[0]
+ $LD $nj,0($np) # np[0]
+
+ $PTR_SUB $sp,2*$BNSZ # place for two extra words
+ sll $num,`log($BNSZ)/log(2)`
+ li $at,-4096
+ $PTR_SUB $sp,$num
+ and $sp,$at
+
+ $MULTU $aj,$bi
+ $LD $alo,$BNSZ($ap)
+ $LD $nlo,$BNSZ($np)
+ mflo $lo0
+ mfhi $hi0
+ $MULTU $lo0,$n0
+ mflo $m1
+
+ $MULTU $alo,$bi
+ mflo $alo
+ mfhi $ahi
+
+ $MULTU $nj,$m1
+ mflo $lo1
+ mfhi $hi1
+ $MULTU $nlo,$m1
+ $ADDU $lo1,$lo0
+ sltu $at,$lo1,$lo0
+ $ADDU $hi1,$at
+ mflo $nlo
+ mfhi $nhi
+
+ move $tp,$sp
+ li $j,2*$BNSZ
+.align 4
+.L1st:
+ .set noreorder
+ $PTR_ADD $aj,$ap,$j
+ $PTR_ADD $nj,$np,$j
+ $LD $aj,($aj)
+ $LD $nj,($nj)
+
+ $MULTU $aj,$bi
+ $ADDU $lo0,$alo,$hi0
+ $ADDU $lo1,$nlo,$hi1
+ sltu $at,$lo0,$hi0
+ sltu $t0,$lo1,$hi1
+ $ADDU $hi0,$ahi,$at
+ $ADDU $hi1,$nhi,$t0
+ mflo $alo
+ mfhi $ahi
+
+ $ADDU $lo1,$lo0
+ sltu $at,$lo1,$lo0
+ $MULTU $nj,$m1
+ $ADDU $hi1,$at
+ addu $j,$BNSZ
+ $ST $lo1,($tp)
+ sltu $t0,$j,$num
+ mflo $nlo
+ mfhi $nhi
+
+ bnez $t0,.L1st
+ $PTR_ADD $tp,$BNSZ
+ .set reorder
+
+ $ADDU $lo0,$alo,$hi0
+ sltu $at,$lo0,$hi0
+ $ADDU $hi0,$ahi,$at
+
+ $ADDU $lo1,$nlo,$hi1
+ sltu $t0,$lo1,$hi1
+ $ADDU $hi1,$nhi,$t0
+ $ADDU $lo1,$lo0
+ sltu $at,$lo1,$lo0
+ $ADDU $hi1,$at
+
+ $ST $lo1,($tp)
+
+ $ADDU $hi1,$hi0
+ sltu $at,$hi1,$hi0
+ $ST $hi1,$BNSZ($tp)
+ $ST $at,2*$BNSZ($tp)
+
+ li $i,$BNSZ
+.align 4
+.Louter:
+ $PTR_ADD $bi,$bp,$i
+ $LD $bi,($bi)
+ $LD $aj,($ap)
+ $LD $alo,$BNSZ($ap)
+ $LD $tj,($sp)
+
+ $MULTU $aj,$bi
+ $LD $nj,($np)
+ $LD $nlo,$BNSZ($np)
+ mflo $lo0
+ mfhi $hi0
+ $ADDU $lo0,$tj
+ $MULTU $lo0,$n0
+ sltu $at,$lo0,$tj
+ $ADDU $hi0,$at
+ mflo $m1
+
+ $MULTU $alo,$bi
+ mflo $alo
+ mfhi $ahi
+
+ $MULTU $nj,$m1
+ mflo $lo1
+ mfhi $hi1
+
+ $MULTU $nlo,$m1
+ $ADDU $lo1,$lo0
+ sltu $at,$lo1,$lo0
+ $ADDU $hi1,$at
+ mflo $nlo
+ mfhi $nhi
+
+ move $tp,$sp
+ li $j,2*$BNSZ
+ $LD $tj,$BNSZ($tp)
+.align 4
+.Linner:
+ .set noreorder
+ $PTR_ADD $aj,$ap,$j
+ $PTR_ADD $nj,$np,$j
+ $LD $aj,($aj)
+ $LD $nj,($nj)
+
+ $MULTU $aj,$bi
+ $ADDU $lo0,$alo,$hi0
+ $ADDU $lo1,$nlo,$hi1
+ sltu $at,$lo0,$hi0
+ sltu $t0,$lo1,$hi1
+ $ADDU $hi0,$ahi,$at
+ $ADDU $hi1,$nhi,$t0
+ mflo $alo
+ mfhi $ahi
+
+ $ADDU $lo0,$tj
+ addu $j,$BNSZ
+ $MULTU $nj,$m1
+ sltu $at,$lo0,$tj
+ $ADDU $lo1,$lo0
+ $ADDU $hi0,$at
+ sltu $t0,$lo1,$lo0
+ $LD $tj,2*$BNSZ($tp)
+ $ADDU $hi1,$t0
+ sltu $at,$j,$num
+ mflo $nlo
+ mfhi $nhi
+ $ST $lo1,($tp)
+ bnez $at,.Linner
+ $PTR_ADD $tp,$BNSZ
+ .set reorder
+
+ $ADDU $lo0,$alo,$hi0
+ sltu $at,$lo0,$hi0
+ $ADDU $hi0,$ahi,$at
+ $ADDU $lo0,$tj
+ sltu $t0,$lo0,$tj
+ $ADDU $hi0,$t0
+
+ $LD $tj,2*$BNSZ($tp)
+ $ADDU $lo1,$nlo,$hi1
+ sltu $at,$lo1,$hi1
+ $ADDU $hi1,$nhi,$at
+ $ADDU $lo1,$lo0
+ sltu $t0,$lo1,$lo0
+ $ADDU $hi1,$t0
+ $ST $lo1,($tp)
+
+ $ADDU $lo1,$hi1,$hi0
+ sltu $hi1,$lo1,$hi0
+ $ADDU $lo1,$tj
+ sltu $at,$lo1,$tj
+ $ADDU $hi1,$at
+ $ST $lo1,$BNSZ($tp)
+ $ST $hi1,2*$BNSZ($tp)
+
+ addu $i,$BNSZ
+ sltu $t0,$i,$num
+ bnez $t0,.Louter
+
+ .set noreorder
+ $PTR_ADD $tj,$sp,$num # &tp[num]
+ move $tp,$sp
+ move $ap,$sp
+ li $hi0,0 # clear borrow bit
+
+.align 4
+.Lsub: $LD $lo0,($tp)
+ $LD $lo1,($np)
+ $PTR_ADD $tp,$BNSZ
+ $PTR_ADD $np,$BNSZ
+ $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
+ sgtu $at,$lo1,$lo0
+ $SUBU $lo0,$lo1,$hi0
+ sgtu $hi0,$lo0,$lo1
+ $ST $lo0,($rp)
+ or $hi0,$at
+ sltu $at,$tp,$tj
+ bnez $at,.Lsub
+ $PTR_ADD $rp,$BNSZ
+
+ $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
+ move $tp,$sp
+ $PTR_SUB $rp,$num # restore rp
+ not $hi1,$hi0
+
+ and $ap,$hi0,$sp
+ and $bp,$hi1,$rp
+ or $ap,$ap,$bp # ap=borrow?tp:rp
+
+.align 4
+.Lcopy: $LD $aj,($ap)
+ $PTR_ADD $ap,$BNSZ
+ $ST $zero,($tp)
+ $PTR_ADD $tp,$BNSZ
+ sltu $at,$tp,$tj
+ $ST $aj,($rp)
+ bnez $at,.Lcopy
+ $PTR_ADD $rp,$BNSZ
+
+ li $a0,1
+ li $t0,1
+
+ .set noreorder
+ move $sp,$fp
+ $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
+ $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
+ $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
+ $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
+ $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
+ $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
+ $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
+ $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
+ $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
+ $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
+ $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
+ $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+ jr $ra
+ $PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end bn_mul_mont_internal
+.rdata
+.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl
new file mode 100644
index 0000000000..acfd35968e
--- /dev/null
+++ b/crypto/bn/asm/mips.pl
@@ -0,0 +1,2585 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project.
+#
+# Rights for redistribution and usage in source and binary forms are
+# granted according to the OpenSSL license. Warranty of any kind is
+# disclaimed.
+# ====================================================================
+
+
+# July 1999
+#
+# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
+#
+# The module is designed to work with either of the "new" MIPS ABI(5),
+# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
+# IRIX 5.x not only because it doesn't support new ABIs but also
+# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
+# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
+# cause illegal instruction exception:-(
+#
+# In addition the code depends on preprocessor flags set up by MIPSpro
+# compiler driver (either as or cc) and therefore (probably?) can't be
+# compiled by the GNU assembler. GNU C driver manages fine though...
+# I mean as long as -mmips-as is specified or is the default option,
+# because then it simply invokes /usr/bin/as which in turn takes
+# perfect care of the preprocessor definitions. Another neat feature
+# offered by the MIPSpro assembler is an optimization pass. This gave
+# me the opportunity to have the code looking more regular as all those
+# architecture dependent instruction rescheduling details were left to
+# the assembler. Cool, huh?
+#
+# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
+# goes way over 3 times faster!
+#
+# <appro@fy.chalmers.se>
+
+# October 2010
+#
+# Adapt the module even for 32-bit ABIs and other OSes. The former was
+# achieved by mechanical replacement of 64-bit arithmetic instructions
+# such as dmultu, daddu, etc. with their 32-bit counterparts and
+# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
+# >3x performance improvement naturally does not apply to 32-bit code
+# [because there is no instruction 32-bit compiler can't use], one
+# has to content with 40-85% improvement depending on benchmark and
+# key length, more for longer keys.
+
+$flavour = shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+ $LD="ld";
+ $ST="sd";
+ $MULTU="dmultu";
+ $DIVU="ddivu";
+ $ADDU="daddu";
+ $SUBU="dsubu";
+ $SRL="dsrl";
+ $SLL="dsll";
+ $BNSZ=8;
+ $PTR_ADD="daddu";
+ $PTR_SUB="dsubu";
+ $SZREG=8;
+ $REG_S="sd";
+ $REG_L="ld";
+} else {
+ $LD="lw";
+ $ST="sw";
+ $MULTU="multu";
+ $DIVU="divu";
+ $ADDU="addu";
+ $SUBU="subu";
+ $SRL="srl";
+ $SLL="sll";
+ $BNSZ=4;
+ $PTR_ADD="addu";
+ $PTR_SUB="subu";
+ $SZREG=4;
+ $REG_S="sw";
+ $REG_L="lw";
+ $code=".set mips2\n";
+}
+
+# Below is N32/64 register layout used in the original module.
+#
+($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
+#
+# No special adaptation is required for O32. NUBI on the other hand
+# is treated by saving/restoring ($v1,$t0..$t3).
+
+$gp=$v1 if ($flavour =~ /nubi/i);
+
+$minus4=$v1;
+
+$code.=<<___;
+.rdata
+.asciiz "mips3.s, Version 1.2"
+.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
+
+.text
+.set noat
+
+.align 5
+.globl bn_mul_add_words
+.ent bn_mul_add_words
+bn_mul_add_words:
+ .set noreorder
+ bgtz $a2,bn_mul_add_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$v0
+.end bn_mul_add_words
+
+.align 5
+.ent bn_mul_add_words_internal
+bn_mul_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $ta0,$a2,$minus4
+ $LD $t0,0($a1)
+ beqz $ta0,.L_bn_mul_add_words_tail
+
+.L_bn_mul_add_words_loop:
+ $MULTU $t0,$a3
+ $LD $t1,0($a0)
+ $LD $t2,$BNSZ($a1)
+ $LD $t3,$BNSZ($a0)
+ $LD $ta0,2*$BNSZ($a1)
+ $LD $ta1,2*$BNSZ($a0)
+ $ADDU $t1,$v0
+ sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
+ # values", but it seems to work fine
+ # even on 64-bit registers.
+ mflo $at
+ mfhi $t0
+ $ADDU $t1,$at
+ $ADDU $v0,$t0
+ $MULTU $t2,$a3
+ sltu $at,$t1,$at
+ $ST $t1,0($a0)
+ $ADDU $v0,$at
+
+ $LD $ta2,3*$BNSZ($a1)
+ $LD $ta3,3*$BNSZ($a0)
+ $ADDU $t3,$v0
+ sltu $v0,$t3,$v0
+ mflo $at
+ mfhi $t2
+ $ADDU $t3,$at
+ $ADDU $v0,$t2
+ $MULTU $ta0,$a3
+ sltu $at,$t3,$at
+ $ST $t3,$BNSZ($a0)
+ $ADDU $v0,$at
+
+ subu $a2,4
+ $PTR_ADD $a0,4*$BNSZ
+ $PTR_ADD $a1,4*$BNSZ
+ $ADDU $ta1,$v0
+ sltu $v0,$ta1,$v0
+ mflo $at
+ mfhi $ta0
+ $ADDU $ta1,$at
+ $ADDU $v0,$ta0
+ $MULTU $ta2,$a3
+ sltu $at,$ta1,$at
+ $ST $ta1,-2*$BNSZ($a0)
+ $ADDU $v0,$at
+
+
+ and $ta0,$a2,$minus4
+ $ADDU $ta3,$v0
+ sltu $v0,$ta3,$v0
+ mflo $at
+ mfhi $ta2
+ $ADDU $ta3,$at
+ $ADDU $v0,$ta2
+ sltu $at,$ta3,$at
+ $ST $ta3,-$BNSZ($a0)
+ $ADDU $v0,$at
+ .set noreorder
+ bgtzl $ta0,.L_bn_mul_add_words_loop
+ $LD $t0,0($a1)
+
+ beqz $a2,.L_bn_mul_add_words_return
+ nop
+
+.L_bn_mul_add_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $MULTU $t0,$a3
+ $LD $t1,0($a0)
+ subu $a2,1
+ $ADDU $t1,$v0
+ sltu $v0,$t1,$v0
+ mflo $at
+ mfhi $t0
+ $ADDU $t1,$at
+ $ADDU $v0,$t0
+ sltu $at,$t1,$at
+ $ST $t1,0($a0)
+ $ADDU $v0,$at
+ beqz $a2,.L_bn_mul_add_words_return
+
+ $LD $t0,$BNSZ($a1)
+ $MULTU $t0,$a3
+ $LD $t1,$BNSZ($a0)
+ subu $a2,1
+ $ADDU $t1,$v0
+ sltu $v0,$t1,$v0
+ mflo $at
+ mfhi $t0
+ $ADDU $t1,$at
+ $ADDU $v0,$t0
+ sltu $at,$t1,$at
+ $ST $t1,$BNSZ($a0)
+ $ADDU $v0,$at
+ beqz $a2,.L_bn_mul_add_words_return
+
+ $LD $t0,2*$BNSZ($a1)
+ $MULTU $t0,$a3
+ $LD $t1,2*$BNSZ($a0)
+ $ADDU $t1,$v0
+ sltu $v0,$t1,$v0
+ mflo $at
+ mfhi $t0
+ $ADDU $t1,$at
+ $ADDU $v0,$t0
+ sltu $at,$t1,$at
+ $ST $t1,2*$BNSZ($a0)
+ $ADDU $v0,$at
+
+.L_bn_mul_add_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_mul_add_words
+
+.align 5
+.globl bn_mul_words
+.ent bn_mul_words
+bn_mul_words:
+ .set noreorder
+ bgtz $a2,bn_mul_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$v0
+.end bn_mul_words
+
+.align 5
+.ent bn_mul_words_internal
+bn_mul_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $ta0,$a2,$minus4
+ $LD $t0,0($a1)
+ beqz $ta0,.L_bn_mul_words_tail
+
+.L_bn_mul_words_loop:
+ $MULTU $t0,$a3
+ $LD $t2,$BNSZ($a1)
+ $LD $ta0,2*$BNSZ($a1)
+ $LD $ta2,3*$BNSZ($a1)
+ mflo $at
+ mfhi $t0
+ $ADDU $v0,$at
+ sltu $t1,$v0,$at
+ $MULTU $t2,$a3
+ $ST $v0,0($a0)
+ $ADDU $v0,$t1,$t0
+
+ subu $a2,4
+ $PTR_ADD $a0,4*$BNSZ
+ $PTR_ADD $a1,4*$BNSZ
+ mflo $at
+ mfhi $t2
+ $ADDU $v0,$at
+ sltu $t3,$v0,$at
+ $MULTU $ta0,$a3
+ $ST $v0,-3*$BNSZ($a0)
+ $ADDU $v0,$t3,$t2
+
+ mflo $at
+ mfhi $ta0
+ $ADDU $v0,$at
+ sltu $ta1,$v0,$at
+ $MULTU $ta2,$a3
+ $ST $v0,-2*$BNSZ($a0)
+ $ADDU $v0,$ta1,$ta0
+
+ and $ta0,$a2,$minus4
+ mflo $at
+ mfhi $ta2
+ $ADDU $v0,$at
+ sltu $ta3,$v0,$at
+ $ST $v0,-$BNSZ($a0)
+ $ADDU $v0,$ta3,$ta2
+ .set noreorder
+ bgtzl $ta0,.L_bn_mul_words_loop
+ $LD $t0,0($a1)
+
+ beqz $a2,.L_bn_mul_words_return
+ nop
+
+.L_bn_mul_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $MULTU $t0,$a3
+ subu $a2,1
+ mflo $at
+ mfhi $t0
+ $ADDU $v0,$at
+ sltu $t1,$v0,$at
+ $ST $v0,0($a0)
+ $ADDU $v0,$t1,$t0
+ beqz $a2,.L_bn_mul_words_return
+
+ $LD $t0,$BNSZ($a1)
+ $MULTU $t0,$a3
+ subu $a2,1
+ mflo $at
+ mfhi $t0
+ $ADDU $v0,$at
+ sltu $t1,$v0,$at
+ $ST $v0,$BNSZ($a0)
+ $ADDU $v0,$t1,$t0
+ beqz $a2,.L_bn_mul_words_return
+
+ $LD $t0,2*$BNSZ($a1)
+ $MULTU $t0,$a3
+ mflo $at
+ mfhi $t0
+ $ADDU $v0,$at
+ sltu $t1,$v0,$at
+ $ST $v0,2*$BNSZ($a0)
+ $ADDU $v0,$t1,$t0
+
+.L_bn_mul_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_mul_words_internal
+
+.align 5
+.globl bn_sqr_words
+.ent bn_sqr_words
+bn_sqr_words:
+ .set noreorder
+ bgtz $a2,bn_sqr_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$v0
+.end bn_sqr_words
+
+.align 5
+.ent bn_sqr_words_internal
+bn_sqr_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $ta0,$a2,$minus4
+ $LD $t0,0($a1)
+ beqz $ta0,.L_bn_sqr_words_tail
+
+.L_bn_sqr_words_loop:
+ $MULTU $t0,$t0
+ $LD $t2,$BNSZ($a1)
+ $LD $ta0,2*$BNSZ($a1)
+ $LD $ta2,3*$BNSZ($a1)
+ mflo $t1
+ mfhi $t0
+ $ST $t1,0($a0)
+ $ST $t0,$BNSZ($a0)
+
+ $MULTU $t2,$t2
+ subu $a2,4
+ $PTR_ADD $a0,8*$BNSZ
+ $PTR_ADD $a1,4*$BNSZ
+ mflo $t3
+ mfhi $t2
+ $ST $t3,-6*$BNSZ($a0)
+ $ST $t2,-5*$BNSZ($a0)
+
+ $MULTU $ta0,$ta0
+ mflo $ta1
+ mfhi $ta0
+ $ST $ta1,-4*$BNSZ($a0)
+ $ST $ta0,-3*$BNSZ($a0)
+
+
+ $MULTU $ta2,$ta2
+ and $ta0,$a2,$minus4
+ mflo $ta3
+ mfhi $ta2
+ $ST $ta3,-2*$BNSZ($a0)
+ $ST $ta2,-$BNSZ($a0)
+
+ .set noreorder
+ bgtzl $ta0,.L_bn_sqr_words_loop
+ $LD $t0,0($a1)
+
+ beqz $a2,.L_bn_sqr_words_return
+ nop
+
+.L_bn_sqr_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $MULTU $t0,$t0
+ subu $a2,1
+ mflo $t1
+ mfhi $t0
+ $ST $t1,0($a0)
+ $ST $t0,$BNSZ($a0)
+ beqz $a2,.L_bn_sqr_words_return
+
+ $LD $t0,$BNSZ($a1)
+ $MULTU $t0,$t0
+ subu $a2,1
+ mflo $t1
+ mfhi $t0
+ $ST $t1,2*$BNSZ($a0)
+ $ST $t0,3*$BNSZ($a0)
+ beqz $a2,.L_bn_sqr_words_return
+
+ $LD $t0,2*$BNSZ($a1)
+ $MULTU $t0,$t0
+ mflo $t1
+ mfhi $t0
+ $ST $t1,4*$BNSZ($a0)
+ $ST $t0,5*$BNSZ($a0)
+
+.L_bn_sqr_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+
+.end bn_sqr_words_internal
+
+.align 5
+.globl bn_add_words
+.ent bn_add_words
+bn_add_words:
+ .set noreorder
+ bgtz $a3,bn_add_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$v0
+.end bn_add_words
+
+.align 5
+.ent bn_add_words_internal
+bn_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $at,$a3,$minus4
+ $LD $t0,0($a1)
+ beqz $at,.L_bn_add_words_tail
+
+.L_bn_add_words_loop:
+ $LD $ta0,0($a2)
+ subu $a3,4
+ $LD $t1,$BNSZ($a1)
+ and $at,$a3,$minus4
+ $LD $t2,2*$BNSZ($a1)
+ $PTR_ADD $a2,4*$BNSZ
+ $LD $t3,3*$BNSZ($a1)
+ $PTR_ADD $a0,4*$BNSZ
+ $LD $ta1,-3*$BNSZ($a2)
+ $PTR_ADD $a1,4*$BNSZ
+ $LD $ta2,-2*$BNSZ($a2)
+ $LD $ta3,-$BNSZ($a2)
+ $ADDU $ta0,$t0
+ sltu $t8,$ta0,$t0
+ $ADDU $t0,$ta0,$v0
+ sltu $v0,$t0,$ta0
+ $ST $t0,-4*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+ $ADDU $ta1,$t1
+ sltu $t9,$ta1,$t1
+ $ADDU $t1,$ta1,$v0
+ sltu $v0,$t1,$ta1
+ $ST $t1,-3*$BNSZ($a0)
+ $ADDU $v0,$t9
+
+ $ADDU $ta2,$t2
+ sltu $t8,$ta2,$t2
+ $ADDU $t2,$ta2,$v0
+ sltu $v0,$t2,$ta2
+ $ST $t2,-2*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+ $ADDU $ta3,$t3
+ sltu $t9,$ta3,$t3
+ $ADDU $t3,$ta3,$v0
+ sltu $v0,$t3,$ta3
+ $ST $t3,-$BNSZ($a0)
+ $ADDU $v0,$t9
+
+ .set noreorder
+ bgtzl $at,.L_bn_add_words_loop
+ $LD $t0,0($a1)
+
+ beqz $a3,.L_bn_add_words_return
+ nop
+
+.L_bn_add_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $LD $ta0,0($a2)
+ $ADDU $ta0,$t0
+ subu $a3,1
+ sltu $t8,$ta0,$t0
+ $ADDU $t0,$ta0,$v0
+ sltu $v0,$t0,$ta0
+ $ST $t0,0($a0)
+ $ADDU $v0,$t8
+ beqz $a3,.L_bn_add_words_return
+
+ $LD $t1,$BNSZ($a1)
+ $LD $ta1,$BNSZ($a2)
+ $ADDU $ta1,$t1
+ subu $a3,1
+ sltu $t9,$ta1,$t1
+ $ADDU $t1,$ta1,$v0
+ sltu $v0,$t1,$ta1
+ $ST $t1,$BNSZ($a0)
+ $ADDU $v0,$t9
+ beqz $a3,.L_bn_add_words_return
+
+ $LD $t2,2*$BNSZ($a1)
+ $LD $ta2,2*$BNSZ($a2)
+ $ADDU $ta2,$t2
+ sltu $t8,$ta2,$t2
+ $ADDU $t2,$ta2,$v0
+ sltu $v0,$t2,$ta2
+ $ST $t2,2*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+.L_bn_add_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+
+.end bn_add_words_internal
+
+.align 5
+.globl bn_sub_words
+.ent bn_sub_words
+bn_sub_words:
+ .set noreorder
+ bgtz $a3,bn_sub_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$zero
+.end bn_sub_words
+
+.align 5
+.ent bn_sub_words_internal
+bn_sub_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $at,$a3,$minus4
+ $LD $t0,0($a1)
+ beqz $at,.L_bn_sub_words_tail
+
+.L_bn_sub_words_loop:
+ $LD $ta0,0($a2)
+ subu $a3,4
+ $LD $t1,$BNSZ($a1)
+ and $at,$a3,$minus4
+ $LD $t2,2*$BNSZ($a1)
+ $PTR_ADD $a2,4*$BNSZ
+ $LD $t3,3*$BNSZ($a1)
+ $PTR_ADD $a0,4*$BNSZ
+ $LD $ta1,-3*$BNSZ($a2)
+ $PTR_ADD $a1,4*$BNSZ
+ $LD $ta2,-2*$BNSZ($a2)
+ $LD $ta3,-$BNSZ($a2)
+ sltu $t8,$t0,$ta0
+ $SUBU $ta0,$t0,$ta0
+ $SUBU $t0,$ta0,$v0
+ sgtu $v0,$t0,$ta0
+ $ST $t0,-4*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+ sltu $t9,$t1,$ta1
+ $SUBU $ta1,$t1,$ta1
+ $SUBU $t1,$ta1,$v0
+ sgtu $v0,$t1,$ta1
+ $ST $t1,-3*$BNSZ($a0)
+ $ADDU $v0,$t9
+
+
+ sltu $t8,$t2,$ta2
+ $SUBU $ta2,$t2,$ta2
+ $SUBU $t2,$ta2,$v0
+ sgtu $v0,$t2,$ta2
+ $ST $t2,-2*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+ sltu $t9,$t3,$ta3
+ $SUBU $ta3,$t3,$ta3
+ $SUBU $t3,$ta3,$v0
+ sgtu $v0,$t3,$ta3
+ $ST $t3,-$BNSZ($a0)
+ $ADDU $v0,$t9
+
+ .set noreorder
+ bgtzl $at,.L_bn_sub_words_loop
+ $LD $t0,0($a1)
+
+ beqz $a3,.L_bn_sub_words_return
+ nop
+
+.L_bn_sub_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $LD $ta0,0($a2)
+ subu $a3,1
+ sltu $t8,$t0,$ta0
+ $SUBU $ta0,$t0,$ta0
+ $SUBU $t0,$ta0,$v0
+ sgtu $v0,$t0,$ta0
+ $ST $t0,0($a0)
+ $ADDU $v0,$t8
+ beqz $a3,.L_bn_sub_words_return
+
+ $LD $t1,$BNSZ($a1)
+ subu $a3,1
+ $LD $ta1,$BNSZ($a2)
+ sltu $t9,$t1,$ta1
+ $SUBU $ta1,$t1,$ta1
+ $SUBU $t1,$ta1,$v0
+ sgtu $v0,$t1,$ta1
+ $ST $t1,$BNSZ($a0)
+ $ADDU $v0,$t9
+ beqz $a3,.L_bn_sub_words_return
+
+ $LD $t2,2*$BNSZ($a1)
+ $LD $ta2,2*$BNSZ($a2)
+ sltu $t8,$t2,$ta2
+ $SUBU $ta2,$t2,$ta2
+ $SUBU $t2,$ta2,$v0
+ sgtu $v0,$t2,$ta2
+ $ST $t2,2*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+.L_bn_sub_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_sub_words
+
+.align 5
+.globl bn_div_3_words
+.ent bn_div_3_words
+bn_div_3_words:
+ .set noreorder
+ move $a3,$a0 # we know that bn_div_words does not
+ # touch $a3, $ta2, $ta3 and preserves $a2
+ # so that we can save two arguments
+ # and return address in registers
+ # instead of stack:-)
+
+ $LD $a0,($a3)
+ move $ta2,$a1
+ bne $a0,$a2,bn_div_3_words_internal
+ $LD $a1,-$BNSZ($a3)
+ li $v0,-1
+ jr $ra
+ move $a0,$v0
+.end bn_div_3_words
+
+.align 5
+.ent bn_div_3_words_internal
+bn_div_3_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ move $ta3,$ra
+ bal bn_div_words
+ move $ra,$ta3
+ $MULTU $ta2,$v0
+ $LD $t2,-2*$BNSZ($a3)
+ move $ta0,$zero
+ mfhi $t1
+ mflo $t0
+ sltu $t8,$t1,$a1
+.L_bn_div_3_words_inner_loop:
+ bnez $t8,.L_bn_div_3_words_inner_loop_done
+ sgeu $at,$t2,$t0
+ seq $t9,$t1,$a1
+ and $at,$t9
+ sltu $t3,$t0,$ta2
+ $ADDU $a1,$a2
+ $SUBU $t1,$t3
+ $SUBU $t0,$ta2
+ sltu $t8,$t1,$a1
+ sltu $ta0,$a1,$a2
+ or $t8,$ta0
+ .set noreorder
+ beqzl $at,.L_bn_div_3_words_inner_loop
+ $SUBU $v0,1
+ .set reorder
+.L_bn_div_3_words_inner_loop_done:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_div_3_words_internal
+
+.align 5
+.globl bn_div_words
+.ent bn_div_words
+bn_div_words:
+ .set noreorder
+ bnez $a2,bn_div_words_internal
+ li $v0,-1 # I would rather signal div-by-zero
+ # which can be done with 'break 7'
+ jr $ra
+ move $a0,$v0
+.end bn_div_words
+
+.align 5
+.ent bn_div_words_internal
+bn_div_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*