summaryrefslogtreecommitdiffstats
path: root/crypto/bn/asm/alpha
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/bn/asm/alpha')
-rw-r--r--crypto/bn/asm/alpha/add.pl118
-rw-r--r--crypto/bn/asm/alpha/div.pl144
-rw-r--r--crypto/bn/asm/alpha/mul.pl104
-rw-r--r--crypto/bn/asm/alpha/mul_add.pl123
-rw-r--r--crypto/bn/asm/alpha/mul_c4.pl215
-rw-r--r--crypto/bn/asm/alpha/mul_c4.works.pl98
-rw-r--r--crypto/bn/asm/alpha/mul_c8.pl177
-rw-r--r--crypto/bn/asm/alpha/sqr.pl113
-rw-r--r--crypto/bn/asm/alpha/sqr_c4.pl109
-rw-r--r--crypto/bn/asm/alpha/sqr_c8.pl132
-rw-r--r--crypto/bn/asm/alpha/sub.pl108
11 files changed, 1441 insertions, 0 deletions
diff --git a/crypto/bn/asm/alpha/add.pl b/crypto/bn/asm/alpha/add.pl
new file mode 100644
index 0000000000..13bf516428
--- /dev/null
+++ b/crypto/bn/asm/alpha/add.pl
@@ -0,0 +1,118 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_add_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r);
+
+ &init_pool(4);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+ $count=&wparam(3);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ &blt($count,&label("finish"));
+
+ ($a0,$b0)=&NR(2);
+
+##########################################################
+ &set_label("loop");
+
+ &ld(($a0)=&NR(1),&QWPw(0,$ap));
+ &ld(($b0)=&NR(1),&QWPw(0,$bp));
+ &ld(($a1)=&NR(1),&QWPw(1,$ap));
+ &ld(($b1)=&NR(1),&QWPw(1,$bp));
+
+ ($o0,$t0)=&NR(2);
+ &add($a0,$b0,$o0);
+ &ld(($a2)=&NR(1),&QWPw(2,$ap));
+ &cmpult($o0,$b0,$t0);
+ &add($o0,$cc,$o0);
+ &cmpult($o0,$cc,$cc);
+ &ld(($b2)=&NR(1),&QWPw(2,$bp));
+ &add($cc,$t0,$cc); &FR($t0);
+
+ ($t1,$o1)=&NR(2);
+
+ &add($a1,$b1,$o1); &FR($a1);
+ &cmpult($o1,$b1,$t1); &FR($b1);
+ &add($o1,$cc,$o1);
+ &cmpult($o1,$cc,$cc);
+ &ld(($a3)=&NR(1),&QWPw(3,$ap));
+ &add($cc,$t1,$cc); &FR($t1);
+
+ ($t2,$o2)=&NR(2);
+
+ &add($a2,$b2,$o2); &FR($a2);
+ &cmpult($o2,$b2,$t2); &FR($b2);
+ &add($o2,$cc,$o2);
+ &cmpult($o2,$cc,$cc);
+ &ld(($b3)=&NR(1),&QWPw(3,$bp));
+ &st($o0,&QWPw(0,$rp)); &FR($o0);
+ &add($cc,$t2,$cc); &FR($t2);
+
+ ($t3,$o3)=&NR(2);
+
+ &st($o1,&QWPw(0,$rp)); &FR($o1);
+ &add($a3,$b3,$o3); &FR($a3);
+ &cmpult($o3,$b3,$t3); &FR($b3);
+ &add($o3,$cc,$o3);
+ &st($o2,&QWPw(0,$rp)); &FR($o2);
+ &cmpult($o3,$cc,$cc);
+ &st($o3,&QWPw(0,$rp)); &FR($o3);
+ &add($cc,$t3,$cc); &FR($t3);
+
+
+ &sub($count,4,$count); # count-=4
+ &add($ap,4*$QWS,$ap); # count+=4
+ &add($bp,4*$QWS,$bp); # count+=4
+ &add($rp,4*$QWS,$rp); # count+=4
+
+ ###
+ &bge($count,&label("loop"));
+ ###
+ &br(&label("finish"));
+##################################################
+ # Do the last 0..3 words
+
+ ($t0,$o0)=&NR(2);
+ &set_label("last_loop");
+
+ &ld($a0,&QWPw(0,$ap)); # get a
+ &ld($b0,&QWPw(0,$bp)); # get b
+ &add($ap,$QWS,$ap);
+ &add($bp,$QWS,$bp);
+ &add($a0,$b0,$o0);
+ &sub($count,1,$count);
+ &cmpult($o0,$b0,$t0); # will we borrow?
+ &add($o0,$cc,$o0); # will we borrow?
+ &cmpult($o0,$cc,$cc); # will we borrow?
+ &add($rp,$QWS,$rp);
+ &st($o0,&QWPw(-1,$rp)); # save
+ &add($cc,$t0,$cc); # add the borrows
+
+ ###
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &FR($o0,$t0,$a0,$b0);
+ &set_label("end");
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/div.pl b/crypto/bn/asm/alpha/div.pl
new file mode 100644
index 0000000000..e9e680897a
--- /dev/null
+++ b/crypto/bn/asm/alpha/div.pl
@@ -0,0 +1,144 @@
+#!/usr/local/bin/perl
+
+sub bn_div_words
+ {
+ local($data)=<<'EOF';
+ #
+ # What follows was taken directly from the C compiler with a few
+ # hacks to redo the lables.
+ #
+.text
+ .set noreorder
+ .set volatile
+ .align 3
+ .globl bn_div_words
+ .ent bn_div_words
+bn_div_words
+ ldgp $29,0($27)
+bn_div_words.ng:
+ lda $30,-48($30)
+ .frame $30,48,$26,0
+ stq $26,0($30)
+ stq $9,8($30)
+ stq $10,16($30)
+ stq $11,24($30)
+ stq $12,32($30)
+ stq $13,40($30)
+ .mask 0x4003e00,-48
+ .prologue 1
+ bis $16,$16,$9
+ bis $17,$17,$10
+ bis $18,$18,$11
+ bis $31,$31,$13
+ bis $31,2,$12
+ bne $11,$9119
+ lda $0,-1
+ br $31,$9136
+ .align 4
+$9119:
+ bis $11,$11,$16
+ jsr $26,BN_num_bits_word
+ ldgp $29,0($26)
+ subq $0,64,$1
+ beq $1,$9120
+ bis $31,1,$1
+ sll $1,$0,$1
+ cmpule $9,$1,$1
+ bne $1,$9120
+ # lda $16,_IO_stderr_
+ # lda $17,$C32
+ # bis $0,$0,$18
+ # jsr $26,fprintf
+ # ldgp $29,0($26)
+ jsr $26,abort
+ ldgp $29,0($26)
+ .align 4
+$9120:
+ bis $31,64,$3
+ cmpult $9,$11,$2
+ subq $3,$0,$1
+ addl $1,$31,$0
+ subq $9,$11,$1
+ cmoveq $2,$1,$9
+ beq $0,$9122
+ zapnot $0,15,$2
+ subq $3,$0,$1
+ sll $11,$2,$11
+ sll $9,$2,$3
+ srl $10,$1,$1
+ sll $10,$2,$10
+ bis $3,$1,$9
+$9122:
+ srl $11,32,$5
+ zapnot $11,15,$6
+ lda $7,-1
+ .align 5
+$9123:
+ srl $9,32,$1
+ subq $1,$5,$1
+ bne $1,$9126
+ zapnot $7,15,$27
+ br $31,$9127
+ .align 4
+$9126:
+ bis $9,$9,$24
+ bis $5,$5,$25
+ divqu $24,$25,$27
+$9127:
+ srl $10,32,$4
+ .align 5
+$9128:
+ mulq $27,$5,$1
+ subq $9,$1,$3
+ zapnot $3,240,$1
+ bne $1,$9129
+ mulq $6,$27,$2
+ sll $3,32,$1
+ addq $1,$4,$1
+ cmpule $2,$1,$2
+ bne $2,$9129
+ subq $27,1,$27
+ br $31,$9128
+ .align 4
+$9129:
+ mulq $27,$6,$1
+ mulq $27,$5,$4
+ srl $1,32,$3
+ sll $1,32,$1
+ addq $4,$3,$4
+ cmpult $10,$1,$2
+ subq $10,$1,$10
+ addq $2,$4,$2
+ cmpult $9,$2,$1
+ bis $2,$2,$4
+ beq $1,$9134
+ addq $9,$11,$9
+ subq $27,1,$27
+$9134:
+ subl $12,1,$12
+ subq $9,$4,$9
+ beq $12,$9124
+ sll $27,32,$13
+ sll $9,32,$2
+ srl $10,32,$1
+ sll $10,32,$10
+ bis $2,$1,$9
+ br $31,$9123
+ .align 4
+$9124:
+ bis $13,$27,$0
+$9136:
+ ldq $26,0($30)
+ ldq $9,8($30)
+ ldq $10,16($30)
+ ldq $11,24($30)
+ ldq $12,32($30)
+ ldq $13,40($30)
+ addq $30,48,$30
+ ret $31,($26),1
+ .end bn_div_words
+EOF
+ &asm_add($data);
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/mul.pl b/crypto/bn/asm/alpha/mul.pl
new file mode 100644
index 0000000000..76c926566c
--- /dev/null
+++ b/crypto/bn/asm/alpha/mul.pl
@@ -0,0 +1,104 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_mul_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r,$couny);
+
+ &init_pool(4);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $count=&wparam(2);
+ $word=&wparam(3);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ ###
+ &blt($count,&label("finish"));
+
+ ($a0)=&NR(1); &ld($a0,&QWPw(0,$ap));
+
+ &set_label("loop");
+
+ ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap));
+ ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap));
+
+ &muh($a0,$word,($h0)=&NR(1)); &FR($a0);
+ ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap));
+ ### wait 8
+ &mul($a0,$word,($l0)=&NR(1)); &FR($a0);
+ ### wait 8
+ &muh($a1,$word,($h1)=&NR(1)); &FR($a1);
+ &add($l0,$cc,$l0); ### wait 8
+ &mul($a1,$word,($l1)=&NR(1)); &FR($a1);
+ &cmpult($l0,$cc,$cc); ### wait 8
+ &muh($a2,$word,($h2)=&NR(1)); &FR($a2);
+ &add($h0,$cc,$cc); &FR($h0); ### wait 8
+ &mul($a2,$word,($l2)=&NR(1)); &FR($a2);
+ &add($l1,$cc,$l1); ### wait 8
+ &st($l0,&QWPw(0,$rp)); &FR($l0);
+ &cmpult($l1,$cc,$cc); ### wait 8
+ &muh($a3,$word,($h3)=&NR(1)); &FR($a3);
+ &add($h1,$cc,$cc); &FR($h1);
+ &mul($a3,$word,($l3)=&NR(1)); &FR($a3);
+ &add($l2,$cc,$l2);
+ &st($l1,&QWPw(1,$rp)); &FR($l1);
+ &cmpult($l2,$cc,$cc);
+ &add($h2,$cc,$cc); &FR($h2);
+ &sub($count,4,$count); # count-=4
+ &st($l2,&QWPw(2,$rp)); &FR($l2);
+ &add($l3,$cc,$l3);
+ &cmpult($l3,$cc,$cc);
+ &add($bp,4*$QWS,$bp); # count+=4
+ &add($h3,$cc,$cc); &FR($h3);
+ &add($ap,4*$QWS,$ap); # count+=4
+ &st($l3,&QWPw(3,$rp)); &FR($l3);
+ &add($rp,4*$QWS,$rp); # count+=4
+ ###
+ &blt($count,&label("finish"));
+ ($a0)=&NR(1); &ld($a0,&QWPw(0,$ap));
+ &br(&label("finish"));
+##################################################
+
+##################################################
+ # Do the last 0..3 words
+
+ &set_label("last_loop");
+
+ &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a
+ ###
+ ###
+ ###
+ &muh($a0,$word,($h0)=&NR(1));
+ ### Wait 8 for next mul issue
+ &mul($a0,$word,($l0)=&NR(1)); &FR($a0)
+ &add($ap,$QWS,$ap);
+ ### Loose 12 until result is available
+ &add($rp,$QWS,$rp);
+ &sub($count,1,$count);
+ &add($l0,$cc,$l0);
+ ###
+ &st($l0,&QWPw(-1,$rp)); &FR($l0);
+ &cmpult($l0,$cc,$cc);
+ &add($h0,$cc,$cc); &FR($h0);
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &set_label("end");
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/mul_add.pl b/crypto/bn/asm/alpha/mul_add.pl
new file mode 100644
index 0000000000..0d6df69bc4
--- /dev/null
+++ b/crypto/bn/asm/alpha/mul_add.pl
@@ -0,0 +1,123 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_mul_add_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r,$couny);
+
+ &init_pool(4);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $count=&wparam(2);
+ $word=&wparam(3);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ ###
+ &blt($count,&label("finish"));
+
+ &ld(($a0)=&NR(1),&QWPw(0,$ap));
+
+$a=<<'EOF';
+##########################################################
+ &set_label("loop");
+
+ &ld(($r0)=&NR(1),&QWPw(0,$rp));
+ &ld(($a1)=&NR(1),&QWPw(1,$ap));
+ &muh($a0,$word,($h0)=&NR(1));
+ &ld(($r1)=&NR(1),&QWPw(1,$rp));
+ &ld(($a2)=&NR(1),&QWPw(2,$ap));
+ ###
+ &mul($a0,$word,($l0)=&NR(1)); &FR($a0);
+ &ld(($r2)=&NR(1),&QWPw(2,$rp));
+ &muh($a1,$word,($h1)=&NR(1));
+ &ld(($a3)=&NR(1),&QWPw(3,$ap));
+ &mul($a1,$word,($l1)=&NR(1)); &FR($a1);
+ &ld(($r3)=&NR(1),&QWPw(3,$rp));
+ &add($r0,$l0,$r0);
+ &add($r1,$l1,$r1);
+ &cmpult($r0,$l0,($t0)=&NR(1)); &FR($l0);
+ &cmpult($r1,$l1,($t1)=&NR(1)); &FR($l1);
+ &muh($a2,$word,($h2)=&NR(1));
+ &add($r0,$cc,$r0);
+ &add($h0,$t0,$h0); &FR($t0);
+ &cmpult($r0,$cc,$cc);
+ &add($h1,$t1,$h1); &FR($t1);
+ &add($h0,$cc,$cc); &FR($h0);
+ &mul($a2,$word,($l2)=&NR(1)); &FR($a2);
+ &add($r1,$cc,$r1);
+ &cmpult($r1,$cc,$cc);
+ &add($r2,$l2,$r2);
+ &add($h1,$cc,$cc); &FR($h1);
+ &cmpult($r2,$l2,($t2)=&NR(1)); &FR($l2);
+ &muh($a3,$word,($h3)=&NR(1));
+ &add($r2,$cc,$r2);
+ &st($r0,&QWPw(0,$rp)); &FR($r0);
+ &add($h2,$t2,$h2); &FR($t2);
+ &st($r1,&QWPw(1,$rp)); &FR($r1);
+ &cmpult($r2,$cc,$cc);
+ &mul($a3,$word,($l3)=&NR(1)); &FR($a3);
+ &add($h2,$cc,$cc); &FR($h2);
+ &st($r2,&QWPw(2,$rp)); &FR($r2);
+ &sub($count,4,$count); # count-=4
+ &add($rp,4*$QWS,$rp); # count+=4
+ &add($r3,$l3,$r3);
+ &add($ap,4*$QWS,$ap); # count+=4
+ &cmpult($r3,$l3,($t3)=&NR(1)); &FR($l3);
+ &add($r3,$cc,$r3);
+ &add($h3,$t3,$h3); &FR($t3);
+ &cmpult($r3,$cc,$cc);
+ &st($r3,&QWPw(-1,$rp)); &FR($r3);
+ &add($h3,$cc,$cc); &FR($h3);
+
+ ###
+ &blt($count,&label("finish"));
+ &ld(($a0)=&NR(1),&QWPw(0,$ap));
+ &br(&label("loop"));
+EOF
+##################################################
+ # Do the last 0..3 words
+
+ &set_label("last_loop");
+
+ &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a
+ &ld(($r0)=&NR(1),&QWPw(0,$rp)); # get b
+ ###
+ ###
+ &muh($a0,$word,($h0)=&NR(1)); &FR($a0);
+ ### wait 8
+ &mul($a0,$word,($l0)=&NR(1)); &FR($a0);
+ &add($rp,$QWS,$rp);
+ &add($ap,$QWS,$ap);
+ &sub($count,1,$count);
+ ### wait 3 until l0 is available
+ &add($r0,$l0,$r0);
+ ###
+ &cmpult($r0,$l0,($t0)=&NR(1)); &FR($l0);
+ &add($r0,$cc,$r0);
+ &add($h0,$t0,$h0); &FR($t0);
+ &cmpult($r0,$cc,$cc);
+ &add($h0,$cc,$cc); &FR($h0);
+
+ &st($r0,&QWPw(-1,$rp)); &FR($r0);
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &set_label("end");
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/mul_c4.pl b/crypto/bn/asm/alpha/mul_c4.pl
new file mode 100644
index 0000000000..9cc876ded4
--- /dev/null
+++ b/crypto/bn/asm/alpha/mul_c4.pl
@@ -0,0 +1,215 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+# upto
+
+sub mul_add_c
+ {
+ local($a,$b,$c0,$c1,$c2)=@_;
+ local($l1,$h1,$t1,$t2);
+
+ &mul($a,$b,($l1)=&NR(1));
+ &muh($a,$b,($h1)=&NR(1));
+ &add($c0,$l1,$c0);
+ &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1);
+ &add($t1,$h1,$h1); &FR($t1);
+ &add($c1,$h1,$c1);
+ &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1);
+ &add($c2,$t2,$c2); &FR($t2);
+ }
+
+sub bn_mul_comba4
+ {
+ local($name)=@_;
+ local(@a,@b,$r,$c0,$c1,$c2);
+
+ $cnt=1;
+ &init_pool(3);
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+
+ &function_begin($name,"");
+
+ &comment("");
+
+ &ld(($a[0])=&NR(1),&QWPw(0,$ap));
+ &ld(($b[0])=&NR(1),&QWPw(0,$bp));
+ &ld(($a[1])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[1])=&NR(1),&QWPw(1,$bp));
+ &mul($a[0],$b[0],($r00)=&NR(1));
+ &ld(($a[2])=&NR(1),&QWPw(2,$ap));
+ &ld(($b[2])=&NR(1),&QWPw(2,$bp));
+ &muh($a[0],$b[0],($r01)=&NR(1));
+ &FR($ap); &ld(($a[3])=&NR(1),&QWPw(3,$ap));
+ &FR($bp); &ld(($b[3])=&NR(1),&QWPw(3,$bp));
+ &mul($a[0],$b[1],($r02)=&NR(1));
+
+ ($R,$H1,$H2)=&NR(3);
+
+ &st($r00,&QWPw(0,$rp)); &FR($r00);
+
+ &mov("zero",$R);
+ &mul($a[1],$b[0],($r03)=&NR(1));
+
+ &mov("zero",$H1);
+ &mov("zero",$H0);
+ &add($R,$r01,$R);
+ &muh($a[0],$b[1],($r04)=&NR(1));
+ &cmpult($R,$r01,($t01)=&NR(1)); &FR($r01);
+ &add($R,$r02,$R);
+ &add($H1,$t01,$H1) &FR($t01);
+ &muh($a[1],$b[0],($r05)=&NR(1));
+ &cmpult($R,$r02,($t02)=&NR(1)); &FR($r02);
+ &add($R,$r03,$R);
+ &add($H2,$t02,$H2) &FR($t02);
+ &mul($a[0],$b[2],($r06)=&NR(1));
+ &cmpult($R,$r03,($t03)=&NR(1)); &FR($r03);
+ &add($H1,$t03,$H1) &FR($t03);
+ &st($R,&QWPw(1,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r04,$R);
+ &mov("zero",$H2);
+ &mul($a[1],$b[1],($r07)=&NR(1));
+ &cmpult($R,$r04,($t04)=&NR(1)); &FR($r04);
+ &add($R,$r05,$R);
+ &add($H1,$t04,$H1) &FR($t04);
+ &mul($a[2],$b[0],($r08)=&NR(1));
+ &cmpult($R,$r05,($t05)=&NR(1)); &FR($r05);
+ &add($R,$r01,$R);
+ &add($H2,$t05,$H2) &FR($t05);
+ &muh($a[0],$b[2],($r09)=&NR(1));
+ &cmpult($R,$r06,($t06)=&NR(1)); &FR($r06);
+ &add($R,$r07,$R);
+ &add($H1,$t06,$H1) &FR($t06);
+ &muh($a[1],$b[1],($r10)=&NR(1));
+ &cmpult($R,$r07,($t07)=&NR(1)); &FR($r07);
+ &add($R,$r08,$R);
+ &add($H2,$t07,$H2) &FR($t07);
+ &muh($a[2],$b[0],($r11)=&NR(1));
+ &cmpult($R,$r08,($t08)=&NR(1)); &FR($r08);
+ &add($H1,$t08,$H1) &FR($t08);
+ &st($R,&QWPw(2,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r09,$R);
+ &mov("zero",$H2);
+ &mul($a[0],$b[3],($r12)=&NR(1));
+ &cmpult($R,$r09,($t09)=&NR(1)); &FR($r09);
+ &add($R,$r10,$R);
+ &add($H1,$t09,$H1) &FR($t09);
+ &mul($a[1],$b[2],($r13)=&NR(1));
+ &cmpult($R,$r10,($t10)=&NR(1)); &FR($r10);
+ &add($R,$r11,$R);
+ &add($H1,$t10,$H1) &FR($t10);
+ &mul($a[2],$b[1],($r14)=&NR(1));
+ &cmpult($R,$r11,($t11)=&NR(1)); &FR($r11);
+ &add($R,$r12,$R);
+ &add($H1,$t11,$H1) &FR($t11);
+ &mul($a[3],$b[0],($r15)=&NR(1));
+ &cmpult($R,$r12,($t12)=&NR(1)); &FR($r12);
+ &add($R,$r13,$R);
+ &add($H1,$t12,$H1) &FR($t12);
+ &muh($a[0],$b[3],($r16)=&NR(1));
+ &cmpult($R,$r13,($t13)=&NR(1)); &FR($r13);
+ &add($R,$r14,$R);
+ &add($H1,$t13,$H1) &FR($t13);
+ &muh($a[1],$b[2],($r17)=&NR(1));
+ &cmpult($R,$r14,($t14)=&NR(1)); &FR($r14);
+ &add($R,$r15,$R);
+ &add($H1,$t14,$H1) &FR($t14);
+ &muh($a[2],$b[1],($r18)=&NR(1));
+ &cmpult($R,$r15,($t15)=&NR(1)); &FR($r15);
+ &add($H1,$t15,$H1) &FR($t15);
+ &st($R,&QWPw(3,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r16,$R);
+ &mov("zero",$H2);
+ &muh($a[3],$b[0],($r19)=&NR(1));
+ &cmpult($R,$r16,($t16)=&NR(1)); &FR($r16);
+ &add($R,$r17,$R);
+ &add($H1,$t16,$H1) &FR($t16);
+ &mul($a[1],$b[3],($r20)=&NR(1));
+ &cmpult($R,$r17,($t17)=&NR(1)); &FR($r17);
+ &add($R,$r18,$R);
+ &add($H1,$t17,$H1) &FR($t17);
+ &mul($a[2],$b[2],($r21)=&NR(1));
+ &cmpult($R,$r18,($t18)=&NR(1)); &FR($r18);
+ &add($R,$r19,$R);
+ &add($H1,$t18,$H1) &FR($t18);
+ &mul($a[3],$b[1],($r22)=&NR(1));
+ &cmpult($R,$r19,($t19)=&NR(1)); &FR($r19);
+ &add($R,$r20,$R);
+ &add($H1,$t19,$H1) &FR($t19);
+ &muh($a[1],$b[3],($r23)=&NR(1));
+ &cmpult($R,$r20,($t20)=&NR(1)); &FR($r20);
+ &add($R,$r21,$R);
+ &add($H1,$t20,$H1) &FR($t20);
+ &muh($a[2],$b[2],($r24)=&NR(1));
+ &cmpult($R,$r21,($t21)=&NR(1)); &FR($r21);
+ &add($R,$r22,$R);
+ &add($H1,$t21,$H1) &FR($t21);
+ &muh($a[3],$b[1],($r25)=&NR(1));
+ &cmpult($R,$r22,($t22)=&NR(1)); &FR($r22);
+ &add($H1,$t22,$H1) &FR($t22);
+ &st($R,&QWPw(4,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r23,$R);
+ &mov("zero",$H2);
+ &mul($a[2],$b[3],($r26)=&NR(1));
+ &cmpult($R,$r23,($t23)=&NR(1)); &FR($r23);
+ &add($R,$r24,$R);
+ &add($H1,$t23,$H1) &FR($t23);
+ &mul($a[3],$b[2],($r27)=&NR(1));
+ &cmpult($R,$r24,($t24)=&NR(1)); &FR($r24);
+ &add($R,$r25,$R);
+ &add($H1,$t24,$H1) &FR($t24);
+ &muh($a[2],$b[3],($r28)=&NR(1));
+ &cmpult($R,$r25,($t25)=&NR(1)); &FR($r25);
+ &add($R,$r26,$R);
+ &add($H1,$t25,$H1) &FR($t25);
+ &muh($a[3],$b[2],($r29)=&NR(1));
+ &cmpult($R,$r26,($t26)=&NR(1)); &FR($r26);
+ &add($R,$r27,$R);
+ &add($H1,$t26,$H1) &FR($t26);
+ &mul($a[3],$b[3],($r30)=&NR(1));
+ &cmpult($R,$r27,($t27)=&NR(1)); &FR($r27);
+ &add($H1,$t27,$H1) &FR($t27);
+ &st($R,&QWPw(5,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r28,$R);
+ &mov("zero",$H2);
+ &muh($a[3],$b[3],($r31)=&NR(1));
+ &cmpult($R,$r28,($t28)=&NR(1)); &FR($r28);
+ &add($R,$r29,$R);
+ &add($H1,$t28,$H1) &FR($t28);
+ ############
+ &cmpult($R,$r29,($t29)=&NR(1)); &FR($r29);
+ &add($R,$r30,$R);
+ &add($H1,$t29,$H1) &FR($t29);
+ ############
+ &cmpult($R,$r30,($t30)=&NR(1)); &FR($r30);
+ &add($H1,$t30,$H1) &FR($t30);
+ &st($R,&QWPw(6,$rp));
+ &add($H1,$H2,$R);
+
+ &add($R,$r31,$R); &FR($r31);
+ &st($R,&QWPw(7,$rp));
+
+ &FR($R,$H1,$H2);
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/mul_c4.works.pl b/crypto/bn/asm/alpha/mul_c4.works.pl
new file mode 100644
index 0000000000..79d86dd25c
--- /dev/null
+++ b/crypto/bn/asm/alpha/mul_c4.works.pl
@@ -0,0 +1,98 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub mul_add_c
+ {
+ local($a,$b,$c0,$c1,$c2)=@_;
+ local($l1,$h1,$t1,$t2);
+
+print STDERR "count=$cnt\n"; $cnt++;
+ &mul($a,$b,($l1)=&NR(1));
+ &muh($a,$b,($h1)=&NR(1));
+ &add($c0,$l1,$c0);
+ &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1);
+ &add($t1,$h1,$h1); &FR($t1);
+ &add($c1,$h1,$c1);
+ &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1);
+ &add($c2,$t2,$c2); &FR($t2);
+ }
+
+sub bn_mul_comba4
+ {
+ local($name)=@_;
+ local(@a,@b,$r,$c0,$c1,$c2);
+
+ $cnt=1;
+ &init_pool(3);
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+
+ &function_begin($name,"");
+
+ &comment("");
+
+ &ld(($a[0])=&NR(1),&QWPw(0,$ap));
+ &ld(($b[0])=&NR(1),&QWPw(0,$bp));
+ &ld(($a[1])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[1])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[2])=&NR(1),&QWPw(2,$ap));
+ &ld(($b[2])=&NR(1),&QWPw(2,$bp));
+ &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap);
+ &ld(($b[3])=&NR(1),&QWPw(3,$bp)); &FR($bp);
+
+ ($c0,$c1,$c2)=&NR(3);
+ &mov("zero",$c2);
+ &mul($a[0],$b[0],$c0);
+ &muh($a[0],$b[0],$c1);
+ &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[1],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[0],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[3],$c0,$c1,$c2); &FR($a[0]);
+ &mul_add_c($a[1],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[0],$c0,$c1,$c2); &FR($b[0]);
+ &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[1],$b[3],$c0,$c1,$c2); &FR($a[1]);
+ &mul_add_c($a[2],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[1],$c0,$c1,$c2); &FR($b[1]);
+ &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[2],$b[3],$c0,$c1,$c2); &FR($a[2]);
+ &mul_add_c($a[3],$b[2],$c0,$c1,$c2); &FR($b[2]);
+ &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[3],$b[3],$c0,$c1,$c2); &FR($a[3],$b[3]);
+ &st($c0,&QWPw(6,$rp));
+ &st($c1,&QWPw(7,$rp));
+
+ &FR($c0,$c1,$c2);
+
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/mul_c8.pl b/crypto/bn/asm/alpha/mul_c8.pl
new file mode 100644
index 0000000000..525ca7494b
--- /dev/null
+++ b/crypto/bn/asm/alpha/mul_c8.pl
@@ -0,0 +1,177 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_mul_comba8
+ {
+ local($name)=@_;
+ local(@a,@b,$r,$c0,$c1,$c2);
+
+ $cnt=1;
+ &init_pool(3);
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+
+ &function_begin($name,"");
+
+ &comment("");
+
+ &stack_push(2);
+ &ld(($a[0])=&NR(1),&QWPw(0,$ap));
+ &ld(($b[0])=&NR(1),&QWPw(0,$bp));
+ &st($reg_s0,&swtmp(0)); &FR($reg_s0);
+ &st($reg_s1,&swtmp(1)); &FR($reg_s1);
+ &ld(($a[1])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[1])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[2])=&NR(1),&QWPw(2,$ap));
+ &ld(($b[2])=&NR(1),&QWPw(2,$bp));
+ &ld(($a[3])=&NR(1),&QWPw(3,$ap));
+ &ld(($b[3])=&NR(1),&QWPw(3,$bp));
+ &ld(($a[4])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[4])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[5])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[5])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[6])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[6])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[7])=&NR(1),&QWPw(1,$ap)); &FR($ap);
+ &ld(($b[7])=&NR(1),&QWPw(1,$bp)); &FR($bp);
+
+ ($c0,$c1,$c2)=&NR(3);
+ &mov("zero",$c2);
+ &mul($a[0],$b[0],$c0);
+ &muh($a[0],$b[0],$c1);
+ &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(6,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[7],$c0,$c1,$c2); &FR($a[0]);
+ &mul_add_c($a[1],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[0],$c0,$c1,$c2); &FR($b[0]);
+ &st($c0,&QWPw(7,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[1],$b[7],$c0,$c1,$c2); &FR($a[1]);
+ &mul_add_c($a[2],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[1],$c0,$c1,$c2); &FR($b[1]);
+ &st($c0,&QWPw(8,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[2],$b[7],$c0,$c1,$c2); &FR($a[2]);
+ &mul_add_c($a[3],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[2],$c0,$c1,$c2); &FR($b[2]);
+ &st($c0,&QWPw(9,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[3],$b[7],$c0,$c1,$c2); &FR($a[3]);
+ &mul_add_c($a[4],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[3],$c0,$c1,$c2); &FR($b[3]);
+ &st($c0,&QWPw(10,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[4],$b[7],$c0,$c1,$c2); &FR($a[4]);
+ &mul_add_c($a[5],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[4],$c0,$c1,$c2); &FR($b[4]);
+ &st($c0,&QWPw(11,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[5],$b[7],$c0,$c1,$c2); &FR($a[5]);
+ &mul_add_c($a[6],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[5],$c0,$c1,$c2); &FR($b[5]);
+ &st($c0,&QWPw(12,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[6],$b[7],$c0,$c1,$c2); &FR($a[6]);
+ &mul_add_c($a[7],$b[6],$c0,$c1,$c2); &FR($b[6]);
+ &st($c0,&QWPw(13,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[7],$b[7],$c0,$c1,$c2); &FR($a[7],$b[7]);
+ &st($c0,&QWPw(14,$rp));
+ &st($c1,&QWPw(15,$rp));
+
+ &FR($c0,$c1,$c2);
+
+ &ld($reg_s0,&swtmp(0));
+ &ld($reg_s1,&swtmp(1));
+ &stack_pop(2);
+
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/sqr.pl b/crypto/bn/asm/alpha/sqr.pl
new file mode 100644
index 0000000000..a55b696906
--- /dev/null
+++ b/crypto/bn/asm/alpha/sqr.pl
@@ -0,0 +1,113 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_sqr_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r,$couny);
+
+ &init_pool(3);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $count=&wparam(2);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ &br(&label("finish"));
+ &blt($count,&label("finish"));
+
+ ($a0,$r0)=&NR(2);
+ &ld($a0,&QWPw(0,$ap));
+ &ld($r0,&QWPw(0,$rp));
+
+$a=<<'EOF';
+##########################################################
+ &set_label("loop");
+
+ ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap));
+ ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp));
+ ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap));
+ ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp));
+ ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap));
+ ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp));
+
+ ($o0,$t0)=&NR(2);
+ &add($a0,$b0,$o0);
+ &cmpult($o0,$b0,$t0);
+ &add($o0,$cc,$o0);
+ &cmpult($o0,$cc,$cc);
+ &add($cc,$t0,$cc); &FR($t0);
+
+ ($t1,$o1)=&NR(2);
+
+ &add($a1,$b1,$o1); &FR($a1);
+ &cmpult($o1,$b1,$t1); &FR($b1);
+ &add($o1,$cc,$o1);
+ &cmpult($o1,$cc,$cc);
+ &add($cc,$t1,$cc); &FR($t1);
+
+ ($t2,$o2)=&NR(2);