diff options
Diffstat (limited to 'crypto/bn/asm/alpha.works')
-rw-r--r-- | crypto/bn/asm/alpha.works/add.pl | 119 | ||||
-rw-r--r-- | crypto/bn/asm/alpha.works/div.pl | 144 | ||||
-rw-r--r-- | crypto/bn/asm/alpha.works/mul.pl | 116 | ||||
-rw-r--r-- | crypto/bn/asm/alpha.works/mul_add.pl | 120 | ||||
-rw-r--r-- | crypto/bn/asm/alpha.works/mul_c4.pl | 213 | ||||
-rw-r--r-- | crypto/bn/asm/alpha.works/mul_c4.works.pl | 98 | ||||
-rw-r--r-- | crypto/bn/asm/alpha.works/mul_c8.pl | 177 | ||||
-rw-r--r-- | crypto/bn/asm/alpha.works/sqr.pl | 113 | ||||
-rw-r--r-- | crypto/bn/asm/alpha.works/sqr_c4.pl | 109 | ||||
-rw-r--r-- | crypto/bn/asm/alpha.works/sqr_c8.pl | 132 | ||||
-rw-r--r-- | crypto/bn/asm/alpha.works/sub.pl | 108 |
11 files changed, 1449 insertions, 0 deletions
diff --git a/crypto/bn/asm/alpha.works/add.pl b/crypto/bn/asm/alpha.works/add.pl new file mode 100644 index 0000000000..4dc76e6b69 --- /dev/null +++ b/crypto/bn/asm/alpha.works/add.pl @@ -0,0 +1,119 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_add_words + { + local($name)=@_; + local($cc,$a,$b,$r); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + $count=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &br(&label("finish")); + &blt($count,&label("finish")); + + ($a0,$b0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + +########################################################## + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp)); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &cmpult($o3,$cc,$cc); + &add($cc,$t3,$cc); &FR($t3); + + &st($o0,&QWPw(0,$rp)); &FR($o0); + &st($o1,&QWPw(0,$rp)); &FR($o1); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &st($o3,&QWPw(0,$rp)); &FR($o3); + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +################################################## + # Do the last 0..3 words + + ($t0,$o0)=&NR(2); + &set_label("last_loop"); + + &ld($a0,&QWPw(0,$ap)); # get a + &ld($b0,&QWPw(0,$bp)); # get b + + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); # will we borrow? + &add($o0,$cc,$o0); # will we borrow? + &cmpult($o0,$cc,$cc); # will we borrow? + &add($cc,$t0,$cc); # add the borrows + &st($o0,&QWPw(0,$rp)); # save + + &add($ap,$QWS,$ap); + &add($bp,$QWS,$bp); + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &FR($o0,$t0,$a0,$b0); + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/div.pl b/crypto/bn/asm/alpha.works/div.pl new file mode 100644 index 0000000000..7ec144377f --- /dev/null +++ b/crypto/bn/asm/alpha.works/div.pl @@ -0,0 +1,144 @@ +#!/usr/local/bin/perl + +sub bn_div64 + { + local($data)=<<'EOF'; + # + # What follows was taken directly from the C compiler with a few + # hacks to redo the lables. + # +.text + .set noreorder + .set volatile + .align 3 + .globl bn_div64 + .ent bn_div64 +bn_div64: + ldgp $29,0($27) +bn_div64..ng: + lda $30,-48($30) + .frame $30,48,$26,0 + stq $26,0($30) + stq $9,8($30) + stq $10,16($30) + stq $11,24($30) + stq $12,32($30) + stq $13,40($30) + .mask 0x4003e00,-48 + .prologue 1 + bis $16,$16,$9 + bis $17,$17,$10 + bis $18,$18,$11 + bis $31,$31,$13 + bis $31,2,$12 + bne $11,$9119 + lda $0,-1 + br $31,$9136 + .align 4 +$9119: + bis $11,$11,$16 + jsr $26,BN_num_bits_word + ldgp $29,0($26) + subq $0,64,$1 + beq $1,$9120 + bis $31,1,$1 + sll $1,$0,$1 + cmpule $9,$1,$1 + bne $1,$9120 + # lda $16,_IO_stderr_ + # lda $17,$C32 + # bis $0,$0,$18 + # jsr $26,fprintf + # ldgp $29,0($26) + jsr $26,abort + ldgp $29,0($26) + .align 4 +$9120: + bis $31,64,$3 + cmpult $9,$11,$2 + subq $3,$0,$1 + addl $1,$31,$0 + subq $9,$11,$1 + cmoveq $2,$1,$9 + beq $0,$9122 + zapnot $0,15,$2 + subq $3,$0,$1 + sll $11,$2,$11 + sll $9,$2,$3 + srl $10,$1,$1 + sll $10,$2,$10 + bis $3,$1,$9 +$9122: + srl $11,32,$5 + zapnot $11,15,$6 + lda $7,-1 + .align 5 +$9123: + srl $9,32,$1 + subq $1,$5,$1 + bne $1,$9126 + zapnot $7,15,$27 + br $31,$9127 + .align 4 +$9126: + bis $9,$9,$24 + bis $5,$5,$25 + divqu $24,$25,$27 +$9127: + srl $10,32,$4 + .align 5 +$9128: + mulq $27,$5,$1 + subq $9,$1,$3 + zapnot $3,240,$1 + bne $1,$9129 + mulq $6,$27,$2 + sll $3,32,$1 + addq $1,$4,$1 + cmpule $2,$1,$2 + bne $2,$9129 + subq $27,1,$27 + br $31,$9128 + .align 4 +$9129: + mulq $27,$6,$1 + mulq $27,$5,$4 + srl $1,32,$3 + sll $1,32,$1 + addq $4,$3,$4 + cmpult $10,$1,$2 + subq $10,$1,$10 + addq $2,$4,$2 + cmpult $9,$2,$1 + bis $2,$2,$4 + beq $1,$9134 + addq $9,$11,$9 + subq $27,1,$27 +$9134: + subl $12,1,$12 + subq $9,$4,$9 + beq $12,$9124 + sll $27,32,$13 + sll $9,32,$2 + srl $10,32,$1 + sll $10,32,$10 + bis $2,$1,$9 + br $31,$9123 + .align 4 +$9124: + bis $13,$27,$0 +$9136: + ldq $26,0($30) + ldq $9,8($30) + ldq $10,16($30) + ldq $11,24($30) + ldq $12,32($30) + ldq $13,40($30) + addq $30,48,$30 + ret $31,($26),1 + .end bn_div64 +EOF + &asm_add($data); + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul.pl b/crypto/bn/asm/alpha.works/mul.pl new file mode 100644 index 0000000000..b182bae452 --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul.pl @@ -0,0 +1,116 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + $word=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &br(&label("finish")); + &blt($count,&label("finish")); + + ($a0,$r0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($r0,&QWPw(0,$rp)); + +$a=<<'EOF'; +########################################################## + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp)); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &cmpult($o3,$cc,$cc); + &add($cc,$t3,$cc); &FR($t3); + + &st($o0,&QWPw(0,$rp)); &FR($o0); + &st($o1,&QWPw(0,$rp)); &FR($o1); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &st($o3,&QWPw(0,$rp)); &FR($o3); + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +EOF +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + &mul($a0,$word,($l0)=&NR(1)); + &add($ap,$QWS,$ap); + &muh($a0,$word,($h0)=&NR(1)); &FR($a0); + &add($l0,$cc,$l0); + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &cmpult($l0,$cc,$cc); + &st($l0,&QWPw(-1,$rp)); &FR($l0); + &add($h0,$cc,$cc); &FR($h0); + + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul_add.pl b/crypto/bn/asm/alpha.works/mul_add.pl new file mode 100644 index 0000000000..e37f6315fb --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul_add.pl @@ -0,0 +1,120 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_add_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + $word=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &br(&label("finish")); + &blt($count,&label("finish")); + + ($a0,$r0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($r0,&QWPw(0,$rp)); + +$a=<<'EOF'; +########################################################## + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp)); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &cmpult($o3,$cc,$cc); + &add($cc,$t3,$cc); &FR($t3); + + &st($o0,&QWPw(0,$rp)); &FR($o0); + &st($o1,&QWPw(0,$rp)); &FR($o1); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &st($o3,&QWPw(0,$rp)); &FR($o3); + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +EOF +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + &ld(($r0)=&NR(1),&QWPw(0,$rp)); # get b + &mul($a0,$word,($l0)=&NR(1)); + &sub($count,1,$count); + &add($ap,$QWS,$ap); + &muh($a0,$word,($h0)=&NR(1)); &FR($a0); + &add($r0,$l0,$r0); + &add($rp,$QWS,$rp); + &cmpult($r0,$l0,($t0)=&NR(1)); &FR($l0); + &add($r0,$cc,$r0); + &add($h0,$t0,$h0); &FR($t0); + &cmpult($r0,$cc,$cc); + &st($r0,&QWPw(-1,$rp)); &FR($r0); + &add($h0,$cc,$cc); &FR($h0); + + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul_c4.pl b/crypto/bn/asm/alpha.works/mul_c4.pl new file mode 100644 index 0000000000..5efd201281 --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul_c4.pl @@ -0,0 +1,213 @@ +#!/usr/local/bin/perl +# alpha assember + +sub mul_add_c + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &add($t1,$h1,$h1); &FR($t1); + &add($c1,$h1,$c1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub bn_mul_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &mul($a[0],$b[0],($r00)=&NR(1)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &muh($a[0],$b[0],($r01)=&NR(1)); + &FR($ap); &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &FR($bp); &ld(($b[3])=&NR(1),&QWPw(3,$bp)); + &mul($a[0],$b[1],($r02)=&NR(1)); + + ($R,$H1,$H2)=&NR(3); + + &st($r00,&QWPw(0,$rp)); &FR($r00); + + &mov("zero",$R); + &mul($a[1],$b[0],($r03)=&NR(1)); + + &mov("zero",$H1); + &mov("zero",$H0); + &add($R,$r01,$R); + &muh($a[0],$b[1],($r04)=&NR(1)); + &cmpult($R,$r01,($t01)=&NR(1)); &FR($r01); + &add($R,$r02,$R); + &add($H1,$t01,$H1) &FR($t01); + &muh($a[1],$b[0],($r05)=&NR(1)); + &cmpult($R,$r02,($t02)=&NR(1)); &FR($r02); + &add($R,$r03,$R); + &add($H2,$t02,$H2) &FR($t02); + &mul($a[0],$b[2],($r06)=&NR(1)); + &cmpult($R,$r03,($t03)=&NR(1)); &FR($r03); + &add($H1,$t03,$H1) &FR($t03); + &st($R,&QWPw(1,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r04,$R); + &mov("zero",$H2); + &mul($a[1],$b[1],($r07)=&NR(1)); + &cmpult($R,$r04,($t04)=&NR(1)); &FR($r04); + &add($R,$r05,$R); + &add($H1,$t04,$H1) &FR($t04); + &mul($a[2],$b[0],($r08)=&NR(1)); + &cmpult($R,$r05,($t05)=&NR(1)); &FR($r05); + &add($R,$r01,$R); + &add($H2,$t05,$H2) &FR($t05); + &muh($a[0],$b[2],($r09)=&NR(1)); + &cmpult($R,$r06,($t06)=&NR(1)); &FR($r06); + &add($R,$r07,$R); + &add($H1,$t06,$H1) &FR($t06); + &muh($a[1],$b[1],($r10)=&NR(1)); + &cmpult($R,$r07,($t07)=&NR(1)); &FR($r07); + &add($R,$r08,$R); + &add($H2,$t07,$H2) &FR($t07); + &muh($a[2],$b[0],($r11)=&NR(1)); + &cmpult($R,$r08,($t08)=&NR(1)); &FR($r08); + &add($H1,$t08,$H1) &FR($t08); + &st($R,&QWPw(2,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r09,$R); + &mov("zero",$H2); + &mul($a[0],$b[3],($r12)=&NR(1)); + &cmpult($R,$r09,($t09)=&NR(1)); &FR($r09); + &add($R,$r10,$R); + &add($H1,$t09,$H1) &FR($t09); + &mul($a[1],$b[2],($r13)=&NR(1)); + &cmpult($R,$r10,($t10)=&NR(1)); &FR($r10); + &add($R,$r11,$R); + &add($H1,$t10,$H1) &FR($t10); + &mul($a[2],$b[1],($r14)=&NR(1)); + &cmpult($R,$r11,($t11)=&NR(1)); &FR($r11); + &add($R,$r12,$R); + &add($H1,$t11,$H1) &FR($t11); + &mul($a[3],$b[0],($r15)=&NR(1)); + &cmpult($R,$r12,($t12)=&NR(1)); &FR($r12); + &add($R,$r13,$R); + &add($H1,$t12,$H1) &FR($t12); + &muh($a[0],$b[3],($r16)=&NR(1)); + &cmpult($R,$r13,($t13)=&NR(1)); &FR($r13); + &add($R,$r14,$R); + &add($H1,$t13,$H1) &FR($t13); + &muh($a[1],$b[2],($r17)=&NR(1)); + &cmpult($R,$r14,($t14)=&NR(1)); &FR($r14); + &add($R,$r15,$R); + &add($H1,$t14,$H1) &FR($t14); + &muh($a[2],$b[1],($r18)=&NR(1)); + &cmpult($R,$r15,($t15)=&NR(1)); &FR($r15); + &add($H1,$t15,$H1) &FR($t15); + &st($R,&QWPw(3,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r16,$R); + &mov("zero",$H2); + &muh($a[3],$b[0],($r19)=&NR(1)); + &cmpult($R,$r16,($t16)=&NR(1)); &FR($r16); + &add($R,$r17,$R); + &add($H1,$t16,$H1) &FR($t16); + &mul($a[1],$b[3],($r20)=&NR(1)); + &cmpult($R,$r17,($t17)=&NR(1)); &FR($r17); + &add($R,$r18,$R); + &add($H1,$t17,$H1) &FR($t17); + &mul($a[2],$b[2],($r21)=&NR(1)); + &cmpult($R,$r18,($t18)=&NR(1)); &FR($r18); + &add($R,$r19,$R); + &add($H1,$t18,$H1) &FR($t18); + &mul($a[3],$b[1],($r22)=&NR(1)); + &cmpult($R,$r19,($t19)=&NR(1)); &FR($r19); + &add($R,$r20,$R); + &add($H1,$t19,$H1) &FR($t19); + &muh($a[1],$b[3],($r23)=&NR(1)); + &cmpult($R,$r20,($t20)=&NR(1)); &FR($r20); + &add($R,$r21,$R); + &add($H1,$t20,$H1) &FR($t20); + &muh($a[2],$b[2],($r24)=&NR(1)); + &cmpult($R,$r21,($t21)=&NR(1)); &FR($r21); + &add($R,$r22,$R); + &add($H1,$t21,$H1) &FR($t21); + &muh($a[3],$b[1],($r25)=&NR(1)); + &cmpult($R,$r22,($t22)=&NR(1)); &FR($r22); + &add($H1,$t22,$H1) &FR($t22); + &st($R,&QWPw(4,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r23,$R); + &mov("zero",$H2); + &mul($a[2],$b[3],($r26)=&NR(1)); + &cmpult($R,$r23,($t23)=&NR(1)); &FR($r23); + &add($R,$r24,$R); + &add($H1,$t23,$H1) &FR($t23); + &mul($a[3],$b[2],($r27)=&NR(1)); + &cmpult($R,$r24,($t24)=&NR(1)); &FR($r24); + &add($R,$r25,$R); + &add($H1,$t24,$H1) &FR($t24); + &muh($a[2],$b[3],($r28)=&NR(1)); + &cmpult($R,$r25,($t25)=&NR(1)); &FR($r25); + &add($R,$r26,$R); + &add($H1,$t25,$H1) &FR($t25); + &muh($a[3],$b[2],($r29)=&NR(1)); + &cmpult($R,$r26,($t26)=&NR(1)); &FR($r26); + &add($R,$r27,$R); + &add($H1,$t26,$H1) &FR($t26); + &mul($a[3],$b[3],($r30)=&NR(1)); + &cmpult($R,$r27,($t27)=&NR(1)); &FR($r27); + &add($H1,$t27,$H1) &FR($t27); + &st($R,&QWPw(5,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r28,$R); + &mov("zero",$H2); + &muh($a[3],$b[3],($r31)=&NR(1)); + &cmpult($R,$r28,($t28)=&NR(1)); &FR($r28); + &add($R,$r29,$R); + &add($H1,$t28,$H1) &FR($t28); + ############ + &cmpult($R,$r29,($t29)=&NR(1)); &FR($r29); + &add($R,$r30,$R); + &add($H1,$t29,$H1) &FR($t29); + ############ + &cmpult($R,$r30,($t30)=&NR(1)); &FR($r30); + &add($H1,$t30,$H1) &FR($t30); + &st($R,&QWPw(6,$rp)); + &add($H1,$H2,$R); + + &add($R,$r31,$R); &FR($r31); + &st($R,&QWPw(7,$rp)); + + &FR($R,$H1,$H2); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul_c4.works.pl b/crypto/bn/asm/alpha.works/mul_c4.works.pl new file mode 100644 index 0000000000..79d86dd25c --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul_c4.works.pl @@ -0,0 +1,98 @@ +#!/usr/local/bin/perl +# alpha assember + +sub mul_add_c + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + +print STDERR "count=$cnt\n"; $cnt++; + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &add($t1,$h1,$h1); &FR($t1); + &add($c1,$h1,$c1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub bn_mul_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap); + &ld(($b[3])=&NR(1),&QWPw(3,$bp)); &FR($bp); + + ($c0,$c1,$c2)=&NR(3); + &mov("zero",$c2); + &mul($a[0],$b[0],$c0); + &muh($a[0],$b[0],$c1); + &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[1],$c0,$c1,$c2); + &mul_add_c($a[1],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[1],$c0,$c1,$c2); + &mul_add_c($a[0],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[3],$c0,$c1,$c2); &FR($a[0]); + &mul_add_c($a[1],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[1],$c0,$c1,$c2); + &mul_add_c($a[3],$b[0],$c0,$c1,$c2); &FR($b[0]); + &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[3],$c0,$c1,$c2); &FR($a[1]); + &mul_add_c($a[2],$b[2],$c0,$c1,$c2); + &mul_add_c($a[3],$b[1],$c0,$c1,$c2); &FR($b[1]); + &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[2],$b[3],$c0,$c1,$c2); &FR($a[2]); + &mul_add_c($a[3],$b[2],$c0,$c1,$c2); &FR($b[2]); + &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[3],$b[3],$c0,$c1,$c2); &FR($a[3],$b[3]); + &st($c0,&QWPw(6,$rp)); + &st($c1,&QWPw(7,$rp)); + + &FR($c0,$c1,$c2); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul_c8.pl b/crypto/bn/asm/alpha.works/mul_c8.pl new file mode 100644 index 0000000000..525ca7494b --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul_c8.pl @@ -0,0 +1,177 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_comba8 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &stack_push(2); + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &st($reg_s0,&swtmp(0)); &FR($reg_s0); + &st($reg_s1,&swtmp(1)); &FR($reg_s1); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &ld(($b[3])=&NR(1),&QWPw(3,$bp)); + &ld(($a[4])=&NR(1),&QWPw(1,$ap)); + &ld(($b[4])=&NR(1),&QWPw(1,$bp)); + &ld(($a[5])=&NR(1),&QWPw(1,$ap)); + &ld(($b[5])=&NR(1),&QWPw(1,$bp)); + &ld(($a[6])=&NR(1),&QWPw(1,$ap)); + &ld(($b[6])=&NR(1),&QWPw(1,$bp)); + &ld(($a[7])=&NR(1),&QWPw(1,$ap)); &FR($ap); + &ld(($b[7])=&NR(1),&QWPw(1,$bp)); &FR($bp); + + ($c0,$c1,$c2)=&NR(3); + &mov("zero",$c2); + &mul($a[0],$b[0],$c0); + &muh($a[0],$b[0],$c1); + &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[1],$c0,$c1,$c2); + &mul_add_c($a[1],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[2],$c0,$c1,$c2); + &mul_add_c($a[1],$b[1],$c0,$c1,$c2); + &mul_add_c($a[2],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[3],$c0,$c1,$c2); + &mul_add_c($a[1],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[1],$c0,$c1,$c2); + &mul_add_c($a[3],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[4],$c0,$c1,$c2); + &mul_add_c($a[1],$b[3],$c0,$c1,$c2); + &mul_add_c($a[2],$b[2],$c0,$c1,$c2); + &mul_add_c($a[3],$b[1],$c0,$c1,$c2); + &mul_add_c($a[4],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[5],$c0,$c1,$c2); + &mul_add_c($a[1],$b[4],$c0,$c1,$c2); + &mul_add_c($a[2],$b[3],$c0,$c1,$c2); + &mul_add_c($a[3],$b[2],$c0,$c1,$c2); + &mul_add_c($a[4],$b[1],$c0,$c1,$c2); + &mul_add_c($a[5],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[6],$c0,$c1,$c2); + &mul_add_c($a[1],$b[5],$c0,$c1,$c2); + &mul_add_c($a[2],$b[4],$c0,$c1,$c2); + &mul_add_c($a[3],$b[3],$c0,$c1,$c2); + &mul_add_c($a[4],$b[2],$c0,$c1,$c2); + &mul_add_c($a[5],$b[1],$c0,$c1,$c2); + &mul_add_c($a[6],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[7],$c0,$c1,$c2); &FR($a[0]); + &mul_add_c($a[1],$b[6],$c0,$c1,$c2); + &mul_add_c($a[2],$b[5],$c0,$c1,$c2); + &mul_add_c($a[3],$b[4],$c0,$c1,$c2); + &mul_add_c($a[4],$b[3],$c0,$c1,$c2); + &mul_add_c($a[5],$b[2],$c0,$c1,$c2); + &mul_add_c($a[6],$b[1],$c0,$c1,$c2); + &mul_add_c($a[7],$b[0],$c0,$c1,$c2); &FR($b[0]); + &st($c0,&QWPw(7,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[7],$c0,$c1,$c2); &FR($a[1]); + &mul_add_c($a[2],$b[6],$c0,$c1,$c2); + &mul_add_c($a[3],$b[5],$c0,$c1,$c2); + &mul_add_c($a[4],$b[4],$c0,$c1,$c2); + &mul_add_c($a[5],$b[3],$c0,$c1,$c2); + &mul_add_c($a[6],$b[2],$c0,$c1,$c2); + &mul_add_c($a[7],$b[1],$c0,$c1,$c2); &FR($b[1]); + &st($c0,&QWPw(8,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[2],$b[7],$c0,$c1,$c2); &FR($a[2]); + &mul_add_c($a[3],$b[6],$c0,$c1,$c2); + &mul_add_c($a[4],$b[5],$c0,$c1,$c2); + &mul_add_c($a[5],$b[4],$c0,$c1,$c2); + &mul_add_c($a[6],$b[3],$c0,$c1,$c2); + &mul_add_c($a[7],$b[2],$c0,$c1,$c2); &FR($b[2]); + &st($c0,&QWPw(9,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[3],$b[7],$c0,$c1,$c2); &FR($a[3]); + &mul_add_c($a[4],$b[6],$c0,$c1,$c2); + &mul_add_c($a[5],$b[5],$c0,$c1,$c2); + &mul_add_c($a[6],$b[4],$c0,$c1,$c2); + &mul_add_c($a[7],$b[3],$c0,$c1,$c2); &FR($b[3]); + &st($c0,&QWPw(10,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[4],$b[7],$c0,$c1,$c2); &FR($a[4]); + &mul_add_c($a[5],$b[6],$c0,$c1,$c2); + &mul_add_c($a[6],$b[5],$c0,$c1,$c2); + &mul_add_c($a[7],$b[4],$c0,$c1,$c2); &FR($b[4]); + &st($c0,&QWPw(11,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[5],$b[7],$c0,$c1,$c2); &FR($a[5]); + &mul_add_c($a[6],$b[6],$c0,$c1,$c2); + &mul_add_c($a[7],$b[5],$c0,$c1,$c2); &FR($b[5]); + &st($c0,&QWPw(12,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[6],$b[7],$c0,$c1,$c2); &FR($a[6]); + &mul_add_c($a[7],$b[6],$c0,$c1,$c2); &FR($b[6]); + &st($c0,&QWPw(13,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[7],$b[7],$c0,$c1,$c2); &FR($a[7],$b[7]); + &st($c0,&QWPw(14,$rp)); + &st($c1,&QWPw(15,$rp)); + + &FR($c0,$c1,$c2); + + &ld($reg_s0,&swtmp(0)); + &ld($reg_s1,&swtmp(1)); + &stack_pop(2); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/sqr.pl b/crypto/bn/asm/alpha.works/sqr.pl new file mode 100644 index 0000000000..a55b696906 --- /dev/null +++ b/crypto/bn/asm/alpha.works/sqr.pl @@ -0,0 +1,113 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_sqr_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(3); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zer |