diff options
author | Andy Polyakov <appro@openssl.org> | 2007-06-17 17:10:03 +0000 |
---|---|---|
committer | Andy Polyakov <appro@openssl.org> | 2007-06-17 17:10:03 +0000 |
commit | 7d9cf7c0bbc17a2c00339e660c83ebf1a4f9061a (patch) | |
tree | 004fe317e4795d576c92746d7e954c2db6a5d3af /crypto/bn/asm/via-mont.pl | |
parent | 55525742f4c2bf416013fc3a75ec642775d97f80 (diff) |
Eliminate conditional final subtraction in Montgomery assembler modules.
Diffstat (limited to 'crypto/bn/asm/via-mont.pl')
-rw-r--r-- | crypto/bn/asm/via-mont.pl | 94 |
1 files changed, 57 insertions, 37 deletions
diff --git a/crypto/bn/asm/via-mont.pl b/crypto/bn/asm/via-mont.pl index e149941987..ce3cd61eb3 100644 --- a/crypto/bn/asm/via-mont.pl +++ b/crypto/bn/asm/via-mont.pl @@ -77,7 +77,8 @@ # - in terms of absolute performance it delivers approximately as much # as modern out-of-order 32-bit cores [again, for longer keys]. -push(@INC,".","../../perlasm"); +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],"via-mont.pl"); @@ -100,7 +101,7 @@ $sp=&DWP(28,"esp"); # &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num] # &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num] # &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num] -# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of np[num] +# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num] # Note that SDK suggests to unconditionally allocate 2K per vector. This # has quite an impact on performance. It naturally depends on key length, # but to give an example 1024 bit private RSA key operations suffer >30% @@ -115,7 +116,7 @@ $sp=&DWP(28,"esp"); &jnz (&label("leave")); # num % 4 != 0 &cmp ("ecx",8); &jb (&label("leave")); # num < 8 - &cmp ("ecx",256); + &cmp ("ecx",1024); &ja (&label("leave")); # num > 1024 &pushf (); @@ -148,74 +149,91 @@ $sp=&DWP(28,"esp"); &lea ("ebp",&DWP(-$pad,"ecx")); &shr ("ebp",2); # restore original num value in ebp - &add ("ecx",32/4); # (4 vectors + 32 byte scratch)/4 &xor ("eax","eax"); + + &mov ("ecx","ebp"); + &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch &data_byte(0xf3,0xab); # rep stosl, bzero &mov ("ecx","ebp"); &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy &mov ($A,"edi"); &data_byte(0xf3,0xa5); # rep movsl, memcpy + &mov ("ecx",$pad/4); + &data_byte(0xf3,0xab); # rep stosl, bzero pad + # edi points at the end of padded ap copy... - # edi points at the end of ap copy... &mov ("ecx","ebp"); - &add ("edi",$pad); # skip padding to point at bp copy &mov ("esi","ebx"); &mov ($B,"edi"); &data_byte(0xf3,0xa5); # rep movsl, memcpy + &mov ("ecx",$pad/4); + &data_byte(0xf3,0xab); # rep stosl, bzero pad + # edi points at the end of padded bp copy... - # edi points at the end of bp copy... &mov ("ecx","ebp"); - &add ("edi",$pad); # skip padding to point at np copy &mov ("esi","edx"); &mov ($M,"edi"); &data_byte(0xf3,0xa5); # rep movsl, memcpy + &mov ("ecx",$pad/4); + &data_byte(0xf3,0xab); # rep stosl, bzero pad + # edi points at the end of padded np copy... # let magic happen... &mov ("ecx","ebp"); &mov ("esi","esp"); - &xor ("eax","eax"); &shl ("ecx",5); # convert word counter to bit counter &align (4); &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul &mov ("ecx","ebp"); - &xor ("edx","edx"); # i=0 - &lea ("esi",&DWP(64,"esp")); # tp - # edi still points at the end of np copy... + &xor ("edx","edx"); # i=0 + &lea ("esi",&DWP(64,"esp")); # tp + # edi still points at the end of padded np copy... + &mov ("eax",&DWP(-4-$pad,"edi")); # np[num-1] &neg ("ebp"); - &lea ("ebp",&DWP(0,"edi","ebp",4)); # so just "rewind" - &mov ("edi",$rp); # restore rp - - &mov ("ebx",&DWP(0,"esi","ecx",4)); # upmost overflow bit - &cmp ("ebx",0); # clears CF unconfitionally - &jnz (&label("sub")); - &mov ("eax",&DWP(-4,"esi","ecx",4)); - &cmp ("eax",&DWP(-4,"ebp","ecx",4)); # tp[num-1]-np[num-1]? - &jae (&label("sub")); # if taken CF is cleared - -&set_label("copy",4); - &mov ("ebx","ecx"); - &data_byte(0xf3,0xa5); # rep movsl - &mov ("ecx","ebx"); - &jmp (&label("zap")); - -&set_label("sub",16); + &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind" + &mov ("edi",$rp); # restore rp + + &shr ("eax",30); # boundary condition... + &jz (&label("copy")); # ... is met + &xor ("edx","edx"); # clear CF + +&set_label("sub",8); &mov ("eax",&DWP(0,"esi","edx",4)); &sbb ("eax",&DWP(0,"ebp","edx",4)); &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i] &lea ("edx",&DWP(1,"edx")); # i++ - &dec ("ecx"); # doesn't affect CF! - &jg (&label("sub")); - &sbb ("ebx",0); # upmost overflow is still there - &mov ("ecx","edx"); - &jc (&label("copy")); + &loop (&label("sub")); # doesn't affect CF! + + &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit + &sbb ("eax",0); + &and ("esi","eax"); + ¬ ("eax"); + &mov ("ebp","edi"); + &and ("ebp","eax"); + &or ("esi","ebp"); # tp=carry?tp:rp + + &mov ("ecx","edx"); # num + &xor ("edx","edx"); # i=0 + +&set_label("copy",8); + &mov ("eax",&DWP(0,"esi","edx",4)); + &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp + &mov (&DWP(0,"edi","edx",4),"eax"); + &lea ("edx",&DWP(1,"edx")); # i++ + &loop (&label("copy")); -&set_label("zap",4); &mov ("ebp",$sp); &xor ("eax","eax"); - &lea ("ecx",&DWP(64/4+$pad,"","ecx",4));# size of frame divided by 4 - &mov ("edi","esp"); + + &mov ("ecx",64/4); + &mov ("edi","esp"); # zap frame including scratch area + &data_byte(0xf3,0xab); # rep stosl, bzero + + # zap copies of ap, bp and np + &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap + &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2)); &data_byte(0xf3,0xab); # rep stosl, bzero &mov ("esp","ebp"); @@ -224,4 +242,6 @@ $sp=&DWP(28,"esp"); &set_label("leave"); &function_end($func); +&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>"); + &asm_finish(); |