diff options
author | Andy Polyakov <appro@openssl.org> | 2011-06-28 13:53:50 +0000 |
---|---|---|
committer | Andy Polyakov <appro@openssl.org> | 2011-06-28 13:53:50 +0000 |
commit | 84968e25f31c76b0e9043002b43bdcc6cad96fc2 (patch) | |
tree | 5e681c3fc83e77f7e2a5fcec90e49e2098b39560 /crypto | |
parent | 10fd0b7b55b2d1ec80c17a97bcb96ab353ced152 (diff) |
x86[_64] assembler pack: back-port SHA1 and RC4 from HEAD.
Diffstat (limited to 'crypto')
-rw-r--r-- | crypto/rc4/asm/rc4-586.pl | 166 | ||||
-rwxr-xr-x | crypto/rc4/asm/rc4-x86_64.pl | 282 | ||||
-rw-r--r-- | crypto/sha/asm/sha1-586.pl | 1107 | ||||
-rwxr-xr-x | crypto/sha/asm/sha1-x86_64.pl | 1180 |
4 files changed, 2480 insertions, 255 deletions
diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl index ec82c35b88..84f1a798cb 100644 --- a/crypto/rc4/asm/rc4-586.pl +++ b/crypto/rc4/asm/rc4-586.pl @@ -24,10 +24,38 @@ # For reference! This code delivers ~80% of rc4-amd64.pl # performance on the same Opteron machine. # (**) This number requires compressed key schedule set up by -# private_RC4_set_key [see commentary below for further details]. +# RC4_set_key [see commentary below for further details]. # # <appro@fy.chalmers.se> +# May 2011 +# +# Optimize for Core2 and Westmere [and incidentally Opteron]. Current +# performance in cycles per processed byte (less is better) and +# improvement relative to previous version of this module is: +# +# Pentium 10.2 # original numbers +# Pentium III 7.8(*) +# Intel P4 7.5 +# +# Opteron 6.1/+20% # new MMX numbers +# Core2 5.3/+67%(**) +# Westmere 5.1/+94%(**) +# Sandy Bridge 5.0/+8% +# Atom 12.6/+6% +# +# (*) PIII can actually deliver 6.6 cycles per byte with MMX code, +# but this specific code performs poorly on Core2. And vice +# versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs +# poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU +# [anymore], I chose to discard PIII-specific code path and opt +# for original IALU-only code, which is why MMX/SSE code path +# is guarded by SSE2 bit (see below), not MMX/SSE. +# (**) Performance vs. block size on Core2 and Westmere had a maximum +# at ... 64 bytes block size. And it was quite a maximum, 40-60% +# in comparison to largest 8KB block size. Above improvement +# coefficients are for the largest block size. + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; @@ -62,6 +90,68 @@ sub RC4_loop { &$func ($out,&DWP(0,$dat,$ty,4)); } +if ($alt=0) { + # >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron, + # but ~40% slower on Core2 and Westmere... Attempt to add movz + # brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet + # on Core2 with movz it's almost 20% slower than below alternative + # code... Yes, it's a total mess... + my @XX=($xx,$out); + $RC4_loop_mmx = sub { # SSE actually... + my $i=shift; + my $j=$i<=0?0:$i>>1; + my $mm=$i<=0?"mm0":"mm".($i&1); + + &add (&LB($yy),&LB($tx)); + &lea (@XX[1],&DWP(1,@XX[0])); + &pxor ("mm2","mm0") if ($i==0); + &psllq ("mm1",8) if ($i==0); + &and (@XX[1],0xff); + &pxor ("mm0","mm0") if ($i<=0); + &mov ($ty,&DWP(0,$dat,$yy,4)); + &mov (&DWP(0,$dat,$yy,4),$tx); + &pxor ("mm1","mm2") if ($i==0); + &mov (&DWP(0,$dat,$XX[0],4),$ty); + &add (&LB($ty),&LB($tx)); + &movd (@XX[0],"mm7") if ($i==0); + &mov ($tx,&DWP(0,$dat,@XX[1],4)); + &pxor ("mm1","mm1") if ($i==1); + &movq ("mm2",&QWP(0,$inp)) if ($i==1); + &movq (&QWP(-8,(@XX[0],$inp)),"mm1") if ($i==0); + &pinsrw ($mm,&DWP(0,$dat,$ty,4),$j); + + push (@XX,shift(@XX)) if ($i>=0); + } +} else { + # Using pinsrw here improves performane on Intel CPUs by 2-3%, but + # brings down AMD by 7%... + $RC4_loop_mmx = sub { + my $i=shift; + + &add (&LB($yy),&LB($tx)); + &psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1); + &mov ($ty,&DWP(0,$dat,$yy,4)); + &mov (&DWP(0,$dat,$yy,4),$tx); + &mov (&DWP(0,$dat,$xx,4),$ty); + &inc ($xx); + &add ($ty,$tx); + &movz ($xx,&LB($xx)); # (*) + &movz ($ty,&LB($ty)); # (*) + &pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0); + &movq ("mm0",&QWP(0,$inp)) if ($i<=0); + &movq (&QWP(-8,($out,$inp)),"mm2") if ($i==0); + &mov ($tx,&DWP(0,$dat,$xx,4)); + &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4)); + + # (*) This is the key to Core2 and Westmere performance. + # Whithout movz out-of-order execution logic confuses + # itself and fails to reorder loads and stores. Problem + # appears to be fixed in Sandy Bridge... + } +} + +&external_label("OPENSSL_ia32cap_P"); + # void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out); &function_begin("RC4"); &mov ($dat,&wparam(0)); # load key schedule pointer @@ -94,11 +184,56 @@ sub RC4_loop { &and ($ty,-4); # how many 4-byte chunks? &jz (&label("loop1")); + &test ($ty,-8); + &mov (&wparam(3),$out); # $out as accumulator in these loops + &jz (&label("go4loop4")); + + &picmeup($out,"OPENSSL_ia32cap_P"); + &bt (&DWP(0,$out),26); # check SSE2 bit [could have been MMX] + &jnc (&label("go4loop4")); + + &mov ($out,&wparam(3)) if (!$alt); + &movd ("mm7",&wparam(3)) if ($alt); + &and ($ty,-8); + &lea ($ty,&DWP(-8,$inp,$ty)); + &mov (&DWP(-4,$dat),$ty); # save input+(len/8)*8-8 + + &$RC4_loop_mmx(-1); + &jmp(&label("loop_mmx_enter")); + + &set_label("loop_mmx",16); + &$RC4_loop_mmx(0); + &set_label("loop_mmx_enter"); + for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); } + &mov ($ty,$yy); + &xor ($yy,$yy); # this is second key to Core2 + &mov (&LB($yy),&LB($ty)); # and Westmere performance... + &cmp ($inp,&DWP(-4,$dat)); + &lea ($inp,&DWP(8,$inp)); + &jb (&label("loop_mmx")); + + if ($alt) { + &movd ($out,"mm7"); + &pxor ("mm2","mm0"); + &psllq ("mm1",8); + &pxor ("mm1","mm2"); + &movq (&QWP(-8,$out,$inp),"mm1"); + } else { + &psllq ("mm1",56); + &pxor ("mm2","mm1"); + &movq (&QWP(-8,$out,$inp),"mm2"); + } + &emms (); + + &cmp ($inp,&wparam(1)); # compare to input+len + &je (&label("done")); + &jmp (&label("loop1")); + +&set_label("go4loop4",16); &lea ($ty,&DWP(-4,$inp,$ty)); &mov (&wparam(2),$ty); # save input+(len/4)*4-4 - &mov (&wparam(3),$out); # $out as accumulator in this loop - &set_label("loop4",16); + &set_label("loop4"); for ($i=0;$i<4;$i++) { RC4_loop($i); } &ror ($out,8); &xor ($out,&DWP(0,$inp)); @@ -151,7 +286,7 @@ sub RC4_loop { &set_label("done"); &dec (&LB($xx)); - &mov (&BP(-4,$dat),&LB($yy)); # save key->y + &mov (&DWP(-4,$dat),$yy); # save key->y &mov (&BP(-8,$dat),&LB($xx)); # save key->x &set_label("abort"); &function_end("RC4"); @@ -164,10 +299,8 @@ $idi="ebp"; $ido="ecx"; $idx="edx"; -&external_label("OPENSSL_ia32cap_P"); - -# void private_RC4_set_key(RC4_KEY *key,int len,const unsigned char *data); -&function_begin("private_RC4_set_key"); +# void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data); +&function_begin("RC4_set_key"); &mov ($out,&wparam(0)); # load key &mov ($idi,&wparam(1)); # load len &mov ($inp,&wparam(2)); # load data @@ -245,7 +378,7 @@ $idx="edx"; &xor ("eax","eax"); &mov (&DWP(-8,$out),"eax"); # key->x=0; &mov (&DWP(-4,$out),"eax"); # key->y=0; -&function_end("private_RC4_set_key"); +&function_end("RC4_set_key"); # const char *RC4_options(void); &function_begin_B("RC4_options"); @@ -254,14 +387,21 @@ $idx="edx"; &blindpop("eax"); &lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax")); &picmeup("edx","OPENSSL_ia32cap_P"); - &bt (&DWP(0,"edx"),20); - &jnc (&label("skip")); - &add ("eax",12); - &set_label("skip"); + &mov ("edx",&DWP(0,"edx")); + &bt ("edx",20); + &jc (&label("1xchar")); + &bt ("edx",26); + &jnc (&label("ret")); + &add ("eax",25); + &ret (); +&set_label("1xchar"); + &add ("eax",12); +&set_label("ret"); &ret (); &set_label("opts",64); &asciz ("rc4(4x,int)"); &asciz ("rc4(1x,char)"); +&asciz ("rc4(8x,mmx)"); &asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>"); &align (64); &function_end_B("RC4_options"); diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl index b04eb1a72a..e18e8a0008 100755 --- a/crypto/rc4/asm/rc4-x86_64.pl +++ b/crypto/rc4/asm/rc4-x86_64.pl @@ -7,6 +7,8 @@ # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # +# July 2004 +# # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in # "hand-coded assembler"] doesn't stand for the whole improvement # coefficient. It turned out that eliminating RC4_CHAR from config @@ -19,6 +21,8 @@ # to operate on partial registers, it turned out to be the best bet. # At least for AMD... How IA32E would perform remains to be seen... +# November 2004 +# # As was shown by Marc Bevand reordering of couple of load operations # results in even higher performance gain of 3.3x:-) At least on # Opteron... For reference, 1x in this case is RC4_CHAR C-code @@ -26,6 +30,8 @@ # Latter means that if you want to *estimate* what to expect from # *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz. +# November 2004 +# # Intel P4 EM64T core was found to run the AMD64 code really slow... # The only way to achieve comparable performance on P4 was to keep # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to @@ -33,10 +39,14 @@ # on either AMD and Intel platforms, I implement both cases. See # rc4_skey.c for further details... +# April 2005 +# # P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing # those with add/sub results in 50% performance improvement of folded # loop... +# May 2005 +# # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T # performance by >30% [unlike P4 32-bit case that is]. But this is # provided that loads are reordered even more aggressively! Both code @@ -50,6 +60,8 @@ # is not implemented, then this final RC4_CHAR code-path should be # preferred, as it provides better *all-round* performance]. +# March 2007 +# # Intel Core2 was observed to perform poorly on both code paths:-( It # apparently suffers from some kind of partial register stall, which # occurs in 64-bit mode only [as virtually identical 32-bit loop was @@ -58,6 +70,37 @@ # fit for Core2 and therefore the code was modified to skip cloop8 on # this CPU. +# May 2010 +# +# Intel Westmere was observed to perform suboptimally. Adding yet +# another movzb to cloop1 improved performance by almost 50%! Core2 +# performance is improved too, but nominally... + +# May 2011 +# +# The only code path that was not modified is P4-specific one. Non-P4 +# Intel code path optimization is heavily based on submission by Maxim +# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used +# some of the ideas even in attempt to optmize the original RC4_INT +# code path... Current performance in cycles per processed byte (less +# is better) and improvement coefficients relative to previous +# version of this module are: +# +# Opteron 5.3/+0%(*) +# P4 6.5 +# Core2 6.2/+15%(**) +# Westmere 4.2/+60% +# Sandy Bridge 4.2/+120% +# Atom 9.3/+80% +# +# (*) But corresponding loop has less instructions, which should have +# positive effect on upcoming Bulldozer, which has one less ALU. +# For reference, Intel code runs at 6.8 cpb rate on Opteron. +# (**) Note that Core2 result is ~15% lower than corresponding result +# for 32-bit code, meaning that it's possible to improve it, +# but more than likely at the cost of the others (see rc4-586.pl +# to get the idea)... + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -76,13 +119,10 @@ $len="%rsi"; # arg2 $inp="%rdx"; # arg3 $out="%rcx"; # arg4 -@XX=("%r8","%r10"); -@TX=("%r9","%r11"); -$YY="%r12"; -$TY="%r13"; - +{ $code=<<___; .text +.extern OPENSSL_ia32cap_P .globl RC4 .type RC4,\@function,4 @@ -95,48 +135,173 @@ RC4: or $len,$len push %r12 push %r13 .Lprologue: + mov $len,%r11 + mov $inp,%r12 + mov $out,%r13 +___ +my $len="%r11"; # reassign input arguments +my $inp="%r12"; +my $out="%r13"; - add \$8,$dat - movl -8($dat),$XX[0]#d - movl -4($dat),$YY#d +my @XX=("%r10","%rsi"); +my @TX=("%rax","%rbx"); +my $YY="%rcx"; +my $TY="%rdx"; + +$code.=<<___; + xor $XX[0],$XX[0] + xor $YY,$YY + + lea 8($dat),$dat + mov -8($dat),$XX[0]#b + mov -4($dat),$YY#b cmpl \$-1,256($dat) je .LRC4_CHAR + mov OPENSSL_ia32cap_P(%rip),%r8d + xor $TX[1],$TX[1] inc $XX[0]#b + sub $XX[0],$TX[1] + sub $inp,$out movl ($dat,$XX[0],4),$TX[0]#d - test \$-8,$len + test \$-16,$len jz .Lloop1 - jmp .Lloop8 + bt \$30,%r8d # Intel CPU? + jc .Lintel + and \$7,$TX[1] + lea 1($XX[0]),$XX[1] + jz .Loop8 + sub $TX[1],$len +.Loop8_warmup: + add $TX[0]#b,$YY#b + movl ($dat,$YY,4),$TY#d + movl $TX[0]#d,($dat,$YY,4) + movl $TY#d,($dat,$XX[0],4) + add $TY#b,$TX[0]#b + inc $XX[0]#b + movl ($dat,$TX[0],4),$TY#d + movl ($dat,$XX[0],4),$TX[0]#d + xorb ($inp),$TY#b + movb $TY#b,($out,$inp) + lea 1($inp),$inp + dec $TX[1] + jnz .Loop8_warmup + + lea 1($XX[0]),$XX[1] + jmp .Loop8 .align 16 -.Lloop8: +.Loop8: ___ for ($i=0;$i<8;$i++) { +$code.=<<___ if ($i==7); + add \$8,$XX[1]#b +___ $code.=<<___; add $TX[0]#b,$YY#b - mov $XX[0],$XX[1] movl ($dat,$YY,4),$TY#d - ror \$8,%rax # ror is redundant when $i=0 - inc $XX[1]#b - movl ($dat,$XX[1],4),$TX[1]#d - cmp $XX[1],$YY movl $TX[0]#d,($dat,$YY,4) - cmove $TX[0],$TX[1] - movl $TY#d,($dat,$XX[0],4) + movl `4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d + ror \$8,%r8 # ror is redundant when $i=0 + movl $TY#d,4*$i($dat,$XX[0],4) add $TX[0]#b,$TY#b - movb ($dat,$TY,4),%al + movb ($dat,$TY,4),%r8b ___ -push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers +push(@TX,shift(@TX)); #push(@XX,shift(@XX)); # "rotate" registers } $code.=<<___; - ror \$8,%rax + add \$8,$XX[0]#b + ror \$8,%r8 sub \$8,$len - xor ($inp),%rax - add \$8,$inp - mov %rax,($out) - add \$8,$out + xor ($inp),%r8 + mov %r8,($out,$inp) + lea 8($inp),$inp test \$-8,$len - jnz .Lloop8 + jnz .Loop8 + cmp \$0,$len + jne .Lloop1 + jmp .Lexit + +.align 16 +.Lintel: + test \$-32,$len + jz .Lloop1 + and \$15,$TX[1] + jz .Loop16_is_hot + sub $TX[1],$len +.Loop16_warmup: + add $TX[0]#b,$YY#b + movl ($dat,$YY,4),$TY#d + movl $TX[0]#d,($dat,$YY,4) + movl $TY#d,($dat,$XX[0],4) + add $TY#b,$TX[0]#b + inc $XX[0]#b + movl ($dat,$TX[0],4),$TY#d + movl ($dat,$XX[0],4),$TX[0]#d + xorb ($inp),$TY#b + movb $TY#b,($out,$inp) + lea 1($inp),$inp + dec $TX[1] + jnz .Loop16_warmup + + mov $YY,$TX[1] + xor $YY,$YY + mov $TX[1]#b,$YY#b + +.Loop16_is_hot: + lea ($dat,$XX[0],4),$XX[1] +___ +sub RC4_loop { + my $i=shift; + my $j=$i<0?0:$i; + my $xmm="%xmm".($j&1); + + $code.=" add \$16,$XX[0]#b\n" if ($i==15); + $code.=" movdqu ($inp),%xmm2\n" if ($i==15); + $code.=" add $TX[0]#b,$YY#b\n" if ($i<=0); + $code.=" movl ($dat,$YY,4),$TY#d\n"; + $code.=" pxor %xmm0,%xmm2\n" if ($i==0); + $code.=" psllq \$8,%xmm1\n" if ($i==0); + $code.=" pxor $xmm,$xmm\n" if ($i<=1); + $code.=" movl $TX[0]#d,($dat,$YY,4)\n"; + $code.=" add $TY#b,$TX[0]#b\n"; + $code.=" movl `4*($j+1)`($XX[1]),$TX[1]#d\n" if ($i<15); + $code.=" movz $TX[0]#b,$TX[0]#d\n"; + $code.=" movl $TY#d,4*$j($XX[1])\n"; + $code.=" pxor %xmm1,%xmm2\n" if ($i==0); + $code.=" lea ($dat,$XX[0],4),$XX[1]\n" if ($i==15); + $code.=" add $TX[1]#b,$YY#b\n" if ($i<15); + $code.=" pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n"; + $code.=" movdqu %xmm2,($out,$inp)\n" if ($i==0); + $code.=" lea 16($inp),$inp\n" if ($i==0); + $code.=" movl ($XX[1]),$TX[1]#d\n" if ($i==15); +} + RC4_loop(-1); +$code.=<<___; + jmp .Loop16_enter +.align 16 +.Loop16: +___ + +for ($i=0;$i<16;$i++) { + $code.=".Loop16_enter:\n" if ($i==1); + RC4_loop($i); + push(@TX,shift(@TX)); # "rotate" registers +} +$code.=<<___; + mov $YY,$TX[1] + xor $YY,$YY # keyword to partial register + sub \$16,$len + mov $TX[1]#b,$YY#b + test \$-16,$len + jnz .Loop16 + + psllq \$8,%xmm1 + pxor %xmm0,%xmm2 + pxor %xmm1,%xmm2 + movdqu %xmm2,($out,$inp) + lea 16($inp),$inp + cmp \$0,$len jne .Lloop1 jmp .Lexit @@ -152,9 +317,8 @@ $code.=<<___; movl ($dat,$TX[0],4),$TY#d movl ($dat,$XX[0],4),$TX[0]#d xorb ($inp),$TY#b - inc $inp - movb $TY#b,($out) - inc $out + movb $TY#b,($out,$inp) + lea 1($inp),$inp dec $len jnz .Lloop1 jmp .Lexit @@ -165,13 +329,11 @@ $code.=<<___; movzb ($dat,$XX[0]),$TX[0]#d test \$-8,$len jz .Lcloop1 - cmpl \$0,260($dat) - jnz .Lcloop1 jmp .Lcloop8 .align 16 .Lcloop8: - mov ($inp),%eax - mov 4($inp),%ebx + mov ($inp),%r8d + mov 4($inp),%r9d ___ # unroll 2x4-wise, because 64-bit rotates kill Intel P4... for ($i=0;$i<4;$i++) { @@ -188,8 +350,8 @@ $code.=<<___; mov $TX[0],$TX[1] .Lcmov$i: add $TX[0]#b,$TY#b - xor ($dat,$TY),%al - ror \$8,%eax + xor ($dat,$TY),%r8b + ror \$8,%r8d ___ push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers } @@ -207,16 +369,16 @@ $code.=<<___; mov $TX[0],$TX[1] .Lcmov$i: add $TX[0]#b,$TY#b - xor ($dat,$TY),%bl - ror \$8,%ebx + xor ($dat,$TY),%r9b + ror \$8,%r9d ___ push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers } $code.=<<___; lea -8($len),$len - mov %eax,($out) + mov %r8d,($out) lea 8($inp),$inp - mov %ebx,4($out) + mov %r9d,4($out) lea 8($out),$out test \$-8,$len @@ -229,6 +391,7 @@ $code.=<<___; .align 16 .Lcloop1: add $TX[0]#b,$YY#b + movzb $YY#b,$YY#d movzb ($dat,$YY),$TY#d movb $TX[0]#b,($dat,$YY) movb $TY#b,($dat,$XX[0]) @@ -260,16 +423,16 @@ $code.=<<___; ret .size RC4,.-RC4 ___ +} $idx="%r8"; $ido="%r9"; $code.=<<___; -.extern OPENSSL_ia32cap_P -.globl private_RC4_set_key -.type private_RC4_set_key,\@function,3 +.globl RC4_set_key +.type RC4_set_key,\@function,3 .align 16 -private_RC4_set_key: +RC4_set_key: lea 8($dat),$dat lea ($inp,$len),$inp neg $len @@ -280,12 +443,9 @@ private_RC4_set_key: xor %r11,%r11 mov OPENSSL_ia32cap_P(%rip),$idx#d - bt \$20,$idx#d - jnc .Lw1stloop - bt \$30,$idx#d - setc $ido#b - mov $ido#d,260($dat) - jmp .Lc1stloop + bt \$20,$idx#d # RC4_CHAR? + jc .Lc1stloop + jmp .Lw1stloop .align 16 .Lw1stloop: @@ -339,7 +499,7 @@ private_RC4_set_key: mov %eax,-8($dat) mov %eax,-4($dat) ret -.size private_RC4_set_key,.-private_RC4_set_key +.size RC4_set_key,.-RC4_set_key .globl RC4_options .type RC4_options,\@abi-omnipotent @@ -348,18 +508,20 @@ RC4_options: lea .Lopts(%rip),%rax mov OPENSSL_ia32cap_P(%rip),%edx bt \$20,%edx - jnc .Ldone - add \$12,%rax + jc .L8xchar bt \$30,%edx jnc .Ldone - add \$13,%rax + add \$25,%rax + ret +.L8xchar: + add \$12,%rax .Ldone: ret .align 64 .Lopts: .asciz "rc4(8x,int)" .asciz "rc4(8x,char)" -.asciz "rc4(1x,char)" +.asciz "rc4(16x,int)" .asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" .align 64 .size RC4_options,.-RC4_options @@ -497,7 +659,17 @@ key_se_handler: ___ } -$code =~ s/#([bwd])/$1/gm; +sub reg_part { +my ($reg,$conv)=@_; + if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } + elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } + elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } + elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } + return $reg; +} + +$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; +$code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl index a1f876281a..1084d227fe 100644 --- a/crypto/sha/asm/sha1-586.pl +++ b/crypto/sha/asm/sha1-586.pl @@ -12,6 +12,8 @@ # commentary below], and in 2006 the rest was rewritten in order to # gain freedom to liberate licensing terms. +# January, September 2004. +# # It was noted that Intel IA-32 C compiler generates code which # performs ~30% *faster* on P4 CPU than original *hand-coded* # SHA1 assembler implementation. To address this problem (and @@ -31,12 +33,92 @@ # ---------------------------------------------------------------- # <appro@fy.chalmers.se> +# August 2009. +# +# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as +# '(c&d) + (b&(c^d))', which allows to accumulate partial results +# and lighten "pressure" on scratch registers. This resulted in +# >12% performance improvement on contemporary AMD cores (with no +# degradation on other CPUs:-). Also, the code was revised to maximize +# "distance" between instructions producing input to 'lea' instruction +# and the 'lea' instruction itself, which is essential for Intel Atom +# core and resulted in ~15% improvement. + +# October 2010. +# +# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it +# is to offload message schedule denoted by Wt in NIST specification, +# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel, +# and in SSE2 context was first explored by Dean Gaudet in 2004, see +# http://arctic.org/~dean/crypto/sha1.html. Since then several things +# have changed that made it interesting again: +# +# a) XMM units became faster and wider; +# b) instruction set became more versatile; +# c) an important observation was made by Max Locktykhin, which made +# it possible to reduce amount of instructions required to perform +# the operation in question, for further details see +# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/. + +# April 2011. +# +# Add AVX code path, probably most controversial... The thing is that +# switch to AVX alone improves performance by as little as 4% in +# comparison to SSSE3 code path. But below result doesn't look like +# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as +# pair of µ-ops, and it's the additional µ-ops, two per round, that +# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded +# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with +# equivalent 'sh[rl]d' that is responsible for the impressive 5.1 +# cycles per processed byte. But 'sh[rl]d' is not something that used +# to be fast, nor does it appear to be fast in upcoming Bulldozer +# [according to its optimization manual]. Which is why AVX code path +# is guarded by *both* AVX and synthetic bit denoting Intel CPUs. +# One can argue that it's unfair to AMD, but without 'sh[rl]d' it +# makes no sense to keep the AVX code path. If somebody feels that +# strongly, it's probably more appropriate to discuss possibility of +# using vector rotate XOP on AMD... + +###################################################################### +# Current performance is summarized in following table. Numbers are +# CPU clock cycles spent to process single byte (less is better). +# +# x86 SSSE3 AVX +# Pentium 15.7 - +# PIII 11.5 - +# P4 10.6 - +# AMD K8 7.1 - +# Core2 7.3 6.1/+20% - +# Atom 12.5 9.5(*)/+32% - +# Westmere 7.3 5.6/+30% - +# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70% +# +# (*) Loop is 1056 instructions long and expected result is ~8.25. +# It remains mystery [to me] why ILP is limited to 1.7. +# +# (**) As per above comment, the result is for AVX *plus* sh[rl]d. + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); +$xmm=$ymm=0; +for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } + +$ymm=1 if ($xmm && + `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/ && + $1>=2.19); # first version supporting AVX + +$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && + $1>=2.03); # first version supporting AVX + +&external_label("OPENSSL_ia32cap_P") if ($xmm); + + $A="eax"; $B="ebx"; $C="ecx"; @@ -47,6 +129,10 @@ $tmp1="ebp"; @V=($A,$B,$C,$D,$E,$T); +$alt=0; # 1 denotes alternative IALU implementation, which performs + # 8% *worse* on P4, same on Westmere and Atom, 2% better on + # Sandy Bridge... + sub BODY_00_15 { local($n,$a,$b,$c,$d,$e,$f)=@_; @@ -59,16 +145,18 @@ sub BODY_00_15 &rotl($tmp1,5); # tmp1=ROTATE(a,5) &xor($f,$d); &add($tmp1,$e); # tmp1+=e; - &and($f,$b); - &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded + &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded # with xi, also note that e becomes # f in next round... - &xor($f,$d); # f holds F_00_19(b,c,d) + &and($f,$b); &rotr($b,2); # b=ROTATE(b,30) - &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi + &xor($f,$d); # f holds F_00_19(b,c,d) + &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi - if ($n==15) { &add($f,$tmp1); } # f+=tmp1 + if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round + &add($f,$tmp1); } # f+=tmp1 else { &add($tmp1,$f); } # f becomes a in next round + &mov($tmp1,$a) if ($alt && $n==15); } sub BODY_16_19 @@ -77,22 +165,41 @@ sub BODY_16_19 &comment("16_19 $n"); - &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) - &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) - &xor($f,&swtmp(($n+2)%16)); - &xor($tmp1,$d); - &xor($f,&swtmp(($n+8)%16)); - &and($tmp1,$b); # tmp1 holds F_00_19(b,c,d) - &rotr($b,2); # b=ROTATE(b,30) +if ($alt) { + &xor($c,$d); + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d + &xor($f,&swtmp(($n+8)%16)); + &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd + &rotl($f,1); # f=ROTATE(f,1) + &add($e,$tmp1); # e+=F_00_19(b,c,d) + &xor($c,$d); # restore $c + &mov($tmp1,$a); # b in next round + &rotr($b,$n==16?2:7); # b=ROTATE(b,30) + &mov(&swtmp($n%16),$f); # xi=f + &rotl($a,5); # ROTATE(a,5) + &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e + &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round + &add($f,$a); # f+=ROTATE(a,5) +} else { + &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &xor($tmp1,$d); + &xor($f,&swtmp(($n+8)%16)); + &and($tmp1,$b); &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) - &mov(&swtmp($n%16),$f); # xi=f - &lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e - &mov($e,$a); # e becomes volatile - &rotl($e,5); # e=ROTATE(a,5) - &add($f,$tmp1); # f+=F_00_19(b,c,d) - &add($f,$e); # f+=ROTATE(a,5) + &add($e,$tmp1); # e+=F_00_19(b,c,d) + &mov($tmp1,$a); + &rotr($b,2); # b=ROTATE(b,30) + &mov(&swtmp($n%16),$f); # xi=f + &rotl($tmp1,5); # ROTATE(a,5) + &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e + &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round + &add($f,$tmp1); # f+=ROTATE(a,5) +} } sub BODY_20_39 @@ -102,21 +209,41 @@ sub BODY_20_39 &comment("20_39 $n"); +if ($alt) { + &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) + &xor($f,&swtmp(($n+8)%16)); + &add($e,$tmp1); # e+=F_20_39(b,c,d) + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd + &rotl($f,1); # f=ROTATE(f,1) + &mov($tmp1,$a); # b in next round + &rotr($b,7); # b=ROTATE(b,30) + &mov(&swtmp($n%16),$f) if($n<77);# xi=f + &rotl($a,5); # ROTATE(a,5) + &xor($b,$c) if($n==39);# warm up for BODY_40_59 + &and($tmp1,$b) if($n==39); + &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY + &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round + &add($f,$a); # f+=ROTATE(a,5) + &rotr($a,5) if ($n==79); +} else { &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) - &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) - &rotr($b,2); # b=ROTATE(b,30) - &xor($f,&swtmp(($n+2)%16)); + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$c); &xor($f,&swtmp(($n+8)%16)); &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) - &add($tmp1,$e); - &mov(&swtmp($n%16),$f); # xi=f - &mov($e,$a); # e becomes volatile - &rotl($e,5); # e=ROTATE(a,5) - &lea($f,&DWP($K,$f,$tmp1)); # f+=K_20_39+e - &add($f,$e); # f+=ROTATE(a,5) + &add($e,$tmp1); # e+=F_20_39(b,c,d) + &rotr($b,2); # b=ROTATE(b,30) + &mov($tmp1,$a); + &rotl($tmp1,5); # ROTATE(a,5) + &mov(&swtmp($n%16),$f) if($n<77);# xi=f + &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY + &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round + &add($f,$tmp1); # f+=ROTATE(a,5) +} } sub BODY_40_59 @@ -125,41 +252,86 @@ sub BODY_40_59 &comment("40_59 $n"); - &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) - &mov($tmp1,&swtmp(($n+2)%16)); - &xor($f,$tmp1); - &mov($tmp1,&swtmp(($n+8)%16)); - &xor($f,$tmp1); - &mov($tmp1,&swtmp(($n+13)%16)); - &xor($f,$tmp1); # f holds xa^xb^xc^xd - &mov($tmp1,$b); # tmp1 to hold F_40_59(b,c,d) +if ($alt) { + &add($e,$tmp1); # e+=b&(c^d) + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &mov($tmp1,$d); + &xor($f,&swtmp(($n+8)%16)); + &xor($c,$d); # restore $c + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) - &or($tmp1,$c); - &mov(&swtmp($n%16),$f); # xi=f - &and($tmp1,$d); - &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e - &mov($e,$b); # e becomes volatile and is used - # to calculate F_40_59(b,c,d) + &and($tmp1,$c); + &rotr($b,7); # b=ROTATE(b,30) + &add($e,$tmp1); # e+=c&d + &mov($tmp1,$a); # b in next round + &mov(&swtmp($n%16),$f); # xi=f + &rotl($a,5); # ROTATE(a,5) + &xor($b,$c) if ($n<59); + &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d) + &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d)) + &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round + &add($f,$a); # f+=ROTATE(a,5) +} else { + &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d) + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &xor($tmp1,$d); + &xor($f,&swtmp(($n+8)%16)); + &and($tmp1,$b); + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd + &rotl($f,1); # f=ROTATE(f,1) + &add($tmp1,$e); # b&(c^d)+=e &rotr($b,2); # b=ROTATE(b,30) - &and($e,$c); - &or($tmp1,$e); # tmp1 holds F_40_59(b,c,d) - &mov($e,$a); - &rotl($e,5); # e=ROTATE(a,5) - &add($f,$tmp1); # f+=tmp1; + &mov($e,$a); # e becomes volatile + &rotl($e,5); # ROTATE(a,5) + &mov(&swtmp($n%16),$f); # xi=f + &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d)) + &mov($tmp1,$c); &add($f,$e); # f+=ROTATE(a,5) + &and($tmp1,$d); + &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round + &add($f,$tmp1); # f+=c&d +} } &function_begin("sha1_block_data_order"); +if ($xmm) { + &static_label("ssse3_shortcut"); + &static_label("avx_shortcut") if ($ymm); + &static_label("K_XX_XX"); + + &call (&label("pic_point")); # make it PIC! + &set_label("pic_point"); + &blindpop($tmp1); + &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point")); + &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); + + &mov ($A,&DWP(0,$T)); + &mov ($D,&DWP(4,$T)); + &test ($D,1<<9); # check SSSE3 bit + &jz (&label("x86")); + &test ($A,1<<24); # check FXSR bit + &jz (&label("x86")); + if ($ymm) { + &and ($D,1<<28); # mask AVX bit + &a |