diff options
author | Andy Polyakov <appro@openssl.org> | 2007-04-02 09:50:14 +0000 |
---|---|---|
committer | Andy Polyakov <appro@openssl.org> | 2007-04-02 09:50:14 +0000 |
commit | 9babf3929bf1f546aa646d9e1e2a934ccfe0b333 (patch) | |
tree | 9b31da5ec08ad7c267b081a3c3a3ff340d9c1fc0 /crypto | |
parent | 2ec0be9e778b7603494f8b9b1ccfc12b9a269760 (diff) |
RC4_set_key for x86_64 and Core2 optimization.
PR: 1447
Diffstat (limited to 'crypto')
-rwxr-xr-x | crypto/rc4/asm/rc4-x86_64.pl | 115 | ||||
-rw-r--r-- | crypto/x86_64cpuid.pl | 58 |
2 files changed, 173 insertions, 0 deletions
diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl index 5236afec12..36a9429ef7 100755 --- a/crypto/rc4/asm/rc4-x86_64.pl +++ b/crypto/rc4/asm/rc4-x86_64.pl @@ -49,6 +49,14 @@ # is not implemented, then this final RC4_CHAR code-path should be # preferred, as it provides better *all-round* performance]. +# Intel Core2 was observed to perform poorly on both code paths:-( It +# apparently suffers from some kind of partial register stall, which +# occurs in 64-bit mode only [as virtually identical 32-bit loop was +# observed to outperform 64-bit one by almost 50%]. Adding two movzb to +# cloop1 boosts its performance by 80%! This loop appears to be optimal +# fit for Core2 and therefore the code was modified to skip cloop8 on +# this CPU. + $output=shift; open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output"; @@ -152,6 +160,8 @@ $code.=<<___; movzb ($dat,$XX[0]),$TX[0]#d test \$-8,$len jz .Lcloop1 + cmp \$0,260($dat) + jnz .Lcloop1 push %rbx jmp .Lcloop8 .align 16 @@ -235,6 +245,111 @@ $code.=<<___; .size RC4,.-RC4 ___ +$idx="%r8"; +$ido="%r9"; + +$code.=<<___; +.extern OPENSSL_ia32cap_P +.globl RC4_set_key +.type RC4_set_key,\@function,3 +.align 16 +RC4_set_key: + lea 8($dat),$dat + lea ($inp,$len),$inp + neg $len + mov $len,%rcx + xor %eax,%eax + xor $ido,$ido + xor %r10,%r10 + xor %r11,%r11 + + mov OPENSSL_ia32cap_P(%rip),$idx#d + bt \$20,$idx#d + jnc .Lw1stloop + bt \$30,$idx#d + setc $ido#b + mov $ido#d,260($dat) + jmp .Lc1stloop + +.align 16 +.Lw1stloop: + mov %eax,($dat,%rax,4) + add \$1,%al + jnc .Lw1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lw2ndloop: + mov ($dat,$ido,4),%r10d + add ($inp,$len,1),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx,4),%r11d + cmovz %rcx,$len + mov %r10d,($dat,$idx,4) + mov %r11d,($dat,$ido,4) + add \$1,$ido#b + jnc .Lw2ndloop + jmp .Lexit_key + +.align 16 +.Lc1stloop: + mov %al,($dat,%rax) + add \$1,%al + jnc .Lc1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lc2ndloop: + mov ($dat,$ido),%r10b + add ($inp,$len),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx),%r11b + jnz .Lcnowrap + mov %rcx,$len +.Lcnowrap: + mov %r10b,($dat,$idx) + mov %r11b,($dat,$ido) + add \$1,$ido#b + jnc .Lc2ndloop + movl \$-1,256($dat) + +.align 16 +.Lexit_key: + xor %eax,%eax + mov %eax,-8($dat) + mov %eax,-4($dat) + ret +.size RC4_set_key,.-RC4_set_key + +.globl RC4_options +.type RC4_options,\@function,0 +.align 16 +RC4_options: + .picmeup %rax + lea .Lopts-.(%rax),%rax + mov OPENSSL_ia32cap_P(%rip),%edx + bt \$20,%edx + jnc .Ldone + add \$12,%rax + bt \$30,%edx + jnc .Ldone + add \$13,%rax +.Ldone: + ret +.align 64 +.Lopts: +.asciz "rc4(8x,int)" +.asciz "rc4(8x,char)" +.asciz "rc4(1x,char)" +.asciz "RC4 for x86_64, OpenSSL project" +.align 64 +.size RC4_options,.-RC4_options +___ + $code =~ s/#([bwd])/$1/gm; print $code; diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl index 4d88ad191b..f9f2827636 100644 --- a/crypto/x86_64cpuid.pl +++ b/crypto/x86_64cpuid.pl @@ -48,8 +48,37 @@ OPENSSL_wipe_cpu ENDP OPENSSL_ia32_cpuid PROC mov r8,rbx + + xor eax,eax + cpuid + xor eax,eax + cmp ebx,0756e6547h + setne al + mov r9d,eax + cmp edx,049656e69h + setne al + or r9d,eax + cmp ecx,06c65746eh + setne al + or r9d,eax + mov eax,1 cpuid + bt edx,28 + jnc \$Ldone + cmp r9,0 + jne \$Lnotintel + or edx,000100000h + and ah,15 + cmp ah,15 + je \$Lnotintel + or edx,040000000h +\$Lnotintel: + shr ebx,16 + cmp bl,1 + ja \$Ldone + and edx,0efffffffh +\$Ldone: shl rcx,32 mov eax,edx mov rbx,r8 @@ -124,8 +153,37 @@ OPENSSL_wipe_cpu: .align 16 OPENSSL_ia32_cpuid: movq %rbx,%r8 + + xor %eax,%eax + cpuid + xor %eax,%eax + cmp \$0x756e6547,%ebx # "Genu" + setne %al + mov %eax,%r9d + cmp \$0x49656e69,%edx # "ineI" + setne %al + or %eax,%r9d + cmp \$0x6c65746e,%ecx # "ntel" + setne %al + or %eax,%r9d + movl \$1,%eax cpuid + bt \$28,%edx # test hyper-threading bit + jnc .Ldone + cmp \$0,%r9 + jne .Lnotintel + or \$1<<20,%edx # use reserved bit to engage RC4_CHAR + and \$15,%ah + cmp \$15,%ah # examine Family ID + je .Lnotintel + or \$1<<30,%edx # use reserved bit to skip unrolled loop +.Lnotintel: + shr \$16,%ebx + cmp \$1,%bl # see if cache is shared + ja .Ldone + and \$~(1<<28),%edx +.Ldone: shlq \$32,%rcx movl %edx,%eax movq %r8,%rbx |