RC4_set_key for x86_64 and Core2 optimization.

PR: 1447
author: Andy Polyakov <appro@openssl.org> 2007-04-02 09:50:14 +0000
committer: Andy Polyakov <appro@openssl.org> 2007-04-02 09:50:14 +0000
commit: 9babf3929bf1f546aa646d9e1e2a934ccfe0b333 (patch)
tree: 9b31da5ec08ad7c267b081a3c3a3ff340d9c1fc0 /crypto
parent: 2ec0be9e778b7603494f8b9b1ccfc12b9a269760 (diff)
2 files changed, 173 insertions, 0 deletions
diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl
index 5236afec12..36a9429ef7 100755
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl
@@ -49,6 +49,14 @@
 # is not implemented, then this final RC4_CHAR code-path should be
 # preferred, as it provides better *all-round* performance].
 
+# Intel Core2 was observed to perform poorly on both code paths:-( It
+# apparently suffers from some kind of partial register stall, which
+# occurs in 64-bit mode only [as virtually identical 32-bit loop was
+# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
+# cloop1 boosts its performance by 80%! This loop appears to be optimal
+# fit for Core2 and therefore the code was modified to skip cloop8 on
+# this CPU.
+
 $output=shift;
 open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
 
@@ -152,6 +160,8 @@ $code.=<<___;
 	movzb	($dat,$XX[0]),$TX[0]#d
 	test	\$-8,$len
 	jz	.Lcloop1
+	cmp	\$0,260($dat)
+	jnz	.Lcloop1
 	push	%rbx
 	jmp	.Lcloop8
 .align	16
@@ -235,6 +245,111 @@ $code.=<<___;
 .size	RC4,.-RC4
 ___
 
+$idx="%r8";
+$ido="%r9";
+
+$code.=<<___;
+.extern	OPENSSL_ia32cap_P
+.globl	RC4_set_key
+.type	RC4_set_key,\@function,3
+.align	16
+RC4_set_key:
+	lea	8($dat),$dat
+	lea	($inp,$len),$inp
+	neg	$len
+	mov	$len,%rcx
+	xor	%eax,%eax
+	xor	$ido,$ido
+	xor	%r10,%r10
+	xor	%r11,%r11
+
+	mov	OPENSSL_ia32cap_P(%rip),$idx#d
+	bt	\$20,$idx#d
+	jnc	.Lw1stloop
+	bt	\$30,$idx#d
+	setc	$ido#b
+	mov	$ido#d,260($dat)
+	jmp	.Lc1stloop
+
+.align	16
+.Lw1stloop:
+	mov	%eax,($dat,%rax,4)
+	add	\$1,%al
+	jnc	.Lw1stloop
+
+	xor	$ido,$ido
+	xor	$idx,$idx
+.align	16
+.Lw2ndloop:
+	mov	($dat,$ido,4),%r10d
+	add	($inp,$len,1),$idx#b
+	add	%r10b,$idx#b
+	add	\$1,$len
+	mov	($dat,$idx,4),%r11d
+	cmovz	%rcx,$len
+	mov	%r10d,($dat,$idx,4)
+	mov	%r11d,($dat,$ido,4)
+	add	\$1,$ido#b
+	jnc	.Lw2ndloop
+	jmp	.Lexit_key
+
+.align	16
+.Lc1stloop:
+	mov	%al,($dat,%rax)
+	add	\$1,%al
+	jnc	.Lc1stloop
+
+	xor	$ido,$ido
+	xor	$idx,$idx
+.align	16
+.Lc2ndloop:
+	mov	($dat,$ido),%r10b
+	add	($inp,$len),$idx#b
+	add	%r10b,$idx#b
+	add	\$1,$len
+	mov	($dat,$idx),%r11b
+	jnz	.Lcnowrap
+	mov	%rcx,$len
+.Lcnowrap:
+	mov	%r10b,($dat,$idx)
+	mov	%r11b,($dat,$ido)
+	add	\$1,$ido#b
+	jnc	.Lc2ndloop
+	movl	\$-1,256($dat)
+
+.align	16
+.Lexit_key:
+	xor	%eax,%eax
+	mov	%eax,-8($dat)
+	mov	%eax,-4($dat)
+	ret
+.size	RC4_set_key,.-RC4_set_key
+
+.globl	RC4_options
+.type	RC4_options,\@function,0
+.align	16
+RC4_options:
+	.picmeup %rax
+	lea	.Lopts-.(%rax),%rax
+	mov	OPENSSL_ia32cap_P(%rip),%edx
+	bt	\$20,%edx
+	jnc	.Ldone
+	add	\$12,%rax
+	bt	\$30,%edx
+	jnc	.Ldone
+	add	\$13,%rax
+.Ldone:
+	ret
+.align	64
+.Lopts:
+.asciz	"rc4(8x,int)"
+.asciz	"rc4(8x,char)"
+.asciz	"rc4(1x,char)"
+.asciz	"RC4 for x86_64, OpenSSL project"
+.align	64
+.size	RC4_options,.-RC4_options
+___
+
 $code =~ s/#([bwd])/$1/gm;
 
 print $code;
diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl
index 4d88ad191b..f9f2827636 100644
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@@ -48,8 +48,37 @@ OPENSSL_wipe_cpu	ENDP
 
 OPENSSL_ia32_cpuid	PROC
 	mov	r8,rbx
+
+	xor	eax,eax
+	cpuid
+	xor	eax,eax
+	cmp	ebx,0756e6547h
+	setne	al
+	mov	r9d,eax
+	cmp	edx,049656e69h
+	setne	al
+	or	r9d,eax
+	cmp	ecx,06c65746eh
+	setne	al
+	or	r9d,eax
+
 	mov	eax,1
 	cpuid
+	bt	edx,28
+	jnc	\$Ldone
+	cmp	r9,0
+	jne	\$Lnotintel
+	or	edx,000100000h
+	and	ah,15
+	cmp	ah,15
+	je	\$Lnotintel
+	or	edx,040000000h
+\$Lnotintel:
+	shr	ebx,16
+	cmp	bl,1
+	ja	\$Ldone
+	and	edx,0efffffffh
+\$Ldone:
 	shl	rcx,32
 	mov	eax,edx
 	mov	rbx,r8
@@ -124,8 +153,37 @@ OPENSSL_wipe_cpu:
 .align	16
 OPENSSL_ia32_cpuid:
 	movq	%rbx,%r8
+
+	xor	%eax,%eax
+	cpuid
+	xor	%eax,%eax
+	cmp	\$0x756e6547,%ebx	# "Genu"
+	setne	%al
+	mov	%eax,%r9d
+	cmp	\$0x49656e69,%edx	# "ineI"
+	setne	%al
+	or	%eax,%r9d
+	cmp	\$0x6c65746e,%ecx	# "ntel"
+	setne	%al
+	or	%eax,%r9d
+
 	movl	\$1,%eax
 	cpuid
+	bt	\$28,%edx		# test hyper-threading bit
+	jnc	.Ldone
+	cmp	\$0,%r9
+	jne	.Lnotintel
+	or	\$1<<20,%edx		# use reserved bit to engage RC4_CHAR
+	and	\$15,%ah
+	cmp	\$15,%ah		# examine Family ID
+	je	.Lnotintel
+	or	\$1<<30,%edx		# use reserved bit to skip unrolled loop
+.Lnotintel:
+	shr	\$16,%ebx
+	cmp	\$1,%bl			# see if cache is shared
+	ja	.Ldone
+	and	\$~(1<<28),%edx
+.Ldone:
 	shlq	\$32,%rcx
 	movl	%edx,%eax
 	movq	%r8,%rbx
author	Andy Polyakov <appro@openssl.org>	2007-04-02 09:50:14 +0000
committer	Andy Polyakov <appro@openssl.org>	2007-04-02 09:50:14 +0000
commit	9babf3929bf1f546aa646d9e1e2a934ccfe0b333 (patch)
tree	9b31da5ec08ad7c267b081a3c3a3ff340d9c1fc0 /crypto
parent	2ec0be9e778b7603494f8b9b1ccfc12b9a269760 (diff)