chacha/asm/chacha-x86_64.pl: add dedicated path for 128-byte inputs.

The 128-byte vectors are extensively used in chacha20_poly1305_tls_cipher and dedicated code path is ~30-50% faster on most platforms. Reviewed-by: Rich Salz <rsalz@openssl.org> (Merged from https://github.com/openssl/openssl/pull/6626)
author: Andy Polyakov <appro@openssl.org> 2018-07-02 13:16:33 +0200
committer: Andy Polyakov <appro@openssl.org> 2018-07-03 19:02:02 +0200
commit: d5487a454c485eb6f9aef7fb0cb1c0681a06fd25 (patch)
tree: 55b50bcd779367e575718e8f960e01e86c6b879d /crypto/chacha
parent: b068a9b914887af5cc99895754412582fbb0e10b (diff)
1 files changed, 221 insertions, 74 deletions
diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl
index 51bb6a965c..b54f3b1525 100755
--- a/crypto/chacha/asm/chacha-x86_64.pl
+++ b/crypto/chacha/asm/chacha-x86_64.pl
@@ -1,5 +1,5 @@
 #! /usr/bin/env perl
-# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
@@ -28,33 +28,32 @@
 #
 # Performance in cycles per byte out of large buffer.
 #
-#		IALU/gcc 4.8(i)	1xSSSE3/SSE2	4xSSSE3	    NxAVX(v)
+#		IALU/gcc 4.8(i)	1x/2xSSSE3(ii)	4xSSSE3	    NxAVX(v)
 #
-# P4		9.48/+99%	-/22.7(ii)	-
-# Core2		7.83/+55%	7.90/8.08	4.35
-# Westmere	7.19/+50%	5.60/6.70	3.00
-# Sandy Bridge	8.31/+42%	5.45/6.76	2.72
-# Ivy Bridge	6.71/+46%	5.40/6.49	2.41
-# Haswell	5.92/+43%	5.20/6.45	2.42	    1.23
-# Skylake[-X]	5.87/+39%	4.70/-		2.31	    1.19[0.80(vi)]
-# Silvermont	12.0/+33%	7.75/7.40	7.03(iii)
-# Knights L	11.7/-		-		9.60(iii)   0.80
-# Goldmont	10.6/+17%	5.10/-		3.28
-# Sledgehammer	7.28/+52%	-/14.2(ii)	-
-# Bulldozer	9.66/+28%	9.85/11.1	3.06(iv)
-# Ryzen		5.96/+50%	5.19/-		2.40        2.09
-# VIA Nano	10.5/+46%	6.72/8.60	6.05
+# P4		9.48/+99%	-		-
+# Core2		7.83/+55%	7.90/5.76	4.35
+# Westmere	7.19/+50%	5.60/4.50	3.00
+# Sandy Bridge	8.31/+42%	5.45/4.00	2.72
+# Ivy Bridge	6.71/+46%	5.40/?		2.41
+# Haswell	5.92/+43%	5.20/3.45	2.42        1.23
+# Skylake[-X]	5.87/+39%	4.70/3.22	2.31        1.19[0.80(vi)]
+# Silvermont	12.0/+33%	7.75/6.90	7.03(iii)
+# Knights L	11.7/-		?		9.60(iii)   0.80
+# Goldmont	10.6/+17%	5.10/3.52	3.28
+# Sledgehammer	7.28/+52%	-		-
+# Bulldozer	9.66/+28%	9.85/5.35(iv)	3.06(iv)
+# Ryzen		5.96/+50%	5.19/3.00	2.40        2.09
+# VIA Nano	10.5/+46%	6.72/6.88	6.05
 #
 # (i)	compared to older gcc 3.x one can observe >2x improvement on
 #	most platforms;
-# (ii)	as it can be seen, SSE2 performance is too low on legacy
-#	processors; NxSSE2 results are naturally better, but not
-#	impressively better than IALU ones, which is why you won't
-#	find SSE2 code below;
+# (ii)	2xSSSE3 is code path optimized specifically for 128 bytes used
+#	by chacha20_poly1305_tls_cipher, results are EVP-free;
 # (iii)	this is not optimal result for Atom because of MSROM
 #	limitations, SSE2 can do better, but gain is considered too
 #	low to justify the [maintenance] effort;
-# (iv)	Bulldozer actually executes 4xXOP code path that delivers 2.20;
+# (iv)	Bulldozer actually executes 4xXOP code path that delivers 2.20
+#	and 4.85 for 128-byte inputs;
 # (v)	8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
 # (vi)	even though Skylake-X can execute AVX512F code and deliver 0.57
 #	cpb in single thread, the corresponding capability is suppressed;
@@ -489,6 +488,7 @@ $code.=<<___	if ($avx);
 ___
 $code.=<<___;
 	cmp	\$128,$len		# we might throw away some data,
+	je	.LChaCha20_128
 	ja	.LChaCha20_4x		# but overall it won't be slower
 
 .Ldo_sse3_after_all:
@@ -606,6 +606,172 @@ ___
 }
 
 ########################################################################
+# SSSE3 code path that handles 128-byte inputs
+{
+my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
+my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
+
+sub SSSE3ROUND_2x {
+	&paddd	($a,$b);
+	&pxor	($d,$a);
+	 &paddd	($a1,$b1);
+	 &pxor	($d1,$a1);
+	&pshufb	($d,$rot16);
+	 &pshufb($d1,$rot16);
+
+	&paddd	($c,$d);
+	 &paddd	($c1,$d1);
+	&pxor	($b,$c);
+	 &pxor	($b1,$c1);
+	&movdqa	($t,$b);
+	&psrld	($b,20);
+	 &movdqa($t1,$b1);
+	&pslld	($t,12);
+	 &psrld	($b1,20);
+	&por	($b,$t);
+	 &pslld	($t1,12);
+	 &por	($b1,$t1);
+
+	&paddd	($a,$b);
+	&pxor	($d,$a);
+	 &paddd	($a1,$b1);
+	 &pxor	($d1,$a1);
+	&pshufb	($d,$rot24);
+	 &pshufb($d1,$rot24);
+
+	&paddd	($c,$d);
+	 &paddd	($c1,$d1);
+	&pxor	($b,$c);
+	 &pxor	($b1,$c1);
+	&movdqa	($t,$b);
+	&psrld	($b,25);
+	 &movdqa($t1,$b1);
+	&pslld	($t,7);
+	 &psrld	($b1,25);
+	&por	($b,$t);
+	 &pslld	($t1,7);
+	 &por	($b1,$t1);
+}
+
+my $xframe = $win64 ? 0x68 : 8;
+
+$code.=<<___;
+.type	ChaCha20_128,\@function,5
+.align	32
+ChaCha20_128:
+.cfi_startproc
+.LChaCha20_128:
+	mov	%rsp,%r9		# frame pointer
+.cfi_def_cfa_register	%r9
+	sub	\$64+$xframe,%rsp
+___
+$code.=<<___	if ($win64);
+	movaps	%xmm6,-0x68(%r9)
+	movaps	%xmm7,-0x58(%r9)
+	movaps	%xmm8,-0x48(%r9)
+	movaps	%xmm9,-0x38(%r9)
+	movaps	%xmm10,-0x28(%r9)
+	movaps	%xmm11,-0x18(%r9)
+.L128_body:
+___
+$code.=<<___;
+	movdqa	.Lsigma(%rip),$a
+	movdqu	($key),$b
+	movdqu	16($key),$c
+	movdqu	($counter),$d
+	movdqa	.Lone(%rip),$d1
+	movdqa	.Lrot16(%rip),$rot16
+	movdqa	.Lrot24(%rip),$rot24
+
+	movdqa	$a,$a1
+	movdqa	$a,0x00(%rsp)
+	movdqa	$b,$b1
+	movdqa	$b,0x10(%rsp)
+	movdqa	$c,$c1
+	movdqa	$c,0x20(%rsp)
+	paddd	$d,$d1
+	movdqa	$d,0x30(%rsp)
+	mov	\$10,$counter		# reuse $counter
+	jmp	.Loop_128
+
+.align	32
+.Loop_128:
+___
+	&SSSE3ROUND_2x();
+	&pshufd	($c,$c,0b01001110);
+	&pshufd	($b,$b,0b00111001);
+	&pshufd	($d,$d,0b10010011);
+	&pshufd	($c1,$c1,0b01001110);
+	&pshufd	($b1,$b1,0b00111001);
+	&pshufd	($d1,$d1,0b10010011);
+
+	&SSSE3ROUND_2x();
+	&pshufd	($c,$c,0b01001110);
+	&pshufd	($b,$b,0b10010011);
+	&pshufd	($d,$d,0b00111001);
+	&pshufd	($c1,$c1,0b01001110);
+	&pshufd	($b1,$b1,0b10010011);
+	&pshufd	($d1,$d1,0b00111001);
+
+	&dec	($counter);
+	&jnz	(".Loop_128");
+
+$code.=<<___;
+	paddd	0x00(%rsp),$a
+	paddd	0x10(%rsp),$b
+	paddd	0x20(%rsp),$c
+	paddd	0x30(%rsp),$d
+	paddd	.Lone(%rip),$d1
+	paddd	0x00(%rsp),$a1
+	paddd	0x10(%rsp),$b1
+	paddd	0x20(%rsp),$c1
+	paddd	0x30(%rsp),$d1
+
+	movdqu	0x00($inp),$t
+	movdqu	0x10($inp),$t1
+	pxor	$t,$a			# xor with input
+	movdqu	0x20($inp),$t
+	pxor	$t1,$b
+	movdqu	0x30($inp),$t1
+	pxor	$t,$c
+	movdqu	0x40($inp),$t
+	pxor	$t1,$d
+	movdqu	0x50($inp),$t1
+	pxor	$t,$a1
+	movdqu	0x60($inp),$t
+	pxor	$t1,$b1
+	movdqu	0x70($inp),$t1
+	pxor	$t,$c1
+	pxor	$t1,$d1
+
+	movdqu	$a,0x00($out)		# write output
+	movdqu	$b,0x10($out)
+	movdqu	$c,0x20($out)
+	movdqu	$d,0x30($out)
+	movdqu	$a1,0x40($out)
+	movdqu	$b1,0x50($out)
+	movdqu	$c1,0x60($out)
+	movdqu	$d1,0x70($out)
+___
+$code.=<<___	if ($win64);
+	movaps	-0x68(%r9),%xmm6
+	movaps	-0x58(%r9),%xmm7
+	movaps	-0x48(%r9),%xmm8
+	movaps	-0x38(%r9),%xmm9
+	movaps	-0x28(%r9),%xmm10
+	movaps	-0x18(%r9),%xmm11
+___
+$code.=<<___;
+	lea	(%r9),%rsp
+.cfi_def_cfa_register	%rsp
+.L128_epilogue:
+	ret
+.cfi_endproc
+.size	ChaCha20_128,.-ChaCha20_128
+___
+}
+
+########################################################################
 # SSSE3 code path that handles longer messages.
 {
 # assign variables to favor Atom front-end
@@ -3674,9 +3840,9 @@ se_handler:
 	ret
 .size	se_handler,.-se_handler
 
-.type	ssse3_handler,\@abi-omnipotent
+.type	simd_handler,\@abi-omnipotent
 .align	16
-ssse3_handler:
+simd_handler:
 	push	%rsi
 	push	%rdi
 	push	%rbx
@@ -3702,57 +3868,20 @@ ssse3_handler:
 	mov	192($context),%rax	# pull context->R9
 
 	mov	4(%r11),%r10d		# HandlerData[1]
+	mov	8(%r11),%ecx		# HandlerData[2]
 	lea	(%rsi,%r10),%r10	# epilogue label
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lcommon_seh_tail
 
-	lea	-0x28(%rax),%rsi
+	neg	%rcx
+	lea	-8(%rax,%rcx),%rsi
 	lea	512($context),%rdi	# &context.Xmm6
-	mov	\$4,%ecx
+	neg	%ecx
+	shr	\$3,%ecx
 	.long	0xa548f3fc		# cld; rep movsq
 
 	jmp	.Lcommon_seh_tail
-.size	ssse3_handler,.-ssse3_handler
-
-.type	full_handler,\@abi-omnipotent
-.align	16
-full_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	mov	8($disp),%rsi		# disp->ImageBase
-	mov	56($disp),%r11		# disp->HandlerData
-
-	mov	0(%r11),%r10d		# HandlerData[0]
-	lea	(%rsi,%r10),%r10	# prologue label
-	cmp	%r10,%rbx		# context->Rip<prologue label
-	jb	.Lcommon_seh_tail
-
-	mov	192($context),%rax	# pull context->R9
-
-	mov	4(%r11),%r10d		# HandlerData[1]
-	lea	(%rsi,%r10),%r10	# epilogue label
-	cmp	%r10,%rbx		# context->Rip>=epilogue label
-	jae	.Lcommon_seh_tail
-
-	lea	-0xa8(%rax),%rsi
-	lea	512($context),%rdi	# &context.Xmm6
-	mov	\$20,%ecx
-	.long	0xa548f3fc		# cld; rep movsq
-
-	jmp	.Lcommon_seh_tail
-.size	full_handler,.-full_handler
+.size	simd_handler,.-simd_handler
 
 .section	.pdata
 .align	4
@@ -3764,6 +3893,10 @@ full_handler:
 	.rva	.LSEH_end_ChaCha20_ssse3
 	.rva	.LSEH_info_ChaCha20_ssse3
 
+	.rva	.LSEH_begin_ChaCha20_128
+	.rva	.LSEH_end_ChaCha20_128
+	.rva	.LSEH_info_ChaCha20_128
+
 	.rva	.LSEH_begin_ChaCha20_4x
 	.rva	.LSEH_end_ChaCha20_4x
 	.rva	.LSEH_info_ChaCha20_4x
@@ -3804,46 +3937,60 @@ $code.=<<___;
 
 .LSEH_info_ChaCha20_ssse3:
 	.byte	9,0,0,0
-	.rva	ssse3_handler
+	.rva	simd_handler
 	.rva	.Lssse3_body,.Lssse3_epilogue
+	.long	0x20,0
+
+.LSEH_info_ChaCha20_128:
+	.byte	9,0,0,0
+	.rva	simd_handler
+	.rva	.L128_body,.L128_epilogue
+	.long	0x60,0
 
 .LSEH_info_ChaCha20_4x:
 	.byte	9,0,0,0
-	.rva	full_handler
+	.rva	simd_handler
 	.rva	.L4x_body,.L4x_epilogue
+	.long	0xa0,0
 ___
 $code.=<<___ if ($avx);
 .LSEH_info_ChaCha20_4xop:
 	.byte	9,0,0,0
-	.rva	full_handler
+	.rva	simd_handler
 	.rva	.L4xop_body,.L4xop_epilogue		# HandlerData[]
+	.long	0xa0,0
 ___
 $code.=<<___ if ($avx>1);
 .LSEH_info_ChaCha20_8x:
 	.byte	9,0,0,0
-	.rva	full_handler
+	.rva	simd_handler
 	.rva	.L8x_body,.L8x_epilogue			# HandlerData[]
+	.long	0xa0,0
 ___
 $code.=<<___ if ($avx>2);
 .LSEH_info_ChaCha20_avx512:
 	.byte	9,0,0,0
-	.rva	ssse3_handler
+	.rva	simd_handler
 	.rva	.Lavx512_body,.Lavx512_epilogue		# HandlerData[]
+	.long	0x20,0
 
 .LSEH_info_ChaCha20_avx512vl:
 	.byte	9,0,0,0
-	.rva	ssse3_handler
+	.rva	simd_handler
 	.rva	.Lavx512vl_body,.Lavx512vl_epilogue	# HandlerData[]
+	.long	0x20,0
 
 .LSEH_info_ChaCha20_16x:
 	.byte	9,0,0,0
-	.rva	full_handler
+	.rva	simd_handler
 	.rva	.L16x_body,.L16x_epilogue		# HandlerData[]
+	.long	0xa0,0
 
 .LSEH_info_ChaCha20_8xvl:
 	.byte	9,0,0,0
-	.rva	full_handler
+	.rva	simd_handler
 	.rva	.L8xvl_body,.L8xvl_epilogue		# HandlerData[]
+	.long	0xa0,0
 ___
 }
author	Andy Polyakov <appro@openssl.org>	2018-07-02 13:16:33 +0200
committer	Andy Polyakov <appro@openssl.org>	2018-07-03 19:02:02 +0200
commit	d5487a454c485eb6f9aef7fb0cb1c0681a06fd25 (patch)
tree	55b50bcd779367e575718e8f960e01e86c6b879d /crypto/chacha
parent	b068a9b914887af5cc99895754412582fbb0e10b (diff)