summaryrefslogtreecommitdiffstats
path: root/crypto/chacha
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2018-07-02 13:16:33 +0200
committerAndy Polyakov <appro@openssl.org>2018-07-03 19:02:02 +0200
commitd5487a454c485eb6f9aef7fb0cb1c0681a06fd25 (patch)
tree55b50bcd779367e575718e8f960e01e86c6b879d /crypto/chacha
parentb068a9b914887af5cc99895754412582fbb0e10b (diff)
chacha/asm/chacha-x86_64.pl: add dedicated path for 128-byte inputs.
The 128-byte vectors are extensively used in chacha20_poly1305_tls_cipher and dedicated code path is ~30-50% faster on most platforms. Reviewed-by: Rich Salz <rsalz@openssl.org> (Merged from https://github.com/openssl/openssl/pull/6626)
Diffstat (limited to 'crypto/chacha')
-rwxr-xr-xcrypto/chacha/asm/chacha-x86_64.pl295
1 files changed, 221 insertions, 74 deletions
diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl
index 51bb6a965c..b54f3b1525 100755
--- a/crypto/chacha/asm/chacha-x86_64.pl
+++ b/crypto/chacha/asm/chacha-x86_64.pl
@@ -1,5 +1,5 @@
#! /usr/bin/env perl
-# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
@@ -28,33 +28,32 @@
#
# Performance in cycles per byte out of large buffer.
#
-# IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 NxAVX(v)
+# IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v)
#
-# P4 9.48/+99% -/22.7(ii) -
-# Core2 7.83/+55% 7.90/8.08 4.35
-# Westmere 7.19/+50% 5.60/6.70 3.00
-# Sandy Bridge 8.31/+42% 5.45/6.76 2.72
-# Ivy Bridge 6.71/+46% 5.40/6.49 2.41
-# Haswell 5.92/+43% 5.20/6.45 2.42 1.23
-# Skylake[-X] 5.87/+39% 4.70/- 2.31 1.19[0.80(vi)]
-# Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
-# Knights L 11.7/- - 9.60(iii) 0.80
-# Goldmont 10.6/+17% 5.10/- 3.28
-# Sledgehammer 7.28/+52% -/14.2(ii) -
-# Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
-# Ryzen 5.96/+50% 5.19/- 2.40 2.09
-# VIA Nano 10.5/+46% 6.72/8.60 6.05
+# P4 9.48/+99% - -
+# Core2 7.83/+55% 7.90/5.76 4.35
+# Westmere 7.19/+50% 5.60/4.50 3.00
+# Sandy Bridge 8.31/+42% 5.45/4.00 2.72
+# Ivy Bridge 6.71/+46% 5.40/? 2.41
+# Haswell 5.92/+43% 5.20/3.45 2.42 1.23
+# Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)]
+# Silvermont 12.0/+33% 7.75/6.90 7.03(iii)
+# Knights L 11.7/- ? 9.60(iii) 0.80
+# Goldmont 10.6/+17% 5.10/3.52 3.28
+# Sledgehammer 7.28/+52% - -
+# Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv)
+# Ryzen 5.96/+50% 5.19/3.00 2.40 2.09
+# VIA Nano 10.5/+46% 6.72/6.88 6.05
#
# (i) compared to older gcc 3.x one can observe >2x improvement on
# most platforms;
-# (ii) as it can be seen, SSE2 performance is too low on legacy
-# processors; NxSSE2 results are naturally better, but not
-# impressively better than IALU ones, which is why you won't
-# find SSE2 code below;
+# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
+# by chacha20_poly1305_tls_cipher, results are EVP-free;
# (iii) this is not optimal result for Atom because of MSROM
# limitations, SSE2 can do better, but gain is considered too
# low to justify the [maintenance] effort;
-# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
+# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
+# and 4.85 for 128-byte inputs;
# (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
# (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
# cpb in single thread, the corresponding capability is suppressed;
@@ -489,6 +488,7 @@ $code.=<<___ if ($avx);
___
$code.=<<___;
cmp \$128,$len # we might throw away some data,
+ je .LChaCha20_128
ja .LChaCha20_4x # but overall it won't be slower
.Ldo_sse3_after_all:
@@ -606,6 +606,172 @@ ___
}
########################################################################
+# SSSE3 code path that handles 128-byte inputs
+{
+my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
+my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
+
+sub SSSE3ROUND_2x {
+ &paddd ($a,$b);
+ &pxor ($d,$a);
+ &paddd ($a1,$b1);
+ &pxor ($d1,$a1);
+ &pshufb ($d,$rot16);
+ &pshufb($d1,$rot16);
+
+ &paddd ($c,$d);
+ &paddd ($c1,$d1);
+ &pxor ($b,$c);
+ &pxor ($b1,$c1);
+ &movdqa ($t,$b);
+ &psrld ($b,20);
+ &movdqa($t1,$b1);
+ &pslld ($t,12);
+ &psrld ($b1,20);
+ &por ($b,$t);
+ &pslld ($t1,12);
+ &por ($b1,$t1);
+
+ &paddd ($a,$b);
+ &pxor ($d,$a);
+ &paddd ($a1,$b1);
+ &pxor ($d1,$a1);
+ &pshufb ($d,$rot24);
+ &pshufb($d1,$rot24);
+
+ &paddd ($c,$d);
+ &paddd ($c1,$d1);
+ &pxor ($b,$c);
+ &pxor ($b1,$c1);
+ &movdqa ($t,$b);
+ &psrld ($b,25);
+ &movdqa($t1,$b1);
+ &pslld ($t,7);
+ &psrld ($b1,25);
+ &por ($b,$t);
+ &pslld ($t1,7);
+ &por ($b1,$t1);
+}
+
+my $xframe = $win64 ? 0x68 : 8;
+
+$code.=<<___;
+.type ChaCha20_128,\@function,5
+.align 32
+ChaCha20_128:
+.cfi_startproc
+.LChaCha20_128:
+ mov %rsp,%r9 # frame pointer
+.cfi_def_cfa_register %r9
+ sub \$64+$xframe,%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,-0x68(%r9)
+ movaps %xmm7,-0x58(%r9)
+ movaps %xmm8,-0x48(%r9)
+ movaps %xmm9,-0x38(%r9)
+ movaps %xmm10,-0x28(%r9)
+ movaps %xmm11,-0x18(%r9)
+.L128_body:
+___
+$code.=<<___;
+ movdqa .Lsigma(%rip),$a
+ movdqu ($key),$b
+ movdqu 16($key),$c
+ movdqu ($counter),$d
+ movdqa .Lone(%rip),$d1
+ movdqa .Lrot16(%rip),$rot16
+ movdqa .Lrot24(%rip),$rot24
+
+ movdqa $a,$a1
+ movdqa $a,0x00(%rsp)
+ movdqa $b,$b1
+ movdqa $b,0x10(%rsp)
+ movdqa $c,$c1
+ movdqa $c,0x20(%rsp)
+ paddd $d,$d1
+ movdqa $d,0x30(%rsp)
+ mov \$10,$counter # reuse $counter
+ jmp .Loop_128
+
+.align 32
+.Loop_128:
+___
+ &SSSE3ROUND_2x();
+ &pshufd ($c,$c,0b01001110);
+ &pshufd ($b,$b,0b00111001);
+ &pshufd ($d,$d,0b10010011);
+ &pshufd ($c1,$c1,0b01001110);
+ &pshufd ($b1,$b1,0b00111001);
+ &pshufd ($d1,$d1,0b10010011);
+
+ &SSSE3ROUND_2x();
+ &pshufd ($c,$c,0b01001110);
+ &pshufd ($b,$b,0b10010011);
+ &pshufd ($d,$d,0b00111001);
+ &pshufd ($c1,$c1,0b01001110);
+ &pshufd ($b1,$b1,0b10010011);
+ &pshufd ($d1,$d1,0b00111001);
+
+ &dec ($counter);
+ &jnz (".Loop_128");
+
+$code.=<<___;
+ paddd 0x00(%rsp),$a
+ paddd 0x10(%rsp),$b
+ paddd 0x20(%rsp),$c
+ paddd 0x30(%rsp),$d
+ paddd .Lone(%rip),$d1
+ paddd 0x00(%rsp),$a1
+ paddd 0x10(%rsp),$b1
+ paddd 0x20(%rsp),$c1
+ paddd 0x30(%rsp),$d1
+
+ movdqu 0x00($inp),$t
+ movdqu 0x10($inp),$t1
+ pxor $t,$a # xor with input
+ movdqu 0x20($inp),$t
+ pxor $t1,$b
+ movdqu 0x30($inp),$t1
+ pxor $t,$c
+ movdqu 0x40($inp),$t
+ pxor $t1,$d
+ movdqu 0x50($inp),$t1
+ pxor $t,$a1
+ movdqu 0x60($inp),$t
+ pxor $t1,$b1
+ movdqu 0x70($inp),$t1
+ pxor $t,$c1
+ pxor $t1,$d1
+
+ movdqu $a,0x00($out) # write output
+ movdqu $b,0x10($out)
+ movdqu $c,0x20($out)
+ movdqu $d,0x30($out)
+ movdqu $a1,0x40($out)
+ movdqu $b1,0x50($out)
+ movdqu $c1,0x60($out)
+ movdqu $d1,0x70($out)
+___
+$code.=<<___ if ($win64);
+ movaps -0x68(%r9),%xmm6
+ movaps -0x58(%r9),%xmm7
+ movaps -0x48(%r9),%xmm8
+ movaps -0x38(%r9),%xmm9
+ movaps -0x28(%r9),%xmm10
+ movaps -0x18(%r9),%xmm11
+___
+$code.=<<___;
+ lea (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.L128_epilogue:
+ ret
+.cfi_endproc
+.size ChaCha20_128,.-ChaCha20_128
+___
+}
+
+########################################################################
# SSSE3 code path that handles longer messages.
{
# assign variables to favor Atom front-end
@@ -3674,9 +3840,9 @@ se_handler:
ret
.size se_handler,.-se_handler
-.type ssse3_handler,\@abi-omnipotent
+.type simd_handler,\@abi-omnipotent
.align 16
-ssse3_handler:
+simd_handler:
push %rsi
push %rdi
push %rbx
@@ -3702,57 +3868,20 @@ ssse3_handler:
mov 192($context),%rax # pull context->R9
mov 4(%r11),%r10d # HandlerData[1]
+ mov 8(%r11),%ecx # HandlerData[2]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- lea -0x28(%rax),%rsi
+ neg %rcx
+ lea -8(%rax,%rcx),%rsi
lea 512($context),%rdi # &context.Xmm6
- mov \$4,%ecx
+ neg %ecx
+ shr \$3,%ecx
.long 0xa548f3fc # cld; rep movsq
jmp .Lcommon_seh_tail
-.size ssse3_handler,.-ssse3_handler
-
-.type full_handler,\@abi-omnipotent
-.align 16
-full_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lcommon_seh_tail
-
- mov 192($context),%rax # pull context->R9
-
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lcommon_seh_tail
-
- lea -0xa8(%rax),%rsi
- lea 512($context),%rdi # &context.Xmm6
- mov \$20,%ecx
- .long 0xa548f3fc # cld; rep movsq
-
- jmp .Lcommon_seh_tail
-.size full_handler,.-full_handler
+.size simd_handler,.-simd_handler
.section .pdata
.align 4
@@ -3764,6 +3893,10 @@ full_handler:
.rva .LSEH_end_ChaCha20_ssse3
.rva .LSEH_info_ChaCha20_ssse3
+ .rva .LSEH_begin_ChaCha20_128
+ .rva .LSEH_end_ChaCha20_128
+ .rva .LSEH_info_ChaCha20_128
+
.rva .LSEH_begin_ChaCha20_4x
.rva .LSEH_end_ChaCha20_4x
.rva .LSEH_info_ChaCha20_4x
@@ -3804,46 +3937,60 @@ $code.=<<___;
.LSEH_info_ChaCha20_ssse3:
.byte 9,0,0,0
- .rva ssse3_handler
+ .rva simd_handler
.rva .Lssse3_body,.Lssse3_epilogue
+ .long 0x20,0
+
+.LSEH_info_ChaCha20_128:
+ .byte 9,0,0,0
+ .rva simd_handler
+ .rva .L128_body,.L128_epilogue
+ .long 0x60,0
.LSEH_info_ChaCha20_4x:
.byte 9,0,0,0
- .rva full_handler
+ .rva simd_handler
.rva .L4x_body,.L4x_epilogue
+ .long 0xa0,0
___
$code.=<<___ if ($avx);
.LSEH_info_ChaCha20_4xop:
.byte 9,0,0,0
- .rva full_handler
+ .rva simd_handler
.rva .L4xop_body,.L4xop_epilogue # HandlerData[]
+ .long 0xa0,0
___
$code.=<<___ if ($avx>1);
.LSEH_info_ChaCha20_8x:
.byte 9,0,0,0
- .rva full_handler
+ .rva simd_handler
.rva .L8x_body,.L8x_epilogue # HandlerData[]
+ .long 0xa0,0
___
$code.=<<___ if ($avx>2);
.LSEH_info_ChaCha20_avx512:
.byte 9,0,0,0
- .rva ssse3_handler
+ .rva simd_handler
.rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
+ .long 0x20,0
.LSEH_info_ChaCha20_avx512vl:
.byte 9,0,0,0
- .rva ssse3_handler
+ .rva simd_handler
.rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[]
+ .long 0x20,0
.LSEH_info_ChaCha20_16x:
.byte 9,0,0,0
- .rva full_handler
+ .rva simd_handler
.rva .L16x_body,.L16x_epilogue # HandlerData[]
+ .long 0xa0,0
.LSEH_info_ChaCha20_8xvl:
.byte 9,0,0,0
- .rva full_handler
+ .rva simd_handler
.rva .L8xvl_body,.L8xvl_epilogue # HandlerData[]
+ .long 0xa0,0
___
}