diff options
author | Andy Polyakov <appro@openssl.org> | 2014-10-23 16:08:44 +0200 |
---|---|---|
committer | Andy Polyakov <appro@openssl.org> | 2014-10-23 16:08:44 +0200 |
commit | 3ff08e1dde56747011a702a9a5aae06cfa8ae5fc (patch) | |
tree | e141a2b9454de50db73c2ef5499c819fd6a8539b /crypto/ec/asm | |
parent | b06f7d9ac0752083e7443dddc9e5ac3e198063d4 (diff) |
ecp_nistz256 update.
Facilitate switch to custom scatter-gather routines. This modification
does not change algorithms, only makes it possible to implement
alternative. This is achieved by a) moving precompute table to assembly
(perlasm parses ecp_nistz256_table.c and is free to rearrange data to
match gathering algorithm); b) adhering to explicit scatter subroutine
(which for now is simply a memcpy). First implementations that will use
this option are 32-bit assembly implementations, ARMv4 and x86, where
equivalent of current read-whole-table-select-single-value algorithm
is too time-consuming. [On side note, switching to scatter-gather on
x86_64 would allow to improve server-side ECDSA performance by ~5%].
Reviewed-by: Bodo Moeller <bodo@openssl.org>
Diffstat (limited to 'crypto/ec/asm')
-rwxr-xr-x | crypto/ec/asm/ecp_nistz256-avx2.pl | 18 | ||||
-rwxr-xr-x | crypto/ec/asm/ecp_nistz256-x86_64.pl | 143 |
2 files changed, 117 insertions, 44 deletions
diff --git a/crypto/ec/asm/ecp_nistz256-avx2.pl b/crypto/ec/asm/ecp_nistz256-avx2.pl index 4c220aa645..9dc5cc6aa3 100755 --- a/crypto/ec/asm/ecp_nistz256-avx2.pl +++ b/crypto/ec/asm/ecp_nistz256-avx2.pl @@ -1909,7 +1909,7 @@ ___ } { ################################################################################ -# void ecp_nistz256_avx2_multi_select_w7(void* RESULT, void *in, +# void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in, # int index0, int index1, int index2, int index3); ################################################################################ @@ -1919,10 +1919,10 @@ my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11)); my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15)); $code.=<<___; -.globl ecp_nistz256_avx2_multi_select_w7 -.type ecp_nistz256_avx2_multi_select_w7,\@function,6 +.globl ecp_nistz256_avx2_multi_gather_w7 +.type ecp_nistz256_avx2_multi_gather_w7,\@function,6 .align 32 -ecp_nistz256_avx2_multi_select_w7: +ecp_nistz256_avx2_multi_gather_w7: vzeroupper ___ $code.=<<___ if ($win64); @@ -2036,7 +2036,7 @@ $code.=<<___ if ($win64); ___ $code.=<<___; ret -.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7 +.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7 .extern OPENSSL_ia32cap_P .globl ecp_nistz_avx2_eligible @@ -2061,8 +2061,8 @@ $code.=<<___; .globl ecp_nistz256_avx2_to_mont .globl ecp_nistz256_avx2_from_mont .globl ecp_nistz256_avx2_set1 -.globl ecp_nistz256_avx2_multi_select_w7 -.type ecp_nistz256_avx2_multi_select_w7,\@abi-omnipotent +.globl ecp_nistz256_avx2_multi_gather_w7 +.type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent ecp_nistz256_avx2_transpose_convert: ecp_nistz256_avx2_convert_transpose_back: ecp_nistz256_avx2_point_add_affine_x4: @@ -2070,10 +2070,10 @@ ecp_nistz256_avx2_point_add_affines_x4: ecp_nistz256_avx2_to_mont: ecp_nistz256_avx2_from_mont: ecp_nistz256_avx2_set1: -ecp_nistz256_avx2_multi_select_w7: +ecp_nistz256_avx2_multi_gather_w7: .byte 0x0f,0x0b # ud2 ret -.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7 +.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7 .globl ecp_nistz_avx2_eligible .type ecp_nistz_avx2_eligible,\@abi-omnipotent diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl index c4b6d0f741..946b734922 100755 --- a/crypto/ec/asm/ecp_nistz256-x86_64.pl +++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl @@ -1465,20 +1465,44 @@ my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); $code.=<<___; ################################################################################ -# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); -.globl ecp_nistz256_select_w5 -.type ecp_nistz256_select_w5,\@abi-omnipotent +# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_scatter_w5 +.type ecp_nistz256_scatter_w5,\@abi-omnipotent .align 32 -ecp_nistz256_select_w5: +ecp_nistz256_scatter_w5: + lea -3($index,$index,2), $index + movdqa 0x00($in_t), %xmm0 + shl \$5, $index + movdqa 0x10($in_t), %xmm1 + movdqa 0x20($in_t), %xmm2 + movdqa 0x30($in_t), %xmm3 + movdqa 0x40($in_t), %xmm4 + movdqa 0x50($in_t), %xmm5 + movdqa %xmm0, 0x00($val,$index) + movdqa %xmm1, 0x10($val,$index) + movdqa %xmm2, 0x20($val,$index) + movdqa %xmm3, 0x30($val,$index) + movdqa %xmm4, 0x40($val,$index) + movdqa %xmm5, 0x50($val,$index) + + ret +.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 + +################################################################################ +# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_gather_w5 +.type ecp_nistz256_gather_w5,\@abi-omnipotent +.align 32 +ecp_nistz256_gather_w5: ___ $code.=<<___ if ($avx>1); mov OPENSSL_ia32cap_P+8(%rip), %eax test \$`1<<5`, %eax - jnz .Lavx2_select_w5 + jnz .Lavx2_gather_w5 ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax -.LSEH_begin_ecp_nistz256_select_w5: +.LSEH_begin_ecp_nistz256_gather_w5: .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) @@ -1555,27 +1579,46 @@ $code.=<<___ if ($win64); movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp -.LSEH_end_ecp_nistz256_select_w5: +.LSEH_end_ecp_nistz256_gather_w5: ___ $code.=<<___; ret -.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 +.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 ################################################################################ -# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); -.globl ecp_nistz256_select_w7 -.type ecp_nistz256_select_w7,\@abi-omnipotent +# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_scatter_w7 +.type ecp_nistz256_scatter_w7,\@abi-omnipotent .align 32 -ecp_nistz256_select_w7: +ecp_nistz256_scatter_w7: + movdqu 0x00($in_t), %xmm0 + shl \$6, $index + movdqu 0x10($in_t), %xmm1 + movdqu 0x20($in_t), %xmm2 + movdqu 0x30($in_t), %xmm3 + movdqa %xmm0, 0x00($val,$index) + movdqa %xmm1, 0x10($val,$index) + movdqa %xmm2, 0x20($val,$index) + movdqa %xmm3, 0x30($val,$index) + + ret +.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 + +################################################################################ +# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_gather_w7 +.type ecp_nistz256_gather_w7,\@abi-omnipotent +.align 32 +ecp_nistz256_gather_w7: ___ $code.=<<___ if ($avx>1); mov OPENSSL_ia32cap_P+8(%rip), %eax test \$`1<<5`, %eax - jnz .Lavx2_select_w7 + jnz .Lavx2_gather_w7 ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax -.LSEH_begin_ecp_nistz256_select_w7: +.LSEH_begin_ecp_nistz256_gather_w7: .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) @@ -1641,11 +1684,11 @@ $code.=<<___ if ($win64); movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp -.LSEH_end_ecp_nistz256_select_w7: +.LSEH_end_ecp_nistz256_gather_w7: ___ $code.=<<___; ret -.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 +.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 ___ } if ($avx>1) { @@ -1656,16 +1699,16 @@ my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); $code.=<<___; ################################################################################ -# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); -.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent +# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index); +.type ecp_nistz256_avx2_gather_w5,\@abi-omnipotent .align 32 -ecp_nistz256_avx2_select_w5: -.Lavx2_select_w5: +ecp_nistz256_avx2_gather_w5: +.Lavx2_gather_w5: vzeroupper ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax -.LSEH_begin_ecp_nistz256_avx2_select_w5: +.LSEH_begin_ecp_nistz256_avx2_gather_w5: .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) @@ -1743,11 +1786,11 @@ $code.=<<___ if ($win64); movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp -.LSEH_end_ecp_nistz256_avx2_select_w5: +.LSEH_end_ecp_nistz256_avx2_gather_w5: ___ $code.=<<___; ret -.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 +.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5 ___ } if ($avx>1) { @@ -1760,17 +1803,17 @@ my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); $code.=<<___; ################################################################################ -# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); -.globl ecp_nistz256_avx2_select_w7 -.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent +# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_avx2_gather_w7 +.type ecp_nistz256_avx2_gather_w7,\@abi-omnipotent .align 32 -ecp_nistz256_avx2_select_w7: -.Lavx2_select_w7: +ecp_nistz256_avx2_gather_w7: +.Lavx2_gather_w7: vzeroupper ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax -.LSEH_begin_ecp_nistz256_avx2_select_w7: +.LSEH_begin_ecp_nistz256_avx2_gather_w7: .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) @@ -1863,21 +1906,21 @@ $code.=<<___ if ($win64); movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp -.LSEH_end_ecp_nistz256_avx2_select_w7: +.LSEH_end_ecp_nistz256_avx2_gather_w7: ___ $code.=<<___; ret -.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 +.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 ___ } else { $code.=<<___; -.globl ecp_nistz256_avx2_select_w7 -.type ecp_nistz256_avx2_select_w7,\@function,3 +.globl ecp_nistz256_avx2_gather_w7 +.type ecp_nistz256_avx2_gather_w7,\@function,3 .align 32 -ecp_nistz256_avx2_select_w7: +ecp_nistz256_avx2_gather_w7: .byte 0x0f,0x0b # ud2 ret -.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 +.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 ___ } {{{ @@ -3087,6 +3130,36 @@ ___ } }}} +######################################################################## +# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 +# +open TABLE,"<ecp_nistz256_table.c" or +open TABLE,"<${dir}/../ecp_nistz256_table.c" or +die "failed to open ecp_nistz256_table.c:",$!; + +use integer; + +foreach(<TABLE>) { + s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; +} +close TABLE; + +die "insane number of elements" if ($#arr != 64*16*37-1); + +print <<___; +.text +.globl ecp_nistz256_precomputed +.type ecp_nistz256_precomputed,\@object +.align 4096 +ecp_nistz256_precomputed: +___ +while (@line=splice(@arr,0,16)) { + print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n"; +} +print <<___; +.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed +___ + $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT; |