summaryrefslogtreecommitdiffstats
path: root/crypto/ec/asm
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2014-10-23 16:08:44 +0200
committerAndy Polyakov <appro@openssl.org>2014-10-23 16:08:44 +0200
commit3ff08e1dde56747011a702a9a5aae06cfa8ae5fc (patch)
treee141a2b9454de50db73c2ef5499c819fd6a8539b /crypto/ec/asm
parentb06f7d9ac0752083e7443dddc9e5ac3e198063d4 (diff)
ecp_nistz256 update.
Facilitate switch to custom scatter-gather routines. This modification does not change algorithms, only makes it possible to implement alternative. This is achieved by a) moving precompute table to assembly (perlasm parses ecp_nistz256_table.c and is free to rearrange data to match gathering algorithm); b) adhering to explicit scatter subroutine (which for now is simply a memcpy). First implementations that will use this option are 32-bit assembly implementations, ARMv4 and x86, where equivalent of current read-whole-table-select-single-value algorithm is too time-consuming. [On side note, switching to scatter-gather on x86_64 would allow to improve server-side ECDSA performance by ~5%]. Reviewed-by: Bodo Moeller <bodo@openssl.org>
Diffstat (limited to 'crypto/ec/asm')
-rwxr-xr-xcrypto/ec/asm/ecp_nistz256-avx2.pl18
-rwxr-xr-xcrypto/ec/asm/ecp_nistz256-x86_64.pl143
2 files changed, 117 insertions, 44 deletions
diff --git a/crypto/ec/asm/ecp_nistz256-avx2.pl b/crypto/ec/asm/ecp_nistz256-avx2.pl
index 4c220aa645..9dc5cc6aa3 100755
--- a/crypto/ec/asm/ecp_nistz256-avx2.pl
+++ b/crypto/ec/asm/ecp_nistz256-avx2.pl
@@ -1909,7 +1909,7 @@ ___
}
{
################################################################################
-# void ecp_nistz256_avx2_multi_select_w7(void* RESULT, void *in,
+# void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in,
# int index0, int index1, int index2, int index3);
################################################################################
@@ -1919,10 +1919,10 @@ my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11));
my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15));
$code.=<<___;
-.globl ecp_nistz256_avx2_multi_select_w7
-.type ecp_nistz256_avx2_multi_select_w7,\@function,6
+.globl ecp_nistz256_avx2_multi_gather_w7
+.type ecp_nistz256_avx2_multi_gather_w7,\@function,6
.align 32
-ecp_nistz256_avx2_multi_select_w7:
+ecp_nistz256_avx2_multi_gather_w7:
vzeroupper
___
$code.=<<___ if ($win64);
@@ -2036,7 +2036,7 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
ret
-.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7
+.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
.extern OPENSSL_ia32cap_P
.globl ecp_nistz_avx2_eligible
@@ -2061,8 +2061,8 @@ $code.=<<___;
.globl ecp_nistz256_avx2_to_mont
.globl ecp_nistz256_avx2_from_mont
.globl ecp_nistz256_avx2_set1
-.globl ecp_nistz256_avx2_multi_select_w7
-.type ecp_nistz256_avx2_multi_select_w7,\@abi-omnipotent
+.globl ecp_nistz256_avx2_multi_gather_w7
+.type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent
ecp_nistz256_avx2_transpose_convert:
ecp_nistz256_avx2_convert_transpose_back:
ecp_nistz256_avx2_point_add_affine_x4:
@@ -2070,10 +2070,10 @@ ecp_nistz256_avx2_point_add_affines_x4:
ecp_nistz256_avx2_to_mont:
ecp_nistz256_avx2_from_mont:
ecp_nistz256_avx2_set1:
-ecp_nistz256_avx2_multi_select_w7:
+ecp_nistz256_avx2_multi_gather_w7:
.byte 0x0f,0x0b # ud2
ret
-.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7
+.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
.globl ecp_nistz_avx2_eligible
.type ecp_nistz_avx2_eligible,\@abi-omnipotent
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index c4b6d0f741..946b734922 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -1465,20 +1465,44 @@ my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
$code.=<<___;
################################################################################
-# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
-.globl ecp_nistz256_select_w5
-.type ecp_nistz256_select_w5,\@abi-omnipotent
+# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_scatter_w5
+.type ecp_nistz256_scatter_w5,\@abi-omnipotent
.align 32
-ecp_nistz256_select_w5:
+ecp_nistz256_scatter_w5:
+ lea -3($index,$index,2), $index
+ movdqa 0x00($in_t), %xmm0
+ shl \$5, $index
+ movdqa 0x10($in_t), %xmm1
+ movdqa 0x20($in_t), %xmm2
+ movdqa 0x30($in_t), %xmm3
+ movdqa 0x40($in_t), %xmm4
+ movdqa 0x50($in_t), %xmm5
+ movdqa %xmm0, 0x00($val,$index)
+ movdqa %xmm1, 0x10($val,$index)
+ movdqa %xmm2, 0x20($val,$index)
+ movdqa %xmm3, 0x30($val,$index)
+ movdqa %xmm4, 0x40($val,$index)
+ movdqa %xmm5, 0x50($val,$index)
+
+ ret
+.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
+
+################################################################################
+# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_gather_w5
+.type ecp_nistz256_gather_w5,\@abi-omnipotent
+.align 32
+ecp_nistz256_gather_w5:
___
$code.=<<___ if ($avx>1);
mov OPENSSL_ia32cap_P+8(%rip), %eax
test \$`1<<5`, %eax
- jnz .Lavx2_select_w5
+ jnz .Lavx2_gather_w5
___
$code.=<<___ if ($win64);
lea -0x88(%rsp), %rax
-.LSEH_begin_ecp_nistz256_select_w5:
+.LSEH_begin_ecp_nistz256_gather_w5:
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
.byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
.byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
@@ -1555,27 +1579,46 @@ $code.=<<___ if ($win64);
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_select_w5:
+.LSEH_end_ecp_nistz256_gather_w5:
___
$code.=<<___;
ret
-.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
################################################################################
-# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
-.globl ecp_nistz256_select_w7
-.type ecp_nistz256_select_w7,\@abi-omnipotent
+# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_scatter_w7
+.type ecp_nistz256_scatter_w7,\@abi-omnipotent
.align 32
-ecp_nistz256_select_w7:
+ecp_nistz256_scatter_w7:
+ movdqu 0x00($in_t), %xmm0
+ shl \$6, $index
+ movdqu 0x10($in_t), %xmm1
+ movdqu 0x20($in_t), %xmm2
+ movdqu 0x30($in_t), %xmm3
+ movdqa %xmm0, 0x00($val,$index)
+ movdqa %xmm1, 0x10($val,$index)
+ movdqa %xmm2, 0x20($val,$index)
+ movdqa %xmm3, 0x30($val,$index)
+
+ ret
+.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
+
+################################################################################
+# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_gather_w7
+.type ecp_nistz256_gather_w7,\@abi-omnipotent
+.align 32
+ecp_nistz256_gather_w7:
___
$code.=<<___ if ($avx>1);
mov OPENSSL_ia32cap_P+8(%rip), %eax
test \$`1<<5`, %eax
- jnz .Lavx2_select_w7
+ jnz .Lavx2_gather_w7
___
$code.=<<___ if ($win64);
lea -0x88(%rsp), %rax
-.LSEH_begin_ecp_nistz256_select_w7:
+.LSEH_begin_ecp_nistz256_gather_w7:
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
.byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
.byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
@@ -1641,11 +1684,11 @@ $code.=<<___ if ($win64);
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_select_w7:
+.LSEH_end_ecp_nistz256_gather_w7:
___
$code.=<<___;
ret
-.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
___
}
if ($avx>1) {
@@ -1656,16 +1699,16 @@ my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
$code.=<<___;
################################################################################
-# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
-.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent
+# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index);
+.type ecp_nistz256_avx2_gather_w5,\@abi-omnipotent
.align 32
-ecp_nistz256_avx2_select_w5:
-.Lavx2_select_w5:
+ecp_nistz256_avx2_gather_w5:
+.Lavx2_gather_w5:
vzeroupper
___
$code.=<<___ if ($win64);
lea -0x88(%rsp), %rax
-.LSEH_begin_ecp_nistz256_avx2_select_w5:
+.LSEH_begin_ecp_nistz256_avx2_gather_w5:
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
.byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
.byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
@@ -1743,11 +1786,11 @@ $code.=<<___ if ($win64);
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_avx2_select_w5:
+.LSEH_end_ecp_nistz256_avx2_gather_w5:
___
$code.=<<___;
ret
-.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
+.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
___
}
if ($avx>1) {
@@ -1760,17 +1803,17 @@ my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
$code.=<<___;
################################################################################
-# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
-.globl ecp_nistz256_avx2_select_w7
-.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent
+# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_avx2_gather_w7
+.type ecp_nistz256_avx2_gather_w7,\@abi-omnipotent
.align 32
-ecp_nistz256_avx2_select_w7:
-.Lavx2_select_w7:
+ecp_nistz256_avx2_gather_w7:
+.Lavx2_gather_w7:
vzeroupper
___
$code.=<<___ if ($win64);
lea -0x88(%rsp), %rax
-.LSEH_begin_ecp_nistz256_avx2_select_w7:
+.LSEH_begin_ecp_nistz256_avx2_gather_w7:
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
.byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
.byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
@@ -1863,21 +1906,21 @@ $code.=<<___ if ($win64);
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_avx2_select_w7:
+.LSEH_end_ecp_nistz256_avx2_gather_w7:
___
$code.=<<___;
ret
-.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
___
} else {
$code.=<<___;
-.globl ecp_nistz256_avx2_select_w7
-.type ecp_nistz256_avx2_select_w7,\@function,3
+.globl ecp_nistz256_avx2_gather_w7
+.type ecp_nistz256_avx2_gather_w7,\@function,3
.align 32
-ecp_nistz256_avx2_select_w7:
+ecp_nistz256_avx2_gather_w7:
.byte 0x0f,0x0b # ud2
ret
-.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
___
}
{{{
@@ -3087,6 +3130,36 @@ ___
}
}}}
+########################################################################
+# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
+#
+open TABLE,"<ecp_nistz256_table.c" or
+open TABLE,"<${dir}/../ecp_nistz256_table.c" or
+die "failed to open ecp_nistz256_table.c:",$!;
+
+use integer;
+
+foreach(<TABLE>) {
+ s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
+}
+close TABLE;
+
+die "insane number of elements" if ($#arr != 64*16*37-1);
+
+print <<___;
+.text
+.globl ecp_nistz256_precomputed
+.type ecp_nistz256_precomputed,\@object
+.align 4096
+ecp_nistz256_precomputed:
+___
+while (@line=splice(@arr,0,16)) {
+ print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n";
+}
+print <<___;
+.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
+___
+
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;