summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrey Matyukov <andrey.matyukov@intel.com>2020-12-08 22:53:39 +0300
committerTomas Mraz <tomas@openssl.org>2022-11-09 15:29:59 +0100
commit3d2b47bcdf8cf407ef1e459d54d4501cc19f0227 (patch)
treebf85f45e7ae5af4f40cf392fa86cd55072527499
parent0bed814750c62c738d509cd0a9655789ae69e99b (diff)
Dual 1536/2048-bit exponentiation optimization for Intel IceLake CPU
It uses AVX512_IFMA + AVX512_VL (with 256-bit wide registers) ISA to keep lower power license. Reviewed-by: Matt Caswell <matt@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/14908) (cherry picked from commit f87b4c4ea67393c9269663ed40a7ea3463cc59d3)
-rw-r--r--CHANGES.md5
-rw-r--r--crypto/bn/asm/rsaz-2k-avx512.pl (renamed from crypto/bn/asm/rsaz-avx512.pl)310
-rw-r--r--crypto/bn/asm/rsaz-3k-avx512.pl874
-rw-r--r--crypto/bn/asm/rsaz-4k-avx512.pl930
-rw-r--r--crypto/bn/bn_exp.c24
-rw-r--r--crypto/bn/build.info6
-rw-r--r--crypto/bn/rsaz_exp_x2.c410
-rw-r--r--test/exptest.c9
8 files changed, 2234 insertions, 334 deletions
diff --git a/CHANGES.md b/CHANGES.md
index 0a15cba87d..700238fe66 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -35,6 +35,11 @@ OpenSSL 3.1
*Paul Dale*
+ * Parallel dual-prime 1536/2048-bit modular exponentiation for
+ AVX512_IFMA capable processors.
+
+ *Sergey Kirillov, Andrey Matyukov (Intel Corp)*
+
OpenSSL 3.0
-----------
diff --git a/crypto/bn/asm/rsaz-avx512.pl b/crypto/bn/asm/rsaz-2k-avx512.pl
index 8d1d19f6c7..80bc4a51b2 100644
--- a/crypto/bn/asm/rsaz-avx512.pl
+++ b/crypto/bn/asm/rsaz-2k-avx512.pl
@@ -7,7 +7,8 @@
# https://www.openssl.org/source/license.html
#
#
-# Originally written by Ilya Albrekht, Sergey Kirillov and Andrey Matyukov
+# Originally written by Sergey Kirillov and Andrey Matyukov.
+# Special thanks to Ilya Albrekht for his valuable hints.
# Intel Corporation
#
# December 2020
@@ -86,26 +87,29 @@ ___
###############################################################################
# Almost Montgomery Multiplication (AMM) for 20-digit number in radix 2^52.
#
-# AMM is defined as presented in the paper
-# "Efficient Software Implementations of Modular Exponentiation" by Shay Gueron.
+# AMM is defined as presented in the paper [1].
#
# The input and output are presented in 2^52 radix domain, i.e.
# |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high bits zeroed.
# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
-# (note, the implementation counts only 52 bits from it).
#
-# NB: the AMM implementation does not perform "conditional" subtraction step as
-# specified in the original algorithm as according to the paper "Enhanced Montgomery
-# Multiplication" by Shay Gueron (see Lemma 1), the result will be always < 2*2^1024
-# and can be used as a direct input to the next AMM iteration.
-# This post-condition is true, provided the correct parameter |s| is choosen, i.e.
-# s >= n + 2 * k, which matches our case: 1040 > 1024 + 2 * 1.
+# NB: the AMM implementation does not perform "conditional" subtraction step
+# specified in the original algorithm as according to the Lemma 1 from the paper
+# [2], the result will be always < 2*m and can be used as a direct input to
+# the next AMM iteration. This post-condition is true, provided the correct
+# parameter |s| (notion of the Lemma 1 from [2]) is choosen, i.e. s >= n + 2 * k,
+# which matches our case: 1040 > 1024 + 2 * 1.
#
-# void ossl_rsaz_amm52x20_x1_256(BN_ULONG *res,
-# const BN_ULONG *a,
-# const BN_ULONG *b,
-# const BN_ULONG *m,
-# BN_ULONG k0);
+# [1] Gueron, S. Efficient software implementations of modular exponentiation.
+# DOI: 10.1007/s13389-012-0031-5
+# [2] Gueron, S. Enhanced Montgomery Multiplication.
+# DOI: 10.1007/3-540-36400-5_5
+#
+# void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res,
+# const BN_ULONG *a,
+# const BN_ULONG *b,
+# const BN_ULONG *m,
+# BN_ULONG k0);
###############################################################################
{
# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
@@ -121,16 +125,13 @@ my $b_ptr = "%r11";
my $iter = "%ebx";
my $zero = "%ymm0";
-my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm1", map("%ymm$_",(16..19)));
-my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm2", map("%ymm$_",(20..23)));
-my $Bi = "%ymm3";
-my $Yi = "%ymm4";
+my $Bi = "%ymm1";
+my $Yi = "%ymm2";
+my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm3",map("%ymm$_",(16..19)));
+my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm4",map("%ymm$_",(20..23)));
# Registers mapping for normalization.
-# We can reuse Bi, Yi registers here.
-my $TMP = $Bi;
-my $mask52x4 = $Yi;
-my ($T0,$T0h,$T1,$T1h,$T2) = map("%ymm$_", (24..28));
+my ($T0,$T0h,$T1,$T1h,$T2) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (25..26)));
sub amm52x20_x1() {
# _data_offset - offset in the |a| or |m| arrays pointing to the beginning
@@ -199,16 +200,16 @@ $code.=<<___;
___
}
-# Normalization routine: handles carry bits in R0..R2 QWs and
-# gets R0..R2 back to normalized 2^52 representation.
+# Normalization routine: handles carry bits and gets bignum qwords to normalized
+# 2^52 representation.
#
# Uses %r8-14,%e[bcd]x
sub amm52x20_x1_norm {
my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2) = @_;
$code.=<<___;
# Put accumulator to low qword in R0
- vpbroadcastq $_acc, $TMP
- vpblendd \$3, $TMP, $_R0, $_R0
+ vpbroadcastq $_acc, $T0
+ vpblendd \$3, $T0, $_R0, $_R0
# Extract "carries" (12 high bits) from each QW of R0..R2
# Save them to LSB of QWs in T0..T2
@@ -223,14 +224,14 @@ $code.=<<___;
valignq \$3, $T1, $T1h, $T1h
valignq \$3, $T0h, $T1, $T1
valignq \$3, $T0, $T0h, $T0h
- valignq \$3, $zero, $T0, $T0
+ valignq \$3, .Lzeros(%rip), $T0, $T0
# Drop "carries" from R0..R2 QWs
- vpandq $mask52x4, $_R0, $_R0
- vpandq $mask52x4, $_R0h, $_R0h
- vpandq $mask52x4, $_R1, $_R1
- vpandq $mask52x4, $_R1h, $_R1h
- vpandq $mask52x4, $_R2, $_R2
+ vpandq .Lmask52x4(%rip), $_R0, $_R0
+ vpandq .Lmask52x4(%rip), $_R0h, $_R0h
+ vpandq .Lmask52x4(%rip), $_R1, $_R1
+ vpandq .Lmask52x4(%rip), $_R1h, $_R1h
+ vpandq .Lmask52x4(%rip), $_R2, $_R2
# Sum R0..R2 with corresponding adjusted carries
vpaddq $T0, $_R0, $_R0
@@ -241,11 +242,11 @@ $code.=<<___;
# Now handle carry bits from this addition
# Get mask of QWs which 52-bit parts overflow...
- vpcmpuq \$1, $_R0, $mask52x4, %k1 # OP=lt
- vpcmpuq \$1, $_R0h, $mask52x4, %k2
- vpcmpuq \$1, $_R1, $mask52x4, %k3
- vpcmpuq \$1, $_R1h, $mask52x4, %k4
- vpcmpuq \$1, $_R2, $mask52x4, %k5
+ vpcmpuq \$6, .Lmask52x4(%rip), $_R0, %k1 # OP=nle (i.e. gt)
+ vpcmpuq \$6, .Lmask52x4(%rip), $_R0h, %k2
+ vpcmpuq \$6, .Lmask52x4(%rip), $_R1, %k3
+ vpcmpuq \$6, .Lmask52x4(%rip), $_R1h, %k4
+ vpcmpuq \$6, .Lmask52x4(%rip), $_R2, %k5
kmovb %k1, %r14d # k1
kmovb %k2, %r13d # k1h
kmovb %k3, %r12d # k2
@@ -253,11 +254,11 @@ $code.=<<___;
kmovb %k5, %r10d # k3
# ...or saturated
- vpcmpuq \$0, $_R0, $mask52x4, %k1 # OP=eq
- vpcmpuq \$0, $_R0h, $mask52x4, %k2
- vpcmpuq \$0, $_R1, $mask52x4, %k3
- vpcmpuq \$0, $_R1h, $mask52x4, %k4
- vpcmpuq \$0, $_R2, $mask52x4, %k5
+ vpcmpuq \$0, .Lmask52x4(%rip), $_R0, %k1 # OP=eq
+ vpcmpuq \$0, .Lmask52x4(%rip), $_R0h, %k2
+ vpcmpuq \$0, .Lmask52x4(%rip), $_R1, %k3
+ vpcmpuq \$0, .Lmask52x4(%rip), $_R1h, %k4
+ vpcmpuq \$0, .Lmask52x4(%rip), $_R2, %k5
kmovb %k1, %r9d # k4
kmovb %k2, %r8d # k4h
kmovb %k3, %ebx # k5
@@ -297,27 +298,27 @@ $code.=<<___;
kmovb %r10d, %k5
# Add carries according to the obtained mask
- vpsubq $mask52x4, $_R0, ${_R0}{%k1}
- vpsubq $mask52x4, $_R0h, ${_R0h}{%k2}
- vpsubq $mask52x4, $_R1, ${_R1}{%k3}
- vpsubq $mask52x4, $_R1h, ${_R1h}{%k4}
- vpsubq $mask52x4, $_R2, ${_R2}{%k5}
-
- vpandq $mask52x4, $_R0, $_R0
- vpandq $mask52x4, $_R0h, $_R0h
- vpandq $mask52x4, $_R1, $_R1
- vpandq $mask52x4, $_R1h, $_R1h
- vpandq $mask52x4, $_R2, $_R2
+ vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1}
+ vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
+ vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3}
+ vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
+ vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5}
+
+ vpandq .Lmask52x4(%rip), $_R0, $_R0
+ vpandq .Lmask52x4(%rip), $_R0h, $_R0h
+ vpandq .Lmask52x4(%rip), $_R1, $_R1
+ vpandq .Lmask52x4(%rip), $_R1h, $_R1h
+ vpandq .Lmask52x4(%rip), $_R2, $_R2
___
}
$code.=<<___;
.text
-.globl ossl_rsaz_amm52x20_x1_256
-.type ossl_rsaz_amm52x20_x1_256,\@function,5
+.globl ossl_rsaz_amm52x20_x1_ifma256
+.type ossl_rsaz_amm52x20_x1_ifma256,\@function,5
.align 32
-ossl_rsaz_amm52x20_x1_256:
+ossl_rsaz_amm52x20_x1_ifma256:
.cfi_startproc
endbranch
push %rbx
@@ -332,7 +333,7 @@ ossl_rsaz_amm52x20_x1_256:
.cfi_push %r14
push %r15
.cfi_push %r15
-.Lrsaz_amm52x20_x1_256_body:
+.Lossl_rsaz_amm52x20_x1_ifma256_body:
# Zeroing accumulators
vpxord $zero, $zero, $zero
@@ -360,17 +361,15 @@ $code.=<<___;
lea `4*8`($b_ptr), $b_ptr
dec $iter
jne .Lloop5
-
- vmovdqa64 .Lmask52x4(%rip), $mask52x4
___
&amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0);
$code.=<<___;
- vmovdqu64 $R0_0, ($res)
- vmovdqu64 $R0_0h, 32($res)
- vmovdqu64 $R1_0, 64($res)
- vmovdqu64 $R1_0h, 96($res)
- vmovdqu64 $R2_0, 128($res)
+ vmovdqu64 $R0_0, `0*32`($res)
+ vmovdqu64 $R0_0h, `1*32`($res)
+ vmovdqu64 $R1_0, `2*32`($res)
+ vmovdqu64 $R1_0h, `3*32`($res)
+ vmovdqu64 $R2_0, `4*32`($res)
vzeroupper
mov 0(%rsp),%r15
@@ -387,10 +386,10 @@ $code.=<<___;
.cfi_restore %rbx
lea 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
-.Lrsaz_amm52x20_x1_256_epilogue:
+.Lossl_rsaz_amm52x20_x1_ifma256_epilogue:
ret
.cfi_endproc
-.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256
+.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256
___
$code.=<<___;
@@ -406,25 +405,25 @@ ___
###############################################################################
# Dual Almost Montgomery Multiplication for 20-digit number in radix 2^52
#
-# See description of ossl_rsaz_amm52x20_x1_256() above for details about Almost
+# See description of ossl_rsaz_amm52x20_x1_ifma256() above for details about Almost
# Montgomery Multiplication algorithm and function input parameters description.
#
# This function does two AMMs for two independent inputs, hence dual.
#
-# void ossl_rsaz_amm52x20_x2_256(BN_ULONG out[2][20],
-# const BN_ULONG a[2][20],
-# const BN_ULONG b[2][20],
-# const BN_ULONG m[2][20],
-# const BN_ULONG k0[2]);
+# void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20],
+# const BN_ULONG a[2][20],
+# const BN_ULONG b[2][20],
+# const BN_ULONG m[2][20],
+# const BN_ULONG k0[2]);
###############################################################################
$code.=<<___;
.text
-.globl ossl_rsaz_amm52x20_x2_256
-.type ossl_rsaz_amm52x20_x2_256,\@function,5
+.globl ossl_rsaz_amm52x20_x2_ifma256
+.type ossl_rsaz_amm52x20_x2_ifma256,\@function,5
.align 32
-ossl_rsaz_amm52x20_x2_256:
+ossl_rsaz_amm52x20_x2_ifma256:
.cfi_startproc
endbranch
push %rbx
@@ -439,7 +438,7 @@ ossl_rsaz_amm52x20_x2_256:
.cfi_push %r14
push %r15
.cfi_push %r15
-.Lrsaz_amm52x20_x2_256_body:
+.Lossl_rsaz_amm52x20_x2_ifma256_body:
# Zeroing accumulators
vpxord $zero, $zero, $zero
@@ -472,24 +471,22 @@ $code.=<<___;
lea 8($b_ptr), $b_ptr
dec $iter
jne .Lloop20
-
- vmovdqa64 .Lmask52x4(%rip), $mask52x4
___
&amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0);
&amm52x20_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1);
$code.=<<___;
- vmovdqu64 $R0_0, ($res)
- vmovdqu64 $R0_0h, 32($res)
- vmovdqu64 $R1_0, 64($res)
- vmovdqu64 $R1_0h, 96($res)
- vmovdqu64 $R2_0, 128($res)
+ vmovdqu64 $R0_0, `0*32`($res)
+ vmovdqu64 $R0_0h, `1*32`($res)
+ vmovdqu64 $R1_0, `2*32`($res)
+ vmovdqu64 $R1_0h, `3*32`($res)
+ vmovdqu64 $R2_0, `4*32`($res)
- vmovdqu64 $R0_1, 160($res)
- vmovdqu64 $R0_1h, 192($res)
- vmovdqu64 $R1_1, 224($res)
- vmovdqu64 $R1_1h, 256($res)
- vmovdqu64 $R2_1, 288($res)
+ vmovdqu64 $R0_1, `5*32`($res)
+ vmovdqu64 $R0_1h, `6*32`($res)
+ vmovdqu64 $R1_1, `7*32`($res)
+ vmovdqu64 $R1_1h, `8*32`($res)
+ vmovdqu64 $R2_1, `9*32`($res)
vzeroupper
mov 0(%rsp),%r15
@@ -506,10 +503,10 @@ $code.=<<___;
.cfi_restore %rbx
lea 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
-.Lrsaz_amm52x20_x2_256_epilogue:
+.Lossl_rsaz_amm52x20_x2_ifma256_epilogue:
ret
.cfi_endproc
-.size ossl_rsaz_amm52x20_x2_256, .-ossl_rsaz_amm52x20_x2_256
+.size ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256
___
}
@@ -517,77 +514,76 @@ ___
# Constant time extraction from the precomputed table of powers base^i, where
# i = 0..2^EXP_WIN_SIZE-1
#
-# The input |red_table| contains precomputations for two independent base values,
-# so the |tbl_idx| indicates for which base shall we extract the value.
-# |red_table_idx| is a power index.
+# The input |red_table| contains precomputations for two independent base values.
+# |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
#
-# Extracted value (output) is 20 digit number in 2^52 radix.
+# Extracted value (output) is 2 20 digit numbers in 2^52 radix.
#
# void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y,
# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20],
-# int red_table_idx,
-# int tbl_idx); # 0 or 1
+# int red_table_idx1, int red_table_idx2);
#
# EXP_WIN_SIZE = 5
###############################################################################
{
# input parameters
-my ($out,$red_tbl,$red_tbl_idx,$tbl_idx) = @_6_args_universal_ABI;
+my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
-my ($t0,$t1,$t2,$t3,$t4) = map("%ymm$_", (0..4));
-my $t4xmm = $t4;
-$t4xmm =~ s/%y/%x/;
-my ($tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = map("%ymm$_", (16..20));
-my ($cur_idx,$idx,$ones) = map("%ymm$_", (21..23));
+my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
+my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19));
+my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24));
+
+my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9);
+my $t0xmm = $t0;
+$t0xmm =~ s/%y/%x/;
$code.=<<___;
.text
.align 32
.globl ossl_extract_multiplier_2x20_win5
-.type ossl_extract_multiplier_2x20_win5,\@function,4
+.type ossl_extract_multiplier_2x20_win5,\@abi-omnipotent
ossl_extract_multiplier_2x20_win5:
.cfi_startproc
endbranch
- leaq ($tbl_idx,$tbl_idx,4), %rax
- salq \$5, %rax
- addq %rax, $red_tbl
-
vmovdqa64 .Lones(%rip), $ones # broadcast ones
- vpbroadcastq $red_tbl_idx, $idx
+ vpbroadcastq $red_tbl_idx1, $idx1
+ vpbroadcastq $red_tbl_idx2, $idx2
leaq `(1<<5)*2*20*8`($red_tbl), %rax # holds end of the tbl
- vpxor $t4xmm, $t4xmm, $t4xmm
- vmovdqa64 $t4, $t3 # zeroing t0..4, cur_idx
- vmovdqa64 $t4, $t2
- vmovdqa64 $t4, $t1
- vmovdqa64 $t4, $t0
- vmovdqa64 $t4, $cur_idx
+ # zeroing t0..n, cur_idx
+ vpxor $t0xmm, $t0xmm, $t0xmm
+ vmovdqa64 $t0, $cur_idx
+___
+foreach (1..9) {
+ $code.="vmovdqa64 $t0, $t[$_] \n";
+}
+$code.=<<___;
.align 32
.Lloop:
- vpcmpq \$0, $cur_idx, $idx, %k1 # mask of (idx == cur_idx)
- addq \$320, $red_tbl # 320 = 2 * 20 digits * 8 bytes
- vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx
- vmovdqu64 -320($red_tbl), $tmp0 # load data from red_tbl
- vmovdqu64 -288($red_tbl), $tmp1
- vmovdqu64 -256($red_tbl), $tmp2
- vmovdqu64 -224($red_tbl), $tmp3
- vmovdqu64 -192($red_tbl), $tmp4
- vpblendmq $tmp0, $t0, ${t0}{%k1} # extract data when mask is not zero
- vpblendmq $tmp1, $t1, ${t1}{%k1}
- vpblendmq $tmp2, $t2, ${t2}{%k1}
- vpblendmq $tmp3, $t3, ${t3}{%k1}
- vpblendmq $tmp4, $t4, ${t4}{%k1}
+ vpcmpq \$0, $cur_idx, $idx1, %k1 # mask of (idx1 == cur_idx)
+ vpcmpq \$0, $cur_idx, $idx2, %k2 # mask of (idx2 == cur_idx)
+___
+foreach (0..9) {
+ my $mask = $_<5?"%k1":"%k2";
+$code.=<<___;
+ vmovdqu64 `${_}*32`($red_tbl), $tmp # load data from red_tbl
+ vpblendmq $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero
+___
+}
+$code.=<<___;
+ vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx
+ addq \$`2*20*8`, $red_tbl
cmpq $red_tbl, %rax
jne .Lloop
-
- vmovdqu64 $t0, ($out) # store t0..4
- vmovdqu64 $t1, 32($out)
- vmovdqu64 $t2, 64($out)
- vmovdqu64 $t3, 96($out)
- vmovdqu64 $t4, 128($out)
-
+___
+# store t0..n
+foreach (0..9) {
+ $code.="vmovdqu64 $t[$_], `${_}*32`($out) \n";
+}
+$code.=<<___;
ret
.cfi_endproc
.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5
@@ -597,6 +593,8 @@ $code.=<<___;
.align 32
.Lones:
.quad 1,1,1,1
+.Lzeros:
+ .quad 0,0,0,0
___
}
@@ -606,7 +604,7 @@ $frame="%rdx";
$context="%r8";
$disp="%r9";
-$code.=<<___
+$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type rsaz_def_handler,\@abi-omnipotent
.align 16
@@ -697,32 +695,24 @@ rsaz_def_handler:
.section .pdata
.align 4
- .rva .LSEH_begin_ossl_rsaz_amm52x20_x1_256
- .rva .LSEH_end_ossl_rsaz_amm52x20_x1_256
- .rva .LSEH_info_ossl_rsaz_amm52x20_x1_256
-
- .rva .LSEH_begin_ossl_rsaz_amm52x20_x2_256
- .rva .LSEH_end_ossl_rsaz_amm52x20_x2_256
- .rva .LSEH_info_ossl_rsaz_amm52x20_x2_256
+ .rva .LSEH_begin_ossl_rsaz_amm52x20_x1_ifma256
+ .rva .LSEH_end_ossl_rsaz_amm52x20_x1_ifma256
+ .rva .LSEH_info_ossl_rsaz_amm52x20_x1_ifma256
- .rva .LSEH_begin_ossl_extract_multiplier_2x20_win5
- .rva .LSEH_end_ossl_extract_multiplier_2x20_win5
- .rva .LSEH_info_ossl_extract_multiplier_2x20_win5
+ .rva .LSEH_begin_ossl_rsaz_amm52x20_x2_ifma256
+ .rva .LSEH_end_ossl_rsaz_amm52x20_x2_ifma256
+ .rva .LSEH_info_ossl_rsaz_amm52x20_x2_ifma256
.section .xdata
.align 8
-.LSEH_info_ossl_rsaz_amm52x20_x1_256:
- .byte 9,0,0,0
- .rva rsaz_def_handler
- .rva .Lrsaz_amm52x20_x1_256_body,.Lrsaz_amm52x20_x1_256_epilogue
-.LSEH_info_ossl_rsaz_amm52x20_x2_256:
+.LSEH_info_ossl_rsaz_amm52x20_x1_ifma256:
.byte 9,0,0,0
.rva rsaz_def_handler
- .rva .Lrsaz_amm52x20_x2_256_body,.Lrsaz_amm52x20_x2_256_epilogue
-.LSEH_info_ossl_extract_multiplier_2x20_win5:
+ .rva .Lossl_rsaz_amm52x20_x1_ifma256_body,.Lossl_rsaz_amm52x20_x1_ifma256_epilogue
+.LSEH_info_ossl_rsaz_amm52x20_x2_ifma256:
.byte 9,0,0,0
.rva rsaz_def_handler
- .rva .LSEH_begin_ossl_extract_multiplier_2x20_win5,.LSEH_begin_ossl_extract_multiplier_2x20_win5
+ .rva .Lossl_rsaz_amm52x20_x2_ifma256_body,.Lossl_rsaz_amm52x20_x2_ifma256_epilogue
___
}
}}} else {{{ # fallback for old assembler
@@ -736,16 +726,16 @@ ossl_rsaz_avx512ifma_eligible:
ret
.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible
-.globl ossl_rsaz_amm52x20_x1_256
-.globl ossl_rsaz_amm52x20_x2_256
+.globl ossl_rsaz_amm52x20_x1_ifma256
+.globl ossl_rsaz_amm52x20_x2_ifma256
.globl ossl_extract_multiplier_2x20_win5
-.type ossl_rsaz_amm52x20_x1_256,\@abi-omnipotent
-ossl_rsaz_amm52x20_x1_256:
-ossl_rsaz_amm52x20_x2_256:
+.type ossl_rsaz_amm52x20_x1_ifma256,\@abi-omnipotent
+ossl_rsaz_amm52x20_x1_ifma256:
+ossl_rsaz_amm52x20_x2_ifma256:
ossl_extract_multiplier_2x20_win5:
.byte 0x0f,0x0b # ud2
ret
-.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256
+.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256
___
}}}
diff --git a/crypto/bn/asm/rsaz-3k-avx512.pl b/crypto/bn/asm/rsaz-3k-avx512.pl
new file mode 100644
index 0000000000..e294afd294
--- /dev/null
+++ b/crypto/bn/asm/rsaz-3k-avx512.pl
@@ -0,0 +1,874 @@
+# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+#
+# Originally written by Sergey Kirillov and Andrey Matyukov
+# Intel Corporation
+#
+# March 2021
+#
+# Initial release.
+#
+# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
+#
+# IceLake-Client @ 1.3GHz
+# |---------+-----------------------+---------------+-------------|
+# | | OpenSSL 3.0.0-alpha15 | this | Unit |
+# |---------+-----------------------+---------------+-------------|
+# | rsa3072 | 6 397 637 | 2 866 593 | cycles/sign |
+# | | 203.2 | 453.5 / +123% | sign/s |
+# |---------+-----------------------+---------------+-------------|
+#
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$avx512ifma=0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx512ifma = ($1>=2.26);
+}
+
+if (!$avx512 && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+ $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
+}
+
+if (!$avx512 && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
+ $avx512ifma = ($2>=7.0);
+}
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+ or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+if ($avx512ifma>0) {{{
+@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+###############################################################################
+# Almost Montgomery Multiplication (AMM) for 30-digit number in radix 2^52.
+#
+# AMM is defined as presented in the paper [1].
+#
+# The input and output are presented in 2^52 radix domain, i.e.
+# |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high bits zeroed
+#
+# NOTE: the function uses zero-padded data - 2 high QWs is a padding.
+#
+# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
+#
+# NB: the AMM implementation does not perform "conditional" subtraction step
+# specified in the original algorithm as according to the Lemma 1 from the paper
+# [2], the result will be always < 2*m and can be used as a direct input to
+# the next AMM iteration. This post-condition is true, provided the correct
+# parameter |s| (notion of the Lemma 1 from [2]) is choosen, i.e. s >= n + 2 * k,
+# which matches our case: 1560 > 1536 + 2 * 1.
+#
+# [1] Gueron, S. Efficient software implementations of modular exponentiation.
+# DOI: 10.1007/s13389-012-0031-5
+# [2] Gueron, S. Enhanced Montgomery Multiplication.
+# DOI: 10.1007/3-540-36400-5_5
+#
+# void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res,
+# const BN_ULONG *a,
+# const BN_ULONG *b,
+# const BN_ULONG *m,
+# BN_ULONG k0);
+###############################################################################
+{
+# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
+my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
+
+my $mask52 = "%rax";
+my $acc0_0 = "%r9";
+my $acc0_0_low = "%r9d";
+my $acc0_1 = "%r15";
+my $acc0_1_low = "%r15d";
+my $b_ptr = "%r11";
+
+my $iter = "%ebx";
+
+my $zero = "%ymm0";
+my $Bi = "%ymm1";
+my $Yi = "%ymm2";
+my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h) = map("%ymm$_",(3..10));
+my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h) = map("%ymm$_",(11..18));
+
+# Registers mapping for normalization
+my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (19..23)));
+
+sub amm52x30_x1() {
+# _data_offset - offset in the |a| or |m| arrays pointing to the beginning
+# of data for corresponding AMM operation;
+# _b_offset - offset in the |b| array pointing to the next qword digit;
+my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_k0) = @_;
+my $_R0_xmm = $_R0;
+$_R0_xmm =~ s/%y/%x/;
+$code.=<<___;
+ movq $_b_offset($b_ptr), %r13 # b[i]
+
+ vpbroadcastq %r13, $Bi # broadcast b[i]
+ movq $_data_offset($a), %rdx
+ mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2)
+ addq %r13, $_acc # acc += t0
+ movq %r12, %r10
+ adcq \$0, %r10 # t2 += CF
+
+ movq $_k0, %r13
+ imulq $_acc, %r13 # acc * k0
+ andq $mask52, %r13 # yi = (acc * k0) & mask52
+
+ vpbroadcastq %r13, $Yi # broadcast y[i]
+ movq $_data_offset($m), %rdx
+ mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1)
+ addq %r13, $_acc # acc += t0
+ adcq %r12, %r10 # t2 += (t1 + CF)
+
+ shrq \$52, $_acc
+ salq \$12, %r10
+ or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12))
+
+ vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
+ vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+ vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
+ vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+ vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
+ vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h
+ vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3
+ vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h
+
+ vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
+ vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+ vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
+ vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+ vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
+ vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h
+ vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3
+ vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h
+
+ # Shift accumulators right by 1 qword, zero extending the highest one
+ valignq \$1, $_R0, $_R0h, $_R0
+ valignq \$1, $_R0h, $_R1, $_R0h
+ valignq \$1, $_R1, $_R1h, $_R1
+ valignq \$1, $_R1h, $_R2, $_R1h
+ valignq \$1, $_R2, $_R2h, $_R2
+ valignq \$1, $_R2h, $_R3, $_R2h
+ valignq \$1, $_R3, $_R3h, $_R3
+ valignq \$1, $_R3h, $zero, $_R3h
+
+ vmovq $_R0_xmm, %r13
+ addq %r13, $_acc # acc += R0[0]
+
+ vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
+ vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+ vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
+ vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+ vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
+ vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h
+ vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3
+ vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h
+
+ vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
+ vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+ vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
+ vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+ vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
+ vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h
+ vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3
+ vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h
+___
+}
+
+# Normalization routine: handles carry bits and gets bignum qwords to normalized
+# 2^52 representation.
+#
+# Uses %r8-14,%e[abcd]x
+sub amm52x30_x1_norm {
+my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h) = @_;
+$code.=<<___;
+ # Put accumulator to low qword in R0
+ vpbroadcastq $_acc, $T0
+ vpblendd \$3, $T0, $_R0, $_R0
+
+ # Extract "carries" (12 high bits) from each QW of the bignum
+ # Save them to LSB of QWs in T0..Tn
+ vpsrlq \$52, $_R0, $T0
+ vpsrlq \$52, $_R0h, $T0h
+ vpsrlq \$52, $_R1, $T1
+ vpsrlq \$52, $_R1h, $T1h
+ vpsrlq \$52, $_R2, $T2
+ vpsrlq \$52, $_R2h, $T2h
+ vpsrlq \$52, $_R3, $T3
+ vpsrlq \$52, $_R3h, $T3h
+
+ # "Shift left" T0..Tn by 1 QW
+ valignq \$3, $T3, $T3h, $T3h
+ valignq \$3, $T2h, $T3, $T3
+ valignq \$3, $T2, $T2h, $T2h
+ valignq \$3, $T1h, $T2, $T2
+ valignq \$3, $T1, $T1h, $T1h
+ valignq \$3, $T0h, $T1, $T1
+ valignq \$3, $T0, $T0h, $T0h
+ valignq \$3, .Lzeros(%rip), $T0, $T0
+
+ # Drop "carries" from R0..Rn QWs
+ vpandq .Lmask52x4(%rip), $_R0, $_R0
+ vpandq .Lmask52x4(%rip), $_R0h, $_R0h
+ vpandq .Lmask52x4(%rip), $_R1, $_R1
+ vpandq .Lmask52x4(%rip), $_R1h, $_R1h
+ vpandq .Lmask52x4(%rip), $_R2, $_R2
+ vpandq .Lmask52x4(%rip), $_R2h, $_R2h
+ vpandq .Lmask52x4(%rip), $_R3, $_R3
+ vpandq .Lmask52x4(%rip), $_R3h, $_R3h
+
+ # Sum R0..Rn with corresponding adjusted carries
+ vpaddq $T0, $_R0, $_R0
+ vpaddq $T0h, $_R0h, $_R0h
+ vpaddq $T1, $_R1, $_R1
+ vpaddq $T1h, $_R1h, $_R1h
+ vpaddq $T2, $_R2, $_R2
+ vpaddq $T2h, $_R2h, $_R2h
+ vpaddq $T3, $_R3, $_R3
+ vpaddq $T3h, $_R3h, $_R3h
+
+ # Now handle carry bits from this addition
+ # Get mask of QWs whose 52-bit parts overflow
+ vpcmpuq \$6,.Lmask52x4(%rip),${_R0},%k1 # OP=nle (i.e. gt)
+ vpcmpuq \$6,.Lmask52x4(%rip),${_R0h},%k2
+ kmovb %k1,%r14d
+ kmovb %k2,%r13d
+ shl \$4,%r13b
+ or %r13b,%r14b
+
+ vpcmpuq \$6,.Lmask52x4(%rip),${_R1},%k1
+ vpcmpuq \$6,.Lmask52x4(%rip),${_R1h},%k2
+ kmovb %k1,%r13d
+ kmovb %k2,%r12d
+ shl \$4,%r12b
+ or %r12b,%r13b
+
+ vpcmpuq \$6,.Lmask52x4(%rip),${_R2},%k1
+ vpcmpuq \$6,.Lmask52x4(%rip),${_R2h},%k2
+ kmovb %k1,%r12d
+ kmovb %k2,%r11d
+ shl \$4,%r11b
+ or %r11b,%r12b
+
+ vpcmpuq \$6,.Lmask52x4(%rip),${_R3},%k1
+ vpcmpuq \$6,.Lmask52x4(%rip),${_R3h},%k2
+ kmovb %k1,%r11d
+ kmovb %k2,%r10d
+ shl \$4,%r10b
+ or %r10b,%r11b
+
+ addb %r14b,%r14b
+ adcb %r13b,%r13b
+ adcb %r12b,%r12b
+ adcb %r11b,%r11b
+
+ # Get mask of QWs whose 52-bit parts saturated
+ vpcmpuq \$0,.Lmask52x4(%rip),${_R0},%k1 # OP=eq
+ vpcmpuq \$0,.Lmask52x4(%rip),${_R0h},%k2
+ kmovb %k1,%r9d
+ kmovb %k2,%r8d
+ shl \$4,%r8b
+ or %r8b,%r9b
+
+ vpcmpuq \$0,.Lmask52x4(%rip),${_R1},%k1
+ vpcmpuq \$0,.Lmask52x4(%rip),${_R1h},%k2
+ kmovb %k1,%r8d
+ kmovb %k2,%edx
+ shl \$4,%dl
+ or %dl,%r8b
+
+ vpcmpuq \$0,.Lmask52x4(%rip),${_R2},%k1
+ vpcmpuq \$0,.Lmask52x4(%rip),${_R2h},%k2
+ kmovb %k1,%edx
+ kmovb %k2,%ecx
+ shl \$4,%cl
+ or %cl,%dl
+
+ vpcmpuq \$0,.Lmask52x4(%rip),${_R3},%k1
+ vpcmpuq \$0,.Lmask52x4(%rip),${_R3h},%k2
+ kmovb %k1,%ecx
+ kmovb %k2,%ebx
+ shl \$4,%bl
+ or %bl,%cl
+
+ addb %r9b,%r14b
+ adcb %r8b,%r13b
+ adcb %dl,%r12b
+ adcb %cl,%r11b
+
+ xor %r9b,%r14b
+ xor %r8b,%r13b
+ xor %dl,%r12b
+ xor %cl,%r11b
+
+ kmovb %r14d,%k1
+ shr \$4,%r14b
+ kmovb %r14d,%k2
+ kmovb %r13d,%k3
+ shr \$4,%r13b
+ kmovb %r13d,%k4
+ kmovb %r12d,%k5
+ shr \$4,%r12b
+ kmovb %r12d,%k6
+ kmovb %r11d,%k7
+
+ vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1}
+ vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
+ vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3}
+ vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
+ vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5}
+ vpsubq .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6}
+ vpsubq .Lmask52x4(%rip), $_R3, ${_R3}{%k7}
+
+ vpandq .Lmask52x4(%rip), $_R0, $_R0
+ vpandq .Lmask52x4(%rip), $_R0h, $_R0h
+ vpandq .Lmask52x4(%rip), $_R1, $_R1
+ vpandq .Lmask52x4(%rip), $_R1h, $_R1h
+ vpandq .Lmask52x4(%rip), $_R2, $_R2
+ vpandq .Lmask52x4(%rip), $_R2h, $_R2h
+ vpandq .Lmask52x4(%rip), $_R3, $_R3
+
+ shr \$4,%r11b
+ kmovb %r11d,%k1
+
+ vpsubq .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1}
+
+ vpandq .Lmask52x4(%rip), $_R3h, $_R3h
+___
+}
+
+$code.=<<___;