diff options
author | Andrey Matyukov <andrey.matyukov@intel.com> | 2020-12-08 22:53:39 +0300 |
---|---|---|
committer | Tomas Mraz <tomas@openssl.org> | 2022-11-09 15:29:59 +0100 |
commit | 3d2b47bcdf8cf407ef1e459d54d4501cc19f0227 (patch) | |
tree | bf85f45e7ae5af4f40cf392fa86cd55072527499 | |
parent | 0bed814750c62c738d509cd0a9655789ae69e99b (diff) |
Dual 1536/2048-bit exponentiation optimization for Intel IceLake CPU
It uses AVX512_IFMA + AVX512_VL (with 256-bit wide registers) ISA to
keep lower power license.
Reviewed-by: Matt Caswell <matt@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/14908)
(cherry picked from commit f87b4c4ea67393c9269663ed40a7ea3463cc59d3)
-rw-r--r-- | CHANGES.md | 5 | ||||
-rw-r--r-- | crypto/bn/asm/rsaz-2k-avx512.pl (renamed from crypto/bn/asm/rsaz-avx512.pl) | 310 | ||||
-rw-r--r-- | crypto/bn/asm/rsaz-3k-avx512.pl | 874 | ||||
-rw-r--r-- | crypto/bn/asm/rsaz-4k-avx512.pl | 930 | ||||
-rw-r--r-- | crypto/bn/bn_exp.c | 24 | ||||
-rw-r--r-- | crypto/bn/build.info | 6 | ||||
-rw-r--r-- | crypto/bn/rsaz_exp_x2.c | 410 | ||||
-rw-r--r-- | test/exptest.c | 9 |
8 files changed, 2234 insertions, 334 deletions
diff --git a/CHANGES.md b/CHANGES.md index 0a15cba87d..700238fe66 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -35,6 +35,11 @@ OpenSSL 3.1 *Paul Dale* + * Parallel dual-prime 1536/2048-bit modular exponentiation for + AVX512_IFMA capable processors. + + *Sergey Kirillov, Andrey Matyukov (Intel Corp)* + OpenSSL 3.0 ----------- diff --git a/crypto/bn/asm/rsaz-avx512.pl b/crypto/bn/asm/rsaz-2k-avx512.pl index 8d1d19f6c7..80bc4a51b2 100644 --- a/crypto/bn/asm/rsaz-avx512.pl +++ b/crypto/bn/asm/rsaz-2k-avx512.pl @@ -7,7 +7,8 @@ # https://www.openssl.org/source/license.html # # -# Originally written by Ilya Albrekht, Sergey Kirillov and Andrey Matyukov +# Originally written by Sergey Kirillov and Andrey Matyukov. +# Special thanks to Ilya Albrekht for his valuable hints. # Intel Corporation # # December 2020 @@ -86,26 +87,29 @@ ___ ############################################################################### # Almost Montgomery Multiplication (AMM) for 20-digit number in radix 2^52. # -# AMM is defined as presented in the paper -# "Efficient Software Implementations of Modular Exponentiation" by Shay Gueron. +# AMM is defined as presented in the paper [1]. # # The input and output are presented in 2^52 radix domain, i.e. # |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high bits zeroed. # |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 -# (note, the implementation counts only 52 bits from it). # -# NB: the AMM implementation does not perform "conditional" subtraction step as -# specified in the original algorithm as according to the paper "Enhanced Montgomery -# Multiplication" by Shay Gueron (see Lemma 1), the result will be always < 2*2^1024 -# and can be used as a direct input to the next AMM iteration. -# This post-condition is true, provided the correct parameter |s| is choosen, i.e. -# s >= n + 2 * k, which matches our case: 1040 > 1024 + 2 * 1. +# NB: the AMM implementation does not perform "conditional" subtraction step +# specified in the original algorithm as according to the Lemma 1 from the paper +# [2], the result will be always < 2*m and can be used as a direct input to +# the next AMM iteration. This post-condition is true, provided the correct +# parameter |s| (notion of the Lemma 1 from [2]) is choosen, i.e. s >= n + 2 * k, +# which matches our case: 1040 > 1024 + 2 * 1. # -# void ossl_rsaz_amm52x20_x1_256(BN_ULONG *res, -# const BN_ULONG *a, -# const BN_ULONG *b, -# const BN_ULONG *m, -# BN_ULONG k0); +# [1] Gueron, S. Efficient software implementations of modular exponentiation. +# DOI: 10.1007/s13389-012-0031-5 +# [2] Gueron, S. Enhanced Montgomery Multiplication. +# DOI: 10.1007/3-540-36400-5_5 +# +# void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res, +# const BN_ULONG *a, +# const BN_ULONG *b, +# const BN_ULONG *m, +# BN_ULONG k0); ############################################################################### { # input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") @@ -121,16 +125,13 @@ my $b_ptr = "%r11"; my $iter = "%ebx"; my $zero = "%ymm0"; -my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm1", map("%ymm$_",(16..19))); -my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm2", map("%ymm$_",(20..23))); -my $Bi = "%ymm3"; -my $Yi = "%ymm4"; +my $Bi = "%ymm1"; +my $Yi = "%ymm2"; +my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm3",map("%ymm$_",(16..19))); +my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm4",map("%ymm$_",(20..23))); # Registers mapping for normalization. -# We can reuse Bi, Yi registers here. -my $TMP = $Bi; -my $mask52x4 = $Yi; -my ($T0,$T0h,$T1,$T1h,$T2) = map("%ymm$_", (24..28)); +my ($T0,$T0h,$T1,$T1h,$T2) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (25..26))); sub amm52x20_x1() { # _data_offset - offset in the |a| or |m| arrays pointing to the beginning @@ -199,16 +200,16 @@ $code.=<<___; ___ } -# Normalization routine: handles carry bits in R0..R2 QWs and -# gets R0..R2 back to normalized 2^52 representation. +# Normalization routine: handles carry bits and gets bignum qwords to normalized +# 2^52 representation. # # Uses %r8-14,%e[bcd]x sub amm52x20_x1_norm { my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2) = @_; $code.=<<___; # Put accumulator to low qword in R0 - vpbroadcastq $_acc, $TMP - vpblendd \$3, $TMP, $_R0, $_R0 + vpbroadcastq $_acc, $T0 + vpblendd \$3, $T0, $_R0, $_R0 # Extract "carries" (12 high bits) from each QW of R0..R2 # Save them to LSB of QWs in T0..T2 @@ -223,14 +224,14 @@ $code.=<<___; valignq \$3, $T1, $T1h, $T1h valignq \$3, $T0h, $T1, $T1 valignq \$3, $T0, $T0h, $T0h - valignq \$3, $zero, $T0, $T0 + valignq \$3, .Lzeros(%rip), $T0, $T0 # Drop "carries" from R0..R2 QWs - vpandq $mask52x4, $_R0, $_R0 - vpandq $mask52x4, $_R0h, $_R0h - vpandq $mask52x4, $_R1, $_R1 - vpandq $mask52x4, $_R1h, $_R1h - vpandq $mask52x4, $_R2, $_R2 + vpandq .Lmask52x4(%rip), $_R0, $_R0 + vpandq .Lmask52x4(%rip), $_R0h, $_R0h + vpandq .Lmask52x4(%rip), $_R1, $_R1 + vpandq .Lmask52x4(%rip), $_R1h, $_R1h + vpandq .Lmask52x4(%rip), $_R2, $_R2 # Sum R0..R2 with corresponding adjusted carries vpaddq $T0, $_R0, $_R0 @@ -241,11 +242,11 @@ $code.=<<___; # Now handle carry bits from this addition # Get mask of QWs which 52-bit parts overflow... - vpcmpuq \$1, $_R0, $mask52x4, %k1 # OP=lt - vpcmpuq \$1, $_R0h, $mask52x4, %k2 - vpcmpuq \$1, $_R1, $mask52x4, %k3 - vpcmpuq \$1, $_R1h, $mask52x4, %k4 - vpcmpuq \$1, $_R2, $mask52x4, %k5 + vpcmpuq \$6, .Lmask52x4(%rip), $_R0, %k1 # OP=nle (i.e. gt) + vpcmpuq \$6, .Lmask52x4(%rip), $_R0h, %k2 + vpcmpuq \$6, .Lmask52x4(%rip), $_R1, %k3 + vpcmpuq \$6, .Lmask52x4(%rip), $_R1h, %k4 + vpcmpuq \$6, .Lmask52x4(%rip), $_R2, %k5 kmovb %k1, %r14d # k1 kmovb %k2, %r13d # k1h kmovb %k3, %r12d # k2 @@ -253,11 +254,11 @@ $code.=<<___; kmovb %k5, %r10d # k3 # ...or saturated - vpcmpuq \$0, $_R0, $mask52x4, %k1 # OP=eq - vpcmpuq \$0, $_R0h, $mask52x4, %k2 - vpcmpuq \$0, $_R1, $mask52x4, %k3 - vpcmpuq \$0, $_R1h, $mask52x4, %k4 - vpcmpuq \$0, $_R2, $mask52x4, %k5 + vpcmpuq \$0, .Lmask52x4(%rip), $_R0, %k1 # OP=eq + vpcmpuq \$0, .Lmask52x4(%rip), $_R0h, %k2 + vpcmpuq \$0, .Lmask52x4(%rip), $_R1, %k3 + vpcmpuq \$0, .Lmask52x4(%rip), $_R1h, %k4 + vpcmpuq \$0, .Lmask52x4(%rip), $_R2, %k5 kmovb %k1, %r9d # k4 kmovb %k2, %r8d # k4h kmovb %k3, %ebx # k5 @@ -297,27 +298,27 @@ $code.=<<___; kmovb %r10d, %k5 # Add carries according to the obtained mask - vpsubq $mask52x4, $_R0, ${_R0}{%k1} - vpsubq $mask52x4, $_R0h, ${_R0h}{%k2} - vpsubq $mask52x4, $_R1, ${_R1}{%k3} - vpsubq $mask52x4, $_R1h, ${_R1h}{%k4} - vpsubq $mask52x4, $_R2, ${_R2}{%k5} - - vpandq $mask52x4, $_R0, $_R0 - vpandq $mask52x4, $_R0h, $_R0h - vpandq $mask52x4, $_R1, $_R1 - vpandq $mask52x4, $_R1h, $_R1h - vpandq $mask52x4, $_R2, $_R2 + vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1} + vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2} + vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3} + vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4} + vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5} + + vpandq .Lmask52x4(%rip), $_R0, $_R0 + vpandq .Lmask52x4(%rip), $_R0h, $_R0h + vpandq .Lmask52x4(%rip), $_R1, $_R1 + vpandq .Lmask52x4(%rip), $_R1h, $_R1h + vpandq .Lmask52x4(%rip), $_R2, $_R2 ___ } $code.=<<___; .text -.globl ossl_rsaz_amm52x20_x1_256 -.type ossl_rsaz_amm52x20_x1_256,\@function,5 +.globl ossl_rsaz_amm52x20_x1_ifma256 +.type ossl_rsaz_amm52x20_x1_ifma256,\@function,5 .align 32 -ossl_rsaz_amm52x20_x1_256: +ossl_rsaz_amm52x20_x1_ifma256: .cfi_startproc endbranch push %rbx @@ -332,7 +333,7 @@ ossl_rsaz_amm52x20_x1_256: .cfi_push %r14 push %r15 .cfi_push %r15 -.Lrsaz_amm52x20_x1_256_body: +.Lossl_rsaz_amm52x20_x1_ifma256_body: # Zeroing accumulators vpxord $zero, $zero, $zero @@ -360,17 +361,15 @@ $code.=<<___; lea `4*8`($b_ptr), $b_ptr dec $iter jne .Lloop5 - - vmovdqa64 .Lmask52x4(%rip), $mask52x4 ___ &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); $code.=<<___; - vmovdqu64 $R0_0, ($res) - vmovdqu64 $R0_0h, 32($res) - vmovdqu64 $R1_0, 64($res) - vmovdqu64 $R1_0h, 96($res) - vmovdqu64 $R2_0, 128($res) + vmovdqu64 $R0_0, `0*32`($res) + vmovdqu64 $R0_0h, `1*32`($res) + vmovdqu64 $R1_0, `2*32`($res) + vmovdqu64 $R1_0h, `3*32`($res) + vmovdqu64 $R2_0, `4*32`($res) vzeroupper mov 0(%rsp),%r15 @@ -387,10 +386,10 @@ $code.=<<___; .cfi_restore %rbx lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 -.Lrsaz_amm52x20_x1_256_epilogue: +.Lossl_rsaz_amm52x20_x1_ifma256_epilogue: ret .cfi_endproc -.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 +.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256 ___ $code.=<<___; @@ -406,25 +405,25 @@ ___ ############################################################################### # Dual Almost Montgomery Multiplication for 20-digit number in radix 2^52 # -# See description of ossl_rsaz_amm52x20_x1_256() above for details about Almost +# See description of ossl_rsaz_amm52x20_x1_ifma256() above for details about Almost # Montgomery Multiplication algorithm and function input parameters description. # # This function does two AMMs for two independent inputs, hence dual. # -# void ossl_rsaz_amm52x20_x2_256(BN_ULONG out[2][20], -# const BN_ULONG a[2][20], -# const BN_ULONG b[2][20], -# const BN_ULONG m[2][20], -# const BN_ULONG k0[2]); +# void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20], +# const BN_ULONG a[2][20], +# const BN_ULONG b[2][20], +# const BN_ULONG m[2][20], +# const BN_ULONG k0[2]); ############################################################################### $code.=<<___; .text -.globl ossl_rsaz_amm52x20_x2_256 -.type ossl_rsaz_amm52x20_x2_256,\@function,5 +.globl ossl_rsaz_amm52x20_x2_ifma256 +.type ossl_rsaz_amm52x20_x2_ifma256,\@function,5 .align 32 -ossl_rsaz_amm52x20_x2_256: +ossl_rsaz_amm52x20_x2_ifma256: .cfi_startproc endbranch push %rbx @@ -439,7 +438,7 @@ ossl_rsaz_amm52x20_x2_256: .cfi_push %r14 push %r15 .cfi_push %r15 -.Lrsaz_amm52x20_x2_256_body: +.Lossl_rsaz_amm52x20_x2_ifma256_body: # Zeroing accumulators vpxord $zero, $zero, $zero @@ -472,24 +471,22 @@ $code.=<<___; lea 8($b_ptr), $b_ptr dec $iter jne .Lloop20 - - vmovdqa64 .Lmask52x4(%rip), $mask52x4 ___ &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); &amm52x20_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1); $code.=<<___; - vmovdqu64 $R0_0, ($res) - vmovdqu64 $R0_0h, 32($res) - vmovdqu64 $R1_0, 64($res) - vmovdqu64 $R1_0h, 96($res) - vmovdqu64 $R2_0, 128($res) + vmovdqu64 $R0_0, `0*32`($res) + vmovdqu64 $R0_0h, `1*32`($res) + vmovdqu64 $R1_0, `2*32`($res) + vmovdqu64 $R1_0h, `3*32`($res) + vmovdqu64 $R2_0, `4*32`($res) - vmovdqu64 $R0_1, 160($res) - vmovdqu64 $R0_1h, 192($res) - vmovdqu64 $R1_1, 224($res) - vmovdqu64 $R1_1h, 256($res) - vmovdqu64 $R2_1, 288($res) + vmovdqu64 $R0_1, `5*32`($res) + vmovdqu64 $R0_1h, `6*32`($res) + vmovdqu64 $R1_1, `7*32`($res) + vmovdqu64 $R1_1h, `8*32`($res) + vmovdqu64 $R2_1, `9*32`($res) vzeroupper mov 0(%rsp),%r15 @@ -506,10 +503,10 @@ $code.=<<___; .cfi_restore %rbx lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 -.Lrsaz_amm52x20_x2_256_epilogue: +.Lossl_rsaz_amm52x20_x2_ifma256_epilogue: ret .cfi_endproc -.size ossl_rsaz_amm52x20_x2_256, .-ossl_rsaz_amm52x20_x2_256 +.size ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256 ___ } @@ -517,77 +514,76 @@ ___ # Constant time extraction from the precomputed table of powers base^i, where # i = 0..2^EXP_WIN_SIZE-1 # -# The input |red_table| contains precomputations for two independent base values, -# so the |tbl_idx| indicates for which base shall we extract the value. -# |red_table_idx| is a power index. +# The input |red_table| contains precomputations for two independent base values. +# |red_table_idx1| and |red_table_idx2| are corresponding power indexes. # -# Extracted value (output) is 20 digit number in 2^52 radix. +# Extracted value (output) is 2 20 digit numbers in 2^52 radix. # # void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, # const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20], -# int red_table_idx, -# int tbl_idx); # 0 or 1 +# int red_table_idx1, int red_table_idx2); # # EXP_WIN_SIZE = 5 ############################################################################### { # input parameters -my ($out,$red_tbl,$red_tbl_idx,$tbl_idx) = @_6_args_universal_ABI; +my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order -my ($t0,$t1,$t2,$t3,$t4) = map("%ymm$_", (0..4)); -my $t4xmm = $t4; -$t4xmm =~ s/%y/%x/; -my ($tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = map("%ymm$_", (16..20)); -my ($cur_idx,$idx,$ones) = map("%ymm$_", (21..23)); +my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5)); +my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19)); +my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24)); + +my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9); +my $t0xmm = $t0; +$t0xmm =~ s/%y/%x/; $code.=<<___; .text .align 32 .globl ossl_extract_multiplier_2x20_win5 -.type ossl_extract_multiplier_2x20_win5,\@function,4 +.type ossl_extract_multiplier_2x20_win5,\@abi-omnipotent ossl_extract_multiplier_2x20_win5: .cfi_startproc endbranch - leaq ($tbl_idx,$tbl_idx,4), %rax - salq \$5, %rax - addq %rax, $red_tbl - vmovdqa64 .Lones(%rip), $ones # broadcast ones - vpbroadcastq $red_tbl_idx, $idx + vpbroadcastq $red_tbl_idx1, $idx1 + vpbroadcastq $red_tbl_idx2, $idx2 leaq `(1<<5)*2*20*8`($red_tbl), %rax # holds end of the tbl - vpxor $t4xmm, $t4xmm, $t4xmm - vmovdqa64 $t4, $t3 # zeroing t0..4, cur_idx - vmovdqa64 $t4, $t2 - vmovdqa64 $t4, $t1 - vmovdqa64 $t4, $t0 - vmovdqa64 $t4, $cur_idx + # zeroing t0..n, cur_idx + vpxor $t0xmm, $t0xmm, $t0xmm + vmovdqa64 $t0, $cur_idx +___ +foreach (1..9) { + $code.="vmovdqa64 $t0, $t[$_] \n"; +} +$code.=<<___; .align 32 .Lloop: - vpcmpq \$0, $cur_idx, $idx, %k1 # mask of (idx == cur_idx) - addq \$320, $red_tbl # 320 = 2 * 20 digits * 8 bytes - vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx - vmovdqu64 -320($red_tbl), $tmp0 # load data from red_tbl - vmovdqu64 -288($red_tbl), $tmp1 - vmovdqu64 -256($red_tbl), $tmp2 - vmovdqu64 -224($red_tbl), $tmp3 - vmovdqu64 -192($red_tbl), $tmp4 - vpblendmq $tmp0, $t0, ${t0}{%k1} # extract data when mask is not zero - vpblendmq $tmp1, $t1, ${t1}{%k1} - vpblendmq $tmp2, $t2, ${t2}{%k1} - vpblendmq $tmp3, $t3, ${t3}{%k1} - vpblendmq $tmp4, $t4, ${t4}{%k1} + vpcmpq \$0, $cur_idx, $idx1, %k1 # mask of (idx1 == cur_idx) + vpcmpq \$0, $cur_idx, $idx2, %k2 # mask of (idx2 == cur_idx) +___ +foreach (0..9) { + my $mask = $_<5?"%k1":"%k2"; +$code.=<<___; + vmovdqu64 `${_}*32`($red_tbl), $tmp # load data from red_tbl + vpblendmq $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero +___ +} +$code.=<<___; + vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx + addq \$`2*20*8`, $red_tbl cmpq $red_tbl, %rax jne .Lloop - - vmovdqu64 $t0, ($out) # store t0..4 - vmovdqu64 $t1, 32($out) - vmovdqu64 $t2, 64($out) - vmovdqu64 $t3, 96($out) - vmovdqu64 $t4, 128($out) - +___ +# store t0..n +foreach (0..9) { + $code.="vmovdqu64 $t[$_], `${_}*32`($out) \n"; +} +$code.=<<___; ret .cfi_endproc .size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 @@ -597,6 +593,8 @@ $code.=<<___; .align 32 .Lones: .quad 1,1,1,1 +.Lzeros: + .quad 0,0,0,0 ___ } @@ -606,7 +604,7 @@ $frame="%rdx"; $context="%r8"; $disp="%r9"; -$code.=<<___ +$code.=<<___; .extern __imp_RtlVirtualUnwind .type rsaz_def_handler,\@abi-omnipotent .align 16 @@ -697,32 +695,24 @@ rsaz_def_handler: .section .pdata .align 4 - .rva .LSEH_begin_ossl_rsaz_amm52x20_x1_256 - .rva .LSEH_end_ossl_rsaz_amm52x20_x1_256 - .rva .LSEH_info_ossl_rsaz_amm52x20_x1_256 - - .rva .LSEH_begin_ossl_rsaz_amm52x20_x2_256 - .rva .LSEH_end_ossl_rsaz_amm52x20_x2_256 - .rva .LSEH_info_ossl_rsaz_amm52x20_x2_256 + .rva .LSEH_begin_ossl_rsaz_amm52x20_x1_ifma256 + .rva .LSEH_end_ossl_rsaz_amm52x20_x1_ifma256 + .rva .LSEH_info_ossl_rsaz_amm52x20_x1_ifma256 - .rva .LSEH_begin_ossl_extract_multiplier_2x20_win5 - .rva .LSEH_end_ossl_extract_multiplier_2x20_win5 - .rva .LSEH_info_ossl_extract_multiplier_2x20_win5 + .rva .LSEH_begin_ossl_rsaz_amm52x20_x2_ifma256 + .rva .LSEH_end_ossl_rsaz_amm52x20_x2_ifma256 + .rva .LSEH_info_ossl_rsaz_amm52x20_x2_ifma256 .section .xdata .align 8 -.LSEH_info_ossl_rsaz_amm52x20_x1_256: - .byte 9,0,0,0 - .rva rsaz_def_handler - .rva .Lrsaz_amm52x20_x1_256_body,.Lrsaz_amm52x20_x1_256_epilogue -.LSEH_info_ossl_rsaz_amm52x20_x2_256: +.LSEH_info_ossl_rsaz_amm52x20_x1_ifma256: .byte 9,0,0,0 .rva rsaz_def_handler - .rva .Lrsaz_amm52x20_x2_256_body,.Lrsaz_amm52x20_x2_256_epilogue -.LSEH_info_ossl_extract_multiplier_2x20_win5: + .rva .Lossl_rsaz_amm52x20_x1_ifma256_body,.Lossl_rsaz_amm52x20_x1_ifma256_epilogue +.LSEH_info_ossl_rsaz_amm52x20_x2_ifma256: .byte 9,0,0,0 .rva rsaz_def_handler - .rva .LSEH_begin_ossl_extract_multiplier_2x20_win5,.LSEH_begin_ossl_extract_multiplier_2x20_win5 + .rva .Lossl_rsaz_amm52x20_x2_ifma256_body,.Lossl_rsaz_amm52x20_x2_ifma256_epilogue ___ } }}} else {{{ # fallback for old assembler @@ -736,16 +726,16 @@ ossl_rsaz_avx512ifma_eligible: ret .size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible -.globl ossl_rsaz_amm52x20_x1_256 -.globl ossl_rsaz_amm52x20_x2_256 +.globl ossl_rsaz_amm52x20_x1_ifma256 +.globl ossl_rsaz_amm52x20_x2_ifma256 .globl ossl_extract_multiplier_2x20_win5 -.type ossl_rsaz_amm52x20_x1_256,\@abi-omnipotent -ossl_rsaz_amm52x20_x1_256: -ossl_rsaz_amm52x20_x2_256: +.type ossl_rsaz_amm52x20_x1_ifma256,\@abi-omnipotent +ossl_rsaz_amm52x20_x1_ifma256: +ossl_rsaz_amm52x20_x2_ifma256: ossl_extract_multiplier_2x20_win5: .byte 0x0f,0x0b # ud2 ret -.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 +.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256 ___ }}} diff --git a/crypto/bn/asm/rsaz-3k-avx512.pl b/crypto/bn/asm/rsaz-3k-avx512.pl new file mode 100644 index 0000000000..e294afd294 --- /dev/null +++ b/crypto/bn/asm/rsaz-3k-avx512.pl @@ -0,0 +1,874 @@ +# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2021, Intel Corporation. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# +# Originally written by Sergey Kirillov and Andrey Matyukov +# Intel Corporation +# +# March 2021 +# +# Initial release. +# +# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues. +# +# IceLake-Client @ 1.3GHz +# |---------+-----------------------+---------------+-------------| +# | | OpenSSL 3.0.0-alpha15 | this | Unit | +# |---------+-----------------------+---------------+-------------| +# | rsa3072 | 6 397 637 | 2 866 593 | cycles/sign | +# | | 203.2 | 453.5 / +123% | sign/s | +# |---------+-----------------------+---------------+-------------| +# + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); +$avx512ifma=0; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx512ifma = ($1>=2.26); +} + +if (!$avx512 && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { + $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12); +} + +if (!$avx512 && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { + $avx512ifma = ($2>=7.0); +} + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +if ($avx512ifma>0) {{{ +@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); + +############################################################################### +# Almost Montgomery Multiplication (AMM) for 30-digit number in radix 2^52. +# +# AMM is defined as presented in the paper [1]. +# +# The input and output are presented in 2^52 radix domain, i.e. +# |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high bits zeroed +# +# NOTE: the function uses zero-padded data - 2 high QWs is a padding. +# +# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 +# +# NB: the AMM implementation does not perform "conditional" subtraction step +# specified in the original algorithm as according to the Lemma 1 from the paper +# [2], the result will be always < 2*m and can be used as a direct input to +# the next AMM iteration. This post-condition is true, provided the correct +# parameter |s| (notion of the Lemma 1 from [2]) is choosen, i.e. s >= n + 2 * k, +# which matches our case: 1560 > 1536 + 2 * 1. +# +# [1] Gueron, S. Efficient software implementations of modular exponentiation. +# DOI: 10.1007/s13389-012-0031-5 +# [2] Gueron, S. Enhanced Montgomery Multiplication. +# DOI: 10.1007/3-540-36400-5_5 +# +# void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res, +# const BN_ULONG *a, +# const BN_ULONG *b, +# const BN_ULONG *m, +# BN_ULONG k0); +############################################################################### +{ +# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") +my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; + +my $mask52 = "%rax"; +my $acc0_0 = "%r9"; +my $acc0_0_low = "%r9d"; +my $acc0_1 = "%r15"; +my $acc0_1_low = "%r15d"; +my $b_ptr = "%r11"; + +my $iter = "%ebx"; + +my $zero = "%ymm0"; +my $Bi = "%ymm1"; +my $Yi = "%ymm2"; +my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h) = map("%ymm$_",(3..10)); +my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h) = map("%ymm$_",(11..18)); + +# Registers mapping for normalization +my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (19..23))); + +sub amm52x30_x1() { +# _data_offset - offset in the |a| or |m| arrays pointing to the beginning +# of data for corresponding AMM operation; +# _b_offset - offset in the |b| array pointing to the next qword digit; +my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_k0) = @_; +my $_R0_xmm = $_R0; +$_R0_xmm =~ s/%y/%x/; +$code.=<<___; + movq $_b_offset($b_ptr), %r13 # b[i] + + vpbroadcastq %r13, $Bi # broadcast b[i] + movq $_data_offset($a), %rdx + mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2) + addq %r13, $_acc # acc += t0 + movq %r12, %r10 + adcq \$0, %r10 # t2 += CF + + movq $_k0, %r13 + imulq $_acc, %r13 # acc * k0 + andq $mask52, %r13 # yi = (acc * k0) & mask52 + + vpbroadcastq %r13, $Yi # broadcast y[i] + movq $_data_offset($m), %rdx + mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1) + addq %r13, $_acc # acc += t0 + adcq %r12, %r10 # t2 += (t1 + CF) + + shrq \$52, $_acc + salq \$12, %r10 + or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12)) + + vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0 + vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h + vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1 + vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h + vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2 + vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h + vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3 + vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h + + vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0 + vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h + vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1 + vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h + vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2 + vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h + vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3 + vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h + + # Shift accumulators right by 1 qword, zero extending the highest one + valignq \$1, $_R0, $_R0h, $_R0 + valignq \$1, $_R0h, $_R1, $_R0h + valignq \$1, $_R1, $_R1h, $_R1 + valignq \$1, $_R1h, $_R2, $_R1h + valignq \$1, $_R2, $_R2h, $_R2 + valignq \$1, $_R2h, $_R3, $_R2h + valignq \$1, $_R3, $_R3h, $_R3 + valignq \$1, $_R3h, $zero, $_R3h + + vmovq $_R0_xmm, %r13 + addq %r13, $_acc # acc += R0[0] + + vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0 + vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h + vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1 + vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h + vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2 + vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h + vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3 + vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h + + vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0 + vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h + vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1 + vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h + vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2 + vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h + vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3 + vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h +___ +} + +# Normalization routine: handles carry bits and gets bignum qwords to normalized +# 2^52 representation. +# +# Uses %r8-14,%e[abcd]x +sub amm52x30_x1_norm { +my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h) = @_; +$code.=<<___; + # Put accumulator to low qword in R0 + vpbroadcastq $_acc, $T0 + vpblendd \$3, $T0, $_R0, $_R0 + + # Extract "carries" (12 high bits) from each QW of the bignum + # Save them to LSB of QWs in T0..Tn + vpsrlq \$52, $_R0, $T0 + vpsrlq \$52, $_R0h, $T0h + vpsrlq \$52, $_R1, $T1 + vpsrlq \$52, $_R1h, $T1h + vpsrlq \$52, $_R2, $T2 + vpsrlq \$52, $_R2h, $T2h + vpsrlq \$52, $_R3, $T3 + vpsrlq \$52, $_R3h, $T3h + + # "Shift left" T0..Tn by 1 QW + valignq \$3, $T3, $T3h, $T3h + valignq \$3, $T2h, $T3, $T3 + valignq \$3, $T2, $T2h, $T2h + valignq \$3, $T1h, $T2, $T2 + valignq \$3, $T1, $T1h, $T1h + valignq \$3, $T0h, $T1, $T1 + valignq \$3, $T0, $T0h, $T0h + valignq \$3, .Lzeros(%rip), $T0, $T0 + + # Drop "carries" from R0..Rn QWs + vpandq .Lmask52x4(%rip), $_R0, $_R0 + vpandq .Lmask52x4(%rip), $_R0h, $_R0h + vpandq .Lmask52x4(%rip), $_R1, $_R1 + vpandq .Lmask52x4(%rip), $_R1h, $_R1h + vpandq .Lmask52x4(%rip), $_R2, $_R2 + vpandq .Lmask52x4(%rip), $_R2h, $_R2h + vpandq .Lmask52x4(%rip), $_R3, $_R3 + vpandq .Lmask52x4(%rip), $_R3h, $_R3h + + # Sum R0..Rn with corresponding adjusted carries + vpaddq $T0, $_R0, $_R0 + vpaddq $T0h, $_R0h, $_R0h + vpaddq $T1, $_R1, $_R1 + vpaddq $T1h, $_R1h, $_R1h + vpaddq $T2, $_R2, $_R2 + vpaddq $T2h, $_R2h, $_R2h + vpaddq $T3, $_R3, $_R3 + vpaddq $T3h, $_R3h, $_R3h + + # Now handle carry bits from this addition + # Get mask of QWs whose 52-bit parts overflow + vpcmpuq \$6,.Lmask52x4(%rip),${_R0},%k1 # OP=nle (i.e. gt) + vpcmpuq \$6,.Lmask52x4(%rip),${_R0h},%k2 + kmovb %k1,%r14d + kmovb %k2,%r13d + shl \$4,%r13b + or %r13b,%r14b + + vpcmpuq \$6,.Lmask52x4(%rip),${_R1},%k1 + vpcmpuq \$6,.Lmask52x4(%rip),${_R1h},%k2 + kmovb %k1,%r13d + kmovb %k2,%r12d + shl \$4,%r12b + or %r12b,%r13b + + vpcmpuq \$6,.Lmask52x4(%rip),${_R2},%k1 + vpcmpuq \$6,.Lmask52x4(%rip),${_R2h},%k2 + kmovb %k1,%r12d + kmovb %k2,%r11d + shl \$4,%r11b + or %r11b,%r12b + + vpcmpuq \$6,.Lmask52x4(%rip),${_R3},%k1 + vpcmpuq \$6,.Lmask52x4(%rip),${_R3h},%k2 + kmovb %k1,%r11d + kmovb %k2,%r10d + shl \$4,%r10b + or %r10b,%r11b + + addb %r14b,%r14b + adcb %r13b,%r13b + adcb %r12b,%r12b + adcb %r11b,%r11b + + # Get mask of QWs whose 52-bit parts saturated + vpcmpuq \$0,.Lmask52x4(%rip),${_R0},%k1 # OP=eq + vpcmpuq \$0,.Lmask52x4(%rip),${_R0h},%k2 + kmovb %k1,%r9d + kmovb %k2,%r8d + shl \$4,%r8b + or %r8b,%r9b + + vpcmpuq \$0,.Lmask52x4(%rip),${_R1},%k1 + vpcmpuq \$0,.Lmask52x4(%rip),${_R1h},%k2 + kmovb %k1,%r8d + kmovb %k2,%edx + shl \$4,%dl + or %dl,%r8b + + vpcmpuq \$0,.Lmask52x4(%rip),${_R2},%k1 + vpcmpuq \$0,.Lmask52x4(%rip),${_R2h},%k2 + kmovb %k1,%edx + kmovb %k2,%ecx + shl \$4,%cl + or %cl,%dl + + vpcmpuq \$0,.Lmask52x4(%rip),${_R3},%k1 + vpcmpuq \$0,.Lmask52x4(%rip),${_R3h},%k2 + kmovb %k1,%ecx + kmovb %k2,%ebx + shl \$4,%bl + or %bl,%cl + + addb %r9b,%r14b + adcb %r8b,%r13b + adcb %dl,%r12b + adcb %cl,%r11b + + xor %r9b,%r14b + xor %r8b,%r13b + xor %dl,%r12b + xor %cl,%r11b + + kmovb %r14d,%k1 + shr \$4,%r14b + kmovb %r14d,%k2 + kmovb %r13d,%k3 + shr \$4,%r13b + kmovb %r13d,%k4 + kmovb %r12d,%k5 + shr \$4,%r12b + kmovb %r12d,%k6 + kmovb %r11d,%k7 + + vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1} + vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2} + vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3} + vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4} + vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5} + vpsubq .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6} + vpsubq .Lmask52x4(%rip), $_R3, ${_R3}{%k7} + + vpandq .Lmask52x4(%rip), $_R0, $_R0 + vpandq .Lmask52x4(%rip), $_R0h, $_R0h + vpandq .Lmask52x4(%rip), $_R1, $_R1 + vpandq .Lmask52x4(%rip), $_R1h, $_R1h + vpandq .Lmask52x4(%rip), $_R2, $_R2 + vpandq .Lmask52x4(%rip), $_R2h, $_R2h + vpandq .Lmask52x4(%rip), $_R3, $_R3 + + shr \$4,%r11b + kmovb %r11d,%k1 + + vpsubq .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1} + + vpandq .Lmask52x4(%rip), $_R3h, $_R3h +___ +} + +$code.=<<___; |