summaryrefslogtreecommitdiffstats
path: root/crypto/bn
diff options
context:
space:
mode:
authorfangming.fang <fangming.fang@arm.com>2021-03-19 06:45:57 +0000
committerPauli <pauli@openssl.org>2021-05-09 23:15:07 +1000
commit10646160125ac1328d892f1dd27f2847892d33c5 (patch)
tree173014dbe2144c6b47dd4f95c79538cf08c318b4 /crypto/bn
parentf0f4a46c4f5c82d4d9d0fb8a51d546c3135668a2 (diff)
Optimize RSA on armv8
Add Neon path for RSA on armv8, this optimisation targets to A72 and N1 that are ones of important cores of infrastructure. Other platforms are not impacted. A72 old new improved rsa 512 sign 9828.6 9738.7 -1% rsa 512 verify 121497.2 122367.7 1% rsa 1024 sign 1818 1816.9 0% rsa 1024 verify 37175.6 37161.3 0% rsa 2048 sign 267.3 267.4 0% rsa 2048 verify 10127.6 10119.6 0% rsa 3072 sign 86.8 87 0% rsa 3072 verify 4604.2 4956.2 8% rsa 4096 sign 38.3 38.5 1% rsa 4096 verify 2619.8 2972.1 13% rsa 7680 sign 5 7 40% rsa 7680 verify 756 929.4 23% rsa 15360 sign 0.8 1 25% rsa 15360 verify 190.4 246 29% N1 old new improved rsa 512 sign 12599.2 12596.7 0% rsa 512 verify 148636.1 148656.2 0% rsa 1024 sign 2150.6 2148.9 0% rsa 1024 verify 42353.5 42265.2 0% rsa 2048 sign 305.5 305.3 0% rsa 2048 verify 11209.7 11205.2 0% rsa 3072 sign 97.8 98.2 0% rsa 3072 verify 5061.3 5990.7 18% rsa 4096 sign 42.8 43 0% rsa 4096 verify 2867.6 3509.8 22% rsa 7680 sign 5.5 8.4 53% rsa 7680 verify 823.5 1058.3 29% rsa 15360 sign 0.9 1.1 22% rsa 15360 verify 207 273.9 32% CustomizedGitHooks: yes Change-Id: I01c732cc429d793c4eb5ffd27ccd30ff9cebf8af Jira: SECLIB-540 Reviewed-by: Tomas Mraz <tomas@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/14761)
Diffstat (limited to 'crypto/bn')
-rwxr-xr-xcrypto/bn/asm/armv8-mont.pl381
-rw-r--r--crypto/bn/build.info1
2 files changed, 382 insertions, 0 deletions
diff --git a/crypto/bn/asm/armv8-mont.pl b/crypto/bn/asm/armv8-mont.pl
index e8bdfa3bb8..0867ccabee 100755
--- a/crypto/bn/asm/armv8-mont.pl
+++ b/crypto/bn/asm/armv8-mont.pl
@@ -67,16 +67,34 @@ $n0="x4"; # const BN_ULONG *n0,
$num="x5"; # int num);
$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armv8_rsa_neonized
+.hidden OPENSSL_armv8_rsa_neonized
+#endif
.text
.globl bn_mul_mont
.type bn_mul_mont,%function
.align 5
bn_mul_mont:
+.Lbn_mul_mont:
+ tst $num,#3
+ b.ne .Lmul_mont
+ cmp $num,#32
+ b.le .Lscalar_impl
+#ifndef __KERNEL__
+ adrp x17,OPENSSL_armv8_rsa_neonized
+ ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
+ cbnz w17, bn_mul8x_mont_neon
+#endif
+
+.Lscalar_impl:
tst $num,#7
b.eq __bn_sqr8x_mont
tst $num,#3
b.eq __bn_mul4x_mont
+
.Lmul_mont:
stp x29,x30,[sp,#-64]!
add x29,sp,#0
@@ -274,6 +292,369 @@ bn_mul_mont:
.size bn_mul_mont,.-bn_mul_mont
___
{
+my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
+my ($Z,$Temp)=("v4.16b","v5");
+my @ACC=map("v$_",(6..13));
+my ($Bi,$Ni,$M0)=map("v$_",(28..30));
+my $sBi="s28";
+my $sM0="s30";
+my $zero="v14";
+my $temp="v15";
+my $ACCTemp="v16";
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
+
+$code.=<<___;
+.type bn_mul8x_mont_neon,%function
+.align 5
+bn_mul8x_mont_neon:
+ stp x29,x30,[sp,#-80]!
+ mov x16,sp
+ stp d8,d9,[sp,#16]
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+ lsl $num,$num,#1
+ eor $zero.16b,$zero.16b,$zero.16b
+
+.align 4
+.LNEON_8n:
+ eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
+ sub $toutptr,sp,#128
+ eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
+ sub $toutptr,$toutptr,$num,lsl#4
+ eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
+ and $toutptr,$toutptr,#-64
+ eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
+ mov sp,$toutptr // alloca
+ eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
+ add $toutptr,$toutptr,#256
+ eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
+ sub $inner,$num,#8
+ eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
+ eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
+
+.LNEON_8n_init:
+ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
+ subs $inner,$inner,#8
+ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
+ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
+ st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
+ bne .LNEON_8n_init
+
+ add $tinptr,sp,#256
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
+ add $bnptr,sp,#8
+ ldr $sM0,[$n0],#4
+ mov $outer,$num
+ b .LNEON_8n_outer
+
+.align 4
+.LNEON_8n_outer:
+ ldr $sBi,[$bptr],#4 // *b++
+ uxtl $Bi.4s,$Bi.4h
+ add $toutptr,sp,#128
+ ld1 {$N0.4s,$N1.4s},[$nptr],#32
+
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
+ shl $Ni.2d,@ACC[0].2d,#16
+ ext $Ni.16b,$Ni.16b,$Ni.16b,#8
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
+ add $Ni.2d,$Ni.2d,@ACC[0].2d
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
+ mul $Ni.2s,$Ni.2s,$M0.2s
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
+ st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
+ uxtl $Ni.4s,$Ni.4h
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+for ($i=0; $i<7;) {
+$code.=<<___;
+ ldr $sBi,[$bptr],#4 // *b++
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
+ uxtl $Bi.4s,$Bi.4h
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
+ ushr $temp.2d,@ACC[0].2d,#16
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
+ ushr @ACC[0].2d,@ACC[0].2d,#16
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
+ add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
+ ins @ACC[1].d[0],$ACCTemp.d[0]
+ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
+___
+ push(@ACC,shift(@ACC)); $i++;
+$code.=<<___;
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
+ ld1 {@ACC[7].2d},[$tinptr],#16
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
+ shl $Ni.2d,@ACC[0].2d,#16
+ ext $Ni.16b,$Ni.16b,$Ni.16b,#8
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
+ add $Ni.2d,$Ni.2d,@ACC[0].2d
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
+ mul $Ni.2s,$Ni.2s,$M0.2s
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
+ st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
+ uxtl $Ni.4s,$Ni.4h
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+}
+$code.=<<___;
+ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
+ mov $Temp.16b,@ACC[0].16b
+ ushr $Temp.2d,$Temp.2d,#16
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
+ add @ACC[0].2d,@ACC[0].2d,$Temp.2d
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
+ ushr @ACC[0].2d,@ACC[0].2d,#16
+ eor $temp.16b,$temp.16b,$temp.16b
+ ins @ACC[0].d[1],$temp.d[0]
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
+ add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
+ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
+ add $bnptr,sp,#8 // rewind
+___
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ sub $inner,$num,#8
+ b .LNEON_8n_inner
+
+.align 4
+.LNEON_8n_inner:
+ subs $inner,$inner,#8
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
+ ld1 {@ACC[7].2d},[$tinptr]
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
+ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
+ ld1 {$N0.4s,$N1.4s},[$nptr],#32
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
+ b.eq .LInner_jump
+ add $tinptr,$tinptr,#16 // don't advance in last iteration
+.LInner_jump:
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+ ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
+ st1 {@ACC[0].2d},[$toutptr],#16
+___
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
+ ld1 {@ACC[7].2d},[$tinptr]
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
+ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
+ b.eq .LInner_jump$i
+ add $tinptr,$tinptr,#16 // don't advance in last iteration
+.LInner_jump$i:
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+}
+$code.=<<___;
+ b.ne .LInner_after_rewind$i
+ sub $aptr,$aptr,$num,lsl#2 // rewind
+.LInner_after_rewind$i:
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
+ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
+ add $bnptr,sp,#8 // rewind
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
+ st1 {@ACC[0].2d},[$toutptr],#16
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
+
+ bne .LNEON_8n_inner
+___
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ add $tinptr,sp,#128
+ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
+ eor $N0.16b,$N0.16b,$N0.16b // $N0
+ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
+ eor $N1.16b,$N1.16b,$N1.16b // $N1
+ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
+ st1 {@ACC[6].2d},[$toutptr]
+
+ subs $outer,$outer,#8
+ ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
+ ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
+ ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
+ ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
+
+ b.eq .LInner_8n_jump_2steps
+ sub $nptr,$nptr,$num,lsl#2 // rewind
+ b .LNEON_8n_outer
+
+.LInner_8n_jump_2steps:
+ add $toutptr,sp,#128
+ st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
+ mov $Temp.16b,@ACC[0].16b
+ ushr $temp.2d,@ACC[0].2d,#16
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+ st1 {$N0.2d,$N1.2d}, [sp],#32
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
+ st1 {$N0.2d,$N1.2d}, [sp],#32
+ ushr $temp.2d,@ACC[0].2d,#16
+ st1 {$N0.2d,$N1.2d}, [sp],#32
+ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
+ ins $temp.d[1],$zero.d[0]
+
+ mov $inner,$num
+ b .LNEON_tail_entry
+
+.align 4
+.LNEON_tail:
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
+ mov $Temp.16b,@ACC[0].16b
+ ushr $temp.2d,@ACC[0].2d,#16
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+ ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
+ ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
+ ushr $temp.2d,@ACC[0].2d,#16
+ ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
+ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
+ ins $temp.d[1],$zero.d[0]
+
+.LNEON_tail_entry:
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+ add @ACC[1].2d,@ACC[1].2d,$temp.2d
+ st1 {@ACC[0].s}[0], [$toutptr],#4
+ ushr $temp.2d,@ACC[1].2d,#16
+ mov $Temp.16b,@ACC[1].16b
+ ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
+ add @ACC[1].2d,@ACC[1].2d,$temp.2d
+ ushr $temp.2d,@ACC[1].2d,#16
+ zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
+ ins $temp.d[1],$zero.d[0]
+___
+ push(@ACC,shift(@ACC));
+}
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
+ subs $inner,$inner,#8
+ st1 {@ACC[7].s}[0], [$toutptr],#4
+ bne .LNEON_tail
+
+ st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
+ sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
+ subs $aptr,sp,#0 // clear carry flag
+ add $bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+ ldp w4,w5,[$aptr],#8
+ ldp w6,w7,[$aptr],#8
+ ldp w8,w9,[$nptr],#8
+ ldp w10,w11,[$nptr],#8
+ sbcs w8,w4,w8
+ sbcs w9,w5,w9
+ sbcs w10,w6,w10
+ sbcs w11,w7,w11
+ sub x17,$bptr,$aptr
+ stp w8,w9,[$rptr],#8
+ stp w10,w11,[$rptr],#8
+ cbnz x17,.LNEON_sub
+
+ ldr w10, [$aptr] // load top-most bit
+ mov x11,sp
+ eor v0.16b,v0.16b,v0.16b
+ sub x11,$bptr,x11 // this is num*4
+ eor v1.16b,v1.16b,v1.16b
+ mov $aptr,sp
+ sub $rptr,$rptr,x11 // rewind $rptr
+ mov $nptr,$bptr // second 3/4th of frame
+ sbcs w10,w10,wzr // result is carry flag
+
+.LNEON_copy_n_zap:
+ ldp w4,w5,[$aptr],#8
+ ldp w6,w7,[$aptr],#8
+ ldp w8,w9,[$rptr],#8
+ ldp w10,w11,[$rptr]
+ sub $rptr,$rptr,#8
+ b.cs .LCopy_1
+ mov w8,w4
+ mov w9,w5
+ mov w10,w6
+ mov w11,w7
+.LCopy_1:
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
+ ldp w4,w5,[$aptr],#8
+ ldp w6,w7,[$aptr],#8
+ stp w8,w9,[$rptr],#8
+ stp w10,w11,[$rptr],#8
+ sub $aptr,$aptr,#32
+ ldp w8,w9,[$rptr],#8
+ ldp w10,w11,[$rptr]
+ sub $rptr,$rptr,#8
+ b.cs .LCopy_2
+ mov w8, w4
+ mov w9, w5
+ mov w10, w6
+ mov w11, w7
+.LCopy_2:
+ st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
+ sub x17,$bptr,$aptr // preserves carry
+ stp w8,w9,[$rptr],#8
+ stp w10,w11,[$rptr],#8
+ cbnz x17,.LNEON_copy_n_zap
+
+ mov sp,x16
+ ldp d14,d15,[sp,#64]
+ ldp d12,d13,[sp,#48]
+ ldp d10,d11,[sp,#32]
+ ldp d8,d9,[sp,#16]
+ ldr x29,[sp],#80
+ ret // bx lr
+
+.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+___
+}
+{
########################################################################
# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
diff --git a/crypto/bn/build.info b/crypto/bn/build.info
index 3c32e83067..d0c1034bde 100644
--- a/crypto/bn/build.info
+++ b/crypto/bn/build.info
@@ -177,3 +177,4 @@ INCLUDE[armv4-mont.o]=..
GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl
INCLUDE[armv4-gf2m.o]=..
GENERATE[armv8-mont.S]=asm/armv8-mont.pl
+INCLUDE[armv8-mont.o]=..