summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.github/workflows/run-checker-daily.yml1
-rw-r--r--CHANGES.md7
-rwxr-xr-xConfigure1
-rw-r--r--INSTALL.md4
-rw-r--r--crypto/ec/asm/ecp_sm2p256-armv8.pl820
-rw-r--r--crypto/ec/build.info12
-rw-r--r--crypto/ec/ec_curve.c11
-rw-r--r--crypto/ec/ec_local.h5
-rw-r--r--crypto/ec/ecp_sm2p256.c800
-rw-r--r--crypto/ec/ecp_sm2p256_table.c16387
10 files changed, 18045 insertions, 3 deletions
diff --git a/.github/workflows/run-checker-daily.yml b/.github/workflows/run-checker-daily.yml
index 45b97f21cd..e2bf91cddc 100644
--- a/.github/workflows/run-checker-daily.yml
+++ b/.github/workflows/run-checker-daily.yml
@@ -102,6 +102,7 @@ jobs:
no-siphash,
no-siv,
no-sm2,
+ no-sm2-precomp,
no-sm3,
no-sm4,
no-sock,
diff --git a/CHANGES.md b/CHANGES.md
index ec4e1a892d..11a0ee793d 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -25,6 +25,13 @@ OpenSSL 3.2
### Changes between 3.1 and 3.2 [xx XXX xxxx]
+ * Added optimization for SM2 algorithm on aarch64. It uses a huge precomputed
+ table for point multiplication of the base point, which increases the size of
+ libcrypto from 4.4 MB to 4.9 MB. A new configure option `no-sm2-precomp` has
+ been added to disable the precomputed table.
+
+ *Xu Yizhou*
+
* Added client side support for QUIC
*Hugo Landau, Matt Caswell, Paul Dale, Tomáš Mráz, Richard Levitte*
diff --git a/Configure b/Configure
index 2c17f4186b..364b699c57 100755
--- a/Configure
+++ b/Configure
@@ -497,6 +497,7 @@ my @disablables = (
"siphash",
"siv",
"sm2",
+ "sm2-precomp",
"sm3",
"sm4",
"sock",
diff --git a/INSTALL.md b/INSTALL.md
index fb6f4be60a..3322acb6a2 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -914,6 +914,10 @@ Do not create shared libraries, only static ones.
See [Notes on shared libraries](#notes-on-shared-libraries) below.
+### no-sm2-precomp
+
+Disable using the SM2 precomputed table on aarch64 to make the library smaller.
+
### no-sock
Don't build support for socket BIOs.
diff --git a/crypto/ec/asm/ecp_sm2p256-armv8.pl b/crypto/ec/asm/ecp_sm2p256-armv8.pl
new file mode 100644
index 0000000000..da4c16c309
--- /dev/null
+++ b/crypto/ec/asm/ecp_sm2p256-armv8.pl
@@ -0,0 +1,820 @@
+#! /usr/bin/env perl
+# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+$flavour = shift;
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("x$_",(7..14));
+my ($a8,$a10,$a12,$a14,$a9,$a11,$a13,$a15)=map("x$_",(7..14));
+my ($t0,$t1,$t2,$t3)=map("x$_",(3..6));
+my ($t4,$t5,$t6,$t7,$t8)=map("x$_",(15..19));
+
+sub bn_mod_add() {
+ my $mod = shift;
+$code.=<<___;
+ # Load inputs
+ ldp $s0,$s1,[x1]
+ ldp $s2,$s3,[x1,#16]
+ ldp $s4,$s5,[x2]
+ ldp $s6,$s7,[x2,#16]
+
+ # Addition
+ adds $s0,$s0,$s4
+ adcs $s1,$s1,$s5
+ adcs $s2,$s2,$s6
+ adcs $s3,$s3,$s7
+ adc $t4,xzr,xzr
+
+ # Load polynomial
+ adr x2,$mod
+ ldp $s4,$s5,[x2]
+ ldp $s6,$s7,[x2,#16]
+
+ # Backup Addition
+ mov $t0,$s0
+ mov $t1,$s1
+ mov $t2,$s2
+ mov $t3,$s3
+
+ # Sub polynomial
+ subs $t0,$t0,$s4
+ sbcs $t1,$t1,$s5
+ sbcs $t2,$t2,$s6
+ sbcs $t3,$t3,$s7
+ sbcs $t4,$t4,xzr
+
+ # Select based on carry
+ csel $s0,$s0,$t0,cc
+ csel $s1,$s1,$t1,cc
+ csel $s2,$s2,$t2,cc
+ csel $s3,$s3,$t3,cc
+
+ # Store results
+ stp $s0,$s1,[x0]
+ stp $s2,$s3,[x0,#16]
+___
+}
+
+sub bn_mod_sub() {
+ my $mod = shift;
+$code.=<<___;
+ # Load inputs
+ ldp $s0,$s1,[x1]
+ ldp $s2,$s3,[x1,#16]
+ ldp $s4,$s5,[x2]
+ ldp $s6,$s7,[x2,#16]
+
+ # Subtraction
+ subs $s0,$s0,$s4
+ sbcs $s1,$s1,$s5
+ sbcs $s2,$s2,$s6
+ sbcs $s3,$s3,$s7
+ sbc $t4,xzr,xzr
+
+ # Load polynomial
+ adr x2,$mod
+ ldp $s4,$s5,[x2]
+ ldp $s6,$s7,[x2,#16]
+
+ # Backup subtraction
+ mov $t0,$s0
+ mov $t1,$s1
+ mov $t2,$s2
+ mov $t3,$s3
+
+ # Add polynomial
+ adds $t0,$t0,$s4
+ adcs $t1,$t1,$s5
+ adcs $t2,$t2,$s6
+ adcs $t3,$t3,$s7
+ tst $t4,$t4
+
+ # Select based on carry
+ csel $s0,$s0,$t0,eq
+ csel $s1,$s1,$t1,eq
+ csel $s2,$s2,$t2,eq
+ csel $s3,$s3,$t3,eq
+
+ # Store results
+ stp $s0,$s1,[x0]
+ stp $s2,$s3,[x0,#16]
+___
+}
+
+sub bn_mod_div_by_2() {
+ my $mod = shift;
+$code.=<<___;
+ # Load inputs
+ ldp $s0,$s1,[x1]
+ ldp $s2,$s3,[x1,#16]
+
+ # Save the least significant bit
+ mov $t0,$s0
+
+ # Right shift 1
+ extr $s0,$s1,$s0,#1
+ extr $s1,$s2,$s1,#1
+ extr $s2,$s3,$s2,#1
+ lsr $s3,$s3,#1
+
+ # Load mod
+ adr x2,$mod
+ ldp $s4,$s5,[x2]
+ ldp $s6,$s7,[x2,#16]
+
+ # Parity check
+ tst $t0,#1
+ csel $s4,xzr,$s4,eq
+ csel $s5,xzr,$s5,eq
+ csel $s6,xzr,$s6,eq
+ csel $s7,xzr,$s7,eq
+
+ # Add
+ adds $s0,$s0,$s4
+ adcs $s1,$s1,$s5
+ adcs $s2,$s2,$s6
+ adc $s3,$s3,$s7
+
+ # Store results
+ stp $s0,$s1,[x0]
+ stp $s2,$s3,[x0,#16]
+___
+}
+
+{
+$code.=<<___;
+#include "arm_arch.h"
+.arch armv8-a
+.text
+
+.align 5
+// The polynomial p
+.Lpoly:
+.quad 0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff
+// The order of polynomial n
+.Lord:
+.quad 0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff
+// (p + 1) / 2
+.Lpoly_div_2:
+.quad 0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff
+// (n + 1) / 2
+.Lord_div_2:
+.quad 0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff
+
+// void bn_rshift1(BN_ULONG *a);
+.globl bn_rshift1
+.type bn_rshift1,%function
+.align 5
+bn_rshift1:
+ AARCH64_VALID_CALL_TARGET
+ # Load inputs
+ ldp $s0,$s1,[x0]
+ ldp $s2,$s3,[x0,#16]
+
+ # Right shift
+ extr $s0,$s1,$s0,#1
+ extr $s1,$s2,$s1,#1
+ extr $s2,$s3,$s2,#1
+ lsr $s3,$s3,#1
+
+ # Store results
+ stp $s0,$s1,[x0]
+ stp $s2,$s3,[x0,#16]
+
+ ret
+.size bn_rshift1,.-bn_rshift1
+
+// void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
+.globl bn_sub
+.type bn_sub,%function
+.align 5
+bn_sub:
+ AARCH64_VALID_CALL_TARGET
+ # Load inputs
+ ldp $s0,$s1,[x1]
+ ldp $s2,$s3,[x1,#16]
+ ldp $s4,$s5,[x2]
+ ldp $s6,$s7,[x2,#16]
+
+ # Subtraction
+ subs $s0,$s0,$s4
+ sbcs $s1,$s1,$s5
+ sbcs $s2,$s2,$s6
+ sbc $s3,$s3,$s7
+
+ # Store results
+ stp $s0,$s1,[x0]
+ stp $s2,$s3,[x0,#16]
+
+ ret
+.size bn_sub,.-bn_sub
+
+// void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a);
+.globl ecp_sm2p256_div_by_2
+.type ecp_sm2p256_div_by_2,%function
+.align 5
+ecp_sm2p256_div_by_2:
+ AARCH64_VALID_CALL_TARGET
+___
+ &bn_mod_div_by_2(".Lpoly_div_2");
+$code.=<<___;
+ ret
+.size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2
+
+// void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a);
+.globl ecp_sm2p256_div_by_2_mod_ord
+.type ecp_sm2p256_div_by_2_mod_ord,%function
+.align 5
+ecp_sm2p256_div_by_2_mod_ord:
+ AARCH64_VALID_CALL_TARGET
+___
+ &bn_mod_div_by_2(".Lord_div_2");
+$code.=<<___;
+ ret
+.size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord
+
+// void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a);
+.globl ecp_sm2p256_mul_by_3
+.type ecp_sm2p256_mul_by_3,%function
+.align 5
+ecp_sm2p256_mul_by_3:
+ AARCH64_VALID_CALL_TARGET
+ # Load inputs
+ ldp $s0,$s1,[x1]
+ ldp $s2,$s3,[x1,#16]
+
+ # 2*a
+ adds $s0,$s0,$s0
+ adcs $s1,$s1,$s1
+ adcs $s2,$s2,$s2
+ adcs $s3,$s3,$s3
+ adcs $t4,xzr,xzr
+
+ mov $t0,$s0
+ mov $t1,$s1
+ mov $t2,$s2
+ mov $t3,$s3
+
+ # Sub polynomial
+ adr x2,.Lpoly
+ ldp $s4,$s5,[x2]
+ ldp $s6,$s7,[x2,#16]
+ subs $s0,$s0,$s4
+ sbcs $s1,$s1,$s5
+ sbcs $s2,$s2,$s6
+ sbcs $s3,$s3,$s7
+ sbcs $t4,$t4,xzr
+
+ csel $s0,$s0,$t0,cs
+ csel $s1,$s1,$t1,cs
+ csel $s2,$s2,$t2,cs
+ csel $s3,$s3,$t3,cs
+ eor $t4,$t4,$t4
+
+ # 3*a
+ ldp $s4,$s5,[x1]
+ ldp $s6,$s7,[x1,#16]
+ adds $s0,$s0,$s4
+ adcs $s1,$s1,$s5
+ adcs $s2,$s2,$s6
+ adcs $s3,$s3,$s7
+ adcs $t4,xzr,xzr
+
+ mov $t0,$s0
+ mov $t1,$s1
+ mov $t2,$s2
+ mov $t3,$s3
+
+ # Sub polynomial
+ adr x2,.Lpoly
+ ldp $s4,$s5,[x2]
+ ldp $s6,$s7,[x2,#16]
+ subs $s0,$s0,$s4
+ sbcs $s1,$s1,$s5
+ sbcs $s2,$s2,$s6
+ sbcs $s3,$s3,$s7
+ sbcs $t4,$t4,xzr
+
+ csel $s0,$s0,$t0,cs
+ csel $s1,$s1,$t1,cs
+ csel $s2,$s2,$t2,cs
+ csel $s3,$s3,$t3,cs
+
+ # Store results
+ stp $s0,$s1,[x0]
+ stp $s2,$s3,[x0,#16]
+
+ ret
+.size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3
+
+// void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
+.globl ecp_sm2p256_add
+.type ecp_sm2p256_add,%function
+.align 5
+ecp_sm2p256_add:
+ AARCH64_VALID_CALL_TARGET
+___
+ &bn_mod_add(".Lpoly");
+$code.=<<___;
+ ret
+.size ecp_sm2p256_add,.-ecp_sm2p256_add
+
+// void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
+.globl ecp_sm2p256_sub
+.type ecp_sm2p256_sub,%function
+.align 5
+ecp_sm2p256_sub:
+ AARCH64_VALID_CALL_TARGET
+___
+ &bn_mod_sub(".Lpoly");
+$code.=<<___;
+ ret
+.size ecp_sm2p256_sub,.-ecp_sm2p256_sub
+
+// void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
+.globl ecp_sm2p256_sub_mod_ord
+.type ecp_sm2p256_sub_mod_ord,%function
+.align 5
+ecp_sm2p256_sub_mod_ord:
+ AARCH64_VALID_CALL_TARGET
+___
+ &bn_mod_sub(".Lord");
+$code.=<<___;
+ ret
+.size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord
+
+.macro RDC
+ # a = | s7 | ... | s0 |, where si are 64-bit quantities
+ # = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities
+ # | s7 | s6 | s5 | s4 |
+ # | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 |
+ # | s3 | s2 | s1 | s0 |
+ # | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 |
+ # =================================================
+ # | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+)
+ # | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+)
+ # | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+)
+ # | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+)
+ # | a12 | 0 | s7 | a13 | 0 | s6 | (+)
+ # | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+)
+ # | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+)
+ # | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
+ # | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
+ # | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
+ # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
+ # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
+ # | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
+ # | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-)
+ # | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-)
+ # | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-)
+ # | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-)
+ # | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
+ # | V[3] | V[2] | V[1] | V[0] |
+
+ # 1. 64-bit addition
+ # t2=s6+s7+s7
+ adds $t2,$s6,$s7
+ adcs $t1,xzr,xzr
+ adds $t2,$t2,$s7
+ adcs $t1,$t1,xzr
+ # t3=s4+s5+t2
+ adds $t3,$s4,$t2
+ adcs $t4,$t1,xzr
+ adds $t3,$t3,$s5
+ adcs $t4,$t4,xzr
+ # sum
+ adds $s0,$s0,$t3
+ adcs $s1,$s1,$t4
+ adcs $s2,$s2,$t2
+ adcs $s3,$s3,$s7
+ adcs $t0,xzr,xzr
+ adds $s3,$s3,$t1
+ adcs $t0,$t0,xzr
+
+ stp $s0,$s1,[sp,#32]
+ stp $s2,$s3,[sp,#48]
+
+ # 2. 64-bit to 32-bit spread
+ mov $t1,#0xffffffff
+ mov $s0,$s4
+ mov $s1,$s5
+ mov $s2,$s6
+ mov $s3,$s7
+ and $s0,$s0,$t1 // a8
+ and $s1,$s1,$t1 // a10
+ and $s2,$s2,$t1 // a12
+ and $s3,$s3,$t1 // a14
+ lsr $s4,$s4,#32 // a9
+ lsr $s5,$s5,#32 // a11
+ lsr $s6,$s6,#32 // a13
+ lsr $s7,$s7,#32 // a15
+
+ # 3. 32-bit addition
+ add $t1,$a14,$a12 // t1 <- a12 + a14
+ add $t2,$a15,$a13 // t2 <- a13 + a15
+ add $t3,$a8,$a9 // t3 <- a8 + a9
+ add $t4,$a14,$a10 // t4 <- a10 + a14
+ add $a15,$a15,$a11 // a15 <- a11 + a15
+ add $a12,$t2,$t1 // a12 <- a12 + a13 + a14 + a15
+ add $a10,$a10,$a12 // a10 <- a10 + a12 + a13 + a14 + a15
+ add $a10,$a10,$a12 // a10 <- a10 + 2*(a12 + a13 + a14 + a15)
+ add $a10,$a10,$t3 // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15)
+ add $a10,$a10,$a11 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
+ add $a12,$a12,$a13 // a12 <- a12 + 2*a13 + a14 + a15
+ add $a12,$a12,$a11 // a12 <- a11 + a12 + 2*a13 + a14 + a15
+ add $a12,$a12,$a8 // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15
+ add $t3,$t3,$a14 // t3 <- a8 + a9 + a14
+ add $t3,$t3,$a13 // t3 <- a8 + a9 + a13 + a14
+ add $a9,$a9,$t2 // a9 <- a9 + a13 + a15
+ add $a11,$a11,$a9 // a11 <- a9 + a11 + a13 + a15
+ add $a11,$a11,$t2 // a11 <- a9 + a11 + 2*(a13 + a15)
+ add $t1,$t1,$t4 // t1 <- a10 + a12 + 2*a14
+
+ # U[0] s5 a9 + a11 + 2*(a13 + a15)
+ # U[1] t1 a10 + a12 + 2*a14
+ # U[2] -t3 a8 + a9 + a13 + a14
+ # U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15
+ # U[4] s4 a9 + a13 + a15
+ # U[5] t4 a10 + a14
+ # U[6] s7 a11 + a15
+ # U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
+
+ # 4. 32-bit to 64-bit
+ lsl $s0,$t1,#32
+ extr $t1,$s2,$t1,#32
+ extr $s2,$t4,$s2,#32
+ extr $t4,$s1,$t4,#32
+ lsr $s1,$s1,#32
+
+ # 5. 64-bit addition
+ adds $s5,$s5,$s0
+ adcs $t1,$t1,xzr
+ adcs $s4,$s4,$s2
+ adcs $s7,$s7,$t4
+ adcs $t0,$t0,$s1
+
+ # V[0] s5
+ # V[1] t1
+ # V[2] s4
+ # V[3] s7
+ # carry t0
+ # sub t3
+
+ # 5. Process s0-s3
+ ldp $s0,$s1,[sp,#32]
+ ldp $s2,$s3,[sp,#48]
+ # add with V0-V3
+ adds $s0,$s0,$s5
+ adcs $s1,$s1,$t1
+ adcs $s2,$s2,$s4
+ adcs $s3,$s3,$s7
+ adcs $t0,$t0,xzr
+ # sub with t3
+ subs $s1,$s1,$t3
+ sbcs $s2,$s2,xzr
+ sbcs $s3,$s3,xzr
+ sbcs $t0,$t0,xzr
+
+ # 6. MOD
+ # First Mod
+ lsl $t1,$t0,#32
+ subs $t2,$t1,$t0
+
+ adds $s0,$s0,$t0
+ adcs $s1,$s1,$t2
+ adcs $s2,$s2,xzr
+ adcs $s3,$s3,$t1
+
+ # Last Mod
+ # return y - p if y > p else y
+ mov $s4,$s0
+ mov $s5,$s1
+ mov $s6,$s2
+ mov $s7,$s3
+
+ adr $t0,.Lpoly
+ ldp $t1,$t2,[$t0]
+ ldp $t3,$t4,[$t0,#16]
+
+ adcs $t5,xzr,xzr
+
+ subs $s0,$s0,$t1
+ sbcs $s1,$s1,$t2
+ sbcs $s2,$s2,$t3
+ sbcs $s3,$s3,$t4
+ sbcs $t5,$t5,xzr
+
+ csel $s0,$s0,$s4,cs
+ csel $s1,$s1,$s5,cs
+ csel $s2,$s2,$s6,cs
+ csel $s3,$s3,$s7,cs
+
+.endm
+
+// void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
+.globl ecp_sm2p256_mul
+.type ecp_sm2p256_mul,%function
+.align 5
+ecp_sm2p256_mul:
+ AARCH64_SIGN_LINK_REGISTER
+ # Store scalar registers
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp x16,x17,[sp,#16]
+ stp x18,x19,[sp,#64]
+
+ # Load inputs
+ ldp $s0,$s1,[x1]
+ ldp $s2,$s3,[x1,#16]
+ ldp $s4,$s5,[x2]
+ ldp $s6,$s7,[x2,#16]
+
+### multiplication ###
+ # ========================
+ # s3 s2 s1 s0
+ # * s7 s6 s5 s4
+ # ------------------------
+ # + s0 s0 s0 s0
+ # * * * *
+ # s7 s6 s5 s4
+ # s1 s1 s1 s1
+ # * * * *
+ # s7 s6 s5 s4
+ # s2 s2 s2 s2
+ # * * * *
+ # s7 s6 s5 s4
+ # s3 s3 s3 s3
+ # * * * *
+ # s7 s6 s5 s4
+ # ------------------------
+ # s7 s6 s5 s4 s3 s2 s1 s0
+ # ========================
+
+### s0*s4 ###
+ mul $t5,$s0,$s4
+ umulh $t2,$s0,$s4
+
+### s1*s4 + s0*s5 ###
+ mul $t0,$s1,$s4
+ umulh $t1,$s1,$s4
+ adds $t2,$t2,$t0
+ adcs $t3,$t1,xzr
+
+ mul $t0,$s0,$s5
+ umulh $t1,$s0,$s5
+ adds $t2,$t2,$t0
+ adcs $t3,$t3,$t1
+ adcs $t4,xzr,xzr
+
+### s2*s4 + s1*s5 + s0*s6 ###
+ mul $t0,$s2,$s4
+ umulh $t1,$s2,$s4
+ adds $t3,$t3,$t0
+ adcs $t4,$t4,$t1
+
+ mul $t0,$s1,$s5
+ umulh $t1,$s1,$s5
+ adds $t3,$t3,$t0
+ adcs $t4,$t4,$t1
+ adcs $t6,xzr,xzr
+
+ mul $t0,$s0,$s6
+ umulh $t1,$s0,$s6
+ adds $t3,$t3,$t0
+ adcs $t4,$t4,$t1
+ adcs $t6,$t6,xzr
+
+### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
+ mul $t0,$s3,$s4
+ umulh $t1,$s3,$s4
+ adds $t4,$t4,$t0
+ adcs $t6,$t6,$t1
+ adcs $t7,xzr,xzr
+
+ mul $t0,$s2,$s5
+ umulh $t1,$s2,$s5
+ adds $t4,$t4,$t0
+ adcs $t6,$t6,$t1
+ adcs $t7,$t7,xzr
+
+ mul $t0,$s1,$s6
+ umulh $t1,$s1,$s6
+ adds $t4,$t4,$t0
+ adcs $t6,$t6,$t1
+ adcs $t7,$t7,xzr
+
+ mul $t0,$s0,$s7
+ umulh $t1,$s0,$s7
+ adds $t4,$t4,$t0
+ adcs $t6,$t6,$t1
+ adcs $t7,$t7,xzr
+
+### s3*s5 + s2*s6 + s1*s7 ###
+ mul $t0,$s3,$s5
+ umulh $t1,$s3,$s5
+ adds $t6,$t6,$t0
+ adcs $t7,$t7,$t1
+ adcs $t8,xzr,xzr
+
+ mul $t0,$s2,$s6
+ umulh $t1,$s2,$s6
+ adds $t6,$t6,$t0
+ adcs $t7,$t7,$t1
+ adcs $t8,$t8,xzr
+
+ mul $t0,$s1,$s7
+ umulh $t1,$s1,$s7
+ adds $s4,$t6,$t0
+ adcs $t7,$t7,$t1
+ adcs $t8,$t8,xzr
+
+### s3*s6 + s2*s7 ###
+ mul $t0,$s3,$s6
+ umulh $t1,$s3,$s6
+ adds $t7,$t7,$t0
+ adcs $t8,$t8,$t1
+ adcs $t6,xzr,xzr
+
+ mul $t0,$s2,$s7
+ umulh $t1,$s2,$s7
+ adds $s5,$t7,$t0
+ adcs $t8,$t8,$t1
+ adcs $t6,$t6,xzr
+
+### s3*s7 ###
+ mul $t0,$s3,$s7
+ umulh $t1,$s3,$s7
+ adds $s6,$t8,$t0
+ adcs $s7,$t6,$t1
+
+ mov $s0,$t5
+ mov $s1,$t2
+ mov $s2,$t3
+ mov $s3,$t4
+
+ # result of mul: s7 s6 s5 s4 s3 s2 s1 s0
+
+### Reduction ###
+ RDC
+
+ stp $s0,$s1,[x0]
+ stp $s2,$s3,[x0,#16]
+
+ # Restore scalar registers
+ ldp x16,x17,[sp,#16]
+ ldp x18,x19,[sp,#64]
+ ldp x29,x30,[sp],#80
+
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_sm2p256_mul,.-ecp_sm2p256_mul
+
+// void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a);
+.globl ecp_sm2p256_sqr
+.type ecp_sm2p256_sqr,%function
+.align 5
+
+ecp_sm2p256_sqr:
+ AARCH64_SIGN_LINK_REGISTER
+ # Store scalar registers
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp x16,x17,[sp,#16]
+ stp x18,x19,[sp,#64]
+
+ # Load inputs
+ ldp $s4,$s5,[x1]
+ ldp $s6,$s7,[x1,#16]
+
+### square ###
+ # ========================
+ # s7 s6 s5 s4
+ # * s7 s6 s5 s4
+ # ------------------------
+ # + s4 s4 s4 s4
+ # * * * *
+ # s7 s6 s5 s4
+ # s5 s5 s5 s5
+ # * * * *
+ # s7 s6 s5 s4
+ # s6 s6 s6 s6
+ # * * * *
+ # s7 s6 s5 s4
+ # s7 s7 s7 s7
+ # * * * *
+ # s7 s6 s5 s4
+ # ------------------------
+ # s7 s6 s5 s4 s3 s2 s1 s0
+ # ========================
+
+### s4*s5 ###
+ mul $s1,$s4,$s5
+ umulh $s2,$s4,$s5
+
+### s4*s6 ###
+ mul $t0,$s6,$s4
+ umulh $s3,$s6,$s4
+ adds $s2,$s2,$t0
+ adcs $s3,$s3,xzr
+
+### s4*s7 + s5*s6 ###
+ mul $t0,$s7,$s4
+ umulh $t1,$s7,$s4
+ adds $s3,$s3,$t0
+ adcs $s0,$t1,xzr
+
+ mul $t0,$s6,$s5
+ umulh $t1,$s6,$s5
+ adds $s3,$s3,$t0
+ adcs $s0,$s0,$t1
+ adcs $t2,xzr,xzr
+
+### s5*s7 ###
+ mul $t0,$s7,$s5
+ umulh $t1,$s7,$s5
+ adds $s0,$s0,$t0
+ adcs $t2,$t2,$t1
+
+### s6*s7 ###
+ mul $t0,$s7,$s6
+ umulh $t1,$s7,$s6
+ adds $t2,$t2,$t0
+ adcs $t3,$t1,xzr
+
+### 2*(t3,t2,s0,s3,s2,s1) ###
+ adds $s1,$s1,$s1
+ adcs $s2,$s2,$s2
+ adcs $s3,$s3,$s3
+ adcs $s0,$s0,$s0
+ adcs $t2,$t2,$t2
+ adcs $t3,$t3,$t3
+ adcs $t4,xzr,xzr
+
+### s4*s4 ###
+ mul $t5,$s4,$s4
+ umulh $t6,$s4,$s4
+
+### s5*s5 ###
+ mul $s4,$s5,$s5
+ umulh $s5,$s5,$s5
+
+### s6*s6 ###
+ mul $t0,$s6,$s6
+ umulh $t1,$s6,$s6
+
+### s7*s7 ###
+ mul $t7,$s7,$s7
+ umulh $t8,$s7,$s7
+
+ adds $s1,$s1,$t6
+ adcs $s2,$s2,$s4
+ adcs $s3,$s3,$s5
+ adcs $s0,$s0,$t0
+ adcs $t2,$t2,$t1
+ adcs $t3,$t3,$t7
+ adcs $t4,$t4,$t8
+
+ mov $s4,$s0
+ mov $s0,$t5
+ mov $s5,$t2
+ mov $s6,$t3
+ mov $s7,$t4
+
+ # result of mul: s7 s6 s5 s4 s3 s2 s1 s0
+
+### Reduction ###
+ RDC
+
+ stp $s0,$s1,[x0]
+ stp $s2,$s3,[x0,#16]
+
+ # Restore scalar registers
+ ldp x16,x17,[sp,#16]
+ ldp x18,x19,[sp,#64]
+ ldp x29,x30,[sp],#80
+
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr
+___
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
diff --git a/crypto/ec/build.info b/crypto/ec/build.info
index 4077bead7b..2f376a39c6 100644
--- a/crypto/ec/build.info
+++ b/crypto/ec/build.info
@@ -31,6 +31,13 @@ IF[{- !$disabled{asm} -}]
$ECDEF_armv4=ECP_NISTZ256_ASM
$ECASM_aarch64=ecp_nistz256.c ecp_nistz256-armv8.S
$ECDEF_aarch64=ECP_NISTZ256_ASM
+ IF[{- !$disabled{'sm2'} -}]
+ $ECASM_aarch64=$ECASM_aarch64 ecp_sm2p256.c ecp_sm2p256-armv8.S
+ IF[{- !$disabled{'sm2-precomp'} -}]
+ $ECASM_aarch64=$ECASM_aarch64 ecp_sm2p256_table.c
+ ENDIF
+ $ECDEF_aarch64=$ECDEF_aarch64 ECP_SM2P256_ASM
+ ENDIF
$ECASM_parisc11=
$ECASM_parisc20_64=
@@ -127,3 +134,8 @@ IF[{- !$disabled{'ecx'} -}]
GENERATE[x25519-x86_64.s]=asm/x25519-x86_64.pl
GENERATE[x25519-ppc64.s]=asm/x25519-ppc64.pl
ENDIF
+
+IF[{- !$disabled{'sm2'} -}]
+ GENERATE[ecp_sm2p256-armv8.S]=asm/ecp_sm2p256-armv8.pl
+ INCLUDE[ecp_sm2p256-armv8.o]=..
+ENDIF
diff --git a/crypto/ec/ec_curve.c b/crypto/ec/ec_curve.c
index 724525a479..d703d16b3c 100644
--- a/crypto/ec/ec_curve.c
+++ b/crypto/ec/ec_curve.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2002-2021 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2002-2023 The OpenSSL Project Authors. All Rights Reserved.
* Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
@@ -3111,8 +3111,13 @@ static const ec_list_element curve_list[] = {
"RFC 5639 curve over a 512 bit prime field"},
{NID_brainpoolP512t1, &_EC_brainpoolP512t1.h, 0,
"RFC 5639 curve over a 512 bit prime field"},
-# ifndef OPENSSL_NO_SM2
- {NID_sm2, &_EC_sm2p256v1.h, 0,
+#ifndef OPENSSL_NO_SM2
+ {NID_sm2, &_EC_sm2p256v1.h,
+# ifdef ECP_SM2P256_ASM
+ EC_GFp_sm2p256_method,
+# else
+ 0,
+# endif
"SM2 curve over a 256 bit prime field"},
# endif
};
diff --git a/crypto/ec/ec_local.h b/crypto/ec/ec_local.h
index 7181090fca..803786fdd2 100644
--- a/crypto/ec/ec_local.h
+++ b/crypto/ec/ec_local.h
@@ -653,6 +653,11 @@ int ossl_ec_key_simple_generate_key(EC_KEY *eckey);
int ossl_ec_key_simple_generate_public_key(EC_KEY *eckey);
int ossl_ec_key_simple_check_key(const EC_KEY *eckey);
+#ifdef ECP_SM2P256_ASM
+/* Returns optimized methods for SM2 */
+const EC_METHOD *EC_GFp_sm2p256_method(void);
+#endif
+
int ossl_ec_curve_nid_from_params(const EC_GROUP *group, BN_CTX *ctx);
/* EC_METHOD definitions */
diff --git a/crypto/ec/ecp_sm2p256.c b/crypto/ec/ecp_sm2p256.c
new file mode 100644
index 0000000000..49fab47187
--- /dev/null
+++ b/crypto/ec/ecp_sm2p256.c
@@ -0,0 +1,800 @@
+/*
+ * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ *
+ */
+
+/*
+ * SM2 low level APIs are deprecated for public use, but still ok for
+ * internal use.
+ */
+#include "internal/deprecated.h"
+
+#include <string.h>
+#include <openssl/err.h>
+#include "crypto/bn.h"
+#include "ec_local.h"
+#include "internal/constant_time.h"
+
+#if defined(__GNUC__)
+# define ALIGN32 __attribute((aligned(32)))
+# define ALIGN64 __attribute((aligned(64)))
+#elif defined(_MSC_VER)
+# define ALIGN32 __declspec(align(32))
+# define ALIGN64 __declspec(align(64))
+#else
+# define ALIGN32
+# define ALIGN64
+#endif
+
+#define P256_LIMBS (256 / BN_BITS2)
+
+#if !defined(OPENSSL_NO_SM2_PRECOMP)
+extern const BN_ULONG ecp_sm2p256_precomputed[8 * 32 * 256];
+#endif
+
+typedef struct {
+ BN_ULONG X[P256_LIMBS];
+ BN_ULONG Y[P256_LIMBS];
+ BN_ULONG Z[P256_LIMBS];
+} P256_POINT;
+
+typedef struct {
+ BN_ULONG X[P256_LIMBS];
+ BN_ULONG Y[P256_LIMBS];
+} P256_POINT_AFFINE;
+
+#if !defined(OPENSSL_NO_SM2_PRECOMP)
+/* Coordinates of G, for which we have precomputed tables */
+static const BN_ULONG def_xG[P256_LIMBS] ALIGN32 = {
+ 0x715a4589334c74c7, 0x8fe30bbff2660be1,
+ 0x5f9904466a39c994, 0x32c4ae2c1f198119
+};
+
+static const BN_ULONG def_yG[P256_LIMBS] ALIGN32 = {
+ 0x02df32e52139f0a0, 0xd0a9877cc62a4740,
+ 0x59bdcee36b692153, 0xbc3736a2f4f6779c,
+};
+#endif
+
+/* p and order for SM2 according to GB/T 32918.5-2017 */
+static const BN_ULONG def_p[P256_LIMBS] ALIGN32 = {
+ 0xffffffffffffffff, 0xffffffff00000000,
+ 0xffffffffffffffff, 0xfffffffeffffffff
+};
+static const BN_ULONG def_ord[P256_LIMBS] ALIGN32 = {
+ 0x53bbf40939d54123, 0x7203df6b21c6052b,
+ 0xffffffffffffffff, 0xfffffffeffffffff
+};
+
+static const BN_ULONG ONE[P256_LIMBS] ALIGN32 = {1, 0, 0, 0};
+
+/* Functions implemented in assembly */
+/*
+ * Most of below mentioned functions *preserve* the property of inputs
+ * being fully reduced, i.e. being in [0, modulus) range. Simply put if
+ * inputs are fully reduced, then output is too.
+ */
+/* Right shift: a >> 1 */
+void bn_rshift1(BN_ULONG *a);
+/* Sub: r = a - b */
+void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
+/* Modular div by 2: r = a / 2 mod p */
+void ecp_sm2p256_div_by_2(BN_ULONG *r, const BN_ULONG *a);
+/* Modular div by 2: r = a / 2 mod n, where n = ord(p) */
+void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r, const BN_ULONG *a);
+/* Modular add: r = a + b mod p */
+void ecp_sm2p256_add(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
+/* Modular sub: r = a - b mod p */
+void ecp_sm2p256_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
+/* Modular sub: r = a - b mod n, where n = ord(p) */
+void ecp_sm2p256_sub_mod_ord(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
+/* Modular mul by 3: out = 3 * a mod p */
+void ecp_sm2p256_mul_by_3(BN_ULONG *r, const BN_ULONG *a);
+/* Modular mul: r = a * b mod p */
+void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
+/* Modular sqr: r = a ^ 2 mod p */
+void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a);
+
+static ossl_inline BN_ULONG is_zeros(const BN_ULONG *a)
+{
+ BN_ULONG res;
+
+ res = a[0] | a[1] | a[2] | a[3];
+
+ return constant_time_is_zero_64(res);
+}
+
+static ossl_inline int is_equal(const BN_ULONG *a, const BN_ULONG *b)
+{
+ BN_ULONG res;
+
+ res = a[0] ^ b[0];
+ res |= a[1] ^ b[1];
+ res |= a[2] ^ b[2];
+ res |= a[3] ^ b[3];
+
+ return constant_time_is_zero_64(res);
+}
+
+static ossl_inline int is_greater(const BN_ULONG *a, const BN_ULONG *b)
+{
+ int i;
+
+ for (i = P256_LIMBS - 1; i >= 0; --i) {
+ if (a[i] > b[i])
+ return 1;
+ if (a[i] < b[i])
+ return -1;
+ }
+
+ return 0;
+}
+
+#define is_one(a) is_equal(a, ONE)
+#define is_even(a) !(a[0] & 1)
+#define is_point_equal(a, b) \
+ is_equal(a->X, b->X) && \
+ is_equal(a->Y, b->Y) && \
+ is_equal(a->Z, b->Z)
+
+/* Bignum and field elements conversion */
+#define ecp_sm2p256_bignum_field_elem(out, in) \
+ bn_copy_words(out, in, P256_LIMBS)
+
+/* Binary algorithm for inversion in Fp */
+#define BN_MOD_INV(out, in, mod_div, mod_sub, mod) \
+ do { \
+ BN_ULONG u[4] ALIGN32; \
+ BN_ULONG v[4] ALIGN32; \
+ BN_ULONG x1[4] ALIGN32 = {1, 0, 0, 0}; \
+ BN_ULONG x2[4] ALIGN32 = {0}; \
+ \
+ if (is_zeros(in)) \
+ return; \
+ memcpy(u, in, 32); \
+ memcpy(v, mod, 32); \
+ while (!is_one(u) && !is_one(v)) { \
+ while (is_even(u)) { \
+ bn_rshift1(u); \
+ mod_div(x1, x1); \
+ } \
+ while (is_even(v)) { \
+ bn_rshift1(v); \
+ mod_div(x2, x2); \
+ } \
+ if (is_greater(u, v) == 1) { \
+ bn_sub(u, u, v); \
+ mod_sub(x1, x1, x2); \
+ } else { \
+ bn_sub(v, v, u); \
+ mod_sub(x2, x2, x1); \
+ } \
+ }