summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xcrypto/arm64cpuid.pl8
-rw-r--r--crypto/arm_arch.h1
-rw-r--r--crypto/armcap.c10
-rw-r--r--crypto/evp/e_sm4.c193
-rwxr-xr-xcrypto/sm4/asm/sm4-armv8.pl635
-rw-r--r--crypto/sm4/build.info32
-rw-r--r--include/crypto/sm4_platform.h48
-rw-r--r--providers/implementations/ciphers/cipher_sm4.h1
-rw-r--r--providers/implementations/ciphers/cipher_sm4_gcm_hw.c20
-rw-r--r--providers/implementations/ciphers/cipher_sm4_hw.c57
10 files changed, 945 insertions, 60 deletions
diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
index b30f505339..1841c0cc04 100755
--- a/crypto/arm64cpuid.pl
+++ b/crypto/arm64cpuid.pl
@@ -80,6 +80,14 @@ _armv8_pmull_probe:
ret
.size _armv8_pmull_probe,.-_armv8_pmull_probe
+.globl _armv8_sm4_probe
+.type _armv8_sm4_probe,%function
+_armv8_sm4_probe:
+ AARCH64_VALID_CALL_TARGET
+ .long 0xcec08400 // sm4e v0.4s, v0.4s
+ ret
+.size _armv8_sm4_probe,.-_armv8_sm4_probe
+
.globl _armv8_sha512_probe
.type _armv8_sha512_probe,%function
_armv8_sha512_probe:
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
index 77173cae42..291620ebc9 100644
--- a/crypto/arm_arch.h
+++ b/crypto/arm_arch.h
@@ -80,6 +80,7 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
# define ARMV8_CPUID (1<<7)
# define ARMV8_RNG (1<<8)
# define ARMV8_SM3 (1<<9)
+# define ARMV8_SM4 (1<<10)
/*
* MIDR_EL1 system register
diff --git a/crypto/armcap.c b/crypto/armcap.c
index 93003c9121..5016987eeb 100644
--- a/crypto/armcap.c
+++ b/crypto/armcap.c
@@ -54,6 +54,7 @@ void _armv8_sha256_probe(void);
void _armv8_pmull_probe(void);
# ifdef __aarch64__
void _armv8_sm3_probe(void);
+void _armv8_sm4_probe(void);
void _armv8_sha512_probe(void);
unsigned int _armv8_cpuid_probe(void);
void _armv8_rng_probe(void);
@@ -171,6 +172,7 @@ static unsigned long getauxval(unsigned long key)
# define HWCAP_CE_SHA256 (1 << 6)
# define HWCAP_CPUID (1 << 11)
# define HWCAP_CE_SM3 (1 << 18)
+# define HWCAP_CE_SM4 (1 << 19)
# define HWCAP_CE_SHA512 (1 << 21)
/* AT_HWCAP2 */
# define HWCAP2 26
@@ -242,6 +244,9 @@ void OPENSSL_cpuid_setup(void)
OPENSSL_armcap_P |= ARMV8_SHA256;
# ifdef __aarch64__
+ if (hwcap & HWCAP_CE_SM4)
+ OPENSSL_armcap_P |= ARMV8_SM4;
+
if (hwcap & HWCAP_CE_SHA512)
OPENSSL_armcap_P |= ARMV8_SHA512;
@@ -294,6 +299,11 @@ void OPENSSL_cpuid_setup(void)
}
# if defined(__aarch64__) && !defined(__APPLE__)
if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv8_sm4_probe();
+ OPENSSL_armcap_P |= ARMV8_SM4;
+ }
+
+ if (sigsetjmp(ill_jmp, 1) == 0) {
_armv8_sha512_probe();
OPENSSL_armcap_P |= ARMV8_SHA512;
}
diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c
index abd603015c..bff79ff197 100644
--- a/crypto/evp/e_sm4.c
+++ b/crypto/evp/e_sm4.c
@@ -17,92 +17,187 @@
# include <openssl/modes.h>
# include "crypto/sm4.h"
# include "crypto/evp.h"
+# include "crypto/sm4_platform.h"
# include "evp_local.h"
typedef struct {
- SM4_KEY ks;
+ union {
+ OSSL_UNION_ALIGN;
+ SM4_KEY ks;
+ } ks;
+ block128_f block;
+ union {
+ ecb128_f ecb;
+ cbc128_f cbc;
+ ctr128_f ctr;
+ } stream;
} EVP_SM4_KEY;
+# define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER sm4_##mode = { \
+ nid##_##nmode,blocksize,128/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ EVP_ORIG_GLOBAL, \
+ sm4_init_key, \
+ sm4_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_SM4_KEY), \
+ NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_sm4_##mode(void) \
+{ return &sm4_##mode; }
+
+#define DEFINE_BLOCK_CIPHERS(nid,flags) \
+ BLOCK_CIPHER_generic(nid,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,1,16,ctr,ctr,CTR,flags)
+
static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc)
{
- ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
+ int mode;
+ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+
+ mode = EVP_CIPHER_CTX_get_mode(ctx);
+ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+ && !enc) {
+#ifdef HWSM4_CAPABLE
+ if (HWSM4_CAPABLE) {
+ HWSM4_set_decrypt_key(key, &dat->ks.ks);
+ dat->block = (block128_f) HWSM4_decrypt;
+ dat->stream.cbc = NULL;
+# ifdef HWSM4_cbc_encrypt
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt;
+# endif
+# ifdef HWSM4_ecb_encrypt
+ if (mode == EVP_CIPH_ECB_MODE)
+ dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
+# endif
+ } else
+#endif
+ {
+ dat->block = (block128_f) ossl_sm4_decrypt;
+ ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
+ }
+ } else
+#ifdef HWSM4_CAPABLE
+ if (HWSM4_CAPABLE) {
+ HWSM4_set_encrypt_key(key, &dat->ks.ks);
+ dat->block = (block128_f) HWSM4_encrypt;
+ dat->stream.cbc = NULL;
+# ifdef HWSM4_cbc_encrypt
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt;
+ else
+# endif
+# ifdef HWSM4_ecb_encrypt
+ if (mode == EVP_CIPH_ECB_MODE)
+ dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
+ else
+# endif
+# ifdef HWSM4_ctr32_encrypt_blocks
+ if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks;
+ else
+# endif
+ (void)0; /* terminate potentially open 'else' */
+ } else
+#endif
+ {
+ dat->block = (block128_f) ossl_sm4_encrypt;
+ ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
+ }
return 1;
}
-static void sm4_cbc_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const SM4_KEY *key,
- unsigned char *ivec, const int enc)
+static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
{
- if (enc)
- CRYPTO_cbc128_encrypt(in, out, len, key, ivec,
- (block128_f)ossl_sm4_encrypt);
+ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+
+ if (dat->stream.cbc)
+ (*dat->stream.cbc) (in, out, len, &dat->ks.ks, ctx->iv,
+ EVP_CIPHER_CTX_is_encrypting(ctx));
+ else if (EVP_CIPHER_CTX_is_encrypting(ctx))
+ CRYPTO_cbc128_encrypt(in, out, len, &dat->ks, ctx->iv,
+ dat->block);
else
- CRYPTO_cbc128_decrypt(in, out, len, key, ivec,
- (block128_f)ossl_sm4_decrypt);
+ CRYPTO_cbc128_decrypt(in, out, len, &dat->ks,
+ ctx->iv, dat->block);
+ return 1;
}
-static void sm4_cfb128_encrypt(const unsigned char *in, unsigned char *out,
- size_t length, const SM4_KEY *key,
- unsigned char *ivec, int *num, const int enc)
+static int sm4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
{
- CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc,
- (block128_f)ossl_sm4_encrypt);
+ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+ int num = EVP_CIPHER_CTX_get_num(ctx);
+
+ CRYPTO_cfb128_encrypt(in, out, len, &dat->ks,
+ ctx->iv, &num,
+ EVP_CIPHER_CTX_is_encrypting(ctx), dat->block);
+ EVP_CIPHER_CTX_set_num(ctx, num);
+ return 1;
}
-static void sm4_ecb_encrypt(const unsigned char *in, unsigned char *out,
- const SM4_KEY *key, const int enc)
+static int sm4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
{
- if (enc)
- ossl_sm4_encrypt(in, out, key);
+ size_t bl = EVP_CIPHER_CTX_get_block_size(ctx);
+ size_t i;
+ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+
+ if (len < bl)
+ return 1;
+
+ if (dat->stream.ecb != NULL)
+ (*dat->stream.ecb) (in, out, len, &dat->ks.ks,
+ EVP_CIPHER_CTX_is_encrypting(ctx));
else
- ossl_sm4_decrypt(in, out, key);
+ for (i = 0, len -= bl; i <= len; i += bl)
+ (*dat->block) (in + i, out + i, &dat->ks);
+
+ return 1;
}
-static void sm4_ofb128_encrypt(const unsigned char *in, unsigned char *out,
- size_t length, const SM4_KEY *key,
- unsigned char *ivec, int *num)
+static int sm4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
{
- CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num,
- (block128_f)ossl_sm4_encrypt);
-}
+ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+ int num = EVP_CIPHER_CTX_get_num(ctx);
-IMPLEMENT_BLOCK_CIPHER(sm4, ks, sm4, EVP_SM4_KEY, NID_sm4,
- 16, 16, 16, 128, EVP_CIPH_FLAG_DEFAULT_ASN1,
- sm4_init_key, 0, 0, 0, 0)
+ CRYPTO_ofb128_encrypt(in, out, len, &dat->ks,
+ ctx->iv, &num, dat->block);
+ EVP_CIPHER_CTX_set_num(ctx, num);
+ return 1;
+}
static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t len)
{
int n = EVP_CIPHER_CTX_get_num(ctx);
unsigned int num;
- EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx);
+ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
if (n < 0)
return 0;
num = (unsigned int)n;
- CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, ctx->iv,
- EVP_CIPHER_CTX_buf_noconst(ctx), &num,
- (block128_f)ossl_sm4_encrypt);
+ if (dat->stream.ctr)
+ CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks,
+ ctx->iv,
+ EVP_CIPHER_CTX_buf_noconst(ctx),
+ &num, dat->stream.ctr);
+ else
+ CRYPTO_ctr128_encrypt(in, out, len, &dat->ks,
+ ctx->iv,
+ EVP_CIPHER_CTX_buf_noconst(ctx), &num,
+ dat->block);
EVP_CIPHER_CTX_set_num(ctx, num);
return 1;
}
-static const EVP_CIPHER sm4_ctr_mode = {
- NID_sm4_ctr, 1, 16, 16,
- EVP_CIPH_CTR_MODE,
- EVP_ORIG_GLOBAL,
- sm4_init_key,
- sm4_ctr_cipher,
- NULL,
- sizeof(EVP_SM4_KEY),
- NULL, NULL, NULL, NULL
-};
-
-const EVP_CIPHER *EVP_sm4_ctr(void)
-{
- return &sm4_ctr_mode;
-}
-
+DEFINE_BLOCK_CIPHERS(NID_sm4, 0)
#endif
diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl
new file mode 100755
index 0000000000..7358a6e6a2
--- /dev/null
+++ b/crypto/sm4/asm/sm4-armv8.pl
@@ -0,0 +1,635 @@
+#! /usr/bin/env perl
+# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# This module implements support for SM4 hw support on aarch64
+# Oct 2021
+#
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+$prefix="sm4_v8";
+my @rks=map("v$_",(0..7));
+
+sub rev32() {
+my $dst = shift;
+my $src = shift;
+$code.=<<___;
+#ifndef __ARMEB__
+ rev32 $dst.16b,$src.16b
+#endif
+___
+}
+
+sub enc_blk () {
+my $data = shift;
+$code.=<<___;
+ sm4e $data.4s,@rks[0].4s
+ sm4e $data.4s,@rks[1].4s
+ sm4e $data.4s,@rks[2].4s
+ sm4e $data.4s,@rks[3].4s
+ sm4e $data.4s,@rks[4].4s
+ sm4e $data.4s,@rks[5].4s
+ sm4e $data.4s,@rks[6].4s
+ sm4e $data.4s,@rks[7].4s
+ rev64 $data.4S,$data.4S
+ ext $data.16b,$data.16b,$data.16b,#8
+___
+}
+
+sub enc_4blks () {
+my $data0 = shift;
+my $data1 = shift;
+my $data2 = shift;
+my $data3 = shift;
+$code.=<<___;
+ sm4e $data0.4s,@rks[0].4s
+ sm4e $data1.4s,@rks[0].4s
+ sm4e $data2.4s,@rks[0].4s
+ sm4e $data3.4s,@rks[0].4s
+
+ sm4e $data0.4s,@rks[1].4s
+ sm4e $data1.4s,@rks[1].4s
+ sm4e $data2.4s,@rks[1].4s
+ sm4e $data3.4s,@rks[1].4s
+
+ sm4e $data0.4s,@rks[2].4s
+ sm4e $data1.4s,@rks[2].4s
+ sm4e $data2.4s,@rks[2].4s
+ sm4e $data3.4s,@rks[2].4s
+
+ sm4e $data0.4s,@rks[3].4s
+ sm4e $data1.4s,@rks[3].4s
+ sm4e $data2.4s,@rks[3].4s
+ sm4e $data3.4s,@rks[3].4s
+
+ sm4e $data0.4s,@rks[4].4s
+ sm4e $data1.4s,@rks[4].4s
+ sm4e $data2.4s,@rks[4].4s
+ sm4e $data3.4s,@rks[4].4s
+
+ sm4e $data0.4s,@rks[5].4s
+ sm4e $data1.4s,@rks[5].4s
+ sm4e $data2.4s,@rks[5].4s
+ sm4e $data3.4s,@rks[5].4s
+
+ sm4e $data0.4s,@rks[6].4s
+ sm4e $data1.4s,@rks[6].4s
+ sm4e $data2.4s,@rks[6].4s
+ sm4e $data3.4s,@rks[6].4s
+
+ sm4e $data0.4s,@rks[7].4s
+ rev64 $data0.4S,$data0.4S
+ sm4e $data1.4s,@rks[7].4s
+ ext $data0.16b,$data0.16b,$data0.16b,#8
+ rev64 $data1.4S,$data1.4S
+ sm4e $data2.4s,@rks[7].4s
+ ext $data1.16b,$data1.16b,$data1.16b,#8
+ rev64 $data2.4S,$data2.4S
+ sm4e $data3.4s,@rks[7].4s
+ ext $data2.16b,$data2.16b,$data2.16b,#8
+ rev64 $data3.4S,$data3.4S
+ ext $data3.16b,$data3.16b,$data3.16b,#8
+___
+}
+
+$code=<<___;
+#include "arm_arch.h"
+.arch armv8-a+crypto
+.text
+___
+
+{{{
+$code.=<<___;
+.align 6
+.Lck:
+ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
+ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
+ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
+ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
+ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
+ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
+ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
+ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+.Lfk:
+ .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
+___
+}}}
+
+{{{
+my ($key,$keys)=("x0","x1");
+my ($tmp)=("x2");
+my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7));
+my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
+my ($fkconst) = ("v24");
+$code.=<<___;
+.globl ${prefix}_set_encrypt_key
+.type ${prefix}_set_encrypt_key,%function
+.align 5
+${prefix}_set_encrypt_key:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {$key0.4s},[$key]
+ adr $tmp,.Lfk
+ ld1 {$fkconst.4s},[$tmp]
+ adr $tmp,.Lck
+ ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
+___
+ &rev32($key0, $key0);
+$code.=<<___;
+ ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
+ eor $key0.16b,$key0.16b,$fkconst.16b;
+ sm4ekey $key0.4S,$key0.4S,$const0.4S
+ sm4ekey $key1.4S,$key0.4S,$const1.4S
+ sm4ekey $key2.4S,$key1.4S,$const2.4S
+ sm4ekey $key3.4S,$key2.4S,$const3.4S
+ sm4ekey $key4.4S,$key3.4S,$const4.4S
+ st1 {$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64
+ sm4ekey $key5.4S,$key4.4S,$const5.4S
+ sm4ekey $key6.4S,$key5.4S,$const6.4S
+ sm4ekey $key7.4S,$key6.4S,$const7.4S
+ st1 {$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys]
+ ret
+.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+___
+}}}
+
+{{{
+my ($key,$keys)=("x0","x1");
+my ($tmp)=("x2");
+my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7));
+my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
+my ($fkconst) = ("v24");
+$code.=<<___;
+.globl ${prefix}_set_decrypt_key
+.type ${prefix}_set_decrypt_key,%function
+.align 5
+${prefix}_set_decrypt_key:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {$key0.4s},[$key]
+ adr $tmp,.Lfk
+ ld1 {$fkconst.4s},[$tmp]
+ adr $tmp, .Lck
+ ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
+___
+ &rev32($key0, $key0);
+$code.=<<___;
+ ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
+ eor $key0.16b, $key0.16b,$fkconst.16b;
+ sm4ekey $key0.4S,$key0.4S,$const0.4S
+ sm4ekey $key1.4S,$key0.4S,$const1.4S
+ sm4ekey $key2.4S,$key1.4S,$const2.4S
+ rev64 $key0.4s,$key0.4s
+ rev64 $key1.4s,$key1.4s
+ ext $key0.16b,$key0.16b,$key0.16b,#8
+ ext $key1.16b,$key1.16b,$key1.16b,#8
+ sm4ekey $key3.4S,$key2.4S,$const3.4S
+ sm4ekey $key4.4S,$key3.4S,$const4.4S
+ rev64 $key2.4s,$key2.4s
+ rev64 $key3.4s,$key3.4s
+ ext $key2.16b,$key2.16b,$key2.16b,#8
+ ext $key3.16b,$key3.16b,$key3.16b,#8
+ sm4ekey $key5.4S,$key4.4S,$const5.4S
+ sm4ekey $key6.4S,$key5.4S,$const6.4S
+ rev64 $key4.4s,$key4.4s
+ rev64 $key5.4s,$key5.4s
+ ext $key4.16b,$key4.16b,$key4.16b,#8
+ ext $key5.16b,$key5.16b,$key5.16b,#8
+ sm4ekey $key7.4S,$key6.4S,$const7.4S
+ rev64 $key6.4s, $key6.4s
+ rev64 $key7.4s, $key7.4s
+ ext $key6.16b,$key6.16b,$key6.16b,#8
+ ext $key7.16b,$key7.16b,$key7.16b,#8
+ st1 {$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64
+ st1 {$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys]
+ ret
+.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
+___
+}}}
+
+{{{
+sub gen_block () {
+my $dir = shift;
+my ($inp,$out,$rk)=map("x$_",(0..2));
+my ($data)=("v16");
+$code.=<<___;
+.globl ${prefix}_${dir}crypt
+.type ${prefix}_${dir}crypt,%function
+.align 5
+${prefix}_${dir}crypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {$data.4s},[$inp]
+ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
+ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
+___
+ &rev32($data,$data);
+ &enc_blk($data);
+ &rev32($data,$data);
+$code.=<<___;
+ st1 {$data.4s},[$out]
+ ret
+.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+___
+}
+
+&gen_block("en");
+&gen_block("de");
+}}}
+
+{{{
+my ($inp,$out,$len,$rk)=map("x$_",(0..3));
+my ($enc) = ("w4");
+my @dat=map("v$_",(16..23));
+$code.=<<___;
+.globl ${prefix}_ecb_encrypt
+.type ${prefix}_ecb_encrypt,%function
+.align 5
+${prefix}_ecb_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
+ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
+1:
+ cmp $len,#64
+ b.lt 1f
+ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
+ cmp $len,#128
+ b.lt 2f
+ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64
+ // 8 blocks
+___
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+ &rev32(@dat[4],@dat[4]);
+ &rev32(@dat[5],@dat[5]);
+ &rev32(@dat[6],@dat[6]);
+ &rev32(@dat[7],@dat[7]);
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+ &rev32(@dat[4],@dat[4]);
+ &rev32(@dat[5],@dat[5]);
+$code.=<<___;
+ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+___
+ &rev32(@dat[6],@dat[6]);
+ &rev32(@dat[7],@dat[7]);
+$code.=<<___;
+ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
+ subs $len,$len,#128
+ b.gt 1b
+ ret
+ // 4 blocks
+2:
+___
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+$code.=<<___;
+ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+ subs $len,$len,#64
+ b.gt 1b
+1:
+ subs $len,$len,#16
+ b.lt 1f
+ ld1 {@dat[0].4s},[$inp],#16
+___
+ &rev32(@dat[0],@dat[0]);
+ &enc_blk(@dat[0]);
+ &rev32(@dat[0],@dat[0]);
+$code.=<<___;
+ st1 {@dat[0].4s},[$out],#16
+ b.ne 1b
+1:
+ ret
+.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
+___
+}}}
+
+{{{
+my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
+my ($enc) = ("w5");
+my @dat=map("v$_",(16..23));
+my @in=map("v$_",(24..31));
+my ($ivec) = ("v8");
+$code.=<<___;
+.globl ${prefix}_cbc_encrypt
+.type ${prefix}_cbc_encrypt,%function
+.align 5
+${prefix}_cbc_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ stp d8,d9,[sp, #-16]!
+
+ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
+ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
+ ld1 {$ivec.4s},[$ivp]
+ cmp $enc,#0
+ b.eq .Ldec
+1:
+ cmp $len, #64
+ b.lt 1f
+ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
+ eor @dat[0].16b,@dat[0].16b,$ivec.16b
+___
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+ &enc_blk(@dat[0]);
+$code.=<<___;
+ eor @dat[1].16b,@dat[1].16b,@dat[0].16b
+___
+ &enc_blk(@dat[1]);
+ &rev32(@dat[0],@dat[0]);
+$code.=<<___;
+ eor @dat[2].16b,@dat[2].16b,@dat[1].16b
+___
+ &enc_blk(@dat[2]);
+ &rev32(@dat[1],@dat[1]);
+$code.=<<___;
+ eor @dat[3].16b,@dat[3].16b,@dat[2].16b
+___
+ &enc_blk(@dat[3]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+$code.=<<___;
+ mov $ivec.16b,@dat[3].16b
+ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+ subs $len,$len,#64
+ b.ne 1b
+1:
+ subs $len,$len,#16
+ b.lt 3f
+ ld1 {@dat[0].4s},[$inp],#16
+ eor $ivec.16b,$ivec.16b,@dat[0].16b
+___
+ &rev32($ivec,$ivec);
+ &enc_blk($ivec);
+ &rev32($ivec,$ivec);
+$code.=<<___;
+ st1 {$ivec.16b},[$out],#16
+ b.ne 1b
+ b 3f
+.Ldec:
+1:
+ cmp $len, #64
+ b.lt 1f
+ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp]
+ ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
+ cmp $len,#128
+ b.lt 2f
+ // 8 blocks mode
+ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp]
+ ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
+___
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],$dat[3]);
+ &rev32(@dat[4],@dat[4]);
+ &rev32(@dat[5],@dat[5]);
+ &rev32(@dat[6],@dat[6]);
+ &rev32(@dat[7],$dat[7]);
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+ &rev32(@dat[4],@dat[4]);
+ &rev32(@dat[5],@dat[5]);
+ &rev32(@dat[6],@dat[6]);
+ &rev32(@dat[7],@dat[7]);
+$code.=<<___;
+ eor @dat[0].16b,@dat[0].16b,$ivec.16b
+ eor @dat[1].16b,@dat[1].16b,@in[0].16b
+ eor @dat[2].16b,@dat[2].16b,@in[1].16b
+ mov $ivec.16b,@in[7].16b
+ eor @dat[3].16b,$dat[3].16b,@in[2].16b
+ eor @dat[4].16b,$dat[4].16b,@in[3].16b
+ eor @dat[5].16b,$dat[5].16b,@in[4].16b
+ eor @dat[6].16b,$dat[6].16b,@in[5].16b
+ eor @dat[7].16b,$dat[7].16b,@in[6].16b
+ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
+ subs $len,$len,128
+ b.gt 1b
+ b 3f
+ // 4 blocks mode
+2:
+___
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],$dat[3]);
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+$code.=<<___;
+ eor @dat[0].16b,@dat[0].16b,$ivec.16b
+ eor @dat[1].16b,@dat[1].16b,@in[0].16b
+ mov $ivec.16b,@in[3].16b
+ eor @dat[2].16b,@dat[2].16b,@in[1].16b
+ eor @dat[3].16b,$dat[3].16b,@in[2].16b
+ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+ subs $len,$len,#64
+ b.gt 1b
+1:
+ subs $len,$len,#16
+ b.lt 3f
+ ld1 {@dat[0].4s},[$inp],#16
+ mov @in[0].16b,@dat[0].16b
+___
+ &rev32(@dat[0],@dat[0]);
+ &enc_blk(@dat[0]);
+ &rev32(@dat[0],@dat[0]);
+$code.=<<___;
+ eor @dat[0].16b,@dat[0].16b,$ivec.16b
+ mov $ivec.16b,@in[0].16b
+ st1 {@dat[0].16b},[$out],#16
+ b.ne 1b
+3:
+ // save back IV
+ st1 {$ivec.16b},[$ivp]
+ ldp d8,d9,[sp],#16
+ ret
+.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
+___
+}}}
+
+{{{
+my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
+my ($ctr)=("w5");
+my @dat=map("v$_",(16..23));
+my @in=map("v$_",(24..31));
+my ($ivec)=("v8");
+$code.=<<___;
+.globl ${prefix}_ctr32_encrypt_blocks
+.type ${prefix}_ctr32_encrypt_blocks,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks:
+ AARCH64_VALID_CALL_TARGET
+ stp d8,d9,[sp, #-16]!
+
+ ld1 {$ivec.4s},[$ivp]
+ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
+ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
+___
+ &rev32($ivec,$ivec);
+$code.=<<___;
+ mov $ctr,$ivec.s[3]
+1:
+ cmp $len,#4
+ b.lt 1f
+ ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
+ mov @dat[0].16b,$ivec.16b
+ mov @dat[1].16b,$ivec.16b
+ mov @dat[2].16b,$ivec.16b
+ mov @dat[3].16b,$ivec.16b
+ add $ctr,$ctr,#1
+ mov $dat[1].s[3],$ctr
+ add $ctr,$ctr,#1
+ mov @dat[2].s[3],$ctr
+ add $ctr,$ctr,#1
+ mov @dat[3].s[3],$ctr
+ cmp $len,#8
+ b.lt 2f
+ ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
+ mov @dat[4].16b,$ivec.16b
+ mov @dat[5].16b,$ivec.16b
+ mov @dat[6].16b,$ivec.16b
+ mov @dat[7].16b,$ivec.16b
+ add $ctr,$ctr,#1
+ mov $dat[4].s[3],$ctr
+ add $ctr,$ctr,#1
+ mov @dat[5].s[3],$ctr
+ add $ctr,$ctr,#1
+ mov @dat[6].s[3],$ctr
+ add $ctr,$ctr,#1
+ mov @dat[7].s[3],$ctr
+___
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+ &rev32(@dat[4],@dat[4]);
+ &rev32(@dat[5],@dat[5]);
+ &rev32(@dat[6],@dat[6]);
+ &rev32(@dat[7],@dat[7]);
+$code.=<<___;
+ eor @dat[0].16b,@dat[0].16b,@in[0].16b
+ eor @dat[1].16b,@dat[1].16b,@in[1].16b
+ eor @dat[2].16b,@dat[2].16b,@in[2].16b
+ eor @dat[3].16b,@dat[3].16b,@in[3].16b
+ eor @dat[4].16b,@dat[4].16b,@in[4].16b
+ eor @dat[5].16b,@dat[5].16b,@in[5].16b
+ eor @dat[6].16b,@dat[6].16b,@in[6].16b
+ eor @dat[7].16b,@dat[7].16b,@in[7].16b
+ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
+ subs $len,$len,#8
+ b.eq 3f
+ add $ctr,$ctr,#1
+ mov $ivec.s[3],$ctr
+ b 1b
+2:
+___
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+$code.=<<___;
+ eor @dat[0].16b,@dat[0].16b,@in[0].16b
+ eor @dat[1].16b,@dat[1].16b,@in[1].16b
+ eor @dat[2].16b,@dat[2].16b,@in[2].16b
+ eor @dat[3].16b,@dat[3].16b,@in[3].16b
+ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+ subs $len,$len,#4
+ b.eq 3f
+ add $ctr,$ctr,#1
+ mov $ivec.s[3],$ctr
+ b 1b
+1:
+ subs $len,$len,#1
+ b.lt 3f
+ mov $dat[0].16b,$ivec.16b
+ ld1 {@in[0].4s},[$inp],#16
+___
+ &enc_blk(@dat[0]);
+ &rev32(@dat[0],@dat[0]);
+$code.=<<___;
+ eor $dat[0].16b,$dat[0].16b,@in[0].16b
+ st1 {$dat[0].4s},[$out],#16
+ b.eq 3f
+ add $ctr,$ctr,#1
+ mov $ivec.s[3],$ctr
+ b 1b
+3:
+ ldp d8,d9,[sp],#16
+ ret
+.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+########################################
+{ my %opcode = (
+ "sm4e" => 0xcec08400,
+ "sm4ekey" => 0xce60c800);
+
+ sub unsm4 {
+ my ($mnemonic,$arg)=@_;
+
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+ &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+ $mnemonic,$arg;
+ }
+}
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/\/\// and !/^$/);
+ print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+
+ s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge;
+ print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info
index b65a7d149e..e27aa49e67 100644
--- a/crypto/sm4/build.info
+++ b/crypto/sm4/build.info
@@ -1,4 +1,32 @@
LIBS=../../libcrypto
-SOURCE[../../libcrypto]=\
- sm4.c
+IF[{- !$disabled{asm} -}]
+ $SM4DEF_aarch64=SM4_ASM
+ $SM4ASM_aarch64=sm4-armv8.S
+
+ # Now that we have defined all the arch specific variables, use the
+ # appropriate one, and define the appropriate macros
+ IF[$SM4ASM_{- $target{asm_arch} -}]
+ $SM4ASM=$SM4ASM_{- $target{asm_arch} -}
+ $SM4DEF=$SM4DEF_{- $target{asm_arch} -}
+ ENDIF
+ENDIF
+
+SOURCE[../../libcrypto]= $SM4ASM sm4.c
+
+
+# Implementations are now spread across several libraries, so the defines
+# need to be applied to all affected libraries and modules.
+DEFINE[../../libcrypto]=$SM4DEF
+DEFINE[../../providers/libfips.a]=$SM4DEF
+DEFINE[../../providers/libdefault.a]=$SM4DEF
+# We only need to include the SM4DEF stuff in the legacy provider when it's a
+# separate module and it's dynamically linked with libcrypto. Otherwise, it
+# already gets everything that the static libcrypto.a has, and doesn't need it
+# added again.
+IF[{- !$disabled{module} && !$disabled{shared} -}]
+ DEFINE[../providers/liblegacy.a]=$SM4DEF
+ENDIF
+
+GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl
+INCLUDE[sm4-armv8.o]=..
diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
new file mode 100644
index 0000000000..42c8b44a43
--- /dev/null
+++ b/include/crypto/sm4_platform.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifndef OSSL_SM4_PLATFORM_H
+# define OSSL_SM4_PLATFORM_H
+# pragma once
+
+# if defined(OPENSSL_CPUID_OBJ)
+# if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
+# include "arm_arch.h"
+# if __ARM_MAX_ARCH__>=8
+# define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4)
+# define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key
+# define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key
+# define HWSM4_encrypt sm4_v8_encrypt
+# define HWSM4_decrypt sm4_v8_decrypt
+# define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt
+# define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt
+# define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks