diff options
author | XiaokangQian <xiaokang.qian@arm.com> | 2021-06-09 06:35:46 +0000 |
---|---|---|
committer | Pauli <pauli@openssl.org> | 2022-01-25 14:30:00 +1100 |
commit | 954f45ba4c504570206ff5bed811e512cf92dc8e (patch) | |
tree | 6d2521f79615afd4c8b35cb2c6794a57aded5602 /crypto/armcap.c | |
parent | 44a563dde1584cd9284e80b6e45ee5019be8d36c (diff) |
Optimize AES-GCM for uarchs with unroll and new instructions
Increase the block numbers to 8 for every iteration. Increase the hash
table capacity. Make use of EOR3 instruction to improve the performance.
This can improve performance 25-40% on out-of-order microarchitectures
with a large number of fast execution units, such as Neoverse V1. We also
see 20-30% performance improvements on other architectures such as the M1.
Assembly code reviewd by Tom Cosgrove (ARM).
Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/15916)
Diffstat (limited to 'crypto/armcap.c')
-rw-r--r-- | crypto/armcap.c | 24 |
1 files changed, 21 insertions, 3 deletions
diff --git a/crypto/armcap.c b/crypto/armcap.c index 5016987eeb..c50322f504 100644 --- a/crypto/armcap.c +++ b/crypto/armcap.c @@ -171,6 +171,7 @@ static unsigned long getauxval(unsigned long key) # define HWCAP_CE_SHA1 (1 << 5) # define HWCAP_CE_SHA256 (1 << 6) # define HWCAP_CPUID (1 << 11) +# define HWCAP_SHA3 (1 << 17) # define HWCAP_CE_SM3 (1 << 18) # define HWCAP_CE_SM4 (1 << 19) # define HWCAP_CE_SHA512 (1 << 21) @@ -216,11 +217,20 @@ void OPENSSL_cpuid_setup(void) */ # else { - unsigned int sha512; - size_t len = sizeof(sha512); + unsigned int feature; + size_t len = sizeof(feature); + char uarch[64]; - if (sysctlbyname("hw.optional.armv8_2_sha512", &sha512, &len, NULL, 0) == 0 && sha512 == 1) + if (sysctlbyname("hw.optional.armv8_2_sha512", &feature, &len, NULL, 0) == 0 && feature == 1) OPENSSL_armcap_P |= ARMV8_SHA512; + feature = 0; + if (sysctlbyname("hw.optional.armv8_2_sha3", &feature, &len, NULL, 0) == 0 && feature == 1) { + OPENSSL_armcap_P |= ARMV8_SHA3; + len = sizeof(uarch); + if ((sysctlbyname("machdep.cpu.brand_string", uarch, &len, NULL, 0) == 0) && + (strncmp(uarch, "Apple M1", 8) == 0)) + OPENSSL_armcap_P |= ARMV8_UNROLL8_EOR3; + } } # endif # endif @@ -255,6 +265,8 @@ void OPENSSL_cpuid_setup(void) if (hwcap & HWCAP_CE_SM3) OPENSSL_armcap_P |= ARMV8_SM3; + if (hwcap & HWCAP_SHA3) + OPENSSL_armcap_P |= ARMV8_SHA3; # endif } # ifdef __aarch64__ @@ -311,6 +323,9 @@ void OPENSSL_cpuid_setup(void) if (sigsetjmp(ill_jmp, 1) == 0) { _armv8_sm3_probe(); OPENSSL_armcap_P |= ARMV8_SM3; + if (sigsetjmp(ill_jmp, 1) == 0) { + _armv8_eor3_probe(); + OPENSSL_armcap_P |= ARMV8_SHA3; } # endif } @@ -340,6 +355,9 @@ void OPENSSL_cpuid_setup(void) (OPENSSL_armcap_P & ARMV7_NEON)) { OPENSSL_armv8_rsa_neonized = 1; } + if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1)) && + (OPENSSL_armcap_P & ARMV8_SHA3)) + OPENSSL_armcap_P |= ARMV8_UNROLL8_EOR3; # endif } #endif |