summaryrefslogtreecommitdiffstats
path: root/crypto/modes
diff options
context:
space:
mode:
authorAndrey Matyukov <andrey.matyukov@intel.com>2021-06-09 14:38:40 -0700
committerTomas Mraz <tomas@openssl.org>2022-11-11 10:04:05 +0100
commit0a23b2b53084c41026349aaf8adf2884fcc8468d (patch)
treee8876844e25163e31acda2fa84e4718b84a43d91 /crypto/modes
parentaf84bf2d52d250d716f4f58834b5603001d45f80 (diff)
AES-GCM enabled with AVX512 vAES and vPCLMULQDQ.
Vectorized 'stitched' encrypt + ghash implementation of AES-GCM enabled with AVX512 vAES and vPCLMULQDQ instructions (available starting Intel's IceLake micro-architecture). The performance details for representative IceLake Server and Client platforms are shown below Performance data: OpenSSL Speed KBs/Sec Intel(R) Xeon(R) Platinum 8380 CPU @ 2.30GHz (1Core/1Thread) Payload in Bytes 16 64 256 1024 8192 16384 AES-128-GCM Baseline 478708.27 1118296.96 2428092.52 3518199.4 4172355.99 4235762.07 Patched 534613.95 2009345.55 3775588.15 5059517.64 8476794.88 8941541.79 Speedup 1.12 1.80 1.55 1.44 2.03 2.11 AES-256-GCM Baseline 399237.27 961699.9 2136377.65 2979889.15 3554823.37 3617757.5 Patched 475948.13 1720128.51 3462407.12 4696832.2 7532013.16 7924953.91 Speedup 1.19 1.79 1.62 1.58 2.12 2.19 Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz (1Core/1Thread) Payload in Bytes 16 64 256 1024 8192 16384 AES-128-GCM Baseline 259128.54 570756.43 1362554.16 1990654.57 2359128.88 2401671.58 Patched 292139.47 1079320.95 2001974.63 2829007.46 4510318.59 4705314.41 Speedup 1.13 1.89 1.47 1.42 1.91 1.96 AES-256-GCM Baseline 236000.34 550506.76 1234638.08 1716734.57 2011255.6 2028099.99 Patched 247256.32 919731.34 1773270.43 2553239.55 3953115.14 4111227.29 Speedup 1.05 1.67 1.44 1.49 1.97 2.03 Reviewed-by: TJ O'Dwyer, Marcel Cornu, Pablo de Lara Reviewed-by: Paul Dale <pauli@openssl.org> Reviewed-by: Tomas Mraz <tomas@openssl.org> (Merged from https://github.com/openssl/openssl/pull/17239) (cherry picked from commit 63b996e752ac698186c38177232280e6515d571b)
Diffstat (limited to 'crypto/modes')
-rw-r--r--crypto/modes/asm/aes-gcm-avx512.pl4975
-rw-r--r--crypto/modes/build.info3
2 files changed, 4977 insertions, 1 deletions
diff --git a/crypto/modes/asm/aes-gcm-avx512.pl b/crypto/modes/asm/aes-gcm-avx512.pl
new file mode 100644
index 0000000000..1c7ee8769a
--- /dev/null
+++ b/crypto/modes/asm/aes-gcm-avx512.pl
@@ -0,0 +1,4975 @@
+# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+#
+# This implementation is based on the AES-GCM code (AVX512VAES + VPCLMULQDQ)
+# from Intel(R) Multi-Buffer Crypto for IPsec Library v1.1
+# (https://github.com/intel/intel-ipsec-mb).
+# Original author is Tomasz Kantecki <tomasz.kantecki@intel.com>.
+#
+# References:
+# [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
+# Intel Architecture Processors. August, 2010.
+# [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
+# Intel Architecture Processors. October, 2012.
+# [3] Shay Gueron et. al. Intel Carry-Less Multiplication Instruction and its
+# Usage for Computing the GCM Mode. May, 2010.
+#
+#
+# December 2021
+#
+# Initial release.
+#
+# GCM128_CONTEXT structure has storage for 16 hkeys only, but this
+# implementation can use up to 48. To avoid extending the context size,
+# precompute and store in the context first 16 hkeys only, and compute the rest
+# on demand keeping them in the local frame.
+#
+#======================================================================
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$win64 = 0;
+$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$avx512vaes = 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+$dir = $1;
+($xlate = "${dir}x86_64-xlate.pl" and -f $xlate)
+ or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate)
+ or die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx512vaes = ($1 >= 2.30);
+}
+
+if (!$avx512vaes
+ && $win64
+ && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/)
+ && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/)
+{
+ $avx512vaes = ($1 == 2.13 && $2 >= 3) + ($1 >= 2.14);
+}
+
+if (!$avx512vaes && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
+ $avx512vaes = ($2 >= 7.0);
+}
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""
+ or die "can't call $xlate: $!";
+*STDOUT = *OUT;
+
+#======================================================================
+if ($avx512vaes>0) { #<<<
+
+$code .= <<___;
+.extern OPENSSL_ia32cap_P
+.globl ossl_vaes_vpclmulqdq_capable
+.type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
+.align 32
+ossl_vaes_vpclmulqdq_capable:
+ mov OPENSSL_ia32cap_P+8(%rip), %rcx
+ # avx512vpclmulqdq + avx512vaes + avx512vl + avx512bw + avx512dq + avx512f
+ mov \$`1<<42|1<<41|1<<31|1<<30|1<<17|1<<16`,%rdx
+ xor %eax,%eax
+ and %rdx,%rcx
+ cmp %rdx,%rcx
+ cmove %rcx,%rax
+ ret
+.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
+___
+
+# ; Mapping key length -> AES rounds count
+my %aes_rounds = (
+ 128 => 9,
+ 192 => 11,
+ 256 => 13);
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Code generation control switches
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+# ; ABI-aware zeroing of volatile registers in EPILOG().
+# ; Disabled due to performance reasons.
+my $CLEAR_SCRATCH_REGISTERS = 0;
+
+# ; Zero HKeys storage from the stack if they are stored there
+my $CLEAR_HKEYS_STORAGE_ON_EXIT = 1;
+
+# ; Enable / disable check of function arguments for null pointer
+# ; Currently disabled, as this check is handled outside.
+my $CHECK_FUNCTION_ARGUMENTS = 0;
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Global constants
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+# AES block size in bytes
+my $AES_BLOCK_SIZE = 16;
+
+# Storage capacity in elements
+my $HKEYS_STORAGE_CAPACITY = 48;
+my $LOCAL_STORAGE_CAPACITY = 48;
+my $HKEYS_CONTEXT_CAPACITY = 16;
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Stack frame definition
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+# (1) -> +64(Win)/+48(Lin)-byte space for pushed GPRs
+# (2) -> +8-byte space for 16-byte alignment of XMM storage
+# (3) -> Frame pointer (%RBP)
+# (4) -> +160-byte XMM storage (Windows only, zero on Linux)
+# (5) -> +48-byte space for 64-byte alignment of %RSP from p.8
+# (6) -> +768-byte LOCAL storage (optional, can be omitted in some functions)
+# (7) -> +768-byte HKEYS storage
+# (8) -> Stack pointer (%RSP) aligned on 64-byte boundary
+
+my $GP_STORAGE = $win64 ? 8 * 8 : 8 * 6; # ; space for saved non-volatile GP registers (pushed on stack)
+my $XMM_STORAGE = $win64 ? (10 * 16) : 0; # ; space for saved XMM registers
+my $HKEYS_STORAGE = ($HKEYS_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for HKeys^i, i=1..48
+my $LOCAL_STORAGE = ($LOCAL_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for up to 48 AES blocks
+
+my $STACK_HKEYS_OFFSET = 0;
+my $STACK_LOCAL_OFFSET = ($STACK_HKEYS_OFFSET + $HKEYS_STORAGE);
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Function arguments abstraction
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+my ($arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8, $arg9, $arg10, $arg11);
+
+# ; This implementation follows the convention: for non-leaf functions (they
+# ; must call PROLOG) %rbp is used as a frame pointer, and has fixed offset from
+# ; the function entry: $GP_STORAGE + [8 bytes alignment (Windows only)]. This
+# ; helps to facilitate SEH handlers writing.
+#
+# ; Leaf functions here do not use more than 4 input arguments.
+if ($win64) {
+ $arg1 = "%rcx";
+ $arg2 = "%rdx";
+ $arg3 = "%r8";
+ $arg4 = "%r9";
+ $arg5 = "`$GP_STORAGE + 8 + 8*5`(%rbp)"; # +8 - alignment bytes
+ $arg6 = "`$GP_STORAGE + 8 + 8*6`(%rbp)";
+ $arg7 = "`$GP_STORAGE + 8 + 8*7`(%rbp)";
+ $arg8 = "`$GP_STORAGE + 8 + 8*8`(%rbp)";
+ $arg9 = "`$GP_STORAGE + 8 + 8*9`(%rbp)";
+ $arg10 = "`$GP_STORAGE + 8 + 8*10`(%rbp)";
+ $arg11 = "`$GP_STORAGE + 8 + 8*11`(%rbp)";
+} else {
+ $arg1 = "%rdi";
+ $arg2 = "%rsi";
+ $arg3 = "%rdx";
+ $arg4 = "%rcx";
+ $arg5 = "%r8";
+ $arg6 = "%r9";
+ $arg7 = "`$GP_STORAGE + 8*1`(%rbp)";
+ $arg8 = "`$GP_STORAGE + 8*2`(%rbp)";
+ $arg9 = "`$GP_STORAGE + 8*3`(%rbp)";
+ $arg10 = "`$GP_STORAGE + 8*4`(%rbp)";
+ $arg11 = "`$GP_STORAGE + 8*5`(%rbp)";
+}
+
+# ; Offsets in gcm128_context structure (see include/crypto/modes.h)
+my $CTX_OFFSET_CurCount = (16 * 0); # ; (Yi) Current counter for generation of encryption key
+my $CTX_OFFSET_PEncBlock = (16 * 1); # ; (repurposed EKi field) Partial block buffer
+my $CTX_OFFSET_EK0 = (16 * 2); # ; (EK0) Encrypted Y0 counter (see gcm spec notation)
+my $CTX_OFFSET_AadLen = (16 * 3); # ; (len.u[0]) Length of Hash which has been input
+my $CTX_OFFSET_InLen = ((16 * 3) + 8); # ; (len.u[1]) Length of input data which will be encrypted or decrypted
+my $CTX_OFFSET_AadHash = (16 * 4); # ; (Xi) Current hash
+my $CTX_OFFSET_HTable = (16 * 6); # ; (Htable) Precomputed table (allows 16 values)
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Helper functions
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+# ; Generates "random" local labels
+sub random_string() {
+ my @chars = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_');
+ my $length = 15;
+ my $str;
+ map { $str .= $chars[rand(33)] } 1 .. $length;
+ return $str;
+}
+
+sub BYTE {
+ my ($reg) = @_;
+ if ($reg =~ /%r[abcd]x/i) {
+ $reg =~ s/%r([abcd])x/%${1}l/i;
+ } elsif ($reg =~ /%r[sdb][ip]/i) {
+ $reg =~ s/%r([sdb][ip])/%${1}l/i;
+ } elsif ($reg =~ /%r[0-9]{1,2}/i) {
+ $reg =~ s/%(r[0-9]{1,2})/%${1}b/i;
+ } else {
+ die "BYTE: unknown register: $reg\n";
+ }
+ return $reg;
+}
+
+sub WORD {
+ my ($reg) = @_;
+ if ($reg =~ /%r[abcdsdb][xip]/i) {
+ $reg =~ s/%r([abcdsdb])([xip])/%${1}${2}/i;
+ } elsif ($reg =~ /%r[0-9]{1,2}/) {
+ $reg =~ s/%(r[0-9]{1,2})/%${1}w/i;
+ } else {
+ die "WORD: unknown register: $reg\n";
+ }
+ return $reg;
+}
+
+sub DWORD {
+ my ($reg) = @_;
+ if ($reg =~ /%r[abcdsdb][xip]/i) {
+ $reg =~ s/%r([abcdsdb])([xip])/%e${1}${2}/i;
+ } elsif ($reg =~ /%r[0-9]{1,2}/i) {
+ $reg =~ s/%(r[0-9]{1,2})/%${1}d/i;
+ } else {
+ die "DWORD: unknown register: $reg\n";
+ }
+ return $reg;
+}
+
+sub XWORD {
+ my ($reg) = @_;
+ if ($reg =~ /%[xyz]mm/i) {
+ $reg =~ s/%[xyz]mm/%xmm/i;
+ } else {
+ die "XWORD: unknown register: $reg\n";
+ }
+ return $reg;
+}
+
+sub YWORD {
+ my ($reg) = @_;
+ if ($reg =~ /%[xyz]mm/i) {
+ $reg =~ s/%[xyz]mm/%ymm/i;
+ } else {
+ die "YWORD: unknown register: $reg\n";
+ }
+ return $reg;
+}
+
+sub ZWORD {
+ my ($reg) = @_;
+ if ($reg =~ /%[xyz]mm/i) {
+ $reg =~ s/%[xyz]mm/%zmm/i;
+ } else {
+ die "ZWORD: unknown register: $reg\n";
+ }
+ return $reg;
+}
+
+# ; Helper function to construct effective address based on two kinds of
+# ; offsets: numerical or located in the register
+sub EffectiveAddress {
+ my ($base, $offset, $displacement) = @_;
+ $displacement = 0 if (!$displacement);
+
+ if ($offset =~ /^\d+\z/) { # numerical offset
+ return "`$offset + $displacement`($base)";
+ } else { # offset resides in register
+ return "$displacement($base,$offset,1)";
+ }
+}
+
+# ; Provides memory location of corresponding HashKey power
+sub HashKeyByIdx {
+ my ($idx, $base) = @_;
+ my $base_str = ($base eq "%rsp") ? "frame" : "context";
+
+ my $offset = &HashKeyOffsetByIdx($idx, $base_str);
+ return "$offset($base)";
+}
+
+# ; Provides offset (in bytes) of corresponding HashKey power from the highest key in the storage
+sub HashKeyOffsetByIdx {
+ my ($idx, $base) = @_;
+ die "HashKeyOffsetByIdx: base should be either 'frame' or 'context'; base = $base"
+ if (($base ne "frame") && ($base ne "context"));
+
+ my $offset_base;
+ my $offset_idx;
+ if ($base eq "frame") { # frame storage
+ die "HashKeyOffsetByIdx: idx out of bounds (1..48)! idx = $idx\n" if ($idx > $HKEYS_STORAGE_CAPACITY || $idx < 1);
+ $offset_base = $STACK_HKEYS_OFFSET;
+ $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_STORAGE_CAPACITY - $idx));
+ } else { # context storage
+ die "HashKeyOffsetByIdx: idx out of bounds (1..16)! idx = $idx\n" if ($idx > $HKEYS_CONTEXT_CAPACITY || $idx < 1);
+ $offset_base = $CTX_OFFSET_HTable;
+ $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_CONTEXT_CAPACITY - $idx));
+ }
+ return $offset_base + $offset_idx;
+}
+
+# ; Creates local frame and does back up of non-volatile registers.
+# ; Holds stack unwinding directives.
+sub PROLOG {
+ my ($need_hkeys_stack_storage, $need_aes_stack_storage, $func_name) = @_;
+
+ my $DYNAMIC_STACK_ALLOC_SIZE = 0;
+ my $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE = $win64 ? 48 : 52;
+
+ if ($need_hkeys_stack_storage) {
+ $DYNAMIC_STACK_ALLOC_SIZE += $HKEYS_STORAGE;
+ }
+
+ if ($need_aes_stack_storage) {
+ if (!$need_hkeys_stack_storage) {
+ die "PROLOG: unsupported case - aes storage without hkeys one";
+ }
+ $DYNAMIC_STACK_ALLOC_SIZE += $LOCAL_STORAGE;
+ }
+
+ $code .= <<___;
+ push %rbx
+.cfi_push %rbx
+.L${func_name}_seh_push_rbx:
+ push %rbp
+.cfi_push %rbp
+.L${func_name}_seh_push_rbp:
+ push %r12
+.cfi_push %r12
+.L${func_name}_seh_push_r12:
+ push %r13
+.cfi_push %r13
+.L${func_name}_seh_push_r13:
+ push %r14
+.cfi_push %r14
+.L${func_name}_seh_push_r14:
+ push %r15
+.cfi_push %r15
+.L${func_name}_seh_push_r15:
+___
+
+ if ($win64) {
+ $code .= <<___;
+ push %rdi
+.L${func_name}_seh_push_rdi:
+ push %rsi
+.L${func_name}_seh_push_rsi:
+
+ sub \$`$XMM_STORAGE+8`,%rsp # +8 alignment
+.L${func_name}_seh_allocstack_xmm:
+___
+ }
+ $code .= <<___;
+ # ; %rbp contains stack pointer right after GP regs pushed at stack + [8
+ # ; bytes of alignment (Windows only)]. It serves as a frame pointer in SEH
+ # ; handlers. The requirement for a frame pointer is that its offset from
+ # ; RSP shall be multiple of 16, and not exceed 240 bytes. The frame pointer
+ # ; itself seems to be reasonable to use here, because later we do 64-byte stack
+ # ; alignment which gives us non-determinate offsets and complicates writing
+ # ; SEH handlers.
+ #
+ # ; It also serves as an anchor for retrieving stack arguments on both Linux
+ # ; and Windows.
+ lea `$XMM_STORAGE`(%rsp),%rbp
+.cfi_def_cfa_register %rbp
+.L${func_name}_seh_setfp:
+___
+ if ($win64) {
+
+ # ; xmm6:xmm15 need to be preserved on Windows
+ foreach my $reg_idx (6 .. 15) {
+ my $xmm_reg_offset = ($reg_idx - 6) * 16;
+ $code .= <<___;
+ vmovdqu %xmm${reg_idx},$xmm_reg_offset(%rsp)
+.L${func_name}_seh_save_xmm${reg_idx}:
+___
+ }
+ }
+
+ $code .= <<___;
+# Prolog ends here. Next stack allocation is treated as "dynamic".
+.L${func_name}_seh_prolog_end:
+___
+
+ if ($DYNAMIC_STACK_ALLOC_SIZE) {
+ $code .= <<___;
+ sub \$`$DYNAMIC_STACK_ALLOC_SIZE + $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE`,%rsp
+ and \$(-64),%rsp
+___
+ }
+}
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Restore register content for the caller.
+# ;;; And cleanup stack.
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+sub EPILOG {
+ my ($hkeys_storage_on_stack, $payload_len) = @_;
+
+ my $rndsuffix = &random_string();
+
+ if ($hkeys_storage_on_stack && $CLEAR_HKEYS_STORAGE_ON_EXIT) {
+
+ # ; There is no need in hkeys cleanup if payload len was small, i.e. no hkeys
+ # ; were stored in the local frame storage
+ $code .= <<___;
+ cmpq \$`16*16`,$payload_len
+ jbe .Lskip_hkeys_cleanup_${rndsuffix}
+ vpxor %xmm0,%xmm0,%xmm0
+___
+ for (my $i = 0; $i < int($HKEYS_STORAGE / 64); $i++) {
+ $code .= "vmovdqa64 %zmm0,`$STACK_HKEYS_OFFSET + 64*$i`(%rsp)\n";
+ }
+ $code .= ".Lskip_hkeys_cleanup_${rndsuffix}:\n";
+ }
+
+ if ($CLEAR_SCRATCH_REGISTERS) {
+ &clear_scratch_gps_asm();
+ &clear_scratch_zmms_asm();
+ } else {
+ $code .= "vzeroupper\n";
+ }
+
+ if ($win64) {
+
+ # ; restore xmm15:xmm6
+ for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
+ my $xmm_reg_offset = -$XMM_STORAGE + ($reg_idx - 6) * 16;
+ $code .= <<___;
+ vmovdqu $xmm_reg_offset(%rbp),%xmm${reg_idx},
+___
+ }
+ }
+
+ if ($win64) {
+
+ # Forming valid epilog for SEH with use of frame pointer.
+ # https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=msvc-160#epilog-code
+ $code .= "lea 8(%rbp),%rsp\n";
+ } else {
+ $code .= "lea (%rbp),%rsp\n";
+ $code .= ".cfi_def_cfa_register %rsp\n";
+ }
+
+ if ($win64) {
+ $code .= <<___;
+ pop %rsi
+.cfi_pop %rsi
+ pop %rdi
+.cfi_pop %rdi
+___
+ }
+ $code .= <<___;
+ pop %r15
+.cfi_pop %r15
+ pop %r14
+.cfi_pop %r14
+ pop %r13
+.cfi_pop %r13
+ pop %r12
+.cfi_pop %r12
+ pop %rbp
+.cfi_pop %rbp
+ pop %rbx
+.cfi_pop %rbx
+___
+}
+
+# ; Clears all scratch ZMM registers
+# ;
+# ; It should be called before restoring the XMM registers
+# ; for Windows (XMM6-XMM15).
+# ;
+sub clear_scratch_zmms_asm {
+
+ # ; On Linux, all ZMM registers are scratch registers
+ if (!$win64) {
+ $code .= "vzeroall\n";
+ } else {
+ foreach my $i (0 .. 5) {
+ $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n";
+ }
+ }
+ foreach my $i (16 .. 31) {
+ $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n";
+ }
+}
+
+# Clears all scratch GP registers
+sub clear_scratch_gps_asm {
+ foreach my $reg ("%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11") {
+ $code .= "xor $reg,$reg\n";
+ }
+ if (!$win64) {
+ foreach my $reg ("%rsi", "%rdi") {
+ $code .= "xor $reg,$reg\n";
+ }
+ }
+}
+
+sub precompute_hkeys_on_stack {
+ my $GCM128_CTX = $_[0];
+ my $HKEYS_READY = $_[1];
+ my $ZTMP0 = $_[2];
+ my $ZTMP1 = $_[3];
+ my $ZTMP2 = $_[4];
+ my $ZTMP3 = $_[5];
+ my $ZTMP4 = $_[6];
+ my $ZTMP5 = $_[7];
+ my $ZTMP6 = $_[8];
+ my $HKEYS_RANGE = $_[9]; # ; "first16", "mid16", "all", "first32", "last32"
+
+ die "precompute_hkeys_on_stack: Unexpected value of HKEYS_RANGE: $HKEYS_RANGE"
+ if ($HKEYS_RANGE ne "first16"
+ && $HKEYS_RANGE ne "mid16"
+ && $HKEYS_RANGE ne "all"
+ && $HKEYS_RANGE ne "first32"
+ && $HKEYS_RANGE ne "last32");
+
+ my $rndsuffix = &random_string();
+
+ $code .= <<___;
+ test $HKEYS_READY,$HKEYS_READY
+ jnz .L_skip_hkeys_precomputation_${rndsuffix}
+___
+
+ if ($HKEYS_RANGE eq "first16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "all") {
+
+ # ; Fill the stack with the first 16 hkeys from the context
+ $code .= <<___;
+ # ; Move 16 hkeys from the context to stack
+ vmovdqu64 @{[HashKeyByIdx(4,$GCM128_CTX)]},$ZTMP0
+ vmovdqu64 $ZTMP0,@{[HashKeyByIdx(4,"%rsp")]}
+
+ vmovdqu64 @{[HashKeyByIdx(8,$GCM128_CTX)]},$ZTMP1
+ vmovdqu64 $ZTMP1,@{[HashKeyByIdx(8,"%rsp")]}
+
+ # ; broadcast HashKey^8
+ vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
+
+ vmovdqu64 @{[HashKeyByIdx(12,$GCM128_CTX)]},$ZTMP2
+ vmovdqu64 $ZTMP2,@{[HashKeyByIdx(12,"%rsp")]}
+
+ vmovdqu64 @{[HashKeyByIdx(16,$GCM128_CTX)]},$ZTMP3
+ vmovdqu64 $ZTMP3,@{[HashKeyByIdx(16,"%rsp")]}
+___
+ }
+
+ if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "last32") {
+ $code .= <<___;
+ vmovdqu64 @{[HashKeyByIdx(8,"%rsp")]},$ZTMP1
+
+ # ; broadcast HashKey^8
+ vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
+
+ vmovdqu64 @{[HashKeyByIdx(12,"%rsp")]},$ZTMP2
+ vmovdqu64 @{[HashKeyByIdx(16,"%rsp")]},$ZTMP3
+___
+
+ }
+
+ if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
+
+ # ; Precompute hkeys^i, i=17..32
+ my $i = 20;
+ foreach (1 .. int((32 - 16) / 8)) {
+
+ # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
+ &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
+ $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
+ $i += 4;
+
+ # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
+ &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
+ $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
+ $i += 4;
+ }
+ }
+
+ if ($HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
+
+ # ; Precompute hkeys^i, i=33..48 (HKEYS_STORAGE_CAPACITY = 48)
+ my $i = 36;
+ foreach (1 .. int((48 - 32) / 8)) {
+
+ # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
+ &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
+ $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
+ $i += 4;
+
+ # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
+ &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
+ $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
+ $i += 4;
+ }
+ }
+
+ $code .= ".L_skip_hkeys_precomputation_${rndsuffix}:\n";
+}
+
+# ;; =============================================================================
+# ;; Generic macro to produce code that executes $OPCODE instruction
+# ;; on selected number of AES blocks (16 bytes long ) between 0 and 16.
+# ;; All three operands of the instruction come from registers.
+# ;; Note: if 3 blocks are left at the end instruction is produced to operate all
+# ;; 4 blocks (full width of ZMM)
+sub ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 {
+ my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
+ my $OPCODE = $_[1]; # [in] instruction name
+ my @DST;
+ $DST[0] = $_[2]; # [out] destination ZMM register
+ $DST[1] = $_[3]; # [out] destination ZMM register
+ $DST[2] = $_[4]; # [out] destination ZMM register
+ $DST[3] = $_[5]; # [out] destination ZMM register
+ my @SRC1;
+ $SRC1[0] = $_[6]; # [in] source 1 ZMM register
+ $SRC1[1] = $_[7]; # [in] source 1 ZMM register
+ $SRC1[2] = $_[8]; # [in] source 1 ZMM register
+ $SRC1[3] = $_[9]; # [in] source 1 ZMM register
+ my @SRC2;
+ $SRC2[0] = $_[10]; # [in] source 2 ZMM register
+ $SRC2[1] = $_[11]; # [in] source 2 ZMM register
+ $SRC2[2] = $_[12]; # [in] source 2 ZMM register
+ $SRC2[3] = $_[13]; # [in] source 2 ZMM register
+
+ die "ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
+ if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
+
+ my $reg_idx = 0;
+ my $blocks_left = $NUM_BLOCKS;
+
+ foreach (1 .. ($NUM_BLOCKS / 4)) {
+ $code .= "$OPCODE $SRC2[$reg_idx],$SRC1[$reg_idx],$DST[$reg_idx]\n";
+ $reg_idx++;
+ $blocks_left -= 4;
+ }
+
+ my $DSTREG = $DST[$reg_idx];
+ my $SRC1REG = $SRC1[$reg_idx];
+ my $SRC2REG = $SRC2[$reg_idx];
+
+ if ($blocks_left == 1) {
+ $code .= "$OPCODE @{[XWORD($SRC2REG)]},@{[XWORD($SRC1REG)]},@{[XWORD($DSTREG)]}\n";
+ } elsif ($blocks_left == 2) {
+ $code .= "$OPCODE @{[YWORD($SRC2REG)]},@{[YWORD($SRC1REG)]},@{[YWORD($DSTREG)]}\n";
+ } elsif ($blocks_left == 3) {
+ $code .= "$OPCODE $SRC2REG,$SRC1REG,$DSTREG\n";
+ }
+}
+
+# ;; =============================================================================
+# ;; Loads specified number of AES blocks into ZMM registers using mask register
+# ;; for the last loaded register (xmm, ymm or zmm).
+# ;; Loads take place at 1 byte granularity.
+sub ZMM_LOAD_MASKED_BLOCKS_0_16 {
+ my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
+ my $INP = $_[1]; # [in] input data pointer to read from
+ my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical)
+ my @DST;
+ $DST[0] = $_[3]; # [out] ZMM register with loaded data
+ $DST[1] = $_[4]; # [out] ZMM register with loaded data
+ $DST[2] = $_[5]; # [out] ZMM register with loaded data
+ $DST[3] = $_[6]; # [out] ZMM register with loaded data
+ my $MASK = $_[7]; # [in] mask register
+
+ die "ZMM_LOAD_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
+ if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
+
+ my $src_offset = 0;
+ my $dst_idx = 0;
+ my $blocks_left = $NUM_BLOCKS;
+
+ if ($NUM_BLOCKS > 0) {
+ foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
+ $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DST[$dst_idx]\n";
+ $src_offset += 64;
+ $dst_idx++;
+ $blocks_left -= 4;
+ }
+ }
+
+ my $DSTREG = $DST[$dst_idx];
+
+ if ($blocks_left == 1) {
+ $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[XWORD($DSTREG)]}\{$MASK\}{z}\n";
+ } elsif ($blocks_left == 2) {
+ $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[YWORD($DSTREG)]}\{$MASK\}{z}\n";
+ } elsif (($blocks_left == 3 || $blocks_left == 4)) {
+ $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DSTREG\{$MASK\}{z}\n";
+ }
+}
+
+# ;; =============================================================================
+# ;; Stores specified number of AES blocks from ZMM registers with mask register
+# ;; for the last loaded register (xmm, ymm or zmm).
+# ;; Stores take place at 1 byte granularity.
+sub ZMM_STORE_MASKED_BLOCKS_0_16 {
+ my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
+ my $OUTP = $_[1]; # [in] output data pointer to write to
+ my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical)
+ my @SRC;
+ $SRC[0] = $_[3]; # [in] ZMM register with data to store
+ $SRC[1] = $_[4]; # [in] ZMM register with data to store
+ $SRC[2] = $_[5]; # [in] ZMM register with data to store
+ $SRC[3] = $_[6]; # [in] ZMM register with data to store
+ my $MASK = $_[7]; # [in] mask register
+
+ die "ZMM_STORE_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
+ if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
+
+ my $dst_offset = 0;
+ my $src_idx = 0;
+ my $blocks_left = $NUM_BLOCKS;
+
+ if ($NUM_BLOCKS > 0) {
+ foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
+ $code .= "vmovdqu8 $SRC[$src_idx],`$dst_offset`($OUTP,$DATA_OFFSET,1)\n";
+ $dst_offset += 64;
+ $src_idx++;
+ $blocks_left -= 4;
+ }
+ }
+
+ my $SRCREG = $SRC[$src_idx];
+
+ if ($blocks_left == 1) {
+ $code .= "vmovdqu8 @{[XWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
+ } elsif ($blocks_left == 2) {
+ $code .= "vmovdqu8 @{[YWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
+ } elsif ($blocks_left == 3 || $blocks_left == 4) {
+ $code .= "vmovdqu8 $SRCREG,`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
+ }
+}
+
+# ;;; ===========================================================================
+# ;;; Handles AES encryption rounds
+# ;;; It handles special cases: the last and first rounds
+# ;;; Optionally, it performs XOR with data after the last AES round.
+# ;;; Uses NROUNDS parameter to check what needs to be done for the current round.
+# ;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
+sub ZMM_AESENC_ROUND_BLOCKS_0_16 {
+ my $L0B0_3 = $_[0]; # [in/out] zmm; blocks 0 to 3
+ my $L0B4_7 = $_[1]; # [in/out] zmm; blocks 4 to 7
+ my $L0B8_11 = $_[2]; # [in/out] zmm; blocks 8 to 11
+ my $L0B12_15 = $_[3]; # [in/out] zmm; blocks 12 to 15
+ my $KEY = $_[4]; # [in] zmm containing round key
+ my $ROUND = $_[5]; # [in] round number
+ my $D0_3 = $_[6]; # [in] zmm or no_data; plain/cipher text blocks 0-3
+ my $D4_7 = $_[7]; # [in] zmm or no_data; plain/cipher text blocks 4-7
+ my $D8_11 = $_[8]; # [in] zmm or no_data; plain/cipher text blocks 8-11
+ my $D12_15 = $_[9]; # [in] zmm or no_data; plain/cipher text blocks 12-15
+ my $NUMBL = $_[10]; # [in] number of blocks; numerical value
+ my $NROUNDS = $_[11]; # [in] number of rounds; numerical value
+
+ # ;;; === first AES round
+ if ($ROUND < 1) {
+
+ # ;; round 0
+ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
+ $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
+ $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
+ }
+
+ # ;;; === middle AES rounds
+ if ($ROUND >= 1 && $ROUND <= $NROUNDS) {
+
+ # ;; rounds 1 to 9/11/13
+ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
+ $NUMBL, "vaesenc", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
+ $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
+ }
+
+ # ;;; === last AES round
+ if ($ROUND > $NROUNDS) {
+
+ # ;; the last round - mix enclast with text xor's
+ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
+ $NUMBL, "vaesenclast", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
+ $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
+
+ # ;;; === XOR with data
+ if ( ($D0_3 ne "no_data")
+ && ($D4_7 ne "no_data")
+ && ($D8_11 ne "no_data")
+ && ($D12_15 ne "no_data"))
+ {
+ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
+ $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
+ $L0B4_7, $L0B8_11, $L0B12_15, $D0_3, $D4_7, $D8_11, $D12_15);
+ }
+ }
+}
+
+# ;;; Horizontal XOR - 4 x 128bits xored together
+sub VHPXORI4x128 {
+ my $REG = $_[0]; # [in/out] ZMM with 4x128bits to xor; 128bit output
+ my $TMP = $_[1]; # [clobbered] ZMM temporary register
+ $code .= <<___;
+ vextracti64x4 \$1,$REG,@{[YWORD($TMP)]}
+ vpxorq @{[YWORD($TMP)]},@{[YWORD($REG)]},@{[YWORD($REG)]}
+ vextracti32x4 \$1,@{[YWORD($REG)]},@{[XWORD($TMP)]}
+ vpxorq @{[XWORD($TMP)]},@{[XWORD($REG)]},@{[XWORD($REG)]}
+___
+}
+
+# ;;; AVX512 reduction macro
+sub VCLMUL_REDUCE {
+ my $OUT = $_[0]; # [out] zmm/ymm/xmm: result (must not be $TMP1 or $HI128)
+ my $POLY = $_[1]; # [in] zmm/ymm/xmm: polynomial
+ my $HI128 = $_[2]; # [in] zmm/ymm/xmm: high 128b of hash to reduce
+ my $LO128 = $_[3]; # [in] zmm/ymm/xmm: low 128b of hash to reduce
+ my $TMP0 = $_[4]; # [in] zmm/ymm/xmm: temporary register
+ my $TMP1 = $_[5]; # [in] zmm/ymm/xmm: temporary register
+
+ $code .= <<___;
+ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ # ;; first phase of the reduction
+ vpclmulqdq \$0x01,$LO128,$POLY,$TMP0
+ vpslldq \$8,$TMP0,$TMP0 # ; shift-L 2 DWs
+ vpxorq $TMP0,$LO128,$TMP0 # ; first phase of the reduction complete
+ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ # ;; second phase of the reduction
+ vpclmulqdq \$0x00,$TMP0,$POLY,$TMP1
+ vpsrldq \$4,$TMP1,$TMP1 # ; shift-R only 1-DW to obtain 2-DWs shift-R
+ vpclmulqdq \$0x10,$TMP0,$POLY,$OUT
+ vpslldq \$4,$OUT,$OUT # ; shift-L 1-DW to obtain result with no shifts
+ vpternlogq \$0x96,$HI128,$TMP1,$OUT # ; OUT/GHASH = OUT xor TMP1 xor HI128
+ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+___
+}
+
+# ;; ===========================================================================
+# ;; schoolbook multiply of 16 blocks (16 x 16 bytes)
+# ;; - it is assumed that data read from $INPTR is already shuffled and
+# ;; $INPTR address is 64 byte aligned
+# ;; - there is an option to pass ready blocks through ZMM registers too.
+# ;; 4 extra parameters need to be passed in such case and 21st ($ZTMP9) argument can be empty
+sub GHASH_16 {
+ my $TYPE = $_[0]; # [in] ghash type: start (xor hash), mid, end (same as mid; no reduction),
+ # end_reduce (end with reduction), start_reduce
+ my $GH = $_[1]; # [in/out] ZMM ghash sum: high 128-bits
+ my $GM = $_[2]; # [in/out] ZMM ghash sum: middle 128-bits
+ my $GL = $_[3]; # [in/out] ZMM ghash sum: low 128-bits
+ my $INPTR = $_[4]; # [in] data input pointer
+ my $INOFF = $_[5]; # [in] data input offset
+ my $INDIS = $_[6]; # [in] data input displacement
+ my $HKPTR = $_[7]; # [in] hash key pointer
+ my $HKOFF = $_[8]; # [in] hash key offset (can be either numerical offset, or register containing offset)
+ my $HKDIS = $_[9]; # [in] hash key displacement
+ my $HASH = $_[10]; # [in/out] ZMM hash value in/out
+ my $ZTMP0 = $_[11]; # [clobbered] temporary ZMM
+ my $ZTMP1 = $_[12]; # [clobbered] temporary ZMM
+ my $ZTMP2 = $_[13]; # [clobbered] temporary ZMM
+ my $ZTMP3 = $_[14]; # [clobbered] temporary ZMM
+ my $ZTMP4 = $_[15]; # [clobbered] temporary ZMM
+ my $ZTMP5 = $_[16]; # [clobbered] temporary ZMM
+ my $ZTMP6 = $_[17]; # [clobbered] temporary ZMM
+ my $ZTMP7 = $_[18]; # [clobbered] temporary ZMM
+ my $ZTMP8 = $_[19]; # [clobbered] temporary ZMM
+ my $ZTMP9 = $_[20]; # [clobbered] temporary ZMM, can be empty if 4 extra parameters below are provided
+ my $DAT0 = $_[21]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
+ my $DAT1 = $_[22]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
+ my $DAT2 = $_[23]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
+ my $DAT3 = $_[24]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
+
+ my $start_ghash = 0;
+ my $do_reduction = 0;
+ if ($TYPE eq "start") {
+ $start_ghash = 1;
+ }
+
+ if ($TYPE eq "start_reduce") {
+ $start_ghash = 1;
+ $do_reduction = 1;
+ }
+
+ if ($TYPE eq "end_reduce") {
+ $do_reduction = 1;
+ }
+
+ # ;; ghash blocks 0-3
+ if (scalar(@_) == 21) {
+ $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+0*64))]},$ZTMP9\n";
+ } else {
+ $ZTMP9 = $DAT0;
+ }
+
+ if ($start_ghash != 0) {
+ $code .= "vpxorq $HASH,$ZTMP9,$ZTMP9\n";
+ }
+ $code .= <<___;