diff options
Diffstat (limited to 'crypto')
-rw-r--r-- | crypto/modes/asm/aes-gcm-avx512.pl | 4975 | ||||
-rw-r--r-- | crypto/modes/build.info | 3 |
2 files changed, 4977 insertions, 1 deletions
diff --git a/crypto/modes/asm/aes-gcm-avx512.pl b/crypto/modes/asm/aes-gcm-avx512.pl new file mode 100644 index 0000000000..1c7ee8769a --- /dev/null +++ b/crypto/modes/asm/aes-gcm-avx512.pl @@ -0,0 +1,4975 @@ +# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2021, Intel Corporation. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# +# This implementation is based on the AES-GCM code (AVX512VAES + VPCLMULQDQ) +# from Intel(R) Multi-Buffer Crypto for IPsec Library v1.1 +# (https://github.com/intel/intel-ipsec-mb). +# Original author is Tomasz Kantecki <tomasz.kantecki@intel.com>. +# +# References: +# [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on +# Intel Architecture Processors. August, 2010. +# [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on +# Intel Architecture Processors. October, 2012. +# [3] Shay Gueron et. al. Intel Carry-Less Multiplication Instruction and its +# Usage for Computing the GCM Mode. May, 2010. +# +# +# December 2021 +# +# Initial release. +# +# GCM128_CONTEXT structure has storage for 16 hkeys only, but this +# implementation can use up to 48. To avoid extending the context size, +# precompute and store in the context first 16 hkeys only, and compute the rest +# on demand keeping them in the local frame. +# +#====================================================================== +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$win64 = 0; +$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$avx512vaes = 0; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; +$dir = $1; +($xlate = "${dir}x86_64-xlate.pl" and -f $xlate) + or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) + or die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx512vaes = ($1 >= 2.30); +} + +if (!$avx512vaes + && $win64 + && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) + && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) +{ + $avx512vaes = ($1 == 2.13 && $2 >= 3) + ($1 >= 2.14); +} + +if (!$avx512vaes && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { + $avx512vaes = ($2 >= 7.0); +} + +open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT = *OUT; + +#====================================================================== +if ($avx512vaes>0) { #<<< + +$code .= <<___; +.extern OPENSSL_ia32cap_P +.globl ossl_vaes_vpclmulqdq_capable +.type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent +.align 32 +ossl_vaes_vpclmulqdq_capable: + mov OPENSSL_ia32cap_P+8(%rip), %rcx + # avx512vpclmulqdq + avx512vaes + avx512vl + avx512bw + avx512dq + avx512f + mov \$`1<<42|1<<41|1<<31|1<<30|1<<17|1<<16`,%rdx + xor %eax,%eax + and %rdx,%rcx + cmp %rdx,%rcx + cmove %rcx,%rax + ret +.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable +___ + +# ; Mapping key length -> AES rounds count +my %aes_rounds = ( + 128 => 9, + 192 => 11, + 256 => 13); + +# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +# ;;; Code generation control switches +# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +# ; ABI-aware zeroing of volatile registers in EPILOG(). +# ; Disabled due to performance reasons. +my $CLEAR_SCRATCH_REGISTERS = 0; + +# ; Zero HKeys storage from the stack if they are stored there +my $CLEAR_HKEYS_STORAGE_ON_EXIT = 1; + +# ; Enable / disable check of function arguments for null pointer +# ; Currently disabled, as this check is handled outside. +my $CHECK_FUNCTION_ARGUMENTS = 0; + +# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +# ;;; Global constants +# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +# AES block size in bytes +my $AES_BLOCK_SIZE = 16; + +# Storage capacity in elements +my $HKEYS_STORAGE_CAPACITY = 48; +my $LOCAL_STORAGE_CAPACITY = 48; +my $HKEYS_CONTEXT_CAPACITY = 16; + +# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +# ;;; Stack frame definition +# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +# (1) -> +64(Win)/+48(Lin)-byte space for pushed GPRs +# (2) -> +8-byte space for 16-byte alignment of XMM storage +# (3) -> Frame pointer (%RBP) +# (4) -> +160-byte XMM storage (Windows only, zero on Linux) +# (5) -> +48-byte space for 64-byte alignment of %RSP from p.8 +# (6) -> +768-byte LOCAL storage (optional, can be omitted in some functions) +# (7) -> +768-byte HKEYS storage +# (8) -> Stack pointer (%RSP) aligned on 64-byte boundary + +my $GP_STORAGE = $win64 ? 8 * 8 : 8 * 6; # ; space for saved non-volatile GP registers (pushed on stack) +my $XMM_STORAGE = $win64 ? (10 * 16) : 0; # ; space for saved XMM registers +my $HKEYS_STORAGE = ($HKEYS_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for HKeys^i, i=1..48 +my $LOCAL_STORAGE = ($LOCAL_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for up to 48 AES blocks + +my $STACK_HKEYS_OFFSET = 0; +my $STACK_LOCAL_OFFSET = ($STACK_HKEYS_OFFSET + $HKEYS_STORAGE); + +# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +# ;;; Function arguments abstraction +# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +my ($arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8, $arg9, $arg10, $arg11); + +# ; This implementation follows the convention: for non-leaf functions (they +# ; must call PROLOG) %rbp is used as a frame pointer, and has fixed offset from +# ; the function entry: $GP_STORAGE + [8 bytes alignment (Windows only)]. This +# ; helps to facilitate SEH handlers writing. +# +# ; Leaf functions here do not use more than 4 input arguments. +if ($win64) { + $arg1 = "%rcx"; + $arg2 = "%rdx"; + $arg3 = "%r8"; + $arg4 = "%r9"; + $arg5 = "`$GP_STORAGE + 8 + 8*5`(%rbp)"; # +8 - alignment bytes + $arg6 = "`$GP_STORAGE + 8 + 8*6`(%rbp)"; + $arg7 = "`$GP_STORAGE + 8 + 8*7`(%rbp)"; + $arg8 = "`$GP_STORAGE + 8 + 8*8`(%rbp)"; + $arg9 = "`$GP_STORAGE + 8 + 8*9`(%rbp)"; + $arg10 = "`$GP_STORAGE + 8 + 8*10`(%rbp)"; + $arg11 = "`$GP_STORAGE + 8 + 8*11`(%rbp)"; +} else { + $arg1 = "%rdi"; + $arg2 = "%rsi"; + $arg3 = "%rdx"; + $arg4 = "%rcx"; + $arg5 = "%r8"; + $arg6 = "%r9"; + $arg7 = "`$GP_STORAGE + 8*1`(%rbp)"; + $arg8 = "`$GP_STORAGE + 8*2`(%rbp)"; + $arg9 = "`$GP_STORAGE + 8*3`(%rbp)"; + $arg10 = "`$GP_STORAGE + 8*4`(%rbp)"; + $arg11 = "`$GP_STORAGE + 8*5`(%rbp)"; +} + +# ; Offsets in gcm128_context structure (see include/crypto/modes.h) +my $CTX_OFFSET_CurCount = (16 * 0); # ; (Yi) Current counter for generation of encryption key +my $CTX_OFFSET_PEncBlock = (16 * 1); # ; (repurposed EKi field) Partial block buffer +my $CTX_OFFSET_EK0 = (16 * 2); # ; (EK0) Encrypted Y0 counter (see gcm spec notation) +my $CTX_OFFSET_AadLen = (16 * 3); # ; (len.u[0]) Length of Hash which has been input +my $CTX_OFFSET_InLen = ((16 * 3) + 8); # ; (len.u[1]) Length of input data which will be encrypted or decrypted +my $CTX_OFFSET_AadHash = (16 * 4); # ; (Xi) Current hash +my $CTX_OFFSET_HTable = (16 * 6); # ; (Htable) Precomputed table (allows 16 values) + +# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +# ;;; Helper functions +# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +# ; Generates "random" local labels +sub random_string() { + my @chars = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_'); + my $length = 15; + my $str; + map { $str .= $chars[rand(33)] } 1 .. $length; + return $str; +} + +sub BYTE { + my ($reg) = @_; + if ($reg =~ /%r[abcd]x/i) { + $reg =~ s/%r([abcd])x/%${1}l/i; + } elsif ($reg =~ /%r[sdb][ip]/i) { + $reg =~ s/%r([sdb][ip])/%${1}l/i; + } elsif ($reg =~ /%r[0-9]{1,2}/i) { + $reg =~ s/%(r[0-9]{1,2})/%${1}b/i; + } else { + die "BYTE: unknown register: $reg\n"; + } + return $reg; +} + +sub WORD { + my ($reg) = @_; + if ($reg =~ /%r[abcdsdb][xip]/i) { + $reg =~ s/%r([abcdsdb])([xip])/%${1}${2}/i; + } elsif ($reg =~ /%r[0-9]{1,2}/) { + $reg =~ s/%(r[0-9]{1,2})/%${1}w/i; + } else { + die "WORD: unknown register: $reg\n"; + } + return $reg; +} + +sub DWORD { + my ($reg) = @_; + if ($reg =~ /%r[abcdsdb][xip]/i) { + $reg =~ s/%r([abcdsdb])([xip])/%e${1}${2}/i; + } elsif ($reg =~ /%r[0-9]{1,2}/i) { + $reg =~ s/%(r[0-9]{1,2})/%${1}d/i; + } else { + die "DWORD: unknown register: $reg\n"; + } + return $reg; +} + +sub XWORD { + my ($reg) = @_; + if ($reg =~ /%[xyz]mm/i) { + $reg =~ s/%[xyz]mm/%xmm/i; + } else { + die "XWORD: unknown register: $reg\n"; + } + return $reg; +} + +sub YWORD { + my ($reg) = @_; + if ($reg =~ /%[xyz]mm/i) { + $reg =~ s/%[xyz]mm/%ymm/i; + } else { + die "YWORD: unknown register: $reg\n"; + } + return $reg; +} + +sub ZWORD { + my ($reg) = @_; + if ($reg =~ /%[xyz]mm/i) { + $reg =~ s/%[xyz]mm/%zmm/i; + } else { + die "ZWORD: unknown register: $reg\n"; + } + return $reg; +} + +# ; Helper function to construct effective address based on two kinds of +# ; offsets: numerical or located in the register +sub EffectiveAddress { + my ($base, $offset, $displacement) = @_; + $displacement = 0 if (!$displacement); + + if ($offset =~ /^\d+\z/) { # numerical offset + return "`$offset + $displacement`($base)"; + } else { # offset resides in register + return "$displacement($base,$offset,1)"; + } +} + +# ; Provides memory location of corresponding HashKey power +sub HashKeyByIdx { + my ($idx, $base) = @_; + my $base_str = ($base eq "%rsp") ? "frame" : "context"; + + my $offset = &HashKeyOffsetByIdx($idx, $base_str); + return "$offset($base)"; +} + +# ; Provides offset (in bytes) of corresponding HashKey power from the highest key in the storage +sub HashKeyOffsetByIdx { + my ($idx, $base) = @_; + die "HashKeyOffsetByIdx: base should be either 'frame' or 'context'; base = $base" + if (($base ne "frame") && ($base ne "context")); + + my $offset_base; + my $offset_idx; + if ($base eq "frame") { # frame storage + die "HashKeyOffsetByIdx: idx out of bounds (1..48)! idx = $idx\n" if ($idx > $HKEYS_STORAGE_CAPACITY || $idx < 1); + $offset_base = $STACK_HKEYS_OFFSET; + $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_STORAGE_CAPACITY - $idx)); + } else { # context storage + die "HashKeyOffsetByIdx: idx out of bounds (1..16)! idx = $idx\n" if ($idx > $HKEYS_CONTEXT_CAPACITY || $idx < 1); + $offset_base = $CTX_OFFSET_HTable; + $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_CONTEXT_CAPACITY - $idx)); + } + return $offset_base + $offset_idx; +} + +# ; Creates local frame and does back up of non-volatile registers. +# ; Holds stack unwinding directives. +sub PROLOG { + my ($need_hkeys_stack_storage, $need_aes_stack_storage, $func_name) = @_; + + my $DYNAMIC_STACK_ALLOC_SIZE = 0; + my $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE = $win64 ? 48 : 52; + + if ($need_hkeys_stack_storage) { + $DYNAMIC_STACK_ALLOC_SIZE += $HKEYS_STORAGE; + } + + if ($need_aes_stack_storage) { + if (!$need_hkeys_stack_storage) { + die "PROLOG: unsupported case - aes storage without hkeys one"; + } + $DYNAMIC_STACK_ALLOC_SIZE += $LOCAL_STORAGE; + } + + $code .= <<___; + push %rbx +.cfi_push %rbx +.L${func_name}_seh_push_rbx: + push %rbp +.cfi_push %rbp +.L${func_name}_seh_push_rbp: + push %r12 +.cfi_push %r12 +.L${func_name}_seh_push_r12: + push %r13 +.cfi_push %r13 +.L${func_name}_seh_push_r13: + push %r14 +.cfi_push %r14 +.L${func_name}_seh_push_r14: + push %r15 +.cfi_push %r15 +.L${func_name}_seh_push_r15: +___ + + if ($win64) { + $code .= <<___; + push %rdi +.L${func_name}_seh_push_rdi: + push %rsi +.L${func_name}_seh_push_rsi: + + sub \$`$XMM_STORAGE+8`,%rsp # +8 alignment +.L${func_name}_seh_allocstack_xmm: +___ + } + $code .= <<___; + # ; %rbp contains stack pointer right after GP regs pushed at stack + [8 + # ; bytes of alignment (Windows only)]. It serves as a frame pointer in SEH + # ; handlers. The requirement for a frame pointer is that its offset from + # ; RSP shall be multiple of 16, and not exceed 240 bytes. The frame pointer + # ; itself seems to be reasonable to use here, because later we do 64-byte stack + # ; alignment which gives us non-determinate offsets and complicates writing + # ; SEH handlers. + # + # ; It also serves as an anchor for retrieving stack arguments on both Linux + # ; and Windows. + lea `$XMM_STORAGE`(%rsp),%rbp +.cfi_def_cfa_register %rbp +.L${func_name}_seh_setfp: +___ + if ($win64) { + + # ; xmm6:xmm15 need to be preserved on Windows + foreach my $reg_idx (6 .. 15) { + my $xmm_reg_offset = ($reg_idx - 6) * 16; + $code .= <<___; + vmovdqu %xmm${reg_idx},$xmm_reg_offset(%rsp) +.L${func_name}_seh_save_xmm${reg_idx}: +___ + } + } + + $code .= <<___; +# Prolog ends here. Next stack allocation is treated as "dynamic". +.L${func_name}_seh_prolog_end: +___ + + if ($DYNAMIC_STACK_ALLOC_SIZE) { + $code .= <<___; + sub \$`$DYNAMIC_STACK_ALLOC_SIZE + $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE`,%rsp + and \$(-64),%rsp +___ + } +} + +# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +# ;;; Restore register content for the caller. +# ;;; And cleanup stack. +# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +sub EPILOG { + my ($hkeys_storage_on_stack, $payload_len) = @_; + + my $rndsuffix = &random_string(); + + if ($hkeys_storage_on_stack && $CLEAR_HKEYS_STORAGE_ON_EXIT) { + + # ; There is no need in hkeys cleanup if payload len was small, i.e. no hkeys + # ; were stored in the local frame storage + $code .= <<___; + cmpq \$`16*16`,$payload_len + jbe .Lskip_hkeys_cleanup_${rndsuffix} + vpxor %xmm0,%xmm0,%xmm0 +___ + for (my $i = 0; $i < int($HKEYS_STORAGE / 64); $i++) { + $code .= "vmovdqa64 %zmm0,`$STACK_HKEYS_OFFSET + 64*$i`(%rsp)\n"; + } + $code .= ".Lskip_hkeys_cleanup_${rndsuffix}:\n"; + } + + if ($CLEAR_SCRATCH_REGISTERS) { + &clear_scratch_gps_asm(); + &clear_scratch_zmms_asm(); + } else { + $code .= "vzeroupper\n"; + } + + if ($win64) { + + # ; restore xmm15:xmm6 + for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) { + my $xmm_reg_offset = -$XMM_STORAGE + ($reg_idx - 6) * 16; + $code .= <<___; + vmovdqu $xmm_reg_offset(%rbp),%xmm${reg_idx}, +___ + } + } + + if ($win64) { + + # Forming valid epilog for SEH with use of frame pointer. + # https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=msvc-160#epilog-code + $code .= "lea 8(%rbp),%rsp\n"; + } else { + $code .= "lea (%rbp),%rsp\n"; + $code .= ".cfi_def_cfa_register %rsp\n"; + } + + if ($win64) { + $code .= <<___; + pop %rsi +.cfi_pop %rsi + pop %rdi +.cfi_pop %rdi +___ + } + $code .= <<___; + pop %r15 +.cfi_pop %r15 + pop %r14 +.cfi_pop %r14 + pop %r13 +.cfi_pop %r13 + pop %r12 +.cfi_pop %r12 + pop %rbp +.cfi_pop %rbp + pop %rbx +.cfi_pop %rbx +___ +} + +# ; Clears all scratch ZMM registers +# ; +# ; It should be called before restoring the XMM registers +# ; for Windows (XMM6-XMM15). +# ; +sub clear_scratch_zmms_asm { + + # ; On Linux, all ZMM registers are scratch registers + if (!$win64) { + $code .= "vzeroall\n"; + } else { + foreach my $i (0 .. 5) { + $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n"; + } + } + foreach my $i (16 .. 31) { + $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n"; + } +} + +# Clears all scratch GP registers +sub clear_scratch_gps_asm { + foreach my $reg ("%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11") { + $code .= "xor $reg,$reg\n"; + } + if (!$win64) { + foreach my $reg ("%rsi", "%rdi") { + $code .= "xor $reg,$reg\n"; + } + } +} + +sub precompute_hkeys_on_stack { + my $GCM128_CTX = $_[0]; + my $HKEYS_READY = $_[1]; + my $ZTMP0 = $_[2]; + my $ZTMP1 = $_[3]; + my $ZTMP2 = $_[4]; + my $ZTMP3 = $_[5]; + my $ZTMP4 = $_[6]; + my $ZTMP5 = $_[7]; + my $ZTMP6 = $_[8]; + my $HKEYS_RANGE = $_[9]; # ; "first16", "mid16", "all", "first32", "last32" + + die "precompute_hkeys_on_stack: Unexpected value of HKEYS_RANGE: $HKEYS_RANGE" + if ($HKEYS_RANGE ne "first16" + && $HKEYS_RANGE ne "mid16" + && $HKEYS_RANGE ne "all" + && $HKEYS_RANGE ne "first32" + && $HKEYS_RANGE ne "last32"); + + my $rndsuffix = &random_string(); + + $code .= <<___; + test $HKEYS_READY,$HKEYS_READY + jnz .L_skip_hkeys_precomputation_${rndsuffix} +___ + + if ($HKEYS_RANGE eq "first16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "all") { + + # ; Fill the stack with the first 16 hkeys from the context + $code .= <<___; + # ; Move 16 hkeys from the context to stack + vmovdqu64 @{[HashKeyByIdx(4,$GCM128_CTX)]},$ZTMP0 + vmovdqu64 $ZTMP0,@{[HashKeyByIdx(4,"%rsp")]} + + vmovdqu64 @{[HashKeyByIdx(8,$GCM128_CTX)]},$ZTMP1 + vmovdqu64 $ZTMP1,@{[HashKeyByIdx(8,"%rsp")]} + + # ; broadcast HashKey^8 + vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1 + + vmovdqu64 @{[HashKeyByIdx(12,$GCM128_CTX)]},$ZTMP2 + vmovdqu64 $ZTMP2,@{[HashKeyByIdx(12,"%rsp")]} + + vmovdqu64 @{[HashKeyByIdx(16,$GCM128_CTX)]},$ZTMP3 + vmovdqu64 $ZTMP3,@{[HashKeyByIdx(16,"%rsp")]} +___ + } + + if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "last32") { + $code .= <<___; + vmovdqu64 @{[HashKeyByIdx(8,"%rsp")]},$ZTMP1 + + # ; broadcast HashKey^8 + vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1 + + vmovdqu64 @{[HashKeyByIdx(12,"%rsp")]},$ZTMP2 + vmovdqu64 @{[HashKeyByIdx(16,"%rsp")]},$ZTMP3 +___ + + } + + if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") { + + # ; Precompute hkeys^i, i=17..32 + my $i = 20; + foreach (1 .. int((32 - 16) / 8)) { + + # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n) + &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); + $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; + $i += 4; + + # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n) + &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); + $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; + $i += 4; + } + } + + if ($HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") { + + # ; Precompute hkeys^i, i=33..48 (HKEYS_STORAGE_CAPACITY = 48) + my $i = 36; + foreach (1 .. int((48 - 32) / 8)) { + + # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n) + &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); + $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; + $i += 4; + + # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n) + &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); + $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; + $i += 4; + } + } + + $code .= ".L_skip_hkeys_precomputation_${rndsuffix}:\n"; +} + +# ;; ============================================================================= +# ;; Generic macro to produce code that executes $OPCODE instruction +# ;; on selected number of AES blocks (16 bytes long ) between 0 and 16. +# ;; All three operands of the instruction come from registers. +# ;; Note: if 3 blocks are left at the end instruction is produced to operate all +# ;; 4 blocks (full width of ZMM) +sub ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 { + my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16) + my $OPCODE = $_[1]; # [in] instruction name + my @DST; + $DST[0] = $_[2]; # [out] destination ZMM register + $DST[1] = $_[3]; # [out] destination ZMM register + $DST[2] = $_[4]; # [out] destination ZMM register + $DST[3] = $_[5]; # [out] destination ZMM register + my @SRC1; + $SRC1[0] = $_[6]; # [in] source 1 ZMM register + $SRC1[1] = $_[7]; # [in] source 1 ZMM register + $SRC1[2] = $_[8]; # [in] source 1 ZMM register + $SRC1[3] = $_[9]; # [in] source 1 ZMM register + my @SRC2; + $SRC2[0] = $_[10]; # [in] source 2 ZMM register + $SRC2[1] = $_[11]; # [in] source 2 ZMM register + $SRC2[2] = $_[12]; # [in] source 2 ZMM register + $SRC2[3] = $_[13]; # [in] source 2 ZMM register + + die "ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n" + if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); + + my $reg_idx = 0; + my $blocks_left = $NUM_BLOCKS; + + foreach (1 .. ($NUM_BLOCKS / 4)) { + $code .= "$OPCODE $SRC2[$reg_idx],$SRC1[$reg_idx],$DST[$reg_idx]\n"; + $reg_idx++; + $blocks_left -= 4; + } + + my $DSTREG = $DST[$reg_idx]; + my $SRC1REG = $SRC1[$reg_idx]; + my $SRC2REG = $SRC2[$reg_idx]; + + if ($blocks_left == 1) { + $code .= "$OPCODE @{[XWORD($SRC2REG)]},@{[XWORD($SRC1REG)]},@{[XWORD($DSTREG)]}\n"; + } elsif ($blocks_left == 2) { + $code .= "$OPCODE @{[YWORD($SRC2REG)]},@{[YWORD($SRC1REG)]},@{[YWORD($DSTREG)]}\n"; + } elsif ($blocks_left == 3) { + $code .= "$OPCODE $SRC2REG,$SRC1REG,$DSTREG\n"; + } +} + +# ;; ============================================================================= +# ;; Loads specified number of AES blocks into ZMM registers using mask register +# ;; for the last loaded register (xmm, ymm or zmm). +# ;; Loads take place at 1 byte granularity. +sub ZMM_LOAD_MASKED_BLOCKS_0_16 { + my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16) + my $INP = $_[1]; # [in] input data pointer to read from + my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical) + my @DST; + $DST[0] = $_[3]; # [out] ZMM register with loaded data + $DST[1] = $_[4]; # [out] ZMM register with loaded data + $DST[2] = $_[5]; # [out] ZMM register with loaded data + $DST[3] = $_[6]; # [out] ZMM register with loaded data + my $MASK = $_[7]; # [in] mask register + + die "ZMM_LOAD_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n" + if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); + + my $src_offset = 0; + my $dst_idx = 0; + my $blocks_left = $NUM_BLOCKS; + + if ($NUM_BLOCKS > 0) { + foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) { + $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DST[$dst_idx]\n"; + $src_offset += 64; + $dst_idx++; + $blocks_left -= 4; + } + } + + my $DSTREG = $DST[$dst_idx]; + + if ($blocks_left == 1) { + $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[XWORD($DSTREG)]}\{$MASK\}{z}\n"; + } elsif ($blocks_left == 2) { + $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[YWORD($DSTREG)]}\{$MASK\}{z}\n"; + } elsif (($blocks_left == 3 || $blocks_left == 4)) { + $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DSTREG\{$MASK\}{z}\n"; + } +} + +# ;; ============================================================================= +# ;; Stores specified number of AES blocks from ZMM registers with mask register +# ;; for the last loaded register (xmm, ymm or zmm). +# ;; Stores take place at 1 byte granularity. +sub ZMM_STORE_MASKED_BLOCKS_0_16 { + my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16) + my $OUTP = $_[1]; # [in] output data pointer to write to + my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical) + my @SRC; + $SRC[0] = $_[3]; # [in] ZMM register with data to store + $SRC[1] = $_[4]; # [in] ZMM register with data to store + $SRC[2] = $_[5]; # [in] ZMM register with data to store + $SRC[3] = $_[6]; # [in] ZMM register with data to store + my $MASK = $_[7]; # [in] mask register + + die "ZMM_STORE_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n" + if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); + + my $dst_offset = 0; + my $src_idx = 0; + my $blocks_left = $NUM_BLOCKS; + + if ($NUM_BLOCKS > 0) { + foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) { + $code .= "vmovdqu8 $SRC[$src_idx],`$dst_offset`($OUTP,$DATA_OFFSET,1)\n"; + $dst_offset += 64; + $src_idx++; + $blocks_left -= 4; + } + } + + my $SRCREG = $SRC[$src_idx]; + + if ($blocks_left == 1) { + $code .= "vmovdqu8 @{[XWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n"; + } elsif ($blocks_left == 2) { + $code .= "vmovdqu8 @{[YWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n"; + } elsif ($blocks_left == 3 || $blocks_left == 4) { + $code .= "vmovdqu8 $SRCREG,`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n"; + } +} + +# ;;; =========================================================================== +# ;;; Handles AES encryption rounds +# ;;; It handles special cases: the last and first rounds +# ;;; Optionally, it performs XOR with data after the last AES round. +# ;;; Uses NROUNDS parameter to check what needs to be done for the current round. +# ;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks). +sub ZMM_AESENC_ROUND_BLOCKS_0_16 { + my $L0B0_3 = $_[0]; # [in/out] zmm; blocks 0 to 3 + my $L0B4_7 = $_[1]; # [in/out] zmm; blocks 4 to 7 + my $L0B8_11 = $_[2]; # [in/out] zmm; blocks 8 to 11 + my $L0B12_15 = $_[3]; # [in/out] zmm; blocks 12 to 15 + my $KEY = $_[4]; # [in] zmm containing round key + my $ROUND = $_[5]; # [in] round number + my $D0_3 = $_[6]; # [in] zmm or no_data; plain/cipher text blocks 0-3 + my $D4_7 = $_[7]; # [in] zmm or no_data; plain/cipher text blocks 4-7 + my $D8_11 = $_[8]; # [in] zmm or no_data; plain/cipher text blocks 8-11 + my $D12_15 = $_[9]; # [in] zmm or no_data; plain/cipher text blocks 12-15 + my $NUMBL = $_[10]; # [in] number of blocks; numerical value + my $NROUNDS = $_[11]; # [in] number of rounds; numerical value + + # ;;; === first AES round + if ($ROUND < 1) { + + # ;; round 0 + &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( + $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, + $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY); + } + + # ;;; === middle AES rounds + if ($ROUND >= 1 && $ROUND <= $NROUNDS) { + + # ;; rounds 1 to 9/11/13 + &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( + $NUMBL, "vaesenc", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, + $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY); + } + + # ;;; === last AES round + if ($ROUND > $NROUNDS) { + + # ;; the last round - mix enclast with text xor's + &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( + $NUMBL, "vaesenclast", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, + $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY); + + # ;;; === XOR with data + if ( ($D0_3 ne "no_data") + && ($D4_7 ne "no_data") + && ($D8_11 ne "no_data") + && ($D12_15 ne "no_data")) + { + &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( + $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, + $L0B4_7, $L0B8_11, $L0B12_15, $D0_3, $D4_7, $D8_11, $D12_15); + } + } +} + +# ;;; Horizontal XOR - 4 x 128bits xored together +sub VHPXORI4x128 { + my $REG = $_[0]; # [in/out] ZMM with 4x128bits to xor; 128bit output + my $TMP = $_[1]; # [clobbered] ZMM temporary register + $code .= <<___; + vextracti64x4 \$1,$REG,@{[YWORD($TMP)]} + vpxorq @{[YWORD($TMP)]},@{[YWORD($REG)]},@{[YWORD($REG)]} + vextracti32x4 \$1,@{[YWORD($REG)]},@{[XWORD($TMP)]} + vpxorq @{[XWORD($TMP)]},@{[XWORD($REG)]},@{[XWORD($REG)]} +___ +} + +# ;;; AVX512 reduction macro +sub VCLMUL_REDUCE { + my $OUT = $_[0]; # [out] zmm/ymm/xmm: result (must not be $TMP1 or $HI128) + my $POLY = $_[1]; # [in] zmm/ymm/xmm: polynomial + my $HI128 = $_[2]; # [in] zmm/ymm/xmm: high 128b of hash to reduce + my $LO128 = $_[3]; # [in] zmm/ymm/xmm: low 128b of hash to reduce + my $TMP0 = $_[4]; # [in] zmm/ymm/xmm: temporary register + my $TMP1 = $_[5]; # [in] zmm/ymm/xmm: temporary register + + $code .= <<___; + # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + # ;; first phase of the reduction + vpclmulqdq \$0x01,$LO128,$POLY,$TMP0 + vpslldq \$8,$TMP0,$TMP0 # ; shift-L 2 DWs + vpxorq $TMP0,$LO128,$TMP0 # ; first phase of the reduction complete + # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + # ;; second phase of the reduction + vpclmulqdq \$0x00,$TMP0,$POLY,$TMP1 + vpsrldq \$4,$TMP1,$TMP1 # ; shift-R only 1-DW to obtain 2-DWs shift-R + vpclmulqdq \$0x10,$TMP0,$POLY,$OUT + vpslldq \$4,$OUT,$OUT # ; shift-L 1-DW to obtain result with no shifts + vpternlogq \$0x96,$HI128,$TMP1,$OUT # ; OUT/GHASH = OUT xor TMP1 xor HI128 + # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +___ +} + +# ;; =========================================================================== +# ;; schoolbook multiply of 16 blocks (16 x 16 bytes) +# ;; - it is assumed that data read from $INPTR is already shuffled and +# ;; $INPTR address is 64 byte aligned +# ;; - there is an option to pass ready blocks through ZMM registers too. +# ;; 4 extra parameters need to be passed in such case and 21st ($ZTMP9) argument can be empty +sub GHASH_16 { + my $TYPE = $_[0]; # [in] ghash type: start (xor hash), mid, end (same as mid; no reduction), + # end_reduce (end with reduction), start_reduce + my $GH = $_[1]; # [in/out] ZMM ghash sum: high 128-bits + my $GM = $_[2]; # [in/out] ZMM ghash sum: middle 128-bits + my $GL = $_[3]; # [in/out] ZMM ghash sum: low 128-bits + my $INPTR = $_[4]; # [in] data input pointer + my $INOFF = $_[5]; # [in] data input offset + my $INDIS = $_[6]; # [in] data input displacement + my $HKPTR = $_[7]; # [in] hash key pointer + my $HKOFF = $_[8]; # [in] hash key offset (can be either numerical offset, or register containing offset) + my $HKDIS = $_[9]; # [in] hash key displacement + my $HASH = $_[10]; # [in/out] ZMM hash value in/out + my $ZTMP0 = $_[11]; # [clobbered] temporary ZMM + my $ZTMP1 = $_[12]; # [clobbered] temporary ZMM + my $ZTMP2 = $_[13]; # [clobbered] temporary ZMM + my $ZTMP3 = $_[14]; # [clobbered] temporary ZMM + my $ZTMP4 = $_[15]; # [clobbered] temporary ZMM + my $ZTMP5 = $_[16]; # [clobbered] temporary ZMM + my $ZTMP6 = $_[17]; # [clobbered] temporary ZMM + my $ZTMP7 = $_[18]; # [clobbered] temporary ZMM + my $ZTMP8 = $_[19]; # [clobbered] temporary ZMM + my $ZTMP9 = $_[20]; # [clobbered] temporary ZMM, can be empty if 4 extra parameters below are provided + my $DAT0 = $_[21]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) + my $DAT1 = $_[22]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) + my $DAT2 = $_[23]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) + my $DAT3 = $_[24]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) + + my $start_ghash = 0; + my $do_reduction = 0; + if ($TYPE eq "start") { + $start_ghash = 1; + } + + if ($TYPE eq "start_reduce") { + $start_ghash = 1; + $do_reduction = 1; + } + + if ($TYPE eq "end_reduce") { + $do_reduction = 1; + } + + # ;; ghash blocks 0-3 + if (scalar(@_) == 21) { + $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+0*64))]},$ZTMP9\n"; + } else { + $ZTMP9 = $DAT0; + } + + if ($start_ghash != 0) { + $code .= "vpxorq $HASH,$ZTMP9,$ZTMP9\n"; + } + $code .= <<___; + vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+0*64))]},$ZTMP8 + vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1 + vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0 + vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0 + vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1 +___ + + # ;; ghash blocks 4-7 + if (scalar(@_) == 21) { + $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+1*64))]},$ZTMP9\n"; + } else { + $ZTMP9 = $DAT1; + } + $code .= <<___; + vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+1*64))]},$ZTMP8 + vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1 + vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0 + vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0 + vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1 +___ + + # ;; update sums + if ($start_ghash != 0) { + $code .= <<___; + vpxorq $ZTMP6,$ZTMP2,$GM # ; GM = T0M1 + T1M1 + vpxorq $ZTMP4,$ZTMP0,$GH # ; GH = T0H + T1H + vpxorq $ZTMP5,$ZTMP1,$GL # ; GL = T0L + T1L + vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM = T0M2 + T1M1 +___ + } else { # ;; mid, end, end_reduce + $code .= <<___; + vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1 + vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H + vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L + vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1 +___ + } + + # ;; ghash blocks 8-11 + if (scalar(@_) == 21) { + $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+2*64))]},$ZTMP9\n"; + } else { + $ZTMP9 = $DAT2; + } + $code .= <<___; + vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+2*64))]},$ZTMP8 + vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1 + vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0 + vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0 + vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1 +___ + + # ;; ghash blocks 12-15 + if (scalar(@_) == 21) { + $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+3*64))]},$ZTMP9\n"; + } else { + $ZTMP9 = $DAT3; + } + $code .= <<___; + vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+3*64))]},$ZTMP8 + vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1 + vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0 + vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0 + vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1 + # ;; update sums + vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1 + vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H + vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L + vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1 +___ + if ($do_reduction != 0) { + $cod |