6 files changed, 5199 insertions, 5 deletions
diff --git a/CHANGES.md b/CHANGES.md
index 212532bce2..05f96dd0a1 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -50,6 +50,10 @@ OpenSSL 3.1
 
    *Felipe Gasper*
 
+ * AES-GCM enabled with AVX512 vAES and vPCLMULQDQ.
+
+   *Tomasz Kantecki, Andrey Matyukov*
+
  * The default SSL/TLS security level has been changed from 1 to 2. RSA,
    DSA and DH keys of 1024 bits and above and less than 2048 bits and ECC keys
    of 160 bits and above and less than 224 bits were previously accepted by
diff --git a/crypto/modes/asm/aes-gcm-avx512.pl b/crypto/modes/asm/aes-gcm-avx512.pl
new file mode 100644
index 0000000000..1c7ee8769a
--- /dev/null
+++ b/crypto/modes/asm/aes-gcm-avx512.pl
@@ -0,0 +1,4975 @@
+# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+#
+# This implementation is based on the AES-GCM code (AVX512VAES + VPCLMULQDQ)
+# from Intel(R) Multi-Buffer Crypto for IPsec Library v1.1
+# (https://github.com/intel/intel-ipsec-mb).
+# Original author is Tomasz Kantecki <tomasz.kantecki@intel.com>.
+#
+# References:
+#  [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
+#      Intel Architecture Processors. August, 2010.
+#  [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
+#      Intel Architecture Processors. October, 2012.
+#  [3] Shay Gueron et. al. Intel Carry-Less Multiplication Instruction and its
+#      Usage for Computing the GCM Mode. May, 2010.
+#
+#
+# December 2021
+#
+# Initial release.
+#
+# GCM128_CONTEXT structure has storage for 16 hkeys only, but this
+# implementation can use up to 48.  To avoid extending the context size,
+# precompute and store in the context first 16 hkeys only, and compute the rest
+# on demand keeping them in the local frame.
+#
+#======================================================================
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output  = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop   : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.|          ? shift : undef;
+
+$win64 = 0;
+$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$avx512vaes = 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+$dir = $1;
+($xlate = "${dir}x86_64-xlate.pl" and -f $xlate)
+  or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate)
+  or die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+  $avx512vaes = ($1 >= 2.30);
+}
+
+if (!$avx512vaes
+  && $win64
+  && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/)
+  && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/)
+{
+  $avx512vaes = ($1 == 2.13 && $2 >= 3) + ($1 >= 2.14);
+}
+
+if (!$avx512vaes && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
+  $avx512vaes = ($2 >= 7.0);
+}
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""
+  or die "can't call $xlate: $!";
+*STDOUT = *OUT;
+
+#======================================================================
+if ($avx512vaes>0) { #<<<
+
+$code .= <<___;
+.extern OPENSSL_ia32cap_P
+.globl  ossl_vaes_vpclmulqdq_capable
+.type   ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
+.align 32
+ossl_vaes_vpclmulqdq_capable:
+    mov OPENSSL_ia32cap_P+8(%rip), %rcx
+    # avx512vpclmulqdq + avx512vaes + avx512vl + avx512bw + avx512dq + avx512f
+    mov \$`1<<42|1<<41|1<<31|1<<30|1<<17|1<<16`,%rdx
+    xor %eax,%eax
+    and %rdx,%rcx
+    cmp %rdx,%rcx
+    cmove %rcx,%rax
+    ret
+.size   ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
+___
+
+# ; Mapping key length -> AES rounds count
+my %aes_rounds = (
+  128 => 9,
+  192 => 11,
+  256 => 13);
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Code generation control switches
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+# ; ABI-aware zeroing of volatile registers in EPILOG().
+# ; Disabled due to performance reasons.
+my $CLEAR_SCRATCH_REGISTERS = 0;
+
+# ; Zero HKeys storage from the stack if they are stored there
+my $CLEAR_HKEYS_STORAGE_ON_EXIT = 1;
+
+# ; Enable / disable check of function arguments for null pointer
+# ; Currently disabled, as this check is handled outside.
+my $CHECK_FUNCTION_ARGUMENTS = 0;
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Global constants
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+# AES block size in bytes
+my $AES_BLOCK_SIZE = 16;
+
+# Storage capacity in elements
+my $HKEYS_STORAGE_CAPACITY = 48;
+my $LOCAL_STORAGE_CAPACITY = 48;
+my $HKEYS_CONTEXT_CAPACITY = 16;
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Stack frame definition
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+# (1) -> +64(Win)/+48(Lin)-byte space for pushed GPRs
+# (2) -> +8-byte space for 16-byte alignment of XMM storage
+# (3) -> Frame pointer (%RBP)
+# (4) -> +160-byte XMM storage (Windows only, zero on Linux)
+# (5) -> +48-byte space for 64-byte alignment of %RSP from p.8
+# (6) -> +768-byte LOCAL storage (optional, can be omitted in some functions)
+# (7) -> +768-byte HKEYS storage
+# (8) -> Stack pointer (%RSP) aligned on 64-byte boundary
+
+my $GP_STORAGE  = $win64 ? 8 * 8     : 8 * 6;    # ; space for saved non-volatile GP registers (pushed on stack)
+my $XMM_STORAGE = $win64 ? (10 * 16) : 0;        # ; space for saved XMM registers
+my $HKEYS_STORAGE = ($HKEYS_STORAGE_CAPACITY * $AES_BLOCK_SIZE);    # ; space for HKeys^i, i=1..48
+my $LOCAL_STORAGE = ($LOCAL_STORAGE_CAPACITY * $AES_BLOCK_SIZE);    # ; space for up to 48 AES blocks
+
+my $STACK_HKEYS_OFFSET = 0;
+my $STACK_LOCAL_OFFSET = ($STACK_HKEYS_OFFSET + $HKEYS_STORAGE);
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Function arguments abstraction
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+my ($arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8, $arg9, $arg10, $arg11);
+
+# ; This implementation follows the convention: for non-leaf functions (they
+# ; must call PROLOG) %rbp is used as a frame pointer, and has fixed offset from
+# ; the function entry: $GP_STORAGE + [8 bytes alignment (Windows only)].  This
+# ; helps to facilitate SEH handlers writing.
+#
+# ; Leaf functions here do not use more than 4 input arguments.
+if ($win64) {
+  $arg1  = "%rcx";
+  $arg2  = "%rdx";
+  $arg3  = "%r8";
+  $arg4  = "%r9";
+  $arg5  = "`$GP_STORAGE + 8 + 8*5`(%rbp)";    # +8 - alignment bytes
+  $arg6  = "`$GP_STORAGE + 8 + 8*6`(%rbp)";
+  $arg7  = "`$GP_STORAGE + 8 + 8*7`(%rbp)";
+  $arg8  = "`$GP_STORAGE + 8 + 8*8`(%rbp)";
+  $arg9  = "`$GP_STORAGE + 8 + 8*9`(%rbp)";
+  $arg10 = "`$GP_STORAGE + 8 + 8*10`(%rbp)";
+  $arg11 = "`$GP_STORAGE + 8 + 8*11`(%rbp)";
+} else {
+  $arg1  = "%rdi";
+  $arg2  = "%rsi";
+  $arg3  = "%rdx";
+  $arg4  = "%rcx";
+  $arg5  = "%r8";
+  $arg6  = "%r9";
+  $arg7  = "`$GP_STORAGE + 8*1`(%rbp)";
+  $arg8  = "`$GP_STORAGE + 8*2`(%rbp)";
+  $arg9  = "`$GP_STORAGE + 8*3`(%rbp)";
+  $arg10 = "`$GP_STORAGE + 8*4`(%rbp)";
+  $arg11 = "`$GP_STORAGE + 8*5`(%rbp)";
+}
+
+# ; Offsets in gcm128_context structure (see include/crypto/modes.h)
+my $CTX_OFFSET_CurCount  = (16 * 0);          #  ; (Yi) Current counter for generation of encryption key
+my $CTX_OFFSET_PEncBlock = (16 * 1);          #  ; (repurposed EKi field) Partial block buffer
+my $CTX_OFFSET_EK0       = (16 * 2);          #  ; (EK0) Encrypted Y0 counter (see gcm spec notation)
+my $CTX_OFFSET_AadLen    = (16 * 3);          #  ; (len.u[0]) Length of Hash which has been input
+my $CTX_OFFSET_InLen     = ((16 * 3) + 8);    #  ; (len.u[1]) Length of input data which will be encrypted or decrypted
+my $CTX_OFFSET_AadHash   = (16 * 4);          #  ; (Xi) Current hash
+my $CTX_OFFSET_HTable    = (16 * 6);          #  ; (Htable) Precomputed table (allows 16 values)
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Helper functions
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+# ; Generates "random" local labels
+sub random_string() {
+  my @chars  = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_');
+  my $length = 15;
+  my $str;
+  map { $str .= $chars[rand(33)] } 1 .. $length;
+  return $str;
+}
+
+sub BYTE {
+  my ($reg) = @_;
+  if ($reg =~ /%r[abcd]x/i) {
+    $reg =~ s/%r([abcd])x/%${1}l/i;
+  } elsif ($reg =~ /%r[sdb][ip]/i) {
+    $reg =~ s/%r([sdb][ip])/%${1}l/i;
+  } elsif ($reg =~ /%r[0-9]{1,2}/i) {
+    $reg =~ s/%(r[0-9]{1,2})/%${1}b/i;
+  } else {
+    die "BYTE: unknown register: $reg\n";
+  }
+  return $reg;
+}
+
+sub WORD {
+  my ($reg) = @_;
+  if ($reg =~ /%r[abcdsdb][xip]/i) {
+    $reg =~ s/%r([abcdsdb])([xip])/%${1}${2}/i;
+  } elsif ($reg =~ /%r[0-9]{1,2}/) {
+    $reg =~ s/%(r[0-9]{1,2})/%${1}w/i;
+  } else {
+    die "WORD: unknown register: $reg\n";
+  }
+  return $reg;
+}
+
+sub DWORD {
+  my ($reg) = @_;
+  if ($reg =~ /%r[abcdsdb][xip]/i) {
+    $reg =~ s/%r([abcdsdb])([xip])/%e${1}${2}/i;
+  } elsif ($reg =~ /%r[0-9]{1,2}/i) {
+    $reg =~ s/%(r[0-9]{1,2})/%${1}d/i;
+  } else {
+    die "DWORD: unknown register: $reg\n";
+  }
+  return $reg;
+}
+
+sub XWORD {
+  my ($reg) = @_;
+  if ($reg =~ /%[xyz]mm/i) {
+    $reg =~ s/%[xyz]mm/%xmm/i;
+  } else {
+    die "XWORD: unknown register: $reg\n";
+  }
+  return $reg;
+}
+
+sub YWORD {
+  my ($reg) = @_;
+  if ($reg =~ /%[xyz]mm/i) {
+    $reg =~ s/%[xyz]mm/%ymm/i;
+  } else {
+    die "YWORD: unknown register: $reg\n";
+  }
+  return $reg;
+}
+
+sub ZWORD {
+  my ($reg) = @_;
+  if ($reg =~ /%[xyz]mm/i) {
+    $reg =~ s/%[xyz]mm/%zmm/i;
+  } else {
+    die "ZWORD: unknown register: $reg\n";
+  }
+  return $reg;
+}
+
+# ; Helper function to construct effective address based on two kinds of
+# ; offsets: numerical or located in the register
+sub EffectiveAddress {
+  my ($base, $offset, $displacement) = @_;
+  $displacement = 0 if (!$displacement);
+
+  if ($offset =~ /^\d+\z/) {    # numerical offset
+    return "`$offset + $displacement`($base)";
+  } else {                      # offset resides in register
+    return "$displacement($base,$offset,1)";
+  }
+}
+
+# ; Provides memory location of corresponding HashKey power
+sub HashKeyByIdx {
+  my ($idx, $base) = @_;
+  my $base_str = ($base eq "%rsp") ? "frame" : "context";
+
+  my $offset = &HashKeyOffsetByIdx($idx, $base_str);
+  return "$offset($base)";
+}
+
+# ; Provides offset (in bytes) of corresponding HashKey power from the highest key in the storage
+sub HashKeyOffsetByIdx {
+  my ($idx, $base) = @_;
+  die "HashKeyOffsetByIdx: base should be either 'frame' or 'context'; base = $base"
+    if (($base ne "frame") && ($base ne "context"));
+
+  my $offset_base;
+  my $offset_idx;
+  if ($base eq "frame") {    # frame storage
+    die "HashKeyOffsetByIdx: idx out of bounds (1..48)! idx = $idx\n" if ($idx > $HKEYS_STORAGE_CAPACITY || $idx < 1);
+    $offset_base = $STACK_HKEYS_OFFSET;
+    $offset_idx  = ($AES_BLOCK_SIZE * ($HKEYS_STORAGE_CAPACITY - $idx));
+  } else {                   # context storage
+    die "HashKeyOffsetByIdx: idx out of bounds (1..16)! idx = $idx\n" if ($idx > $HKEYS_CONTEXT_CAPACITY || $idx < 1);
+    $offset_base = $CTX_OFFSET_HTable;
+    $offset_idx  = ($AES_BLOCK_SIZE * ($HKEYS_CONTEXT_CAPACITY - $idx));
+  }
+  return $offset_base + $offset_idx;
+}
+
+# ; Creates local frame and does back up of non-volatile registers.
+# ; Holds stack unwinding directives.
+sub PROLOG {
+  my ($need_hkeys_stack_storage, $need_aes_stack_storage, $func_name) = @_;
+
+  my $DYNAMIC_STACK_ALLOC_SIZE            = 0;
+  my $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE = $win64 ? 48 : 52;
+
+  if ($need_hkeys_stack_storage) {
+    $DYNAMIC_STACK_ALLOC_SIZE += $HKEYS_STORAGE;
+  }
+
+  if ($need_aes_stack_storage) {
+    if (!$need_hkeys_stack_storage) {
+      die "PROLOG: unsupported case - aes storage without hkeys one";
+    }
+    $DYNAMIC_STACK_ALLOC_SIZE += $LOCAL_STORAGE;
+  }
+
+  $code .= <<___;
+    push    %rbx
+.cfi_push   %rbx
+.L${func_name}_seh_push_rbx:
+    push    %rbp
+.cfi_push   %rbp
+.L${func_name}_seh_push_rbp:
+    push    %r12
+.cfi_push   %r12
+.L${func_name}_seh_push_r12:
+    push    %r13
+.cfi_push   %r13
+.L${func_name}_seh_push_r13:
+    push    %r14
+.cfi_push   %r14
+.L${func_name}_seh_push_r14:
+    push    %r15
+.cfi_push   %r15
+.L${func_name}_seh_push_r15:
+___
+
+  if ($win64) {
+    $code .= <<___;
+    push    %rdi
+.L${func_name}_seh_push_rdi:
+    push    %rsi
+.L${func_name}_seh_push_rsi:
+
+    sub     \$`$XMM_STORAGE+8`,%rsp   # +8 alignment
+.L${func_name}_seh_allocstack_xmm:
+___
+  }
+  $code .= <<___;
+    # ; %rbp contains stack pointer right after GP regs pushed at stack + [8
+    # ; bytes of alignment (Windows only)].  It serves as a frame pointer in SEH
+    # ; handlers. The requirement for a frame pointer is that its offset from
+    # ; RSP shall be multiple of 16, and not exceed 240 bytes. The frame pointer
+    # ; itself seems to be reasonable to use here, because later we do 64-byte stack
+    # ; alignment which gives us non-determinate offsets and complicates writing
+    # ; SEH handlers.
+    #
+    # ; It also serves as an anchor for retrieving stack arguments on both Linux
+    # ; and Windows.
+    lea     `$XMM_STORAGE`(%rsp),%rbp
+.cfi_def_cfa_register %rbp
+.L${func_name}_seh_setfp:
+___
+  if ($win64) {
+
+    # ; xmm6:xmm15 need to be preserved on Windows
+    foreach my $reg_idx (6 .. 15) {
+      my $xmm_reg_offset = ($reg_idx - 6) * 16;
+      $code .= <<___;
+        vmovdqu           %xmm${reg_idx},$xmm_reg_offset(%rsp)
+.L${func_name}_seh_save_xmm${reg_idx}:
+___
+    }
+  }
+
+  $code .= <<___;
+# Prolog ends here. Next stack allocation is treated as "dynamic".
+.L${func_name}_seh_prolog_end:
+___
+
+  if ($DYNAMIC_STACK_ALLOC_SIZE) {
+    $code .= <<___;
+        sub               \$`$DYNAMIC_STACK_ALLOC_SIZE + $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE`,%rsp
+        and               \$(-64),%rsp
+___
+  }
+}
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Restore register content for the caller.
+# ;;; And cleanup stack.
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+sub EPILOG {
+  my ($hkeys_storage_on_stack, $payload_len) = @_;
+
+  my $rndsuffix = &random_string();
+
+  if ($hkeys_storage_on_stack && $CLEAR_HKEYS_STORAGE_ON_EXIT) {
+
+    # ; There is no need in hkeys cleanup if payload len was small, i.e. no hkeys
+    # ; were stored in the local frame storage
+    $code .= <<___;
+        cmpq              \$`16*16`,$payload_len
+        jbe               .Lskip_hkeys_cleanup_${rndsuffix}
+        vpxor             %xmm0,%xmm0,%xmm0
+___
+    for (my $i = 0; $i < int($HKEYS_STORAGE / 64); $i++) {
+      $code .= "vmovdqa64         %zmm0,`$STACK_HKEYS_OFFSET + 64*$i`(%rsp)\n";
+    }
+    $code .= ".Lskip_hkeys_cleanup_${rndsuffix}:\n";
+  }
+
+  if ($CLEAR_SCRATCH_REGISTERS) {
+    &clear_scratch_gps_asm();
+    &clear_scratch_zmms_asm();
+  } else {
+    $code .= "vzeroupper\n";
+  }
+
+  if ($win64) {
+
+    # ; restore xmm15:xmm6
+    for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
+      my $xmm_reg_offset = -$XMM_STORAGE + ($reg_idx - 6) * 16;
+      $code .= <<___;
+        vmovdqu           $xmm_reg_offset(%rbp),%xmm${reg_idx},
+___
+    }
+  }
+
+  if ($win64) {
+
+    # Forming valid epilog for SEH with use of frame pointer.
+    # https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=msvc-160#epilog-code
+    $code .= "lea      8(%rbp),%rsp\n";
+  } else {
+    $code .= "lea      (%rbp),%rsp\n";
+    $code .= ".cfi_def_cfa_register %rsp\n";
+  }
+
+  if ($win64) {
+    $code .= <<___;
+     pop     %rsi
+.cfi_pop     %rsi
+     pop     %rdi
+.cfi_pop     %rdi
+___
+  }
+  $code .= <<___;
+     pop     %r15
+.cfi_pop     %r15
+     pop     %r14
+.cfi_pop     %r14
+     pop     %r13
+.cfi_pop     %r13
+     pop     %r12
+.cfi_pop     %r12
+     pop     %rbp
+.cfi_pop     %rbp
+     pop     %rbx
+.cfi_pop     %rbx
+___
+}
+
+# ; Clears all scratch ZMM registers
+# ;
+# ; It should be called before restoring the XMM registers
+# ; for Windows (XMM6-XMM15).
+# ;
+sub clear_scratch_zmms_asm {
+
+  # ; On Linux, all ZMM registers are scratch registers
+  if (!$win64) {
+    $code .= "vzeroall\n";
+  } else {
+    foreach my $i (0 .. 5) {
+      $code .= "vpxorq  %xmm${i},%xmm${i},%xmm${i}\n";
+    }
+  }
+  foreach my $i (16 .. 31) {
+    $code .= "vpxorq  %xmm${i},%xmm${i},%xmm${i}\n";
+  }
+}
+
+# Clears all scratch GP registers
+sub clear_scratch_gps_asm {
+  foreach my $reg ("%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11") {
+    $code .= "xor $reg,$reg\n";
+  }
+  if (!$win64) {
+    foreach my $reg ("%rsi", "%rdi") {
+      $code .= "xor $reg,$reg\n";
+    }
+  }
+}
+
+sub precompute_hkeys_on_stack {
+  my $GCM128_CTX  = $_[0];
+  my $HKEYS_READY = $_[1];
+  my $ZTMP0       = $_[2];
+  my $ZTMP1       = $_[3];
+  my $ZTMP2       = $_[4];
+  my $ZTMP3       = $_[5];
+  my $ZTMP4       = $_[6];
+  my $ZTMP5       = $_[7];
+  my $ZTMP6       = $_[8];
+  my $HKEYS_RANGE = $_[9];    # ; "first16", "mid16", "all", "first32", "last32"
+
+  die "precompute_hkeys_on_stack: Unexpected value of HKEYS_RANGE: $HKEYS_RANGE"
+    if ($HKEYS_RANGE ne "first16"
+    && $HKEYS_RANGE ne "mid16"
+    && $HKEYS_RANGE ne "all"
+    && $HKEYS_RANGE ne "first32"
+    && $HKEYS_RANGE ne "last32");
+
+  my $rndsuffix = &random_string();
+
+  $code .= <<___;
+        test              $HKEYS_READY,$HKEYS_READY
+        jnz               .L_skip_hkeys_precomputation_${rndsuffix}
+___
+
+  if ($HKEYS_RANGE eq "first16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "all") {
+
+    # ; Fill the stack with the first 16 hkeys from the context
+    $code .= <<___;
+        # ; Move 16 hkeys from the context to stack
+        vmovdqu64         @{[HashKeyByIdx(4,$GCM128_CTX)]},$ZTMP0
+        vmovdqu64         $ZTMP0,@{[HashKeyByIdx(4,"%rsp")]}
+
+        vmovdqu64         @{[HashKeyByIdx(8,$GCM128_CTX)]},$ZTMP1
+        vmovdqu64         $ZTMP1,@{[HashKeyByIdx(8,"%rsp")]}
+
+        # ; broadcast HashKey^8
+        vshufi64x2        \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
+
+        vmovdqu64         @{[HashKeyByIdx(12,$GCM128_CTX)]},$ZTMP2
+        vmovdqu64         $ZTMP2,@{[HashKeyByIdx(12,"%rsp")]}
+
+        vmovdqu64         @{[HashKeyByIdx(16,$GCM128_CTX)]},$ZTMP3
+        vmovdqu64         $ZTMP3,@{[HashKeyByIdx(16,"%rsp")]}
+___
+  }
+
+  if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "last32") {
+    $code .= <<___;
+        vmovdqu64         @{[HashKeyByIdx(8,"%rsp")]},$ZTMP1
+
+        # ; broadcast HashKey^8
+        vshufi64x2        \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
+
+        vmovdqu64         @{[HashKeyByIdx(12,"%rsp")]},$ZTMP2
+        vmovdqu64         @{[HashKeyByIdx(16,"%rsp")]},$ZTMP3
+___
+
+  }
+
+  if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
+
+    # ; Precompute hkeys^i, i=17..32
+    my $i = 20;
+    foreach (1 .. int((32 - 16) / 8)) {
+
+      # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
+      &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
+      $code .= "vmovdqu64         $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
+      $i += 4;
+
+      # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
+      &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
+      $code .= "vmovdqu64         $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
+      $i += 4;
+    }
+  }
+
+  if ($HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
+
+    # ; Precompute hkeys^i, i=33..48 (HKEYS_STORAGE_CAPACITY = 48)
+    my $i = 36;
+    foreach (1 .. int((48 - 32) / 8)) {
+
+      # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
+      &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
+      $code .= "vmovdqu64         $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
+      $i += 4;
+
+      # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
+      &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
+      $code .= "vmovdqu64         $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
+      $i += 4;
+    }
+  }
+
+  $code .= ".L_skip_hkeys_precomputation_${rndsuffix}:\n";
+}
+
+# ;; =============================================================================
+# ;; Generic macro to produce code that executes $OPCODE instruction
+# ;; on selected number of AES blocks (16 bytes long ) between 0 and 16.
+# ;; All three operands of the instruction come from registers.
+# ;; Note: if 3 blocks are left at the end instruction is produced to operate all
+# ;;       4 blocks (full width of ZMM)
+sub ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 {
+  my $NUM_BLOCKS = $_[0];    # [in] numerical value, number of AES blocks (0 to 16)
+  my $OPCODE     = $_[1];    # [in] instruction name
+  my @DST;
+  $DST[0] = $_[2];           # [out] destination ZMM register
+  $DST[1] = $_[3];           # [out] destination ZMM register
+  $DST[2] = $_[4];           # [out] destination ZMM register
+  $DST[3] = $_[5];           # [out] destination ZMM register
+  my @SRC1;
+  $SRC1[0] = $_[6];          # [in] source 1 ZMM register
+  $SRC1[1] = $_[7];          # [in] source 1 ZMM register
+  $SRC1[2] = $_[8];          # [in] source 1 ZMM register
+  $SRC1[3] = $_[9];          # [in] source 1 ZMM register
+  my @SRC2;
+  $SRC2[0] = $_[10];         # [in] source 2 ZMM register
+  $SRC2[1] = $_[11];         # [in] source 2 ZMM register
+  $SRC2[2] = $_[12];         # [in] source 2 ZMM register
+  $SRC2[3] = $_[13];         # [in] source 2 ZMM register
+
+  die "ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
+    if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
+
+  my $reg_idx     = 0;
+  my $blocks_left = $NUM_BLOCKS;
+
+  foreach (1 .. ($NUM_BLOCKS / 4)) {
+    $code .= "$OPCODE        $SRC2[$reg_idx],$SRC1[$reg_idx],$DST[$reg_idx]\n";
+    $reg_idx++;
+    $blocks_left -= 4;
+  }
+
+  my $DSTREG  = $DST[$reg_idx];
+  my $SRC1REG = $SRC1[$reg_idx];
+  my $SRC2REG = $SRC2[$reg_idx];
+
+  if ($blocks_left == 1) {
+    $code .= "$OPCODE         @{[XWORD($SRC2REG)]},@{[XWORD($SRC1REG)]},@{[XWORD($DSTREG)]}\n";
+  } elsif ($blocks_left == 2) {
+    $code .= "$OPCODE         @{[YWORD($SRC2REG)]},@{[YWORD($SRC1REG)]},@{[YWORD($DSTREG)]}\n";
+  } elsif ($blocks_left == 3) {
+    $code .= "$OPCODE         $SRC2REG,$SRC1REG,$DSTREG\n";
+  }
+}
+
+# ;; =============================================================================
+# ;; Loads specified number of AES blocks into ZMM registers using mask register
+# ;; for the last loaded register (xmm, ymm or zmm).
+# ;; Loads take place at 1 byte granularity.
+sub ZMM_LOAD_MASKED_BLOCKS_0_16 {
+  my $NUM_BLOCKS  = $_[0];    # [in] numerical value, number of AES blocks (0 to 16)
+  my $INP         = $_[1];    # [in] input data pointer to read from
+  my $DATA_OFFSET = $_[2];    # [in] offset to the output pointer (GP or numerical)
+  my @DST;
+  $DST[0] = $_[3];            # [out] ZMM register with loaded data
+  $DST[1] = $_[4];            # [out] ZMM register with loaded data
+  $DST[2] = $_[5];            # [out] ZMM register with loaded data
+  $DST[3] = $_[6];            # [out] ZMM register with loaded data
+  my $MASK = $_[7];           # [in] mask register
+
+  die "ZMM_LOAD_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
+    if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
+
+  my $src_offset  = 0;
+  my $dst_idx     = 0;
+  my $blocks_left = $NUM_BLOCKS;
+
+  if ($NUM_BLOCKS > 0) {
+    foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
+      $code .= "vmovdqu8          @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DST[$dst_idx]\n";
+      $src_offset += 64;
+      $dst_idx++;
+      $blocks_left -= 4;
+    }
+  }
+
+  my $DSTREG = $DST[$dst_idx];
+
+  if ($blocks_left == 1) {
+    $code .= "vmovdqu8          @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[XWORD($DSTREG)]}\{$MASK\}{z}\n";
+  } elsif ($blocks_left == 2) {
+    $code .= "vmovdqu8          @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[YWORD($DSTREG)]}\{$MASK\}{z}\n";
+  } elsif (($blocks_left == 3 || $blocks_left == 4)) {
+    $code .= "vmovdqu8          @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DSTREG\{$MASK\}{z}\n";
+  }
+}
+
+# ;; =============================================================================
+# ;; Stores specified number of AES blocks from ZMM registers with mask register
+# ;; for the last loaded register (xmm, ymm or zmm).
+# ;; Stores take place at 1 byte granularity.
+sub ZMM_STORE_MASKED_BLOCKS_0_16 {
+  my $NUM_BLOCKS  = $_[0];    # [in] numerical value, number of AES blocks (0 to 16)
+  my $OUTP        = $_[1];    # [in] output data pointer to write to
+  my $DATA_OFFSET = $_[2];    # [in] offset to the output pointer (GP or numerical)
+  my @SRC;
+  $SRC[0] = $_[3];            # [in] ZMM register with data to store
+  $SRC[1] = $_[4];            # [in] ZMM register with data to store
+  $SRC[2] = $_[5];            # [in] ZMM register with data to store
+  $SRC[3] = $_[6];            # [in] ZMM register with data to store
+  my $MASK = $_[7];           # [in] mask register
+
+  die "ZMM_STORE_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
+    if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
+
+  my $dst_offset  = 0;
+  my $src_idx     = 0;
+  my $blocks_left = $NUM_BLOCKS;
+
+  if ($NUM_BLOCKS > 0) {
+    foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
+      $code .= "vmovdqu8          $SRC[$src_idx],`$dst_offset`($OUTP,$DATA_OFFSET,1)\n";
+      $dst_offset += 64;
+      $src_idx++;
+      $blocks_left -= 4;
+    }
+  }
+
+  my $SRCREG = $SRC[$src_idx];
+
+  if ($blocks_left == 1) {
+    $code .= "vmovdqu8          @{[XWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
+  } elsif ($blocks_left == 2) {
+    $code .= "vmovdqu8          @{[YWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
+  } elsif ($blocks_left == 3 || $blocks_left == 4) {
+    $code .= "vmovdqu8          $SRCREG,`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
+  }
+}
+
+# ;;; ===========================================================================
+# ;;; Handles AES encryption rounds
+# ;;; It handles special cases: the last and first rounds
+# ;;; Optionally, it performs XOR with data after the last AES round.
+# ;;; Uses NROUNDS parameter to check what needs to be done for the current round.
+# ;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
+sub ZMM_AESENC_ROUND_BLOCKS_0_16 {
+  my $L0B0_3   = $_[0];     # [in/out] zmm; blocks 0 to 3
+  my $L0B4_7   = $_[1];     # [in/out] zmm; blocks 4 to 7
+  my $L0B8_11  = $_[2];     # [in/out] zmm; blocks 8 to 11
+  my $L0B12_15 = $_[3];     # [in/out] zmm; blocks 12 to 15
+  my $KEY      = $_[4];     # [in] zmm containing round key
+  my $ROUND    = $_[5];     # [in] round number
+  my $D0_3     = $_[6];     # [in] zmm or no_data; plain/cipher text blocks 0-3
+  my $D4_7     = $_[7];     # [in] zmm or no_data; plain/cipher text blocks 4-7
+  my $D8_11    = $_[8];     # [in] zmm or no_data; plain/cipher text blocks 8-11
+  my $D12_15   = $_[9];     # [in] zmm or no_data; plain/cipher text blocks 12-15
+  my $NUMBL    = $_[10];    # [in] number of blocks; numerical value
+  my $NROUNDS  = $_[11];    # [in] number of rounds; numerical value
+
+  # ;;; === first AES round
+  if ($ROUND < 1) {
+
+    # ;;  round 0
+    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
+      $NUMBL,  "vpxorq", $L0B0_3,   $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
+      $L0B4_7, $L0B8_11, $L0B12_15, $KEY,    $KEY,     $KEY,      $KEY);
+  }
+
+  # ;;; === middle AES rounds
+  if ($ROUND >= 1 && $ROUND <= $NROUNDS) {
+
+    # ;; rounds 1 to 9/11/13
+    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
+      $NUMBL,  "vaesenc", $L0B0_3,   $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
+      $L0B4_7, $L0B8_11,  $L0B12_15, $KEY,    $KEY,     $KEY,      $KEY);
+  }
+
+  # ;;; === last AES round
+  if ($ROUND > $NROUNDS) {
+
+    # ;; the last round - mix enclast with text xor's
+    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
+      $NUMBL,  "vaesenclast", $L0B0_3,   $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
+      $L0B4_7, $L0B8_11,      $L0B12_15, $KEY,    $KEY,     $KEY,      $KEY);
+
+    # ;;; === XOR with data
+    if ( ($D0_3 ne "no_data")
+      && ($D4_7 ne "no_data")
+      && ($D8_11 ne "no_data")
+      && ($D12_15 ne "no_data"))
+    {
+      &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
+        $NUMBL,  "vpxorq", $L0B0_3,   $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
+        $L0B4_7, $L0B8_11, $L0B12_15, $D0_3,   $D4_7,    $D8_11,    $D12_15);
+    }
+  }
+}
+
+# ;;; Horizontal XOR - 4 x 128bits xored together
+sub VHPXORI4x128 {
+  my $REG = $_[0];    # [in/out] ZMM with 4x128bits to xor; 128bit output
+  my $TMP = $_[1];    # [clobbered] ZMM temporary register
+  $code .= <<___;
+        vextracti64x4     \$1,$REG,@{[YWORD($TMP)]}
+        vpxorq            @{[YWORD($TMP)]},@{[YWORD($REG)]},@{[YWORD($REG)]}
+        vextracti32x4     \$1,@{[YWORD($REG)]},@{[XWORD($TMP)]}
+        vpxorq            @{[XWORD($TMP)]},@{[XWORD($REG)]},@{[XWORD($REG)]}
+___
+}
+
+# ;;; AVX512 reduction macro
+sub VCLMUL_REDUCE {
+  my $OUT   = $_[0];    # [out] zmm/ymm/xmm: result (must not be $TMP1 or $HI128)
+  my $POLY  = $_[1];    # [in] zmm/ymm/xmm: polynomial
+  my $HI128 = $_[2];    # [in] zmm/ymm/xmm: high 128b of hash to reduce
+  my $LO128 = $_[3];    # [in] zmm/ymm/xmm: low 128b of hash to reduce
+  my $TMP0  = $_[4];    # [in] zmm/ymm/xmm: temporary register
+  my $TMP1  = $_[5];    # [in] zmm/ymm/xmm: temporary register
+
+  $code .= <<___;
+        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        # ;; first phase of the reduction
+        vpclmulqdq        \$0x01,$LO128,$POLY,$TMP0
+        vpslldq           \$8,$TMP0,$TMP0         # ; shift-L 2 DWs
+        vpxorq            $TMP0,$LO128,$TMP0      # ; first phase of the reduction complete
+        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        # ;; second phase of the reduction
+        vpclmulqdq        \$0x00,$TMP0,$POLY,$TMP1
+        vpsrldq           \$4,$TMP1,$TMP1          # ; shift-R only 1-DW to obtain 2-DWs shift-R
+        vpclmulqdq        \$0x10,$TMP0,$POLY,$OUT
+        vpslldq           \$4,$OUT,$OUT            # ; shift-L 1-DW to obtain result with no shifts
+        vpternlogq        \$0x96,$HI128,$TMP1,$OUT # ; OUT/GHASH = OUT xor TMP1 xor HI128
+        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+___
+}
+
+# ;; ===========================================================================
+# ;; schoolbook multiply of 16 blocks (16 x 16 bytes)
+# ;; - it is assumed that data read from $INPTR is already shuffled and
+# ;;   $INPTR address is 64 byte aligned
+# ;; - there is an option to pass ready blocks through ZMM registers too.
+# ;;   4 extra parameters need to be passed in such case and 21st ($ZTMP9) argument can be empty
+sub GHASH_16 {
+  my $TYPE  = $_[0];     # [in] ghash type: start (xor hash), mid, end (same as mid; no reduction),
+                         # end_reduce (end with reduction), start_reduce
+  my $GH    = $_[1];     # [in/out] ZMM ghash sum: high 128-bits
+  my $GM    = $_[2];     # [in/out] ZMM ghash sum: middle 128-bits
+  my $GL    = $_[3];     # [in/out] ZMM ghash sum: low 128-bits
+  my $INPTR = $_[4];     # [in] data input pointer
+  my $INOFF = $_[5];     # [in] data input offset
+  my $INDIS = $_[6];     # [in] data input displacement
+  my $HKPTR = $_[7];     # [in] hash key pointer
+  my $HKOFF = $_[8];     # [in] hash key offset (can be either numerical offset, or register containing offset)
+  my $HKDIS = $_[9];     # [in] hash key displacement
+  my $HASH  = $_[10];    # [in/out] ZMM hash value in/out
+  my $ZTMP0 = $_[11];    # [clobbered] temporary ZMM
+  my $ZTMP1 = $_[12];    # [clobbered] temporary ZMM
+  my $ZTMP2 = $_[13];    # [clobbered] temporary ZMM
+  my $ZTMP3 = $_[14];    # [clobbered] temporary ZMM
+  my $ZTMP4 = $_[15];    # [clobbered] temporary ZMM
+  my $ZTMP5 = $_[16];    # [clobbered] temporary ZMM
+  my $ZTMP6 = $_[17];    # [clobbered] temporary ZMM
+  my $ZTMP7 = $_[18];    # [clobbered] temporary ZMM
+  my $ZTMP8 = $_[19];    # [clobbered] temporary ZMM
+  my $ZTMP9 = $_[20];    # [clobbered] temporary ZMM, can be empty if 4 extra parameters below are provided
+  my $DAT0  = $_[21];    # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
+  my $DAT1  = $_[22];    # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
+  my $DAT2  = $_[23];    # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
+  my $DAT3  = $_[24];    # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
+
+  my $start_ghash  = 0;
+  my $do_reduction = 0;
+  if ($TYPE eq "start") {
+    $start_ghash = 1;
+  }
+
+  if ($TYPE eq "start_reduce") {
+    $start_ghash  = 1;
+    $do_reduction = 1;
+  }
+
+  if ($TYPE eq "end_reduce") {
+    $do_reduction = 1;
+  }
+
+  # ;; ghash blocks 0-3
+  if (scalar(@_) == 21) {
+    $code .= "vmovdqa64         @{[EffectiveAddress($INPTR,$INOFF,($INDIS+0*64))]},$ZTMP9\n";
+  } else {
+    $ZTMP9 = $DAT0;
+  }
+
+  if ($start_ghash != 0) {
+    $code .= "vpxorq            $HASH,$ZTMP9,$ZTMP9\n";
+  }
+  $code .= <<___;
+        vmovdqu64         @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+0*64))]},$ZTMP8
+        vpclmulqdq        \$0x11,$ZTMP8,$ZTMP9,$ZTMP0      # ; T0H = a1*b1
+        vpclmulqdq        \$0x00,$ZTMP8,$ZTMP9,$ZTMP1      # ; T0L = a0*b0
+        vpclmulqdq        \$0x01,$ZTMP8,$ZTMP9,$ZTMP2      # ; T0M1 = a1*b0
+        vpclmulqdq        \$0x10,$ZTMP8,$ZTMP9,$ZTMP3      # ; T0M2 = a0*b1
+___
+
+  # ;; ghash blocks 4-7
+  if (scalar(@_) == 21) {
+    $code .= "vmovdqa64         @{[EffectiveAddress($INPTR,$INOFF,($INDIS+1*64))]},$ZTMP9\n";
+  } else {
+    $ZTMP9 = $DAT1;
+  }
+  $code .= <<___;
+        vmovdqu64         @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+1*64))]},$ZTMP8
+        vpclmulqdq        \$0x11,$ZTMP8,$ZTMP9,$ZTMP4      # ; T1H = a1*b1
+        vpclmulqdq        \$0x00,$ZTMP8,$ZTMP9,$ZTMP5      # ; T1L = a0*b0
+        vpclmulqdq        \$0x01,$ZTMP8,$ZTMP9,$ZTMP6      # ; T1M1 = a1*b0
+        vpclmulqdq        \$0x10,$ZTMP8,$ZTMP9,$ZTMP7      # ; T1M2 = a0*b1
+___
+
+  # ;; update sums
+  if ($start_ghash != 0) {
+    $code .= <<___;
+        vpxorq            $ZTMP6,$ZTMP2,$GM             # ; GM = T0M1 + T1M1
+        vpxorq            $ZTMP4,$ZTMP0,$GH             # ; GH = T0H + T1H
+        vpxorq            $ZTMP5,$ZTMP1,$GL             # ; GL = T0L + T1L
+        vpternlogq        \$0x96,$ZTMP7,$ZTMP3,$GM      # ; GM = T0M2 + T1M1
+___
+  } else {    # ;; mid, end, end_reduce
+    $code .= <<___;
+        vpternlogq        \$0x96,$ZTMP6,$ZTMP2,$GM      # ; GM += T0M1 + T1M1
+        vpternlogq        \$0x96,$ZTMP4,$ZTMP0,$GH      # ; GH += T0H + T1H
+        vpternlogq        \$0x96,$ZTMP5,$ZTMP1,$GL      # ; GL += T0L + T1L
+        vpternlogq        \$0x96,$ZTMP7,$ZTMP3,$GM      # ; GM += T0M2 + T1M