2 files changed, 4977 insertions, 1 deletions
diff --git a/crypto/modes/asm/aes-gcm-avx512.pl b/crypto/modes/asm/aes-gcm-avx512.pl
new file mode 100644
index 0000000000..1c7ee8769a
--- /dev/null
+++ b/crypto/modes/asm/aes-gcm-avx512.pl
@@ -0,0 +1,4975 @@
+# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+#
+# This implementation is based on the AES-GCM code (AVX512VAES + VPCLMULQDQ)
+# from Intel(R) Multi-Buffer Crypto for IPsec Library v1.1
+# (https://github.com/intel/intel-ipsec-mb).
+# Original author is Tomasz Kantecki <tomasz.kantecki@intel.com>.
+#
+# References:
+#  [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
+#      Intel Architecture Processors. August, 2010.
+#  [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
+#      Intel Architecture Processors. October, 2012.
+#  [3] Shay Gueron et. al. Intel Carry-Less Multiplication Instruction and its
+#      Usage for Computing the GCM Mode. May, 2010.
+#
+#
+# December 2021
+#
+# Initial release.
+#
+# GCM128_CONTEXT structure has storage for 16 hkeys only, but this
+# implementation can use up to 48.  To avoid extending the context size,
+# precompute and store in the context first 16 hkeys only, and compute the rest
+# on demand keeping them in the local frame.
+#
+#======================================================================
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output  = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop   : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.|          ? shift : undef;
+
+$win64 = 0;
+$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$avx512vaes = 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+$dir = $1;
+($xlate = "${dir}x86_64-xlate.pl" and -f $xlate)
+  or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate)
+  or die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+  $avx512vaes = ($1 >= 2.30);
+}
+
+if (!$avx512vaes
+  && $win64
+  && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/)
+  && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/)
+{
+  $avx512vaes = ($1 == 2.13 && $2 >= 3) + ($1 >= 2.14);
+}
+
+if (!$avx512vaes && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
+  $avx512vaes = ($2 >= 7.0);
+}
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""
+  or die "can't call $xlate: $!";
+*STDOUT = *OUT;
+
+#======================================================================
+if ($avx512vaes>0) { #<<<
+
+$code .= <<___;
+.extern OPENSSL_ia32cap_P
+.globl  ossl_vaes_vpclmulqdq_capable
+.type   ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
+.align 32
+ossl_vaes_vpclmulqdq_capable:
+    mov OPENSSL_ia32cap_P+8(%rip), %rcx
+    # avx512vpclmulqdq + avx512vaes + avx512vl + avx512bw + avx512dq + avx512f
+    mov \$`1<<42|1<<41|1<<31|1<<30|1<<17|1<<16`,%rdx
+    xor %eax,%eax
+    and %rdx,%rcx
+    cmp %rdx,%rcx
+    cmove %rcx,%rax
+    ret
+.size   ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
+___
+
+# ; Mapping key length -> AES rounds count
+my %aes_rounds = (
+  128 => 9,
+  192 => 11,
+  256 => 13);
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Code generation control switches
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+# ; ABI-aware zeroing of volatile registers in EPILOG().
+# ; Disabled due to performance reasons.
+my $CLEAR_SCRATCH_REGISTERS = 0;
+
+# ; Zero HKeys storage from the stack if they are stored there
+my $CLEAR_HKEYS_STORAGE_ON_EXIT = 1;
+
+# ; Enable / disable check of function arguments for null pointer
+# ; Currently disabled, as this check is handled outside.
+my $CHECK_FUNCTION_ARGUMENTS = 0;
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Global constants
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+# AES block size in bytes
+my $AES_BLOCK_SIZE = 16;
+
+# Storage capacity in elements
+my $HKEYS_STORAGE_CAPACITY = 48;
+my $LOCAL_STORAGE_CAPACITY = 48;
+my $HKEYS_CONTEXT_CAPACITY = 16;
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Stack frame definition
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+# (1) -> +64(Win)/+48(Lin)-byte space for pushed GPRs
+# (2) -> +8-byte space for 16-byte alignment of XMM storage
+# (3) -> Frame pointer (%RBP)
+# (4) -> +160-byte XMM storage (Windows only, zero on Linux)
+# (5) -> +48-byte space for 64-byte alignment of %RSP from p.8
+# (6) -> +768-byte LOCAL storage (optional, can be omitted in some functions)
+# (7) -> +768-byte HKEYS storage
+# (8) -> Stack pointer (%RSP) aligned on 64-byte boundary
+
+my $GP_STORAGE  = $win64 ? 8 * 8     : 8 * 6;    # ; space for saved non-volatile GP registers (pushed on stack)
+my $XMM_STORAGE = $win64 ? (10 * 16) : 0;        # ; space for saved XMM registers
+my $HKEYS_STORAGE = ($HKEYS_STORAGE_CAPACITY * $AES_BLOCK_SIZE);    # ; space for HKeys^i, i=1..48
+my $LOCAL_STORAGE = ($LOCAL_STORAGE_CAPACITY * $AES_BLOCK_SIZE);    # ; space for up to 48 AES blocks
+
+my $STACK_HKEYS_OFFSET = 0;
+my $STACK_LOCAL_OFFSET = ($STACK_HKEYS_OFFSET + $HKEYS_STORAGE);
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Function arguments abstraction
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+my ($arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8, $arg9, $arg10, $arg11);
+
+# ; This implementation follows the convention: for non-leaf functions (they
+# ; must call PROLOG) %rbp is used as a frame pointer, and has fixed offset from
+# ; the function entry: $GP_STORAGE + [8 bytes alignment (Windows only)].  This
+# ; helps to facilitate SEH handlers writing.
+#
+# ; Leaf functions here do not use more than 4 input arguments.
+if ($win64) {
+  $arg1  = "%rcx";
+  $arg2  = "%rdx";
+  $arg3  = "%r8";
+  $arg4  = "%r9";
+  $arg5  = "`$GP_STORAGE + 8 + 8*5`(%rbp)";    # +8 - alignment bytes
+  $arg6  = "`$GP_STORAGE + 8 + 8*6`(%rbp)";
+  $arg7  = "`$GP_STORAGE + 8 + 8*7`(%rbp)";
+  $arg8  = "`$GP_STORAGE + 8 + 8*8`(%rbp)";
+  $arg9  = "`$GP_STORAGE + 8 + 8*9`(%rbp)";
+  $arg10 = "`$GP_STORAGE + 8 + 8*10`(%rbp)";
+  $arg11 = "`$GP_STORAGE + 8 + 8*11`(%rbp)";
+} else {
+  $arg1  = "%rdi";
+  $arg2  = "%rsi";
+  $arg3  = "%rdx";
+  $arg4  = "%rcx";
+  $arg5  = "%r8";
+  $arg6  = "%r9";
+  $arg7  = "`$GP_STORAGE + 8*1`(%rbp)";
+  $arg8  = "`$GP_STORAGE + 8*2`(%rbp)";
+  $arg9  = "`$GP_STORAGE + 8*3`(%rbp)";
+  $arg10 = "`$GP_STORAGE + 8*4`(%rbp)";
+  $arg11 = "`$GP_STORAGE + 8*5`(%rbp)";
+}
+
+# ; Offsets in gcm128_context structure (see include/crypto/modes.h)
+my $CTX_OFFSET_CurCount  = (16 * 0);          #  ; (Yi) Current counter for generation of encryption key
+my $CTX_OFFSET_PEncBlock = (16 * 1);          #  ; (repurposed EKi field) Partial block buffer
+my $CTX_OFFSET_EK0       = (16 * 2);          #  ; (EK0) Encrypted Y0 counter (see gcm spec notation)
+my $CTX_OFFSET_AadLen    = (16 * 3);          #  ; (len.u[0]) Length of Hash which has been input
+my $CTX_OFFSET_InLen     = ((16 * 3) + 8);    #  ; (len.u[1]) Length of input data which will be encrypted or decrypted
+my $CTX_OFFSET_AadHash   = (16 * 4);          #  ; (Xi) Current hash
+my $CTX_OFFSET_HTable    = (16 * 6);          #  ; (Htable) Precomputed table (allows 16 values)
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Helper functions
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+# ; Generates "random" local labels
+sub random_string() {
+  my @chars  = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_');
+  my $length = 15;
+  my $str;
+  map { $str .= $chars[rand(33)] } 1 .. $length;
+  return $str;
+}
+
+sub BYTE {
+  my ($reg) = @_;
+  if ($reg =~ /%r[abcd]x/i) {
+    $reg =~ s/%r([abcd])x/%${1}l/i;
+  } elsif ($reg =~ /%r[sdb][ip]/i) {
+    $reg =~ s/%r([sdb][ip])/%${1}l/i;
+  } elsif ($reg =~ /%r[0-9]{1,2}/i) {
+    $reg =~ s/%(r[0-9]{1,2})/%${1}b/i;
+  } else {
+    die "BYTE: unknown register: $reg\n";
+  }
+  return $reg;
+}
+
+sub WORD {
+  my ($reg) = @_;
+  if ($reg =~ /%r[abcdsdb][xip]/i) {
+    $reg =~ s/%r([abcdsdb])([xip])/%${1}${2}/i;
+  } elsif ($reg =~ /%r[0-9]{1,2}/) {
+    $reg =~ s/%(r[0-9]{1,2})/%${1}w/i;
+  } else {
+    die "WORD: unknown register: $reg\n";
+  }
+  return $reg;
+}
+
+sub DWORD {
+  my ($reg) = @_;
+  if ($reg =~ /%r[abcdsdb][xip]/i) {
+    $reg =~ s/%r([abcdsdb])([xip])/%e${1}${2}/i;
+  } elsif ($reg =~ /%r[0-9]{1,2}/i) {
+    $reg =~ s/%(r[0-9]{1,2})/%${1}d/i;
+  } else {
+    die "DWORD: unknown register: $reg\n";
+  }
+  return $reg;
+}
+
+sub XWORD {
+  my ($reg) = @_;
+  if ($reg =~ /%[xyz]mm/i) {
+    $reg =~ s/%[xyz]mm/%xmm/i;
+  } else {
+    die "XWORD: unknown register: $reg\n";
+  }
+  return $reg;
+}
+
+sub YWORD {
+  my ($reg) = @_;
+  if ($reg =~ /%[xyz]mm/i) {
+    $reg =~ s/%[xyz]mm/%ymm/i;
+  } else {
+    die "YWORD: unknown register: $reg\n";
+  }
+  return $reg;
+}
+
+sub ZWORD {
+  my ($reg) = @_;
+  if ($reg =~ /%[xyz]mm/i) {
+    $reg =~ s/%[xyz]mm/%zmm/i;
+  } else {
+    die "ZWORD: unknown register: $reg\n";
+  }
+  return $reg;
+}
+
+# ; Helper function to construct effective address based on two kinds of
+# ; offsets: numerical or located in the register
+sub EffectiveAddress {
+  my ($base, $offset, $displacement) = @_;
+  $displacement = 0 if (!$displacement);
+
+  if ($offset =~ /^\d+\z/) {    # numerical offset
+    return "`$offset + $displacement`($base)";
+  } else {                      # offset resides in register
+    return "$displacement($base,$offset,1)";
+  }
+}
+
+# ; Provides memory location of corresponding HashKey power
+sub HashKeyByIdx {
+  my ($idx, $base) = @_;
+  my $base_str = ($base eq "%rsp") ? "frame" : "context";
+
+  my $offset = &HashKeyOffsetByIdx($idx, $base_str);
+  return "$offset($base)";
+}
+
+# ; Provides offset (in bytes) of corresponding HashKey power from the highest key in the storage
+sub HashKeyOffsetByIdx {
+  my ($idx, $base) = @_;
+  die "HashKeyOffsetByIdx: base should be either 'frame' or 'context'; base = $base"
+    if (($base ne "frame") && ($base ne "context"));
+
+  my $offset_base;
+  my $offset_idx;
+  if ($base eq "frame") {    # frame storage
+    die "HashKeyOffsetByIdx: idx out of bounds (1..48)! idx = $idx\n" if ($idx > $HKEYS_STORAGE_CAPACITY || $idx < 1);
+    $offset_base = $STACK_HKEYS_OFFSET;
+    $offset_idx  = ($AES_BLOCK_SIZE * ($HKEYS_STORAGE_CAPACITY - $idx));
+  } else {                   # context storage
+    die "HashKeyOffsetByIdx: idx out of bounds (1..16)! idx = $idx\n" if ($idx > $HKEYS_CONTEXT_CAPACITY || $idx < 1);
+    $offset_base = $CTX_OFFSET_HTable;
+    $offset_idx  = ($AES_BLOCK_SIZE * ($HKEYS_CONTEXT_CAPACITY - $idx));
+  }
+  return $offset_base + $offset_idx;
+}
+
+# ; Creates local frame and does back up of non-volatile registers.
+# ; Holds stack unwinding directives.
+sub PROLOG {
+  my ($need_hkeys_stack_storage, $need_aes_stack_storage, $func_name) = @_;
+
+  my $DYNAMIC_STACK_ALLOC_SIZE            = 0;
+  my $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE = $win64 ? 48 : 52;
+
+  if ($need_hkeys_stack_storage) {
+    $DYNAMIC_STACK_ALLOC_SIZE += $HKEYS_STORAGE;
+  }
+
+  if ($need_aes_stack_storage) {
+    if (!$need_hkeys_stack_storage) {
+      die "PROLOG: unsupported case - aes storage without hkeys one";
+    }
+    $DYNAMIC_STACK_ALLOC_SIZE += $LOCAL_STORAGE;
+  }
+
+  $code .= <<___;
+    push    %rbx
+.cfi_push   %rbx
+.L${func_name}_seh_push_rbx:
+    push    %rbp
+.cfi_push   %rbp
+.L${func_name}_seh_push_rbp:
+    push    %r12
+.cfi_push   %r12
+.L${func_name}_seh_push_r12:
+    push    %r13
+.cfi_push   %r13
+.L${func_name}_seh_push_r13:
+    push    %r14
+.cfi_push   %r14
+.L${func_name}_seh_push_r14:
+    push    %r15
+.cfi_push   %r15
+.L${func_name}_seh_push_r15:
+___
+
+  if ($win64) {
+    $code .= <<___;
+    push    %rdi
+.L${func_name}_seh_push_rdi:
+    push    %rsi
+.L${func_name}_seh_push_rsi:
+
+    sub     \$`$XMM_STORAGE+8`,%rsp   # +8 alignment
+.L${func_name}_seh_allocstack_xmm:
+___
+  }
+  $code .= <<___;
+    # ; %rbp contains stack pointer right after GP regs pushed at stack + [8
+    # ; bytes of alignment (Windows only)].  It serves as a frame pointer in SEH
+    # ; handlers. The requirement for a frame pointer is that its offset from
+    # ; RSP shall be multiple of 16, and not exceed 240 bytes. The frame pointer
+    # ; itself seems to be reasonable to use here, because later we do 64-byte stack
+    # ; alignment which gives us non-determinate offsets and complicates writing
+    # ; SEH handlers.
+    #
+    # ; It also serves as an anchor for retrieving stack arguments on both Linux
+    # ; and Windows.
+    lea     `$XMM_STORAGE`(%rsp),%rbp
+.cfi_def_cfa_register %rbp
+.L${func_name}_seh_setfp:
+___
+  if ($win64) {
+
+    # ; xmm6:xmm15 need to be preserved on Windows
+    foreach my $reg_idx (6 .. 15) {
+      my $xmm_reg_offset = ($reg_idx - 6) * 16;
+      $code .= <<___;
+        vmovdqu           %xmm${reg_idx},$xmm_reg_offset(%rsp)
+.L${func_name}_seh_save_xmm${reg_idx}:
+___
+    }
+  }
+
+  $code .= <<___;
+# Prolog ends here. Next stack allocation is treated as "dynamic".
+.L${func_name}_seh_prolog_end:
+___
+
+  if ($DYNAMIC_STACK_ALLOC_SIZE) {
+    $code .= <<___;
+        sub               \$`$DYNAMIC_STACK_ALLOC_SIZE + $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE`,%rsp
+        and               \$(-64),%rsp
+___
+  }
+}
+
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# ;;; Restore register content for the caller.
+# ;;; And cleanup stack.
+# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+sub EPILOG {
+  my ($hkeys_storage_on_stack, $payload_len) = @_;
+
+  my $rndsuffix = &random_string();
+
+  if ($hkeys_storage_on_stack && $CLEAR_HKEYS_STORAGE_ON_EXIT) {
+
+    # ; There is no need in hkeys cleanup if payload len was small, i.e. no hkeys
+    # ; were stored in the local frame storage
+    $code .= <<___;
+        cmpq              \$`16*16`,$payload_len
+        jbe               .Lskip_hkeys_cleanup_${rndsuffix}
+        vpxor             %xmm0,%xmm0,%xmm0
+___
+    for (my $i = 0; $i < int($HKEYS_STORAGE / 64); $i++) {
+      $code .= "vmovdqa64         %zmm0,`$STACK_HKEYS_OFFSET + 64*$i`(%rsp)\n";
+    }
+    $code .= ".Lskip_hkeys_cleanup_${rndsuffix}:\n";
+  }
+
+  if ($CLEAR_SCRATCH_REGISTERS) {
+    &clear_scratch_gps_asm();
+    &clear_scratch_zmms_asm();
+  } else {
+    $code .= "vzeroupper\n";
+  }
+
+  if ($win64) {
+
+    # ; restore xmm15:xmm6
+    for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
+      my $xmm_reg_offset = -$XMM_STORAGE + ($reg_idx - 6) * 16;
+      $code .= <<___;
+        vmovdqu           $xmm_reg_offset(%rbp),%xmm${reg_idx},
+___
+    }
+  }
+
+  if ($win64) {
+
+    # Forming valid epilog for SEH with use of frame pointer.
+    # https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=msvc-160#epilog-code
+    $code .= "lea      8(%rbp),%rsp\n";
+  } else {
+    $code .= "lea      (%rbp),%rsp\n";
+    $code .= ".cfi_def_cfa_register %rsp\n";
+  }
+
+  if ($win64) {
+    $code .= <<___;
+     pop     %rsi
+.cfi_pop     %rsi
+     pop     %rdi
+.cfi_pop     %rdi
+___
+  }
+  $code .= <<___;
+     pop     %r15
+.cfi_pop     %r15
+     pop     %r14
+.cfi_pop     %r14
+     pop     %r13
+.cfi_pop     %r13
+     pop     %r12
+.cfi_pop     %r12
+     pop     %rbp
+.cfi_pop     %rbp
+     pop     %rbx
+.cfi_pop     %rbx
+___
+}
+
+# ; Clears all scratch ZMM registers
+# ;
+# ; It should be called before restoring the XMM registers
+# ; for Windows (XMM6-XMM15).
+# ;
+sub clear_scratch_zmms_asm {
+
+  # ; On Linux, all ZMM registers are scratch registers
+  if (!$win64) {
+    $code .= "vzeroall\n";
+  } else {
+    foreach my $i (0 .. 5) {
+      $code .= "vpxorq  %xmm${i},%xmm${i},%xmm${i}\n";
+    }
+  }
+  foreach my $i (16 .. 31) {
+    $code .= "vpxorq  %xmm${i},%xmm${i},%xmm${i}\n";
+  }
+}
+
+# Clears all scratch GP registers
+sub clear_scratch_gps_asm {
+  foreach my $reg ("%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11") {
+    $code .= "xor $reg,$reg\n";
+  }
+  if (!$win64) {
+    foreach my $reg ("%rsi", "%rdi") {
+      $code .= "xor $reg,$reg\n";
+    }
+  }
+}
+
+sub precompute_hkeys_on_stack {
+  my $GCM128_CTX  = $_[0];
+  my $HKEYS_READY = $_[1];
+  my $ZTMP0       = $_[2];
+  my $ZTMP1       = $_[3];
+  my $ZTMP2       = $_[4];
+  my $ZTMP3       = $_[5];
+  my $ZTMP4       = $_[6];
+  my $ZTMP5       = $_[7];
+  my $ZTMP6       = $_[8];
+  my $HKEYS_RANGE = $_[9];    # ; "first16", "mid16", "all", "first32", "last32"
+
+  die "precompute_hkeys_on_stack: Unexpected value of HKEYS_RANGE: $HKEYS_RANGE"
+    if ($HKEYS_RANGE ne "first16"
+    && $HKEYS_RANGE ne "mid16"
+    && $HKEYS_RANGE ne "all"
+    && $HKEYS_RANGE ne "first32"
+    && $HKEYS_RANGE ne "last32");
+
+  my $rndsuffix = &random_string();
+
+  $code .= <<___;
+        test              $HKEYS_READY,$HKEYS_READY
+        jnz               .L_skip_hkeys_precomputation_${rndsuffix}
+___
+
+  if ($HKEYS_RANGE eq "first16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "all") {
+
+    # ; Fill the stack with the first 16 hkeys from the context
+    $code .= <<___;
+        # ; Move 16 hkeys from the context to stack
+        vmovdqu64         @{[HashKeyByIdx(4,$GCM128_CTX)]},$ZTMP0
+        vmovdqu64         $ZTMP0,@{[HashKeyByIdx(4,"%rsp")]}
+
+        vmovdqu64         @{[HashKeyByIdx(8,$GCM128_CTX)]},$ZTMP1
+        vmovdqu64         $ZTMP1,@{[HashKeyByIdx(8,"%rsp")]}
+
+        # ; broadcast HashKey^8
+        vshufi64x2        \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
+
+        vmovdqu64         @{[HashKeyByIdx(12,$GCM128_CTX)]},$ZTMP2
+        vmovdqu64         $ZTMP2,@{[HashKeyByIdx(12,"%rsp")]}
+
+        vmovdqu64         @{[HashKeyByIdx(16,$GCM128_CTX)]},$ZTMP3
+        vmovdqu64         $ZTMP3,@{[HashKeyByIdx(16,"%rsp")]}
+___
+  }
+
+  if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "last32") {
+    $code .= <<___;
+        vmovdqu64         @{[HashKeyByIdx(8,"%rsp")]},$ZTMP1
+
+        # ; broadcast HashKey^8
+        vshufi64x2        \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
+
+        vmovdqu64         @{[HashKeyByIdx(12,"%rsp")]},$ZTMP2
+        vmovdqu64         @{[HashKeyByIdx(16,"%rsp")]},$ZTMP3
+___
+
+  }
+
+  if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
+
+    # ; Precompute hkeys^i, i=17..32
+    my $i = 20;
+    foreach (1 .. int((32 - 16) / 8)) {
+
+      # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
+      &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
+      $code .= "vmovdqu64         $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
+      $i += 4;
+
+      # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
+      &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
+      $code .= "vmovdqu64         $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
+      $i += 4;
+    }
+  }
+
+  if ($HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
+
+    # ; Precompute hkeys^i, i=33..48 (HKEYS_STORAGE_CAPACITY = 48)
+    my $i = 36;
+    foreach (1 .. int((48 - 32) / 8)) {
+
+      # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
+      &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
+      $code .= "vmovdqu64         $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
+      $i += 4;
+
+      # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
+      &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
+      $code .= "vmovdqu64         $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
+      $i += 4;
+    }
+  }
+
+  $code .= ".L_skip_hkeys_precomputation_${rndsuffix}:\n";
+}
+
+# ;; =============================================================================
+# ;; Generic macro to produce code that executes $OPCODE instruction
+# ;; on selected number of AES blocks (16 bytes long ) between 0 and 16.
+# ;; All three operands of the instruction come from registers.
+# ;; Note: if 3 blocks are left at the end instruction is produced to operate all
+# ;;       4 blocks (full width of ZMM)
+sub ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 {
+  my $NUM_BLOCKS = $_[0];    # [in] numerical value, number of AES blocks (0 to 16)
+  my $OPCODE     = $_[1];    # [in] instruction name
+  my @DST;
+  $DST[0] = $_[2];           # [out] destination ZMM register
+  $DST[1] = $_[3];           # [out] destination ZMM register
+  $DST[2] = $_[4];           # [out] destination ZMM register
+  $DST[3] = $_[5];           # [out] destination ZMM register
+  my @SRC1;
+  $SRC1[0] = $_[6];          # [in] source 1 ZMM register
+  $SRC1[1] = $_[7];          # [in] source 1 ZMM register
+  $SRC1[2] = $_[8];          # [in] source 1 ZMM register
+  $SRC1[3] = $_[9];          # [in] source 1 ZMM register
+  my @SRC2;
+  $SRC2[0] = $_[10];         # [in] source 2 ZMM register
+  $SRC2[1] = $_[11];         # [in] source 2 ZMM register
+  $SRC2[2] = $_[12];         # [in] source 2 ZMM register
+  $SRC2[3] = $_[13];         # [in] source 2 ZMM register
+
+  die "ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
+    if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
+
+  my $reg_idx     = 0;
+  my $blocks_left = $NUM_BLOCKS;
+
+  foreach (1 .. ($NUM_BLOCKS / 4)) {
+    $code .= "$OPCODE        $SRC2[$reg_idx],$SRC1[$reg_idx],$DST[$reg_idx]\n";
+    $reg_idx++;
+    $blocks_left -= 4;
+  }
+
+  my $DSTREG  = $DST[$reg_idx];
+  my $SRC1REG = $SRC1[$reg_idx];
+  my $SRC2REG = $SRC2[$reg_idx];
+
+  if ($blocks_left == 1) {
+    $code .= "$OPCODE         @{[XWORD($SRC2REG)]},@{[XWORD($SRC1REG)]},@{[XWORD($DSTREG)]}\n";
+  } elsif ($blocks_left == 2) {
+    $code .= "$OPCODE         @{[YWORD($SRC2REG)]},@{[YWORD($SRC1REG)]},@{[YWORD($DSTREG)]}\n";
+  } elsif ($blocks_left == 3) {
+    $code .= "$OPCODE         $SRC2REG,$SRC1REG,$DSTREG\n";
+  }
+}
+
+# ;; =============================================================================
+# ;; Loads specified number of AES blocks into ZMM registers using mask register
+# ;; for the last loaded register (xmm, ymm or zmm).
+# ;; Loads take place at 1 byte granularity.
+sub ZMM_LOAD_MASKED_BLOCKS_0_16 {
+  my $NUM_BLOCKS  = $_[0];    # [in] numerical value, number of AES blocks (0 to 16)
+  my $INP         = $_[1];    # [in] input data pointer to read from
+  my $DATA_OFFSET = $_[2];    # [in] offset to the output pointer (GP or numerical)
+  my @DST;
+  $DST[0] = $_[3];            # [out] ZMM register with loaded data
+  $DST[1] = $_[4];            # [out] ZMM register with loaded data
+  $DST[2] = $_[5];            # [out] ZMM register with loaded data
+  $DST[3] = $_[6];            # [out] ZMM register with loaded data
+  my $MASK = $_[7];           # [in] mask register
+
+  die "ZMM_LOAD_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
+    if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
+
+  my $src_offset  = 0;
+  my $dst_idx     = 0;
+  my $blocks_left = $NUM_BLOCKS;
+
+  if ($NUM_BLOCKS > 0) {
+    foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
+      $code .= "vmovdqu8          @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DST[$dst_idx]\n";
+      $src_offset += 64;
+      $dst_idx++;
+      $blocks_left -= 4;
+    }
+  }
+
+  my $DSTREG = $DST[$dst_idx];
+
+  if ($blocks_left == 1) {
+    $code .= "vmovdqu8          @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[XWORD($DSTREG)]}\{$MASK\}{z}\n";
+  } elsif ($blocks_left == 2) {
+    $code .= "vmovdqu8          @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[YWORD($DSTREG)]}\{$MASK\}{z}\n";
+  } elsif (($blocks_left == 3 || $blocks_left == 4)) {
+    $code .= "vmovdqu8          @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DSTREG\{$MASK\}{z}\n";
+  }
+}
+
+# ;; =============================================================================
+# ;; Stores specified number of AES blocks from ZMM registers with mask register
+# ;; for the last loaded register (xmm, ymm or zmm).
+# ;; Stores take place at 1 byte granularity.
+sub ZMM_STORE_MASKED_BLOCKS_0_16 {
+  my $NUM_BLOCKS  = $_[0];    # [in] numerical value, number of AES blocks (0 to 16)
+  my $OUTP        = $_[1];    # [in] output data pointer to write to
+  my $DATA_OFFSET = $_[2];    # [in] offset to the output pointer (GP or numerical)
+  my @SRC;
+  $SRC[0] = $_[3];            # [in] ZMM register with data to store
+  $SRC[1] = $_[4];            # [in] ZMM register with data to store
+  $SRC[2] = $_[5];            # [in] ZMM register with data to store
+  $SRC[3] = $_[6];            # [in] ZMM register with data to store
+  my $MASK = $_[7];           # [in] mask register
+
+  die "ZMM_STORE_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
+    if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
+
+  my $dst_offset  = 0;
+  my $src_idx     = 0;
+  my $blocks_left = $NUM_BLOCKS;
+
+  if ($NUM_BLOCKS > 0) {
+    foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
+      $code .= "vmovdqu8          $SRC[$src_idx],`$dst_offset`($OUTP,$DATA_OFFSET,1)\n";
+      $dst_offset += 64;
+      $src_idx++;
+      $blocks_left -= 4;
+    }
+  }
+
+  my $SRCREG = $SRC[$src_idx];
+
+  if ($blocks_left == 1) {
+    $code .= "vmovdqu8          @{[XWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
+  } elsif ($blocks_left == 2) {
+    $code .= "vmovdqu8          @{[YWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
+  } elsif ($blocks_left == 3 || $blocks_left == 4) {
+    $code .= "vmovdqu8          $SRCREG,`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
+  }
+}
+
+# ;;; ===========================================================================
+# ;;; Handles AES encryption rounds
+# ;;; It handles special cases: the last and first rounds
+# ;;; Optionally, it performs XOR with data after the last AES round.
+# ;;; Uses NROUNDS parameter to check what needs to be done for the current round.
+# ;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
+sub ZMM_AESENC_ROUND_BLOCKS_0_16 {
+  my $L0B0_3   = $_[0];     # [in/out] zmm; blocks 0 to 3
+  my $L0B4_7   = $_[1];     # [in/out] zmm; blocks 4 to 7
+  my $L0B8_11  = $_[2];     # [in/out] zmm; blocks 8 to 11
+  my $L0B12_15 = $_[3];     # [in/out] zmm; blocks 12 to 15
+  my $KEY      = $_[4];     # [in] zmm containing round key
+  my $ROUND    = $_[5];     # [in] round number
+  my $D0_3     = $_[6];     # [in] zmm or no_data; plain/cipher text blocks 0-3
+  my $D4_7     = $_[7];     # [in] zmm or no_data; plain/cipher text blocks 4-7
+  my $D8_11    = $_[8];     # [in] zmm or no_data; plain/cipher text blocks 8-11
+  my $D12_15   = $_[9];     # [in] zmm or no_data; plain/cipher text blocks 12-15
+  my $NUMBL    = $_[10];    # [in] number of blocks; numerical value
+  my $NROUNDS  = $_[11];    # [in] number of rounds; numerical value
+
+  # ;;; === first AES round
+  if ($ROUND < 1) {
+
+    # ;;  round 0
+    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
+      $NUMBL,  "vpxorq", $L0B0_3,   $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
+      $L0B4_7, $L0B8_11, $L0B12_15, $KEY,    $KEY,     $KEY,      $KEY);
+  }
+
+  # ;;; === middle AES rounds
+  if ($ROUND >= 1 && $ROUND <= $NROUNDS) {
+
+    # ;; rounds 1 to 9/11/13
+    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
+      $NUMBL,  "vaesenc", $L0B0_3,   $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
+      $L0B4_7, $L0B8_11,  $L0B12_15, $KEY,    $KEY,     $KEY,      $KEY);
+  }
+
+  # ;;; === last AES round
+  if ($ROUND > $NROUNDS) {
+
+    # ;; the last round - mix enclast with text xor's
+    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
+      $NUMBL,  "vaesenclast", $L0B0_3,   $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
+      $L0B4_7, $L0B8_11,      $L0B12_15, $KEY,    $KEY,     $KEY,      $KEY);
+
+    # ;;; === XOR with data
+    if ( ($D0_3 ne "no_data")
+      && ($D4_7 ne "no_data")
+      && ($D8_11 ne "no_data")
+      && ($D12_15 ne "no_data"))
+    {
+      &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
+        $NUMBL,  "vpxorq", $L0B0_3,   $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
+        $L0B4_7, $L0B8_11, $L0B12_15, $D0_3,   $D4_7,    $D8_11,    $D12_15);
+    }
+  }
+}
+
+# ;;; Horizontal XOR - 4 x 128bits xored together
+sub VHPXORI4x128 {
+  my $REG = $_[0];    # [in/out] ZMM with 4x128bits to xor; 128bit output
+  my $TMP = $_[1];    # [clobbered] ZMM temporary register
+  $code .= <<___;
+        vextracti64x4     \$1,$REG,@{[YWORD($TMP)]}
+        vpxorq            @{[YWORD($TMP)]},@{[YWORD($REG)]},@{[YWORD($REG)]}
+        vextracti32x4     \$1,@{[YWORD($REG)]},@{[XWORD($TMP)]}
+        vpxorq            @{[XWORD($TMP)]},@{[XWORD($REG)]},@{[XWORD($REG)]}
+___
+}
+
+# ;;; AVX512 reduction macro
+sub VCLMUL_REDUCE {
+  my $OUT   = $_[0];    # [out] zmm/ymm/xmm: result (must not be $TMP1 or $HI128)
+  my $POLY  = $_[1];    # [in] zmm/ymm/xmm: polynomial
+  my $HI128 = $_[2];    # [in] zmm/ymm/xmm: high 128b of hash to reduce
+  my $LO128 = $_[3];    # [in] zmm/ymm/xmm: low 128b of hash to reduce
+  my $TMP0  = $_[4];    # [in] zmm/ymm/xmm: temporary register
+  my $TMP1  = $_[5];    # [in] zmm/ymm/xmm: temporary register
+
+  $code .= <<___;
+        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        # ;; first phase of the reduction
+        vpclmulqdq        \$0x01,$LO128,$POLY,$TMP0
+        vpslldq           \$8,$TMP0,$TMP0         # ; shift-L 2 DWs
+        vpxorq            $TMP0,$LO128,$TMP0      # ; first phase of the reduction complete
+        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        # ;; second phase of the reduction
+        vpclmulqdq        \$0x00,$TMP0,$POLY,$TMP1
+        vpsrldq           \$4,$TMP1,$TMP1          # ; shift-R only 1-DW to obtain 2-DWs shift-R
+        vpclmulqdq        \$0x10,$TMP0,$POLY,$OUT
+        vpslldq           \$4,$OUT,$OUT            # ; shift-L 1-DW to obtain result with no shifts
+        vpternlogq        \$0x96,$HI128,$TMP1,$OUT # ; OUT/GHASH = OUT xor TMP1 xor HI128
+        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+___
+}
+
+# ;; ===========================================================================
+# ;; schoolbook multiply of 16 blocks (16 x 16 bytes)
+# ;; - it is assumed that data read from $INPTR is already shuffled and
+# ;;   $INPTR address is 64 byte aligned
+# ;; - there is an option to pass ready blocks through ZMM registers too.
+# ;;   4 extra parameters need to be passed in such case and 21st ($ZTMP9) argument can be empty
+sub GHASH_16 {
+  my $TYPE  = $_[0];     # [in] ghash type: start (xor hash), mid, end (same as mid; no reduction),
+                         # end_reduce (end with reduction), start_reduce
+  my $GH    = $_[1];     # [in/out] ZMM ghash sum: high 128-bits
+  my $GM    = $_[2];     # [in/out] ZMM ghash sum: middle 128-bits
+  my $GL    = $_[3];     # [in/out] ZMM ghash sum: low 128-bits
+  my $INPTR = $_[4];     # [in] data input pointer
+  my $INOFF = $_[5];     # [in] data input offset
+  my $INDIS = $_[6];     # [in] data input displacement
+  my $HKPTR = $_[7];     # [in] hash key pointer
+  my $HKOFF = $_[8];     # [in] hash key offset (can be either numerical offset, or register containing offset)
+  my $HKDIS = $_[9];     # [in] hash key displacement
+  my $HASH  = $_[10];    # [in/out] ZMM hash value in/out
+  my $ZTMP0 = $_[11];    # [clobbered] temporary ZMM
+  my $ZTMP1 = $_[12];    # [clobbered] temporary ZMM
+  my $ZTMP2 = $_[13];    # [clobbered] temporary ZMM
+  my $ZTMP3 = $_[14];    # [clobbered] temporary ZMM
+  my $ZTMP4 = $_[15];    # [clobbered] temporary ZMM
+  my $ZTMP5 = $_[16];    # [clobbered] temporary ZMM
+  my $ZTMP6 = $_[17];    # [clobbered] temporary ZMM
+  my $ZTMP7 = $_[18];    # [clobbered] temporary ZMM
+  my $ZTMP8 = $_[19];    # [clobbered] temporary ZMM
+  my $ZTMP9 = $_[20];    # [clobbered] temporary ZMM, can be empty if 4 extra parameters below are provided
+  my $DAT0  = $_[21];    # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
+  my $DAT1  = $_[22];    # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
+  my $DAT2  = $_[23];    # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
+  my $DAT3  = $_[24];    # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
+
+  my $start_ghash  = 0;
+  my $do_reduction = 0;
+  if ($TYPE eq "start") {
+    $start_ghash = 1;
+  }
+
+  if ($TYPE eq "start_reduce") {
+    $start_ghash  = 1;
+    $do_reduction = 1;
+  }
+
+  if ($TYPE eq "end_reduce") {
+    $do_reduction = 1;
+  }
+
+  # ;; ghash blocks 0-3
+  if (scalar(@_) == 21) {
+    $code .= "vmovdqa64         @{[EffectiveAddress($INPTR,$INOFF,($INDIS+0*64))]},$ZTMP9\n";
+  } else {
+    $ZTMP9 = $DAT0;
+  }
+
+  if ($start_ghash != 0) {
+    $code .= "vpxorq            $HASH,$ZTMP9,$ZTMP9\n";
+  }
+  $code .= <<___;
+        vmovdqu64         @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+0*64))]},$ZTMP8
+        vpclmulqdq        \$0x11,$ZTMP8,$ZTMP9,$ZTMP0      # ; T0H = a1*b1
+        vpclmulqdq        \$0x00,$ZTMP8,$ZTMP9,$ZTMP1      # ; T0L = a0*b0
+        vpclmulqdq        \$0x01,$ZTMP8,$ZTMP9,$ZTMP2      # ; T0M1 = a1*b0
+        vpclmulqdq        \$0x10,$ZTMP8,$ZTMP9,$ZTMP3      # ; T0M2 = a0*b1
+___
+
+  # ;; ghash blocks 4-7
+  if (scalar(@_) == 21) {
+    $code .= "vmovdqa64         @{[EffectiveAddress($INPTR,$INOFF,($INDIS+1*64))]},$ZTMP9\n";
+  } else {
+    $ZTMP9 = $DAT1;
+  }
+  $code .= <<___;
+        vmovdqu64         @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+1*64))]},$ZTMP8
+        vpclmulqdq        \$0x11,$ZTMP8,$ZTMP9,$ZTMP4      # ; T1H = a1*b1
+        vpclmulqdq        \$0x00,$ZTMP8,$ZTMP9,$ZTMP5      # ; T1L = a0*b0
+        vpclmulqdq        \$0x01,$ZTMP8,$ZTMP9,$ZTMP6      # ; T1M1 = a1*b0
+        vpclmulqdq        \$0x10,$ZTMP8,$ZTMP9,$ZTMP7      # ; T1M2 = a0*b1
+___
+
+  # ;; update sums
+  if ($start_ghash != 0) {
+    $code .= <<___;
+        vpxorq            $ZTMP6,$ZTMP2,$GM             # ; GM = T0M1 + T1M1
+        vpxorq            $ZTMP4,$ZTMP0,$GH             # ; GH = T0H + T1H
+        vpxorq            $ZTMP5,$ZTMP1,$GL             # ; GL = T0L + T1L
+        vpternlogq        \$0x96,$ZTMP7,$ZTMP3,$GM      # ; GM = T0M2 + T1M1
+___
+  } else {    # ;; mid, end, end_reduce
+    $code .= <<___;
+        vpternlogq        \$0x96,$ZTMP6,$ZTMP2,$GM      # ; GM += T0M1 + T1M1
+        vpternlogq        \$0x96,$ZTMP4,$ZTMP0,$GH      # ; GH += T0H + T1H
+        vpternlogq        \$0x96,$ZTMP5,$ZTMP1,$GL      # ; GL += T0L + T1L
+        vpternlogq        \$0x96,$ZTMP7,$ZTMP3,$GM      # ; GM += T0M2 + T1M1
+___
+  }
+
+  # ;; ghash blocks 8-11
+  if (scalar(@_) == 21) {
+    $code .= "vmovdqa64         @{[EffectiveAddress($INPTR,$INOFF,($INDIS+2*64))]},$ZTMP9\n";
+  } else {
+    $ZTMP9 = $DAT2;
+  }
+  $code .= <<___;
+        vmovdqu64         @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+2*64))]},$ZTMP8
+        vpclmulqdq        \$0x11,$ZTMP8,$ZTMP9,$ZTMP0      # ; T0H = a1*b1
+        vpclmulqdq        \$0x00,$ZTMP8,$ZTMP9,$ZTMP1      # ; T0L = a0*b0
+        vpclmulqdq        \$0x01,$ZTMP8,$ZTMP9,$ZTMP2      # ; T0M1 = a1*b0
+        vpclmulqdq        \$0x10,$ZTMP8,$ZTMP9,$ZTMP3      # ; T0M2 = a0*b1
+___
+
+  # ;; ghash blocks 12-15
+  if (scalar(@_) == 21) {
+    $code .= "vmovdqa64         @{[EffectiveAddress($INPTR,$INOFF,($INDIS+3*64))]},$ZTMP9\n";
+  } else {
+    $ZTMP9 = $DAT3;
+  }
+  $code .= <<___;
+        vmovdqu64         @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+3*64))]},$ZTMP8
+        vpclmulqdq        \$0x11,$ZTMP8,$ZTMP9,$ZTMP4      # ; T1H = a1*b1
+        vpclmulqdq        \$0x00,$ZTMP8,$ZTMP9,$ZTMP5      # ; T1L = a0*b0
+        vpclmulqdq        \$0x01,$ZTMP8,$ZTMP9,$ZTMP6      # ; T1M1 = a1*b0
+        vpclmulqdq        \$0x10,$ZTMP8,$ZTMP9,$ZTMP7      # ; T1M2 = a0*b1
+        # ;; update sums
+        vpternlogq        \$0x96,$ZTMP6,$ZTMP2,$GM         # ; GM += T0M1 + T1M1
+        vpternlogq        \$0x96,$ZTMP4,$ZTMP0,$GH         # ; GH += T0H + T1H
+        vpternlogq        \$0x96,$ZTMP5,$ZTMP1,$GL         # ; GL += T0L + T1L
+        vpternlogq        \$0x96,$ZTMP7,$ZTMP3,$GM         # ; GM += T0M2 + T1M1
+___
+  if ($do_reduction != 0) {
+    $cod