#! /usr/bin/env perl
# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
#
# AES-NI-CTR+GHASH stitch.
#
# February 2013
#
# OpenSSL GCM implementation is organized in such way that its
# performance is rather close to the sum of its streamed components,
# in the context parallelized AES-NI CTR and modulo-scheduled
# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
# was observed to perform significantly better than the sum of the
# components on contemporary CPUs, the effort was deemed impossible to
# justify. This module is based on combination of Intel submissions,
# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
# Locktyukhin of Intel Corp. who verified that it reduces shuffles
# pressure with notable relative improvement, achieving 1.0 cycle per
# byte processed with 128-bit key on Haswell processor, 0.74 - on
# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
# measurements for favourable packet size, one divisible by 96.
# Applications using the EVP interface will observe a few percent
# worse performance.]
#
# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
$avx = ($1>=2.20) + ($1>=2.22);
}
if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
$avx = ($1>=2.09) + ($1>=2.10);
}
if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
`ml64 2>&1` =~ /Version ([0-9]+)\./) {
$avx = ($1>=10) + ($1>=11);
}
if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
if ($avx>1) {{{
($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
($Ii,$T1,$T2,$Hkey,
$Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
$code=<<___;
.text
.type _aesni_ctr32_ghash_6x,\@abi-omnipotent
.align 32
_aesni_ctr32_ghash_6x:
.cfi_startproc
vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
sub \$6,$len
vpxor $Z0,$Z0,$Z0 # $Z0 = 0
vmovdqu 0x00-0x80($key),$rndkey
vpaddb $T2,$T1,$inout1
vpaddb $T2,$inout1,$inout2
vpaddb $T2,$inout2,$inout3
vpaddb $T2,$inout3,$inout4
vpaddb $T2,$inout4,$inout5
vpxor $rndkey,$T1,$inout0
vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0
jmp .Loop6x
.align 32
.Loop6x:
add \$`6<<24`,$counter
jc .Lhandle_ctr32 # discard $inout[1-5]?
vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
vpaddb $T2,$inout5,$T1 # next counter value
vpxor $rndkey,$inout1,$inout1
vpxor $rndkey,$inout2,$inout2
.Lresume_ctr32:
vmovdqu $T1,($ivp) # save next counter value
vpclmulqdq \$0x10,$Hkey,$Z3,$Z1
vpxor $rndkey,$inout3,$inout3
vmovups 0x10-0x80($key),$T2