#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# June 2011
#
# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
# in http://download.intel.com/design/intarch/papers/323686.pdf, is
# that since AESNI-CBC encrypt exhibit *very* low instruction-level
# parallelism, interleaving it with another algorithm would allow to
# utilize processor resources better and achieve better performance.
# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
# AESNI code is weaved into it. Below are performance numbers in
# cycles per processed byte, less is better, for standalone AESNI-CBC
# encrypt, sum of the latter and standalone SHA1, and "stitched"
# subroutine:
#
# AES-128-CBC +SHA1 stitch gain
# Westmere 3.77[+5.6] 9.37 6.65 +41%
# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
#
# AES-192-CBC
# Westmere 4.51 10.11 6.97 +45%
# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
#
# AES-256-CBC
# Westmere 5.25 10.85 7.25 +50%
# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
#
# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
# background information. Above numbers in parentheses are SSSE3
# results collected on AVX-capable CPU, i.e. apply on OSes that
# don't support AVX.
#
# Needless to mention that it makes no sense to implement "stitched"
# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
# fully utilize parallelism, so stitching would not give any gain
# anyway. Well, there might be some, e.g. because of better cache
# locality... For reference, here are performance results for
# standalone AESNI-CBC decrypt:
#
# AES-128-CBC AES-192-CBC AES-256-CBC
# Westmere 1.31 1.55 1.80
# Sandy Bridge 0.93 1.06 1.22
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
$1>=2.19);
$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
$1>=2.09);
$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
`ml64 2>&1` =~ /Version ([0-9]+)\./ &&
$1>=10);
open STDOUT,"| $^X $xlate $flavour $output";
# void aesni_cbc_sha1_enc(const void *inp,
# void *out,
# size_t length,
# const AES_KEY *key,
# unsigned char *iv,
# SHA_CTX *ctx,
# const void *in0);
$code.=<<___;
.text
.extern OPENSSL_ia32cap_P
.globl aesni_cbc_sha1_enc
.type aesni_cbc_sha1_enc,\@abi-omnipotent
.align 16
aesni_cbc_sha1_enc:
# caller should check for SSSE3 and AES-NI bits
mov OPENSSL_ia32cap_P+0(%rip),%r10d
mov OPENSSL_ia32cap_P+4(%rip),%r11d
___
$code.=<<___ if ($avx);
and \$`1<<28`,%r11d # mask AVX bit
and \$`1<<30`,%r10d # mask "Intel CPU" bit
or %r11d,%r10d
cmp \$`1<<28|1<<30`,%r10d
je aesni_cbc_sha1_enc_avx
___
$code.=<<___;
jmp aesni_cbc_sha1_enc_ssse3
ret
.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
___
my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
my $Xi=4;
m