#! /usr/bin/env perl
# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
######################################################################
## Constant-time SSSE3 AES core implementation.
## version 0.1
##
## By Mike Hamburg (Stanford University), 2009
## Public domain.
##
## For details see http://shiftleft.org/papers/vector_aes/ and
## http://crypto.stanford.edu/vpaes/.
######################################################################
# September 2011.
#
# Interface to OpenSSL as "almost" drop-in replacement for
# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
# doesn't handle partial vectors (doesn't have to if called from
# EVP only). "Drop-in" implies that this module doesn't share key
# schedule structure with the original nor does it make assumption
# about its alignment...
#
# Performance summary. aes-x86_64.pl column lists large-block CBC
# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
# byte processed with 128-bit key, and vpaes-x86_64.pl column -
# [also large-block CBC] encrypt/decrypt.
#
# aes-x86_64.pl vpaes-x86_64.pl
#
# Core 2(**) 29.6/41.1/14.3 21.9/25.2(***)
# Nehalem 29.6/40.3/14.6 10.0/11.8
# Atom 57.3/74.2/32.1 60.9/77.2(***)
# Silvermont 52.7/64.0/19.5 48.8/60.8(***)
# Goldmont 38.9/49.0/17.8 10.6/12.6
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
# majority of contemporary cores share cache, slower code path
# is common place. In other words "with-hyper-threading-off"
# results are presented mostly for reference purposes.
#
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***) Less impressive improvement on Core 2 and Atom is due to slow
# pshufb, yet it's respectable +36%/62% improvement on Core 2
# (as implied, over "hyper-threading-safe" code path).
#
# <appro@openssl.org>
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
$PREFIX="vpaes";
$code.=<<___;
.text
##
## _aes_encrypt_core
##
## AES-encrypt %xmm0.
##
## Inputs:
## %xmm0 = input
## %xmm9-%xmm15 as in _vpaes_preheat
## (%rdx) = scheduled keys
##
## Output in %xmm0
## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
## Preserves %xmm6 - %xmm8 so you get some local vectors
##
##
.type _vpaes_encrypt_core,\@abi-omnipotent
.align 16
_vpaes_encrypt_core:
.cfi_startproc
mov %rdx, %r9
mov \$16, %r11
mov 240(%rdx),%eax
movdqa %xmm9, %xmm1
movdqa .Lk_ipt(%rip), %xmm2 # iptlo
pandn %xmm0, %xmm1
movdqu (%r9), %xmm5 # round0 key
psrld \$4, %xmm1
pand %xmm9, %xmm0
pshufb %xmm0, %xmm2
movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
pshufb %xmm1, %xmm0
pxor %xmm5, %xmm2
add \$16, %r9
pxor %xmm2, %xmm0
lea .Lk_mc_backward(%rip),%r10
jmp .Lenc_entry
.align 16
.Lenc_loop:
# middle of middle round
movdqa %xmm13, %xmm4 # 4 : sb1u
movdqa %xmm12, %xmm0 # 0 : sb1t
pshufb %xmm2, %xmm4 # 4 = sb1u
pshufb %xmm3, %xmm0 # 0 = sb1t
pxor %xmm5, %xmm4 # 4 = sb1u + k
movdqa %xmm15, %xmm5 # 4 : sb2u
pxor %xmm4, %xmm0 # 0 = A
movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
pshufb %xmm2, %xmm5 # 4 = sb2u
movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
movdqa %xmm14, %xmm2 # 2 : sb2t
pshufb %xmm3, %xmm2 # 2 = sb2t
movdqa %xmm0, %xmm3 # 3 = A
pxor %xmm5, %xmm2 # 2 = 2A
pshufb %xmm1, %xmm0 # 0 = B
add \$16, %r9 # next key
pxor %xmm2, %xmm0 # 0 = 2A+B
pshufb %xmm4, %xmm3 # 3 = D
add \$16, %r11 # next mc
pxor %xmm0, %xmm3 # 3 = 2A+B+D
pshufb %xmm1, %xmm0 # 0 = 2B+C
and \$0x30, %r11 # ... mod 4
sub \$1,%rax # nr--
pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
.Lenc_entry:
# top of round
movdqa %xmm9, %xmm1 # 1 : i
movdqa %xmm11, %xmm5 # 2 : a/k
pandn %xmm0, %xmm1 # 1 = i<<4
psrld \$4, %xmm1 # 1 = i
pand %xmm9, %xmm0 # 0 = k
pshufb %xmm0, %xmm5 # 2 = a/k
movdqa %xmm10, %