summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBernd Edlinger <bernd.edlinger@hotmail.de>2019-08-23 10:17:31 +0200
committerBernd Edlinger <bernd.edlinger@hotmail.de>2019-09-07 10:26:48 +0200
commit87bea6550ae0dda7c40937cff2e86cc2b0b09491 (patch)
treecb5453981d2307a9807847efab7b6b475bc22e9c
parenta6186f39802f94937a46f7a41ef0c86b6334b592 (diff)
Remove x86/x86_64 BSAES and AES_ASM support
This leaves VPAES and AESNI support. The VPAES performance is comparable but BSAES is not completely constant time. There are table lookups using secret key data in AES_set_encrypt/decrypt_key and in ctr mode short data uses the non-constant time AES_encrypt function instead of bit-slicing. Furthermore the AES_ASM is by far outperformed by recent GCC versions. Since BSAES calls back to AES_ASM for short data blocks the performance on those is also worse than the pure software implementaion. Fixes: #9640 Reviewed-by: Richard Levitte <levitte@openssl.org> (Merged from https://github.com/openssl/openssl/pull/9675)
-rw-r--r--Configurations/00-base-templates.conf4
-rwxr-xr-xcrypto/aes/asm/aes-586.pl3000
-rwxr-xr-xcrypto/aes/asm/aes-x86_64.pl2916
-rw-r--r--crypto/aes/asm/bsaes-x86_64.pl3239
-rw-r--r--crypto/evp/e_aes.c2
5 files changed, 3 insertions, 9158 deletions
diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf
index 5fd995cb33..e01dc63a8b 100644
--- a/Configurations/00-base-templates.conf
+++ b/Configurations/00-base-templates.conf
@@ -198,7 +198,7 @@ my %targets=(
bn_asm_src => "bn-586.s co-586.s x86-mont.s x86-gf2m.s",
ec_asm_src => "ecp_nistz256.c ecp_nistz256-x86.s",
des_asm_src => "des-586.s crypt586.s",
- aes_asm_src => "aes-586.s vpaes-x86.s aesni-x86.s",
+ aes_asm_src => "aes_core.c aes_cbc.c vpaes-x86.s aesni-x86.s",
bf_asm_src => "bf-586.s",
md5_asm_src => "md5-586.s",
cast_asm_src => "cast-586.s",
@@ -223,7 +223,7 @@ my %targets=(
cpuid_asm_src => "x86_64cpuid.s",
bn_asm_src => "asm/x86_64-gcc.c x86_64-mont.s x86_64-mont5.s x86_64-gf2m.s rsaz_exp.c rsaz-x86_64.s rsaz-avx2.s",
ec_asm_src => "ecp_nistz256.c ecp_nistz256-x86_64.s x25519-x86_64.s",
- aes_asm_src => "aes-x86_64.s vpaes-x86_64.s bsaes-x86_64.s aesni-x86_64.s aesni-sha1-x86_64.s aesni-sha256-x86_64.s aesni-mb-x86_64.s",
+ aes_asm_src => "aes_core.c aes_cbc.c vpaes-x86_64.s aesni-x86_64.s aesni-sha1-x86_64.s aesni-sha256-x86_64.s aesni-mb-x86_64.s",
md5_asm_src => "md5-x86_64.s",
sha1_asm_src => "sha1-x86_64.s sha256-x86_64.s sha512-x86_64.s sha1-mb-x86_64.s sha256-mb-x86_64.s",
rc4_asm_src => "rc4-x86_64.s rc4-md5-x86_64.s",
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
deleted file mode 100755
index 29059edf8b..0000000000
--- a/crypto/aes/asm/aes-586.pl
+++ /dev/null
@@ -1,3000 +0,0 @@
-#! /usr/bin/env perl
-# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License"). You may not use
-# this file except in compliance with the License. You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# Version 4.3.
-#
-# You might fail to appreciate this module performance from the first
-# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
-# to be *the* best Intel C compiler without -KPIC, performance appears
-# to be virtually identical... But try to re-configure with shared
-# library support... Aha! Intel compiler "suddenly" lags behind by 30%
-# [on P4, more on others]:-) And if compared to position-independent
-# code generated by GNU C, this code performs *more* than *twice* as
-# fast! Yes, all this buzz about PIC means that unlike other hand-
-# coded implementations, this one was explicitly designed to be safe
-# to use even in shared library context... This also means that this
-# code isn't necessarily absolutely fastest "ever," because in order
-# to achieve position independence an extra register has to be
-# off-loaded to stack, which affects the benchmark result.
-#
-# Special note about instruction choice. Do you recall RC4_INT code
-# performing poorly on P4? It might be the time to figure out why.
-# RC4_INT code implies effective address calculations in base+offset*4
-# form. Trouble is that it seems that offset scaling turned to be
-# critical path... At least eliminating scaling resulted in 2.8x RC4
-# performance improvement [as you might recall]. As AES code is hungry
-# for scaling too, I [try to] avoid the latter by favoring off-by-2
-# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
-#
-# As was shown by Dean Gaudet, the above note turned out to be
-# void. Performance improvement with off-by-2 shifts was observed on
-# intermediate implementation, which was spilling yet another register
-# to stack... Final offset*4 code below runs just a tad faster on P4,
-# but exhibits up to 10% improvement on other cores.
-#
-# Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
-# This made it possible to implement little-endian variant of the
-# algorithm without modifying the base C code. Motivating factor for
-# the undertaken effort was that it appeared that in tight IA-32
-# register window little-endian flavor could achieve slightly higher
-# Instruction Level Parallelism, and it indeed resulted in up to 15%
-# better performance on most recent µ-archs...
-#
-# Third version adds AES_cbc_encrypt implementation, which resulted in
-# up to 40% performance improvement of CBC benchmark results. 40% was
-# observed on P4 core, where "overall" improvement coefficient, i.e. if
-# compared to PIC generated by GCC and in CBC mode, was observed to be
-# as large as 4x:-) CBC performance is virtually identical to ECB now
-# and on some platforms even better, e.g. 17.6 "small" cycles/byte on
-# Opteron, because certain function prologues and epilogues are
-# effectively taken out of the loop...
-#
-# Version 3.2 implements compressed tables and prefetch of these tables
-# in CBC[!] mode. Former means that 3/4 of table references are now
-# misaligned, which unfortunately has negative impact on elder IA-32
-# implementations, Pentium suffered 30% penalty, PIII - 10%.
-#
-# Version 3.3 avoids L1 cache aliasing between stack frame and
-# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
-# latter is achieved by copying the key schedule to controlled place in
-# stack. This unfortunately has rather strong impact on small block CBC
-# performance, ~2x deterioration on 16-byte block if compared to 3.3.
-#
-# Version 3.5 checks if there is L1 cache aliasing between user-supplied
-# key schedule and S-boxes and abstains from copying the former if
-# there is no. This allows end-user to consciously retain small block
-# performance by aligning key schedule in specific manner.
-#
-# Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
-#
-# Current ECB performance numbers for 128-bit key in CPU cycles per
-# processed byte [measure commonly used by AES benchmarkers] are:
-#
-# small footprint fully unrolled
-# P4 24 22
-# AMD K8 20 19
-# PIII 25 23
-# Pentium 81 78
-#
-# Version 3.7 reimplements outer rounds as "compact." Meaning that
-# first and last rounds reference compact 256 bytes S-box. This means
-# that first round consumes a lot more CPU cycles and that encrypt
-# and decrypt performance becomes asymmetric. Encrypt performance
-# drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
-# aggressively pre-fetched.
-#
-# Version 4.0 effectively rolls back to 3.6 and instead implements
-# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
-# which use exclusively 256 byte S-box. These functions are to be
-# called in modes not concealing plain text, such as ECB, or when
-# we're asked to process smaller amount of data [or unconditionally
-# on hyper-threading CPU]. Currently it's called unconditionally from
-# AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
-# still needs to be modified to switch between slower and faster
-# mode when appropriate... But in either case benchmark landscape
-# changes dramatically and below numbers are CPU cycles per processed
-# byte for 128-bit key.
-#
-# ECB encrypt ECB decrypt CBC large chunk
-# P4 52[54] 83[95] 23
-# AMD K8 46[41] 66[70] 18
-# PIII 41[50] 60[77] 24
-# Core 2 31[36] 45[64] 18.5
-# Atom 76[100] 96[138] 60
-# Pentium 115 150 77
-#
-# Version 4.1 switches to compact S-box even in key schedule setup.
-#
-# Version 4.2 prefetches compact S-box in every SSE round or in other
-# words every cache-line is *guaranteed* to be accessed within ~50
-# cycles window. Why just SSE? Because it's needed on hyper-threading
-# CPU! Which is also why it's prefetched with 64 byte stride. Best
-# part is that it has no negative effect on performance:-)
-#
-# Version 4.3 implements switch between compact and non-compact block
-# functions in AES_cbc_encrypt depending on how much data was asked
-# to be processed in one stroke.
-#
-######################################################################
-# Timing attacks are classified in two classes: synchronous when
-# attacker consciously initiates cryptographic operation and collects
-# timing data of various character afterwards, and asynchronous when
-# malicious code is executed on same CPU simultaneously with AES,
-# instruments itself and performs statistical analysis of this data.
-#
-# As far as synchronous attacks go the root to the AES timing
-# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
-# are referred to in single 128-bit block operation. Well, in C
-# implementation with 4 distinct tables it's actually as little as 40
-# references per 256 elements table, but anyway... Secondly, even
-# though S-box elements are clustered into smaller amount of cache-
-# lines, smaller than 160 and even 40, it turned out that for certain
-# plain-text pattern[s] or simply put chosen plain-text and given key
-# few cache-lines remain unaccessed during block operation. Now, if
-# attacker can figure out this access pattern, he can deduct the key
-# [or at least part of it]. The natural way to mitigate this kind of
-# attacks is to minimize the amount of cache-lines in S-box and/or
-# prefetch them to ensure that every one is accessed for more uniform
-# timing. But note that *if* plain-text was concealed in such way that
-# input to block function is distributed *uniformly*, then attack
-# wouldn't apply. Now note that some encryption modes, most notably
-# CBC, do mask the plain-text in this exact way [secure cipher output
-# is distributed uniformly]. Yes, one still might find input that
-# would reveal the information about given key, but if amount of
-# candidate inputs to be tried is larger than amount of possible key
-# combinations then attack becomes infeasible. This is why revised
-# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
-# of data is to be processed in one stroke. The current size limit of
-# 512 bytes is chosen to provide same [diminishingly low] probability
-# for cache-line to remain untouched in large chunk operation with
-# large S-box as for single block operation with compact S-box and
-# surely needs more careful consideration...
-#
-# As for asynchronous attacks. There are two flavours: attacker code
-# being interleaved with AES on hyper-threading CPU at *instruction*
-# level, and two processes time sharing single core. As for latter.
-# Two vectors. 1. Given that attacker process has higher priority,
-# yield execution to process performing AES just before timer fires
-# off the scheduler, immediately regain control of CPU and analyze the
-# cache state. For this attack to be efficient attacker would have to
-# effectively slow down the operation by several *orders* of magnitude,
-# by ratio of time slice to duration of handful of AES rounds, which
-# unlikely to remain unnoticed. Not to mention that this also means
-# that he would spend correspondingly more time to collect enough
-# statistical data to mount the attack. It's probably appropriate to
-# say that if adversary reckons that this attack is beneficial and
-# risks to be noticed, you probably have larger problems having him
-# mere opportunity. In other words suggested code design expects you
-# to preclude/mitigate this attack by overall system security design.
-# 2. Attacker manages to make his code interrupt driven. In order for
-# this kind of attack to be feasible, interrupt rate has to be high
-# enough, again comparable to duration of handful of AES rounds. But
-# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
-# generates interrupts at such raging rate...
-#
-# And now back to the former, hyper-threading CPU or more specifically
-# Intel P4. Recall that asynchronous attack implies that malicious
-# code instruments itself. And naturally instrumentation granularity
-# has be noticeably lower than duration of codepath accessing S-box.
-# Given that all cache-lines are accessed during that time that is.
-# Current implementation accesses *all* cache-lines within ~50 cycles
-# window, which is actually *less* than RDTSC latency on Intel P4!
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-
-$output = pop;
-open OUT,">$output";
-*STDOUT=*OUT;
-
-&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
-&static_label("AES_Te");
-&static_label("AES_Td");
-
-$s0="eax";
-$s1="ebx";
-$s2="ecx";
-$s3="edx";
-$key="edi";
-$acc="esi";
-$tbl="ebp";
-
-# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
-# by caller
-$__ra=&DWP(0,"esp"); # return address
-$__s0=&DWP(4,"esp"); # s0 backing store
-$__s1=&DWP(8,"esp"); # s1 backing store
-$__s2=&DWP(12,"esp"); # s2 backing store
-$__s3=&DWP(16,"esp"); # s3 backing store
-$__key=&DWP(20,"esp"); # pointer to key schedule
-$__end=&DWP(24,"esp"); # pointer to end of key schedule
-$__tbl=&DWP(28,"esp"); # %ebp backing store
-
-# stack frame layout in AES_[en|crypt] routines, which differs from
-# above by 4 and overlaps by %ebp backing store
-$_tbl=&DWP(24,"esp");
-$_esp=&DWP(28,"esp");
-
-sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
-
-$speed_limit=512; # chunks smaller than $speed_limit are
- # processed with compact routine in CBC mode
-$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
- # recent µ-archs], but ~5 times smaller!
- # I favor compact code to minimize cache
- # contention and in hope to "collect" 5% back
- # in real-life applications...
-
-$vertical_spin=0; # shift "vertically" defaults to 0, because of
- # its proof-of-concept status...
-# Note that there is no decvert(), as well as last encryption round is
-# performed with "horizontal" shifts. This is because this "vertical"
-# implementation [one which groups shifts on a given $s[i] to form a
-# "column," unlike "horizontal" one, which groups shifts on different
-# $s[i] to form a "row"] is work in progress. It was observed to run
-# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
-# whole 12% slower:-( So we face a trade-off... Shall it be resolved
-# some day? Till then the code is considered experimental and by
-# default remains dormant...
-
-sub encvert()
-{ my ($te,@s) = @_;
- my ($v0,$v1) = ($acc,$key);
-
- &mov ($v0,$s[3]); # copy s3
- &mov (&DWP(4,"esp"),$s[2]); # save s2
- &mov ($v1,$s[0]); # copy s0
- &mov (&DWP(8,"esp"),$s[1]); # save s1
-
- &movz ($s[2],&HB($s[0]));
- &and ($s[0],0xFF);
- &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0
- &shr ($v1,16);
- &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8
- &movz ($s[1],&HB($v1));
- &and ($v1,0xFF);
- &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16
- &mov ($v1,$v0);
- &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24
-
- &and ($v0,0xFF);
- &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0
- &movz ($v0,&HB($v1));
- &shr ($v1,16);
- &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8
- &movz ($v0,&HB($v1));
- &and ($v1,0xFF);
- &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
- &mov ($v1,&DWP(4,"esp")); # restore s2
- &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
-
- &mov ($v0,$v1);
- &and ($v1,0xFF);
- &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0
- &movz ($v1,&HB($v0));
- &shr ($v0,16);
- &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8
- &movz ($v1,&HB($v0));
- &and ($v0,0xFF);
- &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
- &mov ($v0,&DWP(8,"esp")); # restore s1
- &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
-
- &mov ($v1,$v0);
- &and ($v0,0xFF);
- &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0
- &movz ($v0,&HB($v1));
- &shr ($v1,16);
- &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8
- &movz ($v0,&HB($v1));
- &and ($v1,0xFF);
- &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
- &mov ($key,$__key); # reincarnate v1 as key
- &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
-}
-
-# Another experimental routine, which features "horizontal spin," but
-# eliminates one reference to stack. Strangely enough runs slower...
-sub enchoriz()
-{ my ($v0,$v1) = ($key,$acc);
-
- &movz ($v0,&LB($s0)); # 3, 2, 1, 0*
- &rotr ($s2,8); # 8,11,10, 9
- &mov ($v1,&DWP(0,$te,$v0,8)); # 0
- &movz ($v0,&HB($s1)); # 7, 6, 5*, 4
- &rotr ($s3,16); # 13,12,15,14
- &xor ($v1,&DWP(3,$te,$v0,8)); # 5
- &movz ($v0,&HB($s2)); # 8,11,10*, 9
- &rotr ($s0,16); # 1, 0, 3, 2
- &xor ($v1,&DWP(2,$te,$v0,8)); # 10
- &movz ($v0,&HB($s3)); # 13,12,15*,14
- &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
- &mov ($__s0,$v1); # t[0] saved
-
- &movz ($v0,&LB($s1)); # 7, 6, 5, 4*
- &shr ($s1,16); # -, -, 7, 6
- &mov ($v1,&DWP(0,$te,$v0,8)); # 4
- &movz ($v0,&LB($s3)); # 13,12,15,14*
- &xor ($v1,&DWP(2,$te,$v0,8)); # 14
- &movz ($v0,&HB($s0)); # 1, 0, 3*, 2
- &and ($s3,0xffff0000); # 13,12, -, -
- &xor ($v1,&DWP(1,$te,$v0,8)); # 3
- &movz ($v0,&LB($s2)); # 8,11,10, 9*
- &or ($s3,$s1); # 13,12, 7, 6
- &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected
- &mov ($s1,$v1); # s[1]=t[1]
-
- &movz ($v0,&LB($s0)); # 1, 0, 3, 2*
- &shr ($s2,16); # -, -, 8,11
- &mov ($v1,&DWP(2,$te,$v0,8)); # 2
- &movz ($v0,&HB($s3)); # 13,12, 7*, 6
- &xor ($v1,&DWP(1,$te,$v0,8)); # 7
- &movz ($v0,&HB($s2)); # -, -, 8*,11
- &xor ($v1,&DWP(0,$te,$v0,8)); # 8
- &mov ($v0,$s3);
- &shr ($v0,24); # 13
- &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected
-
- &movz ($v0,&LB($s2)); # -, -, 8,11*
- &shr ($s0,24); # 1*
- &mov ($s2,&DWP(1,$te,$v0,8)); # 11
- &xor ($s2,&DWP(3,$te,$s0,8)); # 1
- &mov ($s0,$__s0); # s[0]=t[0]
- &movz ($v0,&LB($s3)); # 13,12, 7, 6*
- &shr ($s3,16); # , ,13,12
- &xor ($s2,&DWP(2,$te,$v0,8)); # 6
- &mov ($key,$__key); # reincarnate v0 as key
- &and ($s3,0xff); # , ,13,12*
- &mov ($s3,&DWP(0,$te,$s3,8)); # 12
- &xor ($s3,$s2); # s[2]=t[3] collected
- &mov ($s2,$v1); # s[2]=t[2]
-}
-
-# More experimental code... SSE one... Even though this one eliminates
-# *all* references to stack, it's not faster...
-sub sse_encbody()
-{
- &movz ($acc,&LB("eax")); # 0
- &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
- &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
- &movz ("edx",&HB("eax")); # 1
- &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
- &shr ("eax",16); # 5, 4
-
- &movz ($acc,&LB("ebx")); # 10
- &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
- &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
- &movz ($acc,&HB("ebx")); # 11
- &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
- &shr ("ebx",16); # 15,14
-
- &movz ($acc,&HB("eax")); # 5
- &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
- &movq ("mm3",QWP(16,$key));
- &movz ($acc,&HB("ebx")); # 15
- &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
- &movd ("mm0","ecx"); # t[0] collected
-
- &movz ($acc,&LB("eax")); # 4
- &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
- &movd ("eax","mm2"); # 7, 6, 3, 2
- &movz ($acc,&LB("ebx")); # 14
- &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
- &movd ("ebx","mm6"); # 13,12, 9, 8
-
- &movz ($acc,&HB("eax")); # 3
- &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
- &movz ($acc,&HB("ebx")); # 9
- &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
- &movd ("mm1","ecx"); # t[1] collected
-
- &movz ($acc,&LB("eax")); # 2
- &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
- &shr ("eax",16); # 7, 6
- &punpckldq ("mm0","mm1"); # t[0,1] collected
- &movz ($acc,&LB("ebx")); # 8
- &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
- &shr ("ebx",16); # 13,12
-
- &movz ($acc,&HB("eax")); # 7
- &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
- &pxor ("mm0","mm3");
- &movz ("eax",&LB("eax")); # 6
- &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
- &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
- &movz ($acc,&HB("ebx")); # 13
- &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
- &xor ("ecx",&DWP(24,$key)); # t[2]
- &movd ("mm4","ecx"); # t[2] collected
- &movz ("ebx",&LB("ebx")); # 12
- &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12
- &shr ("ecx",16);
- &movd ("eax","mm1"); # 5, 4, 1, 0
- &mov ("ebx",&DWP(28,$key)); # t[3]
- &xor ("ebx","edx");
- &movd ("mm5","ebx"); # t[3] collected
- &and ("ebx",0xffff0000);
- &or ("ebx","ecx");
-
- &punpckldq ("mm4","mm5"); # t[2,3] collected
-}
-
-######################################################################
-# "Compact" block function
-######################################################################
-
-sub enccompact()
-{ my $Fn = \&mov;
- while ($#_>5) { pop(@_); $Fn=sub{}; }
- my ($i,$te,@s)=@_;
- my $tmp = $key;
- my $out = $i==3?$s[0]:$acc;
-
- # $Fn is used in first compact round and its purpose is to
- # void restoration of some values from stack, so that after
- # 4xenccompact with extra argument $key value is left there...
- if ($i==3) { &$Fn ($key,$__key); }##%edx
- else { &mov ($out,$s[0]); }
- &and ($out,0xFF);
- if ($i==1) { &shr ($s[0],16); }#%ebx[1]
- if ($i==2) { &shr ($s[0],24); }#%ecx[2]
- &movz ($out,&BP(-128,$te,$out,1));
-
- if ($i==3) { $tmp=$s[1]; }##%eax
- &movz ($tmp,&HB($s[1]));
- &movz ($tmp,&BP(-128,$te,$tmp,1));
- &shl ($tmp,8);
- &xor ($out,$tmp);
-
- if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
- else { &mov ($tmp,$s[2]);
- &shr ($tmp,16); }
- if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
- &and ($tmp,0xFF);
- &movz ($tmp,&BP(-128,$te,$tmp,1));
- &shl ($tmp,16);
- &xor ($out,$tmp);
-
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
- elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
- else { &mov ($tmp,$s[3]);
- &shr ($tmp,24); }
- &movz ($tmp,&BP(-128,$te,$tmp,1));
- &shl ($tmp,24);
- &xor ($out,$tmp);
- if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
- if ($i==3) { &mov ($s[3],$acc); }
- &comment();
-}
-
-sub enctransform()
-{ my @s = ($s0,$s1,$s2,$s3);
- my $i = shift;
- my $tmp = $tbl;
- my $r2 = $key ;
-
- &and ($tmp,$s[$i]);
- &lea ($r2,&DWP(0,$s[$i],$s[$i]));
- &mov ($acc,$tmp);
- &shr ($tmp,7);
- &and ($r2,0xfefefefe);
- &sub ($acc,$tmp);
- &mov ($tmp,$s[$i]);
- &and ($acc,0x1b1b1b1b);
- &rotr ($tmp,16);
- &xor ($acc,$r2); # r2
- &mov ($r2,$s[$i]);
-
- &xor ($s[$i],$acc); # r0 ^ r2
- &rotr ($r2,16+8);
- &xor ($acc,$tmp);
- &rotl ($s[$i],24);
- &xor ($acc,$r2);
- &mov ($tmp,0x80808080) if ($i!=1);
- &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2
-}
-
-&function_begin_B("_x86_AES_encrypt_compact");
- # note that caller is expected to allocate stack frame for me!
- &mov ($__key,$key); # save key
-
- &xor ($s0,&DWP(0,$key)); # xor with key
- &xor ($s1,&DWP(4,$key));
- &xor ($s2,&DWP(8,$key));
- &xor ($s3,&DWP(12,$key));
-
- &mov ($acc,&DWP(240,$key)); # load key->rounds
- &lea ($acc,&DWP(-2,$acc,$acc));
- &lea ($acc,&DWP(0,$key,$acc,8));
- &mov ($__end,$acc); # end of key schedule
-
- # prefetch Te4
- &mov ($key,&DWP(0-128,$tbl));
- &mov ($acc,&DWP(32-128,$tbl));
- &mov ($key,&DWP(64-128,$tbl));
- &mov ($acc,&DWP(96-128,$tbl));
- &mov ($key,&DWP(128-128,$tbl));
- &mov ($acc,&DWP(160-128,$tbl));
- &mov ($key,&DWP(192-128,$tbl));
- &mov ($acc,&DWP(224-128,$tbl));
-
- &set_label("loop",16);
-
- &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
- &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
- &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
- &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
- &mov ($tbl,0x80808080);
- &enctransform(2);
- &enctransform(3);
- &enctransform(0);
- &enctransform(1);
- &mov ($key,$__key);
- &mov ($tbl,$__tbl);
- &add ($key,16); # advance rd_key
- &xor ($s0,&DWP(0,$key));
- &xor ($s1,&DWP(4,$key));
- &xor ($s2,&DWP(8,$key));
- &xor ($s3,&DWP(12,$key));
-
- &cmp ($key,$__end);
- &mov ($__key,$key);
- &jb (&label("loop"));
-
- &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
- &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
- &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
- &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
-
- &xor ($s0,&DWP(16,$key));
- &xor ($s1,&DWP(20,$key));
- &xor ($s2,&DWP(24,$key));
- &xor ($s3,&DWP(28,$key));
-
- &ret ();
-&function_end_B("_x86_AES_encrypt_compact");
-
-######################################################################
-# "Compact" SSE block function.
-######################################################################
-#
-# Performance is not actually extraordinary in comparison to pure
-# x86 code. In particular encrypt performance is virtually the same.
-# Decrypt performance on the other hand is 15-20% better on newer
-# µ-archs [but we're thankful for *any* improvement here], and ~50%
-# better on PIII:-) And additionally on the pros side this code
-# eliminates redundant references to stack and thus relieves/
-# minimizes the pressure on the memory bus.
-#
-# MMX register layout lsb
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# | mm4 | mm0 |
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# | s3 | s2 | s1 | s0 |
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-#
-# Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
-# In this terms encryption and decryption "compact" permutation
-# matrices can be depicted as following:
-#
-# encryption lsb # decryption lsb
-# +----++----+----+----+----+ # +----++----+----+----+----+
-# | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 |
-# +----++----+----+----+----+ # +----++----+----+----+----+
-# | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 |
-# +----++----+----+----+----+ # +----++----+----+----+----+
-# | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 |
-# +----++----+----+----+----+ # +----++----+----+----+----+
-# | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 |
-# +----++----+----+----+----+ # +----++----+----+----+----+
-#
-######################################################################
-# Why not xmm registers? Short answer. It was actually tested and
-# was not any faster, but *contrary*, most notably on Intel CPUs.
-# Longer answer. Main advantage of using mm registers is that movd
-# latency is lower, especially on Intel P4. While arithmetic
-# instructions are twice as many, they can be scheduled every cycle
-# and not every second one when they are operating on xmm register,
-# so that "arithmetic throughput" remains virtually the same. And
-# finally the code can be executed even on elder SSE-only CPUs:-)
-
-sub sse_enccompact()
-{
- &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
- &pshufw ("mm5","mm4",0x0d); # 15,14,11,10
- &movd ("eax","mm1"); # 5, 4, 1, 0
- &movd ("ebx","mm5"); # 15,14,11,10
- &mov ($__key,$key);
-
- &movz ($acc,&LB("eax")); # 0
- &movz ("edx",&HB("eax")); # 1
- &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
- &movz ($key,&LB("ebx")); # 10
- &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
- &shr ("eax",16); # 5, 4
- &shl ("edx",8); # 1
-
- &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
- &movz ($key,&HB("ebx")); # 11
- &shl ($acc,16); # 10
- &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
- &or ("ecx",$acc); # 10
- &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
- &movz ($key,&HB("eax")); # 5
- &shl ($acc,24); # 11
- &shr ("ebx",16); # 15,14
- &or ("edx",$acc); # 11
-
- &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
- &movz ($key,&HB("ebx")); # 15
- &shl ($acc,8); # 5
- &or ("ecx",$acc); # 5
- &movz ($acc,&BP(-128,$tbl,$key,1)); # 15
- &movz ($key,&LB("eax")); # 4
- &shl ($acc,24); # 15
- &or ("ecx",$acc); # 15
-
- &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
- &movz ($key,&LB("ebx")); # 14
- &movd ("eax","mm2"); # 7, 6, 3, 2
- &movd ("mm0","ecx"); # t[0] collected
- &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14
- &movz ($key,&HB("eax")); # 3
- &shl ("ecx",16); # 14
- &movd ("ebx","mm6"); # 13,12, 9, 8
- &or ("ecx",$acc); # 14
-
- &movz ($acc,&BP(-128,$tbl,$key,1)); # 3
- &movz ($key,&HB("ebx")); # 9
- &shl ($acc,24); # 3
- &or ("ecx",$acc); # 3
- &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
- &movz ($key,&LB("ebx")); # 8
- &shl ($acc,8); # 9
- &shr ("ebx",16); # 13,12
- &or ("ecx",$acc); # 9
-
- &movz ($acc,&BP(-128,$tbl,$key,1)); # 8
- &movz ($key,&LB("eax")); # 2
- &shr ("eax",16); # 7, 6
- &movd ("mm1","ecx"); # t[1] collected
- &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2
- &movz ($key,&HB("eax")); # 7
- &shl ("ecx",16); # 2
- &and ("eax",0xff); # 6
- &or ("ecx",$acc); # 2
-
- &punpckldq ("mm0","mm1"); # t[0,1] collected
-
- &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
- &movz ($key,&HB("ebx")); # 13
- &shl ($acc,24); # 7
- &and ("ebx",0xff); # 12
- &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
- &or ("ecx",$acc); # 7
- &shl ("eax",16); # 6
- &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
- &or ("edx","eax"); # 6
- &shl ($acc,8); # 13
- &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
- &or ("ecx",$acc); # 13
- &or ("edx","ebx"); # 12
- &mov ($key,$__key);
- &movd ("mm4","ecx"); # t[2] collected
- &movd ("mm5","edx"); # t[3] collected
-
- &punpckldq ("mm4","mm5"); # t[2,3] collected
-}
-
- if (!$x86only) {
-&function_begin_B("_sse_AES_encrypt_compact");
- &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
- &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
-
- # note that caller is expected to allocate stack frame for me!
- &mov ($acc,&DWP(240,$key)); # load key->rounds
- &lea ($acc,&DWP(-2,$acc,$acc));
- &lea ($acc,&DWP(0,$key,$acc,8));
- &mov ($__end,$acc); # end of key schedule
-
- &mov ($s0,0x1b1b1b1b); # magic constant
- &mov (&DWP(8,"esp"),$s0);
- &mov (&DWP(12,"esp"),$s0);
-
- # prefetch Te4
- &mov ($s0,&DWP(0-128,$tbl));
- &mov ($s1,&DWP(32-128,$tbl));
- &mov ($s2,&DWP(64-128,$tbl));
- &mov ($s3,&DWP(96-128,$tbl));
- &mov ($s0,&DWP(128-128,$tbl));
- &mov ($s1,&DWP(160-128,$tbl));
- &mov ($s2,&DWP(192-128,$tbl));
- &mov ($s3,&DWP(224-128,$tbl));
-
- &set_label("loop",16);
- &sse_enccompact();
- &add ($key,16);
- &cmp ($key,$__end);
- &ja (&label("out"));
-
- &movq ("mm2",&QWP(8,"esp"));
- &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
- &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
- &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
- &pand ("mm3","mm2"); &pand ("mm7","mm2");
- &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
- &paddb ("mm0","mm0"); &paddb ("mm4","mm4");
- &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
- &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
- &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
- &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
-
- &movq ("mm2","mm3"); &movq ("mm6","mm7");
- &pslld ("mm3",8); &pslld ("mm7",8);
- &psrld ("mm2",24); &psrld ("mm6",24);
- &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
- &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
-
- &movq ("mm3","mm1"); &movq ("mm7","mm5");
- &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
- &psrld ("mm1",8); &psrld ("mm5",8);
- &mov ($s0,&DWP(0-128,$tbl));
- &pslld ("mm3",24); &pslld ("mm7",24);
- &mov ($s1,&DWP(64-128,$tbl));
- &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
- &mov ($s2,&DWP(128-128,$tbl));
- &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
- &mov ($s3,&DWP(192-128,$tbl));
-
- &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
- &jmp (&label("loop"));
-
- &set_label("out",16);
- &pxor ("mm0",&QWP(0,$key));
- &pxor ("mm4",&QWP(8,$key));
-
- &ret ();
-&function_end_B("_sse_AES_encrypt_compact");
- }
-
-######################################################################
-# Vanilla block function.
-######################################################################
-
-sub encstep()
-{ my ($i,$te,@s) = @_;
- my $tmp = $key;
- my $out = $i==3?$s[0]:$acc;
-
- # lines marked with #%e?x[i] denote "reordered" instructions...
- if ($i==3) { &mov ($key,$__key); }##%edx
- else { &mov ($out,$s[0]);
- &and ($out,0xFF); }
- if ($i==1) { &shr ($s[0],16); }#%ebx[1]
- if ($i==2) { &shr ($s[0],24); }#%ecx[2]
- &mov ($out,&DWP(0,$te,$out,8));
-
- if ($i==3) { $tmp=$s[1]; }##%eax
- &movz ($tmp,&HB($s[1]));
- &xor ($out,&DWP(3,$te,$tmp,8));
-
- if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
- else { &mov ($tmp,$s[2]);
- &shr ($tmp,16); }
- if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
- &and ($tmp,0xFF);
- &xor ($out,&DWP(2,$te,$tmp,8));
-
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
- elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
- else { &mov ($tmp,$s[3]);
- &shr ($tmp,24) }
- &xor ($out,&DWP(1,$te,$tmp,8));
- if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
- if ($i==3) { &mov ($s[3],$acc); }
- &comment();
-}
-
-sub enclast()
-{ my ($i,$te,@s)=@_;
- my $tmp = $key;
- my $out = $i==3?$s[0]:$acc;
-
- if ($i==3) { &mov ($key,$__key); }##%edx
- else { &mov ($out,$s[0]); }
- &and ($out,0xFF);
- if ($i==1) { &shr ($s[0],16); }#%ebx[1]
- if ($i==2) { &shr ($s[0],24); }#%ecx[2]
- &mov ($out,&DWP(2,$te,$out,8));
- &and ($out,0x000000ff);
-
- if ($i==3) { $tmp=$s[1]; }##%eax
- &movz ($tmp,&HB($s[1]));
- &mov ($tmp,&DWP(0,$te,$tmp,8));
- &and ($tmp,0x0000ff00);
- &xor ($out,$tmp);
-
- if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
- else {