summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2011-08-23 20:51:38 +0000
committerAndy Polyakov <appro@openssl.org>2011-08-23 20:51:38 +0000
commitc608171d9c30eb54a30fe71f9a02a922514dfec7 (patch)
tree5117f51c9ae6ec1ecf69775b2a3e9083ac528639
parentc2d4c2867b517ca9114cc80503e3ba18e6bc8a83 (diff)
Add RC4-MD5 and AESNI-SHA1 "stitched" implementations.
-rw-r--r--CHANGES5
-rwxr-xr-xConfigure2
-rw-r--r--TABLE44
-rw-r--r--crypto/aes/Makefile2
-rw-r--r--crypto/aes/asm/aesni-sha1-x86_64.pl1249
-rw-r--r--crypto/evp/Makefile31
-rw-r--r--crypto/evp/c_allc.c7
-rw-r--r--crypto/evp/e_aes_cbc_hmac_sha1.c403
-rw-r--r--crypto/evp/e_rc4_hmac_md5.c292
-rw-r--r--crypto/evp/evp.h7
-rw-r--r--crypto/evp/names.c2
-rw-r--r--crypto/objects/obj_dat.h21
-rw-r--r--crypto/objects/obj_mac.h16
-rw-r--r--crypto/objects/obj_mac.num4
-rw-r--r--crypto/objects/objects.txt6
-rw-r--r--crypto/rc4/Makefile2
-rw-r--r--crypto/rc4/asm/rc4-md5-x86_64.pl631
-rw-r--r--ssl/ssl_algs.c7
18 files changed, 2703 insertions, 28 deletions
diff --git a/CHANGES b/CHANGES
index 65bbce6c39..48537e854e 100644
--- a/CHANGES
+++ b/CHANGES
@@ -271,6 +271,11 @@
Changes between 1.0.0e and 1.0.1 [xx XXX xxxx]
+ *) Add RC4-MD5 and AESNI-SHA1 "stiched" implementations.
+
+ This work was sponsored by Intel.
+ [Andy Polyakov]
+
*) Redirect HMAC and CMAC operations to FIPS module in FIPS mode. If an
ENGINE is used then we cannot handle that in the FIPS module so we
keep original code iff non-FIPS operations are allowed.
diff --git a/Configure b/Configure
index 9d9bd72b8b..fc793b02ed 100755
--- a/Configure
+++ b/Configure
@@ -127,7 +127,7 @@ my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt5
my $x86_elf_asm="$x86_asm:elf";
-my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o";
+my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o";
my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o:void";
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o:void";
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void";
diff --git a/TABLE b/TABLE
index b5aea36d97..d6940be031 100644
--- a/TABLE
+++ b/TABLE
@@ -299,12 +299,12 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
-$aes_obj = aes-x86_64.o aesni-x86_64.o
+$aes_obj = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o
+$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
@@ -779,12 +779,12 @@ $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj = x86_64cpuid.o
$bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
-$aes_obj = aes-x86_64.o aesni-x86_64.o
+$aes_obj = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o
+$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
@@ -1387,12 +1387,12 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
-$aes_obj = aes-x86_64.o aesni-x86_64.o
+$aes_obj = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o
+$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
@@ -1547,12 +1547,12 @@ $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj = x86_64cpuid.o
$bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
-$aes_obj = aes-x86_64.o aesni-x86_64.o
+$aes_obj = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o
+$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
@@ -2315,12 +2315,12 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
-$aes_obj = aes-x86_64.o aesni-x86_64.o
+$aes_obj = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o
+$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
@@ -2507,12 +2507,12 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
-$aes_obj = aes-x86_64.o aesni-x86_64.o
+$aes_obj = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o
+$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
@@ -2571,12 +2571,12 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
-$aes_obj = aes-x86_64.o aesni-x86_64.o
+$aes_obj = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o
+$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
@@ -4075,12 +4075,12 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
-$aes_obj = aes-x86_64.o aesni-x86_64.o
+$aes_obj = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o
+$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
@@ -4235,12 +4235,12 @@ $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
-$aes_obj = aes-x86_64.o aesni-x86_64.o
+$aes_obj = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o
+$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
@@ -5195,12 +5195,12 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
-$aes_obj = aes-x86_64.o aesni-x86_64.o
+$aes_obj = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o
+$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
@@ -5227,12 +5227,12 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
-$aes_obj = aes-x86_64.o aesni-x86_64.o
+$aes_obj = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o
+$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile
index cf1b3e545f..ae16e659e4 100644
--- a/crypto/aes/Makefile
+++ b/crypto/aes/Makefile
@@ -57,6 +57,8 @@ aes-x86_64.s: asm/aes-x86_64.pl
$(PERL) asm/aes-x86_64.pl $(PERLASM_SCHEME) > $@
aesni-x86_64.s: asm/aesni-x86_64.pl
$(PERL) asm/aesni-x86_64.pl $(PERLASM_SCHEME) > $@
+aesni-sha1-x86_64.s: asm/aesni-sha1-x86_64.pl
+ $(PERL) asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME) > $@
aes-sparcv9.s: asm/aes-sparcv9.pl
$(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@
diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl
new file mode 100644
index 0000000000..c6f6b3334a
--- /dev/null
+++ b/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -0,0 +1,1249 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2011
+#
+# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
+# AESNI code is weaved into it. Below are performance numbers in
+# cycles per processed byte, less is better, for standalone AESNI-CBC
+# encrypt, sum of the latter and standalone SHA1, and "stitched"
+# subroutine:
+#
+# AES-128-CBC +SHA1 stitch gain
+# Westmere 3.77[+5.6] 9.37 6.65 +41%
+# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
+#
+# AES-192-CBC
+# Westmere 4.51 10.11 6.97 +45%
+# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
+#
+# AES-256-CBC
+# Westmere 5.25 10.85 7.25 +50%
+# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
+#
+# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
+# background information. Above numbers in parentheses are SSSE3
+# results collected on AVX-capable CPU, i.e. apply on OSes that
+# don't support AVX.
+#
+# Needless to mention that it makes no sense to implement "stitched"
+# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
+# fully utilize parallelism, so stitching would not give any gain
+# anyway. Well, there might be some, e.g. because of better cache
+# locality... For reference, here are performance results for
+# standalone AESNI-CBC decrypt:
+#
+# AES-128-CBC AES-192-CBC AES-256-CBC
+# Westmere 1.31 1.55 1.80
+# Sandy Bridge 0.93 1.06 1.22
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+ $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+ $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+ $1>=10);
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# void aesni_cbc_sha1_enc(const void *inp,
+# void *out,
+# size_t length,
+# const AES_KEY *key,
+# unsigned char *iv,
+# SHA_CTX *ctx,
+# const void *in0);
+
+$code.=<<___;
+.text
+.extern OPENSSL_ia32cap_P
+
+.globl aesni_cbc_sha1_enc
+.type aesni_cbc_sha1_enc,\@abi-omnipotent
+.align 16
+aesni_cbc_sha1_enc:
+ # caller should check for SSSE3 and AES-NI bits
+ mov OPENSSL_ia32cap_P+0(%rip),%r10d
+ mov OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+ and \$`1<<28`,%r11d # mask AVX bit
+ and \$`1<<30`,%r10d # mask "Intel CPU" bit
+ or %r11d,%r10d
+ cmp \$`1<<28|1<<30`,%r10d
+ je aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+ jmp aesni_cbc_sha1_enc_ssse3
+ ret
+.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+___
+
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+my $j=0; my $jj=0; my $r=0; my $sn=0;
+my $K_XX_XX="%r11";
+my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+
+sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+ my $arg = pop;
+ $arg = "\$$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type aesni_cbc_sha1_enc_ssse3,\@function,6
+.align 16
+aesni_cbc_sha1_enc_ssse3:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ #shr \$6,$len # debugging artefact
+ #jz .Lepilogue_ssse3 # debugging artefact
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+ #mov $in0,$inp # debugging artefact
+ #lea 64(%rsp),$ctx # debugging artefact
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ mov $key,%r15
+ movdqu ($ivp),$iv # load IV
+ mov $ivp,88(%rsp) # save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ mov 240($key),$rounds
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+
+ movdqa 64($K_XX_XX),@X[2] # pbswap mask
+ movdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ movdqu 16($inp),@X[-3&7]
+ movdqu 32($inp),@X[-2&7]
+ movdqu 48($inp),@X[-1&7]
+ pshufb @X[2],@X[-4&7] # byte swap
+ add \$64,$inp
+ pshufb @X[2],@X[-3&7]
+ pshufb @X[2],@X[-2&7]
+ pshufb @X[2],@X[-1&7]
+ paddd @Tx[1],@X[-4&7] # add K_00_19
+ paddd @Tx[1],@X[-3&7]
+ paddd @Tx[1],@X[-2&7]
+ movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
+ psubd @Tx[1],@X[-4&7] # restore X[]
+ movdqa @X[-3&7],16(%rsp)
+ psubd @Tx[1],@X[-3&7]
+ movdqa @X[-2&7],32(%rsp)
+ psubd @Tx[1],@X[-2&7]
+ movups ($key),$rndkey0 # $key[0]
+ movups 16($key),$rndkey[0] # forward reference
+ jmp .Loop_ssse3
+___
+
+my $aesenc=sub {
+ use integer;
+ my ($n,$k)=($r/10,$r%10);
+ if ($k==0) {
+ $code.=<<___;
+ movups `16*$n`($in0),$in # load input
+ xorps $rndkey0,$in
+___
+ $code.=<<___ if ($n);
+ movups $iv,`16*($n-1)`($out,$in0) # write output
+___
+ $code.=<<___;
+ xorps $in,$iv
+ aesenc $rndkey[0],$iv
+ movups `32+16*$k`($key),$rndkey[1]
+___
+ } elsif ($k==9) {
+ $sn++;
+ $code.=<<___;
+ cmp \$11,$rounds
+ jb .Laesenclast$sn
+ movups `32+16*($k+0)`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+1)`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+ je .Laesenclast$sn
+ movups `32+16*($k+2)`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+3)`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+.Laesenclast$sn:
+ aesenclast $rndkey[0],$iv
+ movups 16($key),$rndkey[1] # forward reference
+___
+ } else {
+ $code.=<<___;
+ aesenc $rndkey[0],$iv
+ movups `32+16*$k`($key),$rndkey[1]
+___
+ }
+ $r++; unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &movdqa (@X[0],@X[-3&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@Tx[0],@X[-1&7]);
+ &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (@Tx[2],@X[0]);
+ &movdqa (@Tx[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
+ &paddd (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@Tx[1],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[2],30);
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pslld (@Tx[1],2);
+ &pxor (@X[0],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
+ eval(shift(@insns)); # body_20_39
+ &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ if ($Xi%5) {
+ &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+ } else { # ... or load next one
+ &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+ }
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &movdqa (@Tx[0],@X[0]);
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &pslld (@X[0],2);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &psrld (@Tx[0],30);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &movdqa (@Tx[1],@X[0]) if ($Xi<19);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &cmp ($inp,$len);
+ &je (".Ldone_ssse3");
+
+ unshift(@Tx,pop(@Tx));
+
+ &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
+ &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &movdqu (@X[-4&7],"0($inp)"); # load input
+ &movdqu (@X[-3&7],"16($inp)");
+ &movdqu (@X[-2&7],"32($inp)");
+ &movdqu (@X[-1&7],"48($inp)");
+ &pshufb (@X[-4&7],@X[2]); # byte swap
+ &add ($inp,64);
+
+ $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pshufb (@X[($Xi-3)&7],@X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@X[($Xi-4)&7],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psubd (@X[($Xi-4)&7],@Tx[1]);
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+ use integer;
+ my ($k,$n);
+ my @r=(
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
+ '&xor ($c,$d);',
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&and (@T[0],$c);', # ($b&($c^$d))
+ '&xor ($c,$d);', # restore $c
+ '&xor (@T[0],$d);',
+ '&add ($e,$a);',
+ '&$_ror ($b,$j?7:2);', # $b>>>2
+ '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+ $n = scalar(@r);
+ $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
+ $jj++;
+ return @r;
+}
+
+sub body_20_39 () {
+ use integer;
+ my ($k,$n);
+ my @r=(
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
+ '&xor (@T[0],$d);', # ($b^$d)
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&xor (@T[0],$c);', # ($b^$d^$c)
+ '&add ($e,$a);',
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+ $n = scalar(@r);
+ $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
+ $jj++;
+ return @r;
+}
+
+sub body_40_59 () {
+ use integer;
+ my ($k,$n);
+ my @r=(
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&mov (@T[1],$c);',
+ '&xor ($c,$d);',
+ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
+ '&and (@T[1],$d);',
+ '&and (@T[0],$c);', # ($b&($c^$d))
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,@T[1]);',
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&add ($e,@T[0]);',
+ '&xor ($c,$d);', # restore $c
+ '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+ $n = scalar(@r);
+ $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
+ $jj++;
+ return @r;
+}
+$code.=<<___;
+.align 16
+.Loop_ssse3:
+___
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_r=$r; @saved_rndkey=@rndkey;
+
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+ movups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ jmp .Loop_ssse3
+
+.align 16
+.Ldone_ssse3:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $r=$saved_r; @rndkey=@saved_rndkey;
+
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+ movups $iv,48($out,$in0) # write output
+ mov 88(%rsp),$ivp # restore $ivp
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ movups $iv,($ivp) # write IV
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_ssse3:
+ ret
+.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+___
+
+$j=$jj=$r=$sn=0;
+
+if ($avx) {
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type aesni_cbc_sha1_enc_avx,\@function,6
+.align 16
+aesni_cbc_sha1_enc_avx:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ #shr \$6,$len # debugging artefact
+ #jz .Lepilogue_avx # debugging artefact
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+ #mov $in0,$inp # debugging artefact
+ #lea 64(%rsp),$ctx # debugging artefact
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+ vzeroall
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ mov $key,%r15
+ vmovdqu ($ivp),$iv # load IV
+ mov $ivp,88(%rsp) # save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ mov 240($key),$rounds
+ add \$112,$key # size optimization
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+
+ vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
+ vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ vmovdqu 16($inp),@X[-3&7]
+ vmovdqu 32($inp),@X[-2&7]
+ vmovdqu 48($inp