summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xConfigure2
-rw-r--r--TABLE72
-rw-r--r--crypto/aes/Makefile2
-rw-r--r--crypto/aes/asm/aesni-mb-x86_64.pl1231
-rw-r--r--crypto/sha/Makefile2
-rw-r--r--crypto/sha/asm/sha1-mb-x86_64.pl1050
-rw-r--r--crypto/sha/asm/sha256-mb-x86_64.pl949
7 files changed, 3271 insertions, 37 deletions
diff --git a/Configure b/Configure
index 7eb0c1a112..a8bbcf4109 100755
--- a/Configure
+++ b/Configure
@@ -128,7 +128,7 @@ my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt5
my $x86_elf_asm="$x86_asm:elf";
-my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:";
+my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:";
my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o:des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void";
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
diff --git a/TABLE b/TABLE
index c7ceea9d01..7a723d4b93 100644
--- a/TABLE
+++ b/TABLE
@@ -308,10 +308,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -407,10 +407,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -836,10 +836,10 @@ $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj = x86_64cpuid.o
$bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -1529,10 +1529,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj =
$rmd160_obj =
@@ -1694,10 +1694,10 @@ $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj = x86_64cpuid.o
$bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -1793,10 +1793,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj =
$rmd160_obj =
@@ -1859,10 +1859,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -1892,10 +1892,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -2123,10 +2123,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -2651,10 +2651,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -2849,10 +2849,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -2915,10 +2915,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -4565,10 +4565,10 @@ $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -4598,10 +4598,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -4631,10 +4631,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -4829,10 +4829,10 @@ $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -5819,10 +5819,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
@@ -5852,10 +5852,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
-$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
+$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
-$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
+$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj =
diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile
index fdfda4218d..ce9bb925fd 100644
--- a/crypto/aes/Makefile
+++ b/crypto/aes/Makefile
@@ -67,6 +67,8 @@ aesni-sha1-x86_64.s: asm/aesni-sha1-x86_64.pl
$(PERL) asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME) > $@
aesni-sha256-x86_64.s: asm/aesni-sha256-x86_64.pl
$(PERL) asm/aesni-sha256-x86_64.pl $(PERLASM_SCHEME) > $@
+aesni-mb-x86_64.s: asm/aesni-mb-x86_64.pl
+ $(PERL) asm/aesni-mb-x86_64.pl $(PERLASM_SCHEME) > $@
aes-sparcv9.s: asm/aes-sparcv9.pl
$(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@
diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl
new file mode 100644
index 0000000000..e6ac9f9ecf
--- /dev/null
+++ b/crypto/aes/asm/aesni-mb-x86_64.pl
@@ -0,0 +1,1231 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# Multi-buffer AES-NI procedures process several independent buffers
+# in parallel by interleaving independent instructions.
+#
+# Cycles per byte for interleave factor 4:
+#
+# asymptotic measured
+# ---------------------------
+# Westmere 5.00/4=1.25 5.13/4=1.28
+# Atom 15.0/4=3.75 ?15.7/4=3.93
+# Sandy Bridge 5.06/4=1.27 5.18/4=1.29
+# Ivy Bridge 5.06/4=1.27 5.14/4=1.29
+# Haswell 4.44/4=1.11 4.44/4=1.11
+# Bulldozer 5.75/4=1.44 5.76/4=1.44
+#
+# Cycles per byte for interleave factor 8 (not implemented for
+# pre-AVX processors, where higher interleave factor incidentally
+# doesn't result in improvement):
+#
+# asymptotic measured
+# ---------------------------
+# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
+# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
+# Haswell 5.00/8=0.63 5.00/8=0.63
+# Bulldozer 5.75/8=0.72 5.77/8=0.72
+#
+# (*) Sandy/Ivy Bridge are known to handle high interleave factors
+# suboptimally;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=0;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# void aesni_multi_cbc_encrypt (
+# struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
+# const AES_KEY *key,
+# int num); /* 1 or 2 */
+#
+$inp="%rdi"; # 1st arg
+$key="%rsi"; # 2nd arg
+$num="%edx";
+
+@inptr=map("%r$_",(8..11));
+@outptr=map("%r$_",(12..15));
+
+($rndkey0,$rndkey1)=("%xmm0","%xmm1");
+@out=map("%xmm$_",(2..5));
+@inp=map("%xmm$_",(6..9));
+($counters,$mask,$zero)=map("%xmm$_",(10..12));
+
+($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
+
+$code.=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl aesni_multi_cbc_encrypt
+.type aesni_multi_cbc_encrypt,\@function,3
+.align 32
+aesni_multi_cbc_encrypt:
+___
+$code.=<<___ if ($avx);
+ cmp \$2,$num
+ jb .Lenc_non_avx
+ mov OPENSSL_ia32cap_P+4(%rip),%ecx
+ test \$`1<<28`,%ecx # AVX bit
+ jnz _avx_cbc_enc_shortcut
+ jmp .Lenc_non_avx
+.align 16
+.Lenc_non_avx:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x78(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+
+ sub \$48,%rsp
+ and \$-64,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Lenc4x_body:
+ movdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*2($inp),$inp
+
+.Lenc4x_loop_grande:
+ mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*2`($inp),@inptr[$i]
+ cmp $num,$one
+ mov `40*$i+8-40*2`($inp),@outptr[$i]
+ cmovg $one,$num # find maximum
+ test $one,$one
+ movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@inptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Lenc4x_done
+
+ movups 0x10-0x78($key),$rndkey1
+ pxor $zero,@out[0]
+ movups 0x20-0x78($key),$rndkey0
+ pxor $zero,@out[1]
+ mov 0xf0-0x78($key),$rounds
+ pxor $zero,@out[2]
+ movdqu (@inptr[0]),@inp[0] # load inputs
+ pxor $zero,@out[3]
+ movdqu (@inptr[1]),@inp[1]
+ pxor @inp[0],@out[0]
+ movdqu (@inptr[2]),@inp[2]
+ pxor @inp[1],@out[1]
+ movdqu (@inptr[3]),@inp[3]
+ pxor @inp[2],@out[2]
+ pxor @inp[3],@out[3]
+ movdqa 32(%rsp),$counters # load counters
+ xor $offset,$offset
+ jmp .Loop_enc4x
+
+.align 32
+.Loop_enc4x:
+ add \$16,$offset
+ lea 16(%rsp),$sink # sink pointer
+ mov \$1,$one # constant of 1
+ sub $offset,$sink
+
+ aesenc $rndkey1,@out[0]
+ prefetcht0 31(@inptr[0],$offset) # prefetch input
+ prefetcht0 31(@inptr[1],$offset)
+ aesenc $rndkey1,@out[1]
+ prefetcht0 31(@inptr[2],$offset)
+ prefetcht0 31(@inptr[2],$offset)
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0x30-0x78($key),$rndkey1
+___
+for($i=0;$i<4;$i++) {
+my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
+$code.=<<___;
+ cmp `32+4*$i`(%rsp),$one
+ aesenc $rndkey,@out[0]
+ aesenc $rndkey,@out[1]
+ aesenc $rndkey,@out[2]
+ cmovge $sink,@inptr[$i] # cancel input
+ cmovg $sink,@outptr[$i] # sink output
+ aesenc $rndkey,@out[3]
+ movups `0x40+16*$i-0x78`($key),$rndkey
+___
+}
+$code.=<<___;
+ movdqa $counters,$mask
+ aesenc $rndkey0,@out[0]
+ prefetcht0 15(@outptr[0],$offset) # prefetch output
+ prefetcht0 15(@outptr[1],$offset)
+ aesenc $rndkey0,@out[1]
+ prefetcht0 15(@outptr[2],$offset)
+ prefetcht0 15(@outptr[3],$offset)
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0x80-0x78($key),$rndkey0
+ pxor $zero,$zero
+
+ aesenc $rndkey1,@out[0]
+ pcmpgtd $zero,$mask
+ movdqu -0x78($key),$zero # reload 0-round key
+ aesenc $rndkey1,@out[1]
+ paddd $mask,$counters # decrement counters
+ movdqa $counters,32(%rsp) # update counters
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0x90-0x78($key),$rndkey1
+
+ cmp \$11,$rounds
+
+ aesenc $rndkey0,@out[0]
+ aesenc $rndkey0,@out[1]
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0xa0-0x78($key),$rndkey0
+
+ jb .Lenc4x_tail
+
+ aesenc $rndkey1,@out[0]
+ aesenc $rndkey1,@out[1]
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0xb0-0x78($key),$rndkey1
+
+ aesenc $rndkey0,@out[0]
+ aesenc $rndkey0,@out[1]
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0xc0-0x78($key),$rndkey0
+
+ je .Lenc4x_tail
+
+ aesenc $rndkey1,@out[0]
+ aesenc $rndkey1,@out[1]
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0xd0-0x78($key),$rndkey1
+
+ aesenc $rndkey0,@out[0]
+ aesenc $rndkey0,@out[1]
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0xe0-0x78($key),$rndkey0
+ jmp .Lenc4x_tail
+
+.align 32
+.Lenc4x_tail:
+ aesenc $rndkey1,@out[0]
+ aesenc $rndkey1,@out[1]
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movdqu (@inptr[0],$offset),@inp[0]
+ movdqu 0x10-0x78($key),$rndkey1
+
+ aesenclast $rndkey0,@out[0]
+ movdqu (@inptr[1],$offset),@inp[1]
+ pxor $zero,@inp[0]
+ aesenclast $rndkey0,@out[1]
+ movdqu (@inptr[2],$offset),@inp[2]
+ pxor $zero,@inp[1]
+ aesenclast $rndkey0,@out[2]
+ movdqu (@inptr[3],$offset),@inp[3]
+ pxor $zero,@inp[2]
+ aesenclast $rndkey0,@out[3]
+ movdqu 0x20-0x78($key),$rndkey0
+ pxor $zero,@inp[3]
+
+ movups @out[0],-16(@outptr[0],$offset)
+ pxor @inp[0],@out[0]
+ movups @out[1],-16(@outptr[1],$offset)
+ pxor @inp[1],@out[1]
+ movups @out[2],-16(@outptr[2],$offset)
+ pxor @inp[2],@out[2]
+ movups @out[3],-16(@outptr[3],$offset)
+ pxor @inp[3],@out[3]
+
+ dec $num
+ jnz .Loop_enc4x
+
+ mov 16(%rsp),%rax # original %rsp
+ mov 24(%rsp),$num
+
+ #pxor @inp[0],@out[0]
+ #pxor @inp[1],@out[1]
+ #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME!
+ #pxor @inp[2],@out[2]
+ #movdqu @out[1],`40*1+24-40*2`($inp)
+ #pxor @inp[3],@out[3]
+ #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller
+ #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out...
+
+ lea `40*4`($inp),$inp
+ dec $num
+ jnz .Lenc4x_loop_grande
+
+.Lenc4x_done:
+___
+$code.=<<___ if ($win64);
+ movaps -0xa8(%rax),%xmm6
+ movaps -0x98(%rax),%xmm7
+ movaps -0x88(%rax),%xmm8
+ movaps -0x78(%rax),%xmm9
+ movaps -0x68(%rax),%xmm10
+ movaps -0x58(%rax),%xmm11
+ movaps -0x48(%rax),%xmm12
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+ ret
+.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
+
+.globl aesni_multi_cbc_decrypt
+.type aesni_multi_cbc_decrypt,\@function,3
+.align 32
+aesni_multi_cbc_decrypt:
+___
+$code.=<<___ if ($avx);
+ cmp \$2,$num
+ jb .Ldec_non_avx
+ mov OPENSSL_ia32cap_P+4(%rip),%ecx
+ test \$`1<<28`,%ecx # AVX bit
+ jnz _avx_cbc_dec_shortcut
+ jmp .Ldec_non_avx
+.align 16
+.Ldec_non_avx:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x78(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+
+ sub \$48,%rsp
+ and \$-64,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Ldec4x_body:
+ movdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*2($inp),$inp
+
+.Ldec4x_loop_grande:
+ mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*2`($inp),@inptr[$i]
+ cmp $num,$one
+ mov `40*$i+8-40*2`($inp),@outptr[$i]
+ cmovg $one,$num # find maximum
+ test $one,$one
+ movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@inptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Ldec4x_done
+
+ movups 0x10-0x78($key),$rndkey1
+ movups 0x20-0x78($key),$rndkey0
+ mov 0xf0-0x78($key),$rounds
+ movdqu (@inptr[0]),@out[0] # load inputs
+ movdqu (@inptr[1]),@out[1]
+ pxor $zero,@out[0]
+ movdqu (@inptr[2]),@out[2]
+ pxor $zero,@out[1]
+ movdqu (@inptr[3]),@out[3]
+ pxor $zero,@out[2]
+ pxor $zero,@out[3]
+ movdqa 32(%rsp),$counters # load counters
+ xor $offset,$offset
+ jmp .Loop_dec4x
+
+.align 32
+.Loop_dec4x:
+ add \$16,$offset
+ lea 16(%rsp),$sink # sink pointer
+ mov \$1,$one # constant of 1
+ sub $offset,$sink
+
+ aesdec $rndkey1,@out[0]
+ prefetcht0 31(@inptr[0],$offset) # prefetch input
+ prefetcht0 31(@inptr[1],$offset)
+ aesdec $rndkey1,@out[1]
+ prefetcht0 31(@inptr[2],$offset)
+ prefetcht0 31(@inptr[3],$offset)
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0x30-0x78($key),$rndkey1
+___
+for($i=0;$i<4;$i++) {
+my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
+$code.=<<___;
+ cmp `32+4*$i`(%rsp),$one
+ aesdec $rndkey,@out[0]
+ aesdec $rndkey,@out[1]
+ cmovge $sink,@inptr[$i] # cancel input
+ aesdec $rndkey,@out[2]
+ cmovg $sink,@outptr[$i] # sink output
+ aesdec $rndkey,@out[3]
+ movups `0x40+16*$i-0x78`($key),$rndkey
+___
+}
+$code.=<<___;
+ movdqa $counters,$mask
+ aesdec $rndkey0,@out[0]
+ prefetcht0 15(@outptr[0],$offset) # prefetch output
+ prefetcht0 15(@outptr[1],$offset)
+ aesdec $rndkey0,@out[1]
+ prefetcht0 15(@outptr[2],$offset)
+ prefetcht0 15(@outptr[3],$offset)
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0x80-0x78($key),$rndkey0
+ pxor $zero,$zero
+
+ aesdec $rndkey1,@out[0]
+ pcmpgtd $zero,$mask
+ movdqu -0x78($key),$zero # reload 0-round key
+ aesdec $rndkey1,@out[1]
+ paddd $mask,$counters # decrement counters
+ movdqa $counters,32(%rsp) # update counters
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0x90-0x78($key),$rndkey1
+
+ cmp \$11,$rounds
+
+ aesdec $rndkey0,@out[0]
+ aesdec $rndkey0,@out[1]
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0xa0-0x78($key),$rndkey0
+
+ jb .Ldec4x_tail
+
+ aesdec $rndkey1,@out[0]
+ aesdec $rndkey1,@out[1]
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0xb0-0x78($key),$rndkey1
+
+ aesdec $rndkey0,@out[0]
+ aesdec $rndkey0,@out[1]
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0xc0-0x78($key),$rndkey0
+
+ je .Ldec4x_tail
+
+ aesdec $rndkey1,@out[0]
+ aesdec $rndkey1,@out[1]
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0xd0-0x78($key),$rndkey1
+
+ aesdec $rndkey0,@out[0]
+ aesdec $rndkey0,@out[1]
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0xe0-0x78($key),$rndkey0
+ jmp .Ldec4x_tail
+
+.align 32
+.Ldec4x_tail:
+ aesdec $rndkey1,@out[0]
+ aesdec $rndkey1,@out[1]
+ aesdec $rndkey1,@out[2]
+ pxor $rndkey0,@inp[0]
+ pxor $rndkey0,@inp[1]
+ aesdec $rndkey1,@out[3]
+ movdqu 0x10-0x78($key),$rndkey1
+ pxor $rndkey0,@inp[2]
+ pxor $rndkey0,@inp[3]
+ movdqu 0x20-0x78($key),$rndkey0
+
+ aesdeclast @inp[0],@out[0]
+ aesdeclast @inp[1],@out[1]
+ movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
+ movdqu -16(@inptr[1],$offset),@inp[1]
+ aesdeclast @inp[2],@out[2]
+ aesdeclast @inp[3],@out[3]
+ movdqu -16(@inptr[2],$offset),@inp[2]
+ movdqu -16(@inptr[3],$offset),@inp[3]
+
+ movups @out[0],-16(@outptr[0],$offset)
+ movdqu (@inptr[0],$offset),@out[0]
+ movups @out[1],-16(@outptr[1],$offset)
+ movdqu (@inptr[1],$offset),@out[1]
+ pxor $zero,@out[0]
+ movups @out[2],-16(@outptr[2],$offset)
+ movdqu (@inptr[2],$offset),@out[2]
+ pxor $zero,@out[1]
+ movups @out[3],-16(@outptr[3],$offset)
+ movdqu (@inptr[3],$offset),@out[3]
+ pxor $zero,@out[2]
+ pxor $zero,@out[3]
+
+ dec $num
+ jnz .Loop_dec4x
+
+ mov 16(%rsp),%rax # original %rsp
+ mov 24(%rsp),$num
+
+ lea `40*4`($inp),$inp
+ dec $num
+ jnz .Ldec4x_loop_grande
+
+.Ldec4x_done:
+___
+$code.=<<___ if ($win64);
+ movaps -0xa8(%rax),%xmm6
+ movaps -0x98(%rax),%xmm7
+ movaps -0x88(%rax),%xmm8
+ movaps -0x78(%rax),%xmm9
+ movaps -0x68(%rax),%xmm10
+ movaps -0x58(%rax),%xmm11
+ movaps -0x48(%rax),%xmm12
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+ ret
+.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
+___
+
+ if ($avx) {{{
+my @ptr=map("%r$_",(8..15));
+my $offload=$sink;
+
+my @out=map("%xmm$_",(2..9));
+my @inp=map("%xmm$_",(10..13));
+my ($counters,$zero)=("%xmm14","%xmm15");
+
+$code.=<<___;
+.type aesni_multi_cbc_encrypt_avx,\@function,3
+.align 32
+aesni_multi_cbc_encrypt_avx:
+_avx_cbc_enc_shortcut:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+ # +64 distances between inputs and outputs
+ # +128 off-load area for @inp[0..3]
+
+ sub \$192,%rsp
+ and \$-128,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Lenc8x_body:
+ vzeroupper
+ vmovdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*4($inp),$inp
+ shr \$1,$num
+
+.Lenc8x_loop_grande:
+ #mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<8;$i++) {
+ my $temp = $i ? $offload : $offset;
+ $code.=<<___;
+ mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
+ cmp $num,$one
+ mov `40*$i+8-40*4`($inp),$temp # output pointer
+ cmovg $one,$num # find maximum
+ test $one,$one
+ vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@ptr[$i] # cancel input
+ sub @ptr[$i],$temp # distance between input and output
+ mov $temp,`64+8*$i`(%rsp) # initialize distances
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Lenc8x_done
+
+ vmovups 0x10-0x78($key),$rndkey1
+ vmovups 0x20-0x78($key),$rndkey0
+ mov 0xf0-0x78($key),$rounds
+
+ vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
+ lea 128(%rsp),$offload # offload area
+ vpxor (@ptr[1]),$zero,@inp[1]
+ vpxor (@ptr[2]),$zero,@inp[2]
+ vpxor (@ptr[3]),$zero,@inp[3]
+ vpxor @inp[0],@out[0],@out[0]
+ vpxor (@ptr[4]),$zero,@inp[0]
+ vpxor @inp[1],@out[1],@out[1]
+ vpxor (@ptr[5]),$zero,@inp[1]
+ vpxor @inp[2],@out[2],@out[2]
+ vpxor (@ptr[6]),$zero,@inp[2]
+ vpxor @inp[3],@out[3],@out[3]
+ vpxor (@ptr[7]),$zero,@inp[3]
+ vpxor @inp[0],@out[4],@out[4]
+ mov \$1,$one # constant of 1
+ vpxor @inp[1],@out[5],@out[5]
+ vpxor @inp[2],@out[6],@out[6]
+ vpxor @inp[3],@out[7],@out[7]
+ jmp .Loop_enc8x
+
+.align 32
+.Loop_enc8x:
+___
+for($i=0;$i<8;$i++) {
+my $rndkey=($i&1)?$rndkey0:$rndkey1;
+$code