diff options
-rwxr-xr-x | Configure | 2 | ||||
-rw-r--r-- | TABLE | 72 | ||||
-rw-r--r-- | crypto/aes/Makefile | 2 | ||||
-rw-r--r-- | crypto/aes/asm/aesni-mb-x86_64.pl | 1231 | ||||
-rw-r--r-- | crypto/sha/Makefile | 2 | ||||
-rw-r--r-- | crypto/sha/asm/sha1-mb-x86_64.pl | 1050 | ||||
-rw-r--r-- | crypto/sha/asm/sha256-mb-x86_64.pl | 949 |
7 files changed, 3271 insertions, 37 deletions
@@ -128,7 +128,7 @@ my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt5 my $x86_elf_asm="$x86_asm:elf"; -my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:"; +my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:"; my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o:des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void"; @@ -308,10 +308,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -407,10 +407,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -836,10 +836,10 @@ $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN $cpuid_obj = x86_64cpuid.o $bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -1529,10 +1529,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = $rmd160_obj = @@ -1694,10 +1694,10 @@ $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN $cpuid_obj = x86_64cpuid.o $bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -1793,10 +1793,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = $rmd160_obj = @@ -1859,10 +1859,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -1892,10 +1892,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -2123,10 +2123,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -2651,10 +2651,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -2849,10 +2849,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -2915,10 +2915,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -4565,10 +4565,10 @@ $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -4598,10 +4598,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -4631,10 +4631,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -4829,10 +4829,10 @@ $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -5819,10 +5819,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = @@ -5852,10 +5852,10 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = -$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o -$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o $cast_obj = $rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o $rmd160_obj = diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile index fdfda4218d..ce9bb925fd 100644 --- a/crypto/aes/Makefile +++ b/crypto/aes/Makefile @@ -67,6 +67,8 @@ aesni-sha1-x86_64.s: asm/aesni-sha1-x86_64.pl $(PERL) asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME) > $@ aesni-sha256-x86_64.s: asm/aesni-sha256-x86_64.pl $(PERL) asm/aesni-sha256-x86_64.pl $(PERLASM_SCHEME) > $@ +aesni-mb-x86_64.s: asm/aesni-mb-x86_64.pl + $(PERL) asm/aesni-mb-x86_64.pl $(PERLASM_SCHEME) > $@ aes-sparcv9.s: asm/aes-sparcv9.pl $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@ diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl new file mode 100644 index 0000000000..e6ac9f9ecf --- /dev/null +++ b/crypto/aes/asm/aesni-mb-x86_64.pl @@ -0,0 +1,1231 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# Multi-buffer AES-NI procedures process several independent buffers +# in parallel by interleaving independent instructions. +# +# Cycles per byte for interleave factor 4: +# +# asymptotic measured +# --------------------------- +# Westmere 5.00/4=1.25 5.13/4=1.28 +# Atom 15.0/4=3.75 ?15.7/4=3.93 +# Sandy Bridge 5.06/4=1.27 5.18/4=1.29 +# Ivy Bridge 5.06/4=1.27 5.14/4=1.29 +# Haswell 4.44/4=1.11 4.44/4=1.11 +# Bulldozer 5.75/4=1.44 5.76/4=1.44 +# +# Cycles per byte for interleave factor 8 (not implemented for +# pre-AVX processors, where higher interleave factor incidentally +# doesn't result in improvement): +# +# asymptotic measured +# --------------------------- +# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*) +# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*) +# Haswell 5.00/8=0.63 5.00/8=0.63 +# Bulldozer 5.75/8=0.72 5.77/8=0.72 +# +# (*) Sandy/Ivy Bridge are known to handle high interleave factors +# suboptimally; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +$avx=0; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +# void aesni_multi_cbc_encrypt ( +# struct { void *inp,*out; int blocks; double iv[2]; } inp[8]; +# const AES_KEY *key, +# int num); /* 1 or 2 */ +# +$inp="%rdi"; # 1st arg +$key="%rsi"; # 2nd arg +$num="%edx"; + +@inptr=map("%r$_",(8..11)); +@outptr=map("%r$_",(12..15)); + +($rndkey0,$rndkey1)=("%xmm0","%xmm1"); +@out=map("%xmm$_",(2..5)); +@inp=map("%xmm$_",(6..9)); +($counters,$mask,$zero)=map("%xmm$_",(10..12)); + +($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx"); + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.globl aesni_multi_cbc_encrypt +.type aesni_multi_cbc_encrypt,\@function,3 +.align 32 +aesni_multi_cbc_encrypt: +___ +$code.=<<___ if ($avx); + cmp \$2,$num + jb .Lenc_non_avx + mov OPENSSL_ia32cap_P+4(%rip),%ecx + test \$`1<<28`,%ecx # AVX bit + jnz _avx_cbc_enc_shortcut + jmp .Lenc_non_avx +.align 16 +.Lenc_non_avx: +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0x78(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + + sub \$48,%rsp + and \$-64,%rsp + mov %rax,16(%rsp) # original %rsp + +.Lenc4x_body: + movdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*2($inp),$inp + +.Lenc4x_loop_grande: + mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*2`($inp),@inptr[$i] + cmp $num,$one + mov `40*$i+8-40*2`($inp),@outptr[$i] + cmovg $one,$num # find maximum + test $one,$one + movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@inptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Lenc4x_done + + movups 0x10-0x78($key),$rndkey1 + pxor $zero,@out[0] + movups 0x20-0x78($key),$rndkey0 + pxor $zero,@out[1] + mov 0xf0-0x78($key),$rounds + pxor $zero,@out[2] + movdqu (@inptr[0]),@inp[0] # load inputs + pxor $zero,@out[3] + movdqu (@inptr[1]),@inp[1] + pxor @inp[0],@out[0] + movdqu (@inptr[2]),@inp[2] + pxor @inp[1],@out[1] + movdqu (@inptr[3]),@inp[3] + pxor @inp[2],@out[2] + pxor @inp[3],@out[3] + movdqa 32(%rsp),$counters # load counters + xor $offset,$offset + jmp .Loop_enc4x + +.align 32 +.Loop_enc4x: + add \$16,$offset + lea 16(%rsp),$sink # sink pointer + mov \$1,$one # constant of 1 + sub $offset,$sink + + aesenc $rndkey1,@out[0] + prefetcht0 31(@inptr[0],$offset) # prefetch input + prefetcht0 31(@inptr[1],$offset) + aesenc $rndkey1,@out[1] + prefetcht0 31(@inptr[2],$offset) + prefetcht0 31(@inptr[2],$offset) + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0x30-0x78($key),$rndkey1 +___ +for($i=0;$i<4;$i++) { +my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; +$code.=<<___; + cmp `32+4*$i`(%rsp),$one + aesenc $rndkey,@out[0] + aesenc $rndkey,@out[1] + aesenc $rndkey,@out[2] + cmovge $sink,@inptr[$i] # cancel input + cmovg $sink,@outptr[$i] # sink output + aesenc $rndkey,@out[3] + movups `0x40+16*$i-0x78`($key),$rndkey +___ +} +$code.=<<___; + movdqa $counters,$mask + aesenc $rndkey0,@out[0] + prefetcht0 15(@outptr[0],$offset) # prefetch output + prefetcht0 15(@outptr[1],$offset) + aesenc $rndkey0,@out[1] + prefetcht0 15(@outptr[2],$offset) + prefetcht0 15(@outptr[3],$offset) + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0x80-0x78($key),$rndkey0 + pxor $zero,$zero + + aesenc $rndkey1,@out[0] + pcmpgtd $zero,$mask + movdqu -0x78($key),$zero # reload 0-round key + aesenc $rndkey1,@out[1] + paddd $mask,$counters # decrement counters + movdqa $counters,32(%rsp) # update counters + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0x90-0x78($key),$rndkey1 + + cmp \$11,$rounds + + aesenc $rndkey0,@out[0] + aesenc $rndkey0,@out[1] + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0xa0-0x78($key),$rndkey0 + + jb .Lenc4x_tail + + aesenc $rndkey1,@out[0] + aesenc $rndkey1,@out[1] + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0xb0-0x78($key),$rndkey1 + + aesenc $rndkey0,@out[0] + aesenc $rndkey0,@out[1] + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0xc0-0x78($key),$rndkey0 + + je .Lenc4x_tail + + aesenc $rndkey1,@out[0] + aesenc $rndkey1,@out[1] + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0xd0-0x78($key),$rndkey1 + + aesenc $rndkey0,@out[0] + aesenc $rndkey0,@out[1] + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0xe0-0x78($key),$rndkey0 + jmp .Lenc4x_tail + +.align 32 +.Lenc4x_tail: + aesenc $rndkey1,@out[0] + aesenc $rndkey1,@out[1] + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movdqu (@inptr[0],$offset),@inp[0] + movdqu 0x10-0x78($key),$rndkey1 + + aesenclast $rndkey0,@out[0] + movdqu (@inptr[1],$offset),@inp[1] + pxor $zero,@inp[0] + aesenclast $rndkey0,@out[1] + movdqu (@inptr[2],$offset),@inp[2] + pxor $zero,@inp[1] + aesenclast $rndkey0,@out[2] + movdqu (@inptr[3],$offset),@inp[3] + pxor $zero,@inp[2] + aesenclast $rndkey0,@out[3] + movdqu 0x20-0x78($key),$rndkey0 + pxor $zero,@inp[3] + + movups @out[0],-16(@outptr[0],$offset) + pxor @inp[0],@out[0] + movups @out[1],-16(@outptr[1],$offset) + pxor @inp[1],@out[1] + movups @out[2],-16(@outptr[2],$offset) + pxor @inp[2],@out[2] + movups @out[3],-16(@outptr[3],$offset) + pxor @inp[3],@out[3] + + dec $num + jnz .Loop_enc4x + + mov 16(%rsp),%rax # original %rsp + mov 24(%rsp),$num + + #pxor @inp[0],@out[0] + #pxor @inp[1],@out[1] + #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME! + #pxor @inp[2],@out[2] + #movdqu @out[1],`40*1+24-40*2`($inp) + #pxor @inp[3],@out[3] + #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller + #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out... + + lea `40*4`($inp),$inp + dec $num + jnz .Lenc4x_loop_grande + +.Lenc4x_done: +___ +$code.=<<___ if ($win64); + movaps -0xa8(%rax),%xmm6 + movaps -0x98(%rax),%xmm7 + movaps -0x88(%rax),%xmm8 + movaps -0x78(%rax),%xmm9 + movaps -0x68(%rax),%xmm10 + movaps -0x58(%rax),%xmm11 + movaps -0x48(%rax),%xmm12 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp + ret +.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt + +.globl aesni_multi_cbc_decrypt +.type aesni_multi_cbc_decrypt,\@function,3 +.align 32 +aesni_multi_cbc_decrypt: +___ +$code.=<<___ if ($avx); + cmp \$2,$num + jb .Ldec_non_avx + mov OPENSSL_ia32cap_P+4(%rip),%ecx + test \$`1<<28`,%ecx # AVX bit + jnz _avx_cbc_dec_shortcut + jmp .Ldec_non_avx +.align 16 +.Ldec_non_avx: +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0x78(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + + sub \$48,%rsp + and \$-64,%rsp + mov %rax,16(%rsp) # original %rsp + +.Ldec4x_body: + movdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*2($inp),$inp + +.Ldec4x_loop_grande: + mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*2`($inp),@inptr[$i] + cmp $num,$one + mov `40*$i+8-40*2`($inp),@outptr[$i] + cmovg $one,$num # find maximum + test $one,$one + movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@inptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldec4x_done + + movups 0x10-0x78($key),$rndkey1 + movups 0x20-0x78($key),$rndkey0 + mov 0xf0-0x78($key),$rounds + movdqu (@inptr[0]),@out[0] # load inputs + movdqu (@inptr[1]),@out[1] + pxor $zero,@out[0] + movdqu (@inptr[2]),@out[2] + pxor $zero,@out[1] + movdqu (@inptr[3]),@out[3] + pxor $zero,@out[2] + pxor $zero,@out[3] + movdqa 32(%rsp),$counters # load counters + xor $offset,$offset + jmp .Loop_dec4x + +.align 32 +.Loop_dec4x: + add \$16,$offset + lea 16(%rsp),$sink # sink pointer + mov \$1,$one # constant of 1 + sub $offset,$sink + + aesdec $rndkey1,@out[0] + prefetcht0 31(@inptr[0],$offset) # prefetch input + prefetcht0 31(@inptr[1],$offset) + aesdec $rndkey1,@out[1] + prefetcht0 31(@inptr[2],$offset) + prefetcht0 31(@inptr[3],$offset) + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0x30-0x78($key),$rndkey1 +___ +for($i=0;$i<4;$i++) { +my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; +$code.=<<___; + cmp `32+4*$i`(%rsp),$one + aesdec $rndkey,@out[0] + aesdec $rndkey,@out[1] + cmovge $sink,@inptr[$i] # cancel input + aesdec $rndkey,@out[2] + cmovg $sink,@outptr[$i] # sink output + aesdec $rndkey,@out[3] + movups `0x40+16*$i-0x78`($key),$rndkey +___ +} +$code.=<<___; + movdqa $counters,$mask + aesdec $rndkey0,@out[0] + prefetcht0 15(@outptr[0],$offset) # prefetch output + prefetcht0 15(@outptr[1],$offset) + aesdec $rndkey0,@out[1] + prefetcht0 15(@outptr[2],$offset) + prefetcht0 15(@outptr[3],$offset) + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0x80-0x78($key),$rndkey0 + pxor $zero,$zero + + aesdec $rndkey1,@out[0] + pcmpgtd $zero,$mask + movdqu -0x78($key),$zero # reload 0-round key + aesdec $rndkey1,@out[1] + paddd $mask,$counters # decrement counters + movdqa $counters,32(%rsp) # update counters + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0x90-0x78($key),$rndkey1 + + cmp \$11,$rounds + + aesdec $rndkey0,@out[0] + aesdec $rndkey0,@out[1] + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0xa0-0x78($key),$rndkey0 + + jb .Ldec4x_tail + + aesdec $rndkey1,@out[0] + aesdec $rndkey1,@out[1] + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0xb0-0x78($key),$rndkey1 + + aesdec $rndkey0,@out[0] + aesdec $rndkey0,@out[1] + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0xc0-0x78($key),$rndkey0 + + je .Ldec4x_tail + + aesdec $rndkey1,@out[0] + aesdec $rndkey1,@out[1] + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0xd0-0x78($key),$rndkey1 + + aesdec $rndkey0,@out[0] + aesdec $rndkey0,@out[1] + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0xe0-0x78($key),$rndkey0 + jmp .Ldec4x_tail + +.align 32 +.Ldec4x_tail: + aesdec $rndkey1,@out[0] + aesdec $rndkey1,@out[1] + aesdec $rndkey1,@out[2] + pxor $rndkey0,@inp[0] + pxor $rndkey0,@inp[1] + aesdec $rndkey1,@out[3] + movdqu 0x10-0x78($key),$rndkey1 + pxor $rndkey0,@inp[2] + pxor $rndkey0,@inp[3] + movdqu 0x20-0x78($key),$rndkey0 + + aesdeclast @inp[0],@out[0] + aesdeclast @inp[1],@out[1] + movdqu -16(@inptr[0],$offset),@inp[0] # load next IV + movdqu -16(@inptr[1],$offset),@inp[1] + aesdeclast @inp[2],@out[2] + aesdeclast @inp[3],@out[3] + movdqu -16(@inptr[2],$offset),@inp[2] + movdqu -16(@inptr[3],$offset),@inp[3] + + movups @out[0],-16(@outptr[0],$offset) + movdqu (@inptr[0],$offset),@out[0] + movups @out[1],-16(@outptr[1],$offset) + movdqu (@inptr[1],$offset),@out[1] + pxor $zero,@out[0] + movups @out[2],-16(@outptr[2],$offset) + movdqu (@inptr[2],$offset),@out[2] + pxor $zero,@out[1] + movups @out[3],-16(@outptr[3],$offset) + movdqu (@inptr[3],$offset),@out[3] + pxor $zero,@out[2] + pxor $zero,@out[3] + + dec $num + jnz .Loop_dec4x + + mov 16(%rsp),%rax # original %rsp + mov 24(%rsp),$num + + lea `40*4`($inp),$inp + dec $num + jnz .Ldec4x_loop_grande + +.Ldec4x_done: +___ +$code.=<<___ if ($win64); + movaps -0xa8(%rax),%xmm6 + movaps -0x98(%rax),%xmm7 + movaps -0x88(%rax),%xmm8 + movaps -0x78(%rax),%xmm9 + movaps -0x68(%rax),%xmm10 + movaps -0x58(%rax),%xmm11 + movaps -0x48(%rax),%xmm12 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp + ret +.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt +___ + + if ($avx) {{{ +my @ptr=map("%r$_",(8..15)); +my $offload=$sink; + +my @out=map("%xmm$_",(2..9)); +my @inp=map("%xmm$_",(10..13)); +my ($counters,$zero)=("%xmm14","%xmm15"); + +$code.=<<___; +.type aesni_multi_cbc_encrypt_avx,\@function,3 +.align 32 +aesni_multi_cbc_encrypt_avx: +_avx_cbc_enc_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + # +64 distances between inputs and outputs + # +128 off-load area for @inp[0..3] + + sub \$192,%rsp + and \$-128,%rsp + mov %rax,16(%rsp) # original %rsp + +.Lenc8x_body: + vzeroupper + vmovdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*4($inp),$inp + shr \$1,$num + +.Lenc8x_loop_grande: + #mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<8;$i++) { + my $temp = $i ? $offload : $offset; + $code.=<<___; + mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer + cmp $num,$one + mov `40*$i+8-40*4`($inp),$temp # output pointer + cmovg $one,$num # find maximum + test $one,$one + vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@ptr[$i] # cancel input + sub @ptr[$i],$temp # distance between input and output + mov $temp,`64+8*$i`(%rsp) # initialize distances +___ +} +$code.=<<___; + test $num,$num + jz .Lenc8x_done + + vmovups 0x10-0x78($key),$rndkey1 + vmovups 0x20-0x78($key),$rndkey0 + mov 0xf0-0x78($key),$rounds + + vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round + lea 128(%rsp),$offload # offload area + vpxor (@ptr[1]),$zero,@inp[1] + vpxor (@ptr[2]),$zero,@inp[2] + vpxor (@ptr[3]),$zero,@inp[3] + vpxor @inp[0],@out[0],@out[0] + vpxor (@ptr[4]),$zero,@inp[0] + vpxor @inp[1],@out[1],@out[1] + vpxor (@ptr[5]),$zero,@inp[1] + vpxor @inp[2],@out[2],@out[2] + vpxor (@ptr[6]),$zero,@inp[2] + vpxor @inp[3],@out[3],@out[3] + vpxor (@ptr[7]),$zero,@inp[3] + vpxor @inp[0],@out[4],@out[4] + mov \$1,$one # constant of 1 + vpxor @inp[1],@out[5],@out[5] + vpxor @inp[2],@out[6],@out[6] + vpxor @inp[3],@out[7],@out[7] + jmp .Loop_enc8x + +.align 32 +.Loop_enc8x: +___ +for($i=0;$i<8;$i++) { +my $rndkey=($i&1)?$rndkey0:$rndkey1; +$code |