"Monolithic" x86 assembler replacement for aes_core.c. Up to +15% better

performance on recent microarchitectures.
author: Andy Polyakov <appro@openssl.org> 2005-01-13 15:35:44 +0000
committer: Andy Polyakov <appro@openssl.org> 2005-01-13 15:35:44 +0000
commit: e7e1150706f8c8bcc807d8184bd0ebd08b6f5aff (patch)
tree: b4ab68be2540b291fe75ceaf0dba283d50dbf1c8
parent: 5d727078ac8c16ccc0d987234c168a589a2ab767 (diff)
4 files changed, 1227 insertions, 781 deletions
diff --git a/Configure b/Configure
index f9d827d362..068b7c10e4 100755
--- a/Configure
+++ b/Configure
@@ -114,9 +114,9 @@ my $tlib="-lnsl -lsocket";
 my $bits1="THIRTY_TWO_BIT ";
 my $bits2="SIXTY_FOUR_BIT ";
 
-my $x86_elf_asm="x86cpuid-elf.o:asm/bn86-elf.o asm/co86-elf.o:asm/dx86-elf.o asm/yx86-elf.o:aes_core.o asm/ax86-elf.o:asm/bx86-elf.o:asm/mx86-elf.o:asm/sx86-elf.o asm/s512sse2-elf.o:asm/cx86-elf.o:asm/rx86-elf.o:asm/rm86-elf.o:asm/r586-elf.o";
-my $x86_coff_asm="x86cpuid-cof.o:asm/bn86-cof.o asm/co86-cof.o:asm/dx86-cof.o asm/yx86-cof.o:aes_core.o asm/ax86-cof.o:asm/bx86-cof.o:asm/mx86-cof.o:asm/sx86-cof.o asm/s512sse2-cof.o:asm/cx86-cof.o:asm/rx86-cof.o:asm/rm86-cof.o:asm/r586-cof.o";
-my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:aes_core.o ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o";
+my $x86_elf_asm="x86cpuid-elf.o:asm/bn86-elf.o asm/co86-elf.o:asm/dx86-elf.o asm/yx86-elf.o:asm/ax86-elf.o:asm/bx86-elf.o:asm/mx86-elf.o:asm/sx86-elf.o asm/s512sse2-elf.o:asm/cx86-elf.o:asm/rx86-elf.o:asm/rm86-elf.o:asm/r586-elf.o";
+my $x86_coff_asm="x86cpuid-cof.o:asm/bn86-cof.o asm/co86-cof.o:asm/dx86-cof.o asm/yx86-cof.o:asm/ax86-cof.o:asm/bx86-cof.o:asm/mx86-cof.o:asm/sx86-cof.o asm/s512sse2-cof.o:asm/cx86-cof.o:asm/rx86-cof.o:asm/rm86-cof.o:asm/r586-cof.o";
+my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o";
 
 my $ia64_asm=":asm/ia64.o::aes_core.o asm/aes-ia64.o:::asm/sha1-ia64.o asm/sha256-ia64.o asm/sha512-ia64.o::asm/rc4-ia64.o::";
 
diff --git a/TABLE b/TABLE
index 5632f9ecb2..e910ed47fc 100644
--- a/TABLE
+++ b/TABLE
@@ -92,7 +92,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-cof.o
 $bn_obj       = asm/bn86-cof.o asm/co86-cof.o
 $des_obj      = asm/dx86-cof.o asm/yx86-cof.o
-$aes_obj      = aes_core.o asm/ax86-cof.o
+$aes_obj      = asm/ax86-cof.o
 $bf_obj       = asm/bx86-cof.o
 $md5_obj      = asm/mx86-cof.o
 $sha1_obj     = asm/sx86-cof.o asm/s512sse2-cof.o
@@ -146,7 +146,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-out.o
 $bn_obj       = bn86-out.o co86-out.o
 $des_obj      = dx86-out.o yx86-out.o
-$aes_obj      = aes_core.o ax86-out.o
+$aes_obj      = ax86-out.o
 $bf_obj       = bx86-out.o
 $md5_obj      = mx86-out.o
 $sha1_obj     = sx86-out.o s512sse2-out.o
@@ -173,7 +173,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-out.o
 $bn_obj       = bn86-out.o co86-out.o
 $des_obj      = dx86-out.o yx86-out.o
-$aes_obj      = aes_core.o ax86-out.o
+$aes_obj      = ax86-out.o
 $bf_obj       = bx86-out.o
 $md5_obj      = mx86-out.o
 $sha1_obj     = sx86-out.o s512sse2-out.o
@@ -227,7 +227,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -551,7 +551,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-out.o
 $bn_obj       = bn86-out.o co86-out.o
 $des_obj      = dx86-out.o yx86-out.o
-$aes_obj      = aes_core.o ax86-out.o
+$aes_obj      = ax86-out.o
 $bf_obj       = bx86-out.o
 $md5_obj      = mx86-out.o
 $sha1_obj     = sx86-out.o s512sse2-out.o
@@ -767,7 +767,7 @@ $bn_ops       = BN_LLONG MD2_CHAR RC4_INDEX DES_PTR DES_RISC1 DES_UNROLL
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -794,7 +794,7 @@ $bn_ops       = BN_LLONG MD2_CHAR RC4_INDEX DES_PTR DES_RISC1 DES_UNROLL
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1334,7 +1334,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1685,7 +1685,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1739,7 +1739,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1766,7 +1766,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1847,7 +1847,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1874,7 +1874,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1901,7 +1901,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1928,7 +1928,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1955,7 +1955,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -2090,7 +2090,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -2279,7 +2279,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -2738,7 +2738,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -3035,7 +3035,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-out.o
 $bn_obj       = bn86-out.o co86-out.o
 $des_obj      = dx86-out.o yx86-out.o
-$aes_obj      = aes_core.o ax86-out.o
+$aes_obj      = ax86-out.o
 $bf_obj       = bx86-out.o
 $md5_obj      = mx86-out.o
 $sha1_obj     = sx86-out.o s512sse2-out.o
@@ -3062,7 +3062,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -3116,7 +3116,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -3197,7 +3197,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -3332,7 +3332,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -3413,7 +3413,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -3629,7 +3629,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-cof.o
 $bn_obj       = asm/bn86-cof.o asm/co86-cof.o
 $des_obj      = asm/dx86-cof.o asm/yx86-cof.o
-$aes_obj      = aes_core.o asm/ax86-cof.o
+$aes_obj      = asm/ax86-cof.o
 $bf_obj       = asm/bx86-cof.o
 $md5_obj      = asm/mx86-cof.o
 $sha1_obj     = asm/sx86-cof.o asm/s512sse2-cof.o
@@ -4034,7 +4034,7 @@ $bn_ops       = DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -4061,7 +4061,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -4277,7 +4277,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -4547,7 +4547,7 @@ $bn_ops       = BN_LLONG MD2_CHAR RC4_INDEX DES_PTR DES_RISC1 DES_UNROLL
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -4574,7 +4574,7 @@ $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj      = aes_core.o asm/ax86-elf.o
+$aes_obj      = asm/ax86-elf.o
 $bf_obj       = asm/bx86-elf.o
 $md5_obj      = asm/mx86-elf.o
 $sha1_obj     = asm/sx86-elf.o asm/s512sse2-elf.o
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
index c61ed6bdb2..688fda21ff 100755
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl
@@ -6,15 +6,21 @@
 # forms are granted according to the OpenSSL license.
 # ====================================================================
 #
+# Version 2.0.
+#
 # You might fail to appreciate this module performance from the first
-# try. If compared to "vanilla" linux-ia32-icc target, i.e. Intel C
-# without -KPIC, performance appears to be virtually identical... But
-# try to configure with shared library support... Aha! Intel compiler
-# "suddenly" lags behind by 30% [on P4]:-) And if compared to
-# position-independent code generated by GNU C, this code performs
-# more than *twice* as fast! Yes, all this buzz about PIC means that
-# [unlike other implementations] this module was explicitly designed
-# to be safe to use even in shared library context...
+# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
+# to be *the* best Intel C compiler without -KPIC, performance appears
+# to be virtually identical... But try to re-configure with shared
+# library support... Aha! Intel compiler "suddenly" lags behind by 30%
+# [on P4, more on others]:-) And if compared to position-independent
+# code generated by GNU C, this code performs *more* than *twice* as
+# fast! Yes, all this buzz about PIC means that unlike other hand-
+# coded implementations, this one was explicitly designed to be safe
+# to use even in shared library context... This also means that this
+# code isn't necessarily absolutely fastest "ever," because in order
+# to achieve position independence an extra register has to be
+# off-loaded to stack, which affects the benchmark result.
 #
 # Special note about instruction choice. Do you recall RC4_INT code
 # performing poorly on P4? It might be the time to figure out why.
@@ -30,212 +36,326 @@
 # intermediate implementation, which was spilling yet another register
 # to stack... Final offset*4 code below runs just a tad faster on P4,
 # but exhibits up to 10% improvement on other cores.
+#
+# Second version is "monolithic" replacement for aes_core.c, which in
+# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
+# This made it possible to implement little-endian variant of the
+# algorithm without modifying the base C code. Motivating factor for
+# the undertaken effort was that it appeared that in tight IA-32
+# register window little-endian flavor could achieve slightly higher
+# Instruction Level Parallelism, and it indeed resulted in up to 15%
+# better performance on most recent �-archs...
+#
+# Current ECB performance numbers for 128-bit key in cycles per byte
+# [measure commonly used by AES benchmarkers] are:
+#
+#		small footprint		fully unrolled
+# P4[-3]	23[24]			22[23]
+# AMD K8	19			18
+# PIII		26(*)			23
+# Pentium	63(*)			52
+#
+# (*)	Performance difference between small footprint code and fully
+#	unrolled in more commonly used CBC mode is not as big, 7% for
+#	PIII and 15% for Pentium, which I consider tolerable.
 
 push(@INC,"perlasm","../../perlasm");
 require "x86asm.pl";
 
 &asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386");
 
-$small_footprint=1;	# $small_footprint=1 code is 4-6% slower, but
-			# 5 times smaller! I default to compact code.
+$small_footprint=1;	# $small_footprint=1 code is ~5% slower [on
+			# recent �-archs], but ~5 times smaller!
+			# I favor compact code, because it minimizes
+			# cache contention...
+$vertical_spin=0;	# shift "verticaly" defaults to 0, because of
+			# its proof-of-concept status, see below...
+
 $s0="eax";
 $s1="ebx";
 $s2="ecx";
 $s3="edx";
+$key="esi";
+$acc="edi";
+
+if ($vertical_spin) {
+	# I need high parts of volatile registers to be accessible...
+	$s1="esi";	$key="ebx";
+	$s2="edi";	$acc="ecx";
+}
+# Note that there is no decvert(), as well as last encryption round is
+# performed with "horizontal" shifts. This is because this "vertical"
+# implementation [one which groups shifts on a given $s[i] to form a
+# "column," unlike "horizontal" one, which groups shifts on different
+# $s[i] to form a "row"] is work in progress. It was observed to run
+# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
+# whole 12% slower:-( So we face a trade-off... Shall it be resolved
+# some day? Till then the code is considered experimental and by
+# default remains dormant...
+
+sub encvert()
+{ my ($te,@s) = @_;
+  my $v0 = $acc, $v1 = $key;
+
+	&mov	($v0,$s[3]);				# copy s3
+	&mov	(&DWP(0,"esp"),$s[2]);			# save s2
+	&mov	($v1,$s[0]);				# copy s0
+	&mov	(&DWP(4,"esp"),$s[1]);			# save s1
+
+	&movz	($s[2],&HB($s[0]));
+	&and	($s[0],0xFF);
+	&mov	($s[0],&DWP(1024*0,$te,$s[0],4));	# s0>>0
+	&shr	($v1,16);
+	&mov	($s[3],&DWP(1024*1,$te,$s[2],4));	# s0>>8
+	&movz	($s[1],&HB($v1));
+	&and	($v1,0xFF);
+	&mov	($s[2],&DWP(1024*2,$te,$v1,4));		# s0>>16
+	 &mov	($v1,$v0);
+	&mov	($s[1],&DWP(1024*3,$te,$s[1],4));	# s0>>24
+
+	&and	($v0,0xFF);
+	&xor	($s[3],&DWP(1024*0,$te,$v0,4));		# s3>>0
+	&movz	($v0,&HB($v1));
+	&shr	($v1,16);
+	&xor	($s[2],&DWP(1024*1,$te,$v0,4));		# s3>>8
+	&movz	($v0,&HB($v1));
+	&and	($v1,0xFF);
+	&xor	($s[1],&DWP(1024*2,$te,$v1,4));		# s3>>16
+	 &mov	($v1,&DWP(0,"esp"));			# restore s2
+	&xor	($s[0],&DWP(1024*3,$te,$v0,4));		# s3>>24
+
+	&mov	($v0,$v1);
+	&and	($v1,0xFF);
+	&xor	($s[2],&DWP(1024*0,$te,$v1,4));		# s2>>0
+	&movz	($v1,&HB($v0));
+	&shr	($v0,16);
+	&xor	($s[1],&DWP(1024*1,$te,$v1,4));		# s2>>8
+	&movz	($v1,&HB($v0));
+	&and	($v0,0xFF);
+	&xor	($s[0],&DWP(1024*2,$te,$v0,4));		# s2>>16
+	 &mov	($v0,&DWP(4,"esp"));			# restore s1
+	&xor	($s[3],&DWP(1024*3,$te,$v1,4));		# s2>>24
+
+	&mov	($v1,$v0);
+	&and	($v0,0xFF);
+	&xor	($s[1],&DWP(1024*0,$te,$v0,4));		# s1>>0
+	&movz	($v0,&HB($v1));
+	&shr	($v1,16);
+	&xor	($s[0],&DWP(1024*1,$te,$v0,4));		# s1>>8
+	&movz	($v0,&HB($v1));
+	&and	($v1,0xFF);
+	&xor	($s[3],&DWP(1024*2,$te,$v1,4));		# s1>>16
+	 &mov	($key,&DWP(12,"esp"));			# reincarnate v1 as key
+	&xor	($s[2],&DWP(1024*3,$te,$v0,4));		# s1>>24
+}
 
 sub encstep()
 { my ($i,$te,@s) = @_;
-  my $tmp,$out;
+  my $tmp = $key;
+  my $out = $i==3?$s[0]:$acc;
 
-	# lines marked with ## denote same $sN...
-	if ($i==3)  {	&mov	("edi",&DWP(12,"esp"));
-			&movz	($out=$s[0],&HB($s[0]));	}	##
-	else        {	&mov	($out="esi",$s[0]);
-			&shr	($out,24);			}
+	# lines marked with #%e?x[i] denote "reordered" instructions...
+	if ($i==3)  {	&mov	($key,&DWP(12,"esp"));		}##%edx
+	else        {	&mov	($out,$s[0]);
+			&and	($out,0xFF);			}
+	if ($i==1)  {	&shr	($s[0],16);			}#%ebx[1]
+	if ($i==2)  {	&shr	($s[0],24);			}#%ecx[2]
 			&mov	($out,&DWP(1024*0,$te,$out,4));
 
-	if ($i==2)  {	&movz	($tmp="edi",&LB($s[1]));	}	##
-	else        {	$i==3?$tmp=$s[1]:&mov($tmp="edi",$s[1]);
-			&shr	($tmp,16);
-			&and	($tmp,0xFF);			}
+	if ($i==3)  {	$tmp=$s[1];				}##%eax
+			&movz	($tmp,&HB($s[1]));
 			&xor	($out,&DWP(1024*1,$te,$tmp,4));
 
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],&DWP(0,"esp"));	}
-	else        {	$tmp="edi";				}
-			&movz	($tmp,&HB($s[2]));
+	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],&DWP(0,"esp"));	}##%ebx
+	else        {	&mov	($tmp,$s[2]);
+			&shr	($tmp,16);			}
+	if ($i==2)  {	&and	($s[1],0xFF);			}#%edx[2]
+			&and	($tmp,0xFF);
 			&xor	($out,&DWP(1024*2,$te,$tmp,4));
-	if ($i==1)  {	&shr	($s[2],16);			}	##
 
-	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],&DWP(4,"esp"));	}
-	else        {	&mov	($tmp="edi",$s[3]);		} 
-			&and	($tmp,0xFF);
+	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],&DWP(4,"esp"));	}##%ecx
+	elsif($i==2){	&movz	($tmp,&HB($s[3]));		}#%ebx[2]
+	else        {	&mov	($tmp,$s[3]); 
+			&shr	($tmp,24)			}
 			&xor	($out,&DWP(1024*3,$te,$tmp,4));
 	if ($i<2)   {	&mov	(&DWP(4*$i,"esp"),$out);	}
-	if ($i==3)  {	&mov	($s[3],"esi");			}
+	if ($i==3)  {	&mov	($s[3],$acc);			}
+			&comment();
 }
 
 sub enclast()
 { my ($i,$te,@s)=@_;
-  my $tmp,$out;
+  my $tmp = $key;
+  my $out = $i==3?$s[0]:$acc;
 
-	if ($i==3)  {	&mov	("edi",&DWP(12,"esp"));
-			&movz	($out=$s[0],&HB($s[0]));	}	##
-	else        {	&mov	($out="esi",$s[0]);
-			&shr	($out,24);			}
-			&mov	($out,&DWP(0,$te,$out,4));
-			&and	($out,0xff000000);
+	if ($i==3)  {	&mov	($key,&DWP(12,"esp"));		}##%edx
+	else        {	&mov	($out,$s[0]);			}
+			&and	($out,0xFF);
+	if ($i==1)  {	&shr	($s[0],16);			}#%ebx[1]
+	if ($i==2)  {	&shr	($s[0],24);			}#%ecx[2]
+			&mov	($out,&DWP(1024*0,$te,$out,4));
+			&and	($out,0x000000ff);
 
-	if ($i==2)  {	&movz	($tmp="edi",&LB($s[1]));	}	##
-	else        {	$i==3?$tmp=$s[1]:&mov($tmp="edi",$s[1]);
-			&shr	($tmp,16);
-			&and	($tmp,0xFF);			}
+	if ($i==3)  {	$tmp=$s[1];				}##%eax
+			&movz	($tmp,&HB($s[1]));
 			&mov	($tmp,&DWP(0,$te,$tmp,4));
-			&and	($tmp,0x00ff0000);
+			&and	($tmp,0x0000ff00);
 			&xor	($out,$tmp);
 
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],&DWP(0,"esp"));	}
-	else        {	$tmp="edi";				}
-			&movz	($tmp,&HB($s[2]));
+	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],&DWP(0,"esp"));	}##%ebx
+	else        {	mov	($tmp,$s[2]);
+			&shr	($tmp,16);			}
+	if ($i==2)  {	&and	($s[1],0xFF);			}#%edx[2]
+			&and	($tmp,0xFF);
 			&mov	($tmp,&DWP(0,$te,$tmp,4));
-			&and	($tmp,0x0000ff00);
-	if ($i==1)  {	&shr	($s[2],16);			}	##
+			&and	($tmp,0x00ff0000);
 			&xor	($out,$tmp);
 
-	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],&DWP(4,"esp"));	}
-	else        {	&mov	($tmp="edi",$s[3]);		} 
-			&and	($tmp,0xFF);
+	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],&DWP(4,"esp"));	}##%ecx
+	elsif($i==2){	&movz	($tmp,&HB($s[3]));		}#%ebx[2]
+	else        {	&mov	($tmp,$s[3]);
+			&shr	($tmp,24);			}
 			&mov	($tmp,&DWP(0,$te,$tmp,4));
-			&and	($tmp,0x000000ff);
+			&and	($tmp,0xff000000);
 			&xor	($out,$tmp);
 	if ($i<2)   {	&mov	(&DWP(4*$i,"esp"),$out);	}
-	if ($i==3)  {	&mov	($s[3],"esi");			}
+	if ($i==3)  {	&mov	($s[3],$acc);			}
 }
 
 # void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
 &public_label("AES_Te");
 &function_begin("AES_encrypt");
-	&mov	("esi",&wparam(0));		# load inp
-	&mov	("edi",&wparam(2));		# load key
+	&mov	($acc,&wparam(0));		# load inp
+	&mov	($key,&wparam(2));		# load key
 
         &call   (&label("pic_point"));          # make it PIC!
-&set_label("pic_point");
+	&set_label("pic_point");
         &blindpop("ebp");
         &lea    ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
 
 	# allocate aligned stack frame
-	&mov	("eax","esp");
+	&mov	($s0,"esp");
 	&sub	("esp",20);
 	&and	("esp",-16);
 
-	&mov	(&DWP(12,"esp"),"edi");		# save key
-	&mov	(&DWP(16,"esp"),"eax");		# save %esp
-
-	&mov	($s0,&DWP(0,"esi"));		# load input data
-	&mov	($s1,&DWP(4,"esi"));
-	&mov	($s2,&DWP(8,"esi"));
-	&mov	($s3,&DWP(12,"esi"));
-	#
-	# It's perfectly possible to implement algorithm as
-	# little-endian and get rid of bswaps... It would give
-	# less than 1% performance improvement, so I judge it
-	# doesn't worth the trouble...
-	#
-	&bswap	($s0);
-	&bswap	($s1);
-	&bswap	($s2);
-	&bswap	($s3);
-	&xor	($s0,&DWP(0,"edi"));
-	&xor	($s1,&DWP(4,"edi"));
-	&xor	($s2,&DWP(8,"edi"));
-	&xor	($s3,&DWP(12,"edi"));
-
-	&mov	("esi",&DWP(240,"edi"));	# load key->rounds
+	&mov	(&DWP(12,"esp"),$key);		# save key
+	&mov	(&DWP(16,"esp"),$s0);		# save %esp
+
+	&mov	($s0,&DWP(0,$acc));		# load input data
+	&mov	($s1,&DWP(4,$acc));
+	&mov	($s2,&DWP(8,$acc));
+	&mov	($s3,&DWP(12,$acc));
+
+	&xor	($s0,&DWP(0,$key));
+	&xor	($s1,&DWP(4,$key));
+	&xor	($s2,&DWP(8,$key));
+	&xor	($s3,&DWP(12,$key));
+
+	&mov	($acc,&DWP(240,$key));		# load key->rounds
 
 	if ($small_footprint) {
-	    &lea	("esi",&DWP(-2,"esi","esi"));
-	    &lea	("esi",&DWP(0,"edi","esi",8));
-	    &mov	(&DWP(8,"esp"),"esi");	# end of key schedule
+	    &lea	($acc,&DWP(-2,$acc,$acc));
+	    &lea	($acc,&DWP(0,$key,$acc,8));
+	    &mov	(&DWP(8,"esp"),$acc);	# end of key schedule
 	    &align	(4);
 	    &set_label("loop");
-		&encstep(0,"ebp",$s0,$s1,$s2,$s3);
-		&encstep(1,"ebp",$s1,$s2,$s3,$s0);
-		&encstep(2,"ebp",$s2,$s3,$s0,$s1);
-		&encstep(3,"ebp",$s3,$s0,$s1,$s2);
-		&add	("edi",16);			# advance rd_key
-		&xor	($s0,&DWP(0,"edi"));
-		&xor	($s1,&DWP(4,"edi"));
-		&xor	($s2,&DWP(8,"edi"));
-		&xor	($s3,&DWP(12,"edi"));
-	    &cmp	("edi",&DWP(8,"esp"));
-	    &mov	(&DWP(12,"esp"),"edi");
+		if ($vertical_spin) {
+		    &encvert("ebp",$s0,$s1,$s2,$s3);
+		} else {
+		    &encstep(0,"ebp",$s0,$s1,$s2,$s3);
+		    &encstep(1,"ebp",$s1,$s2,$s3,$s0);
+		    &encstep(2,"ebp",$s2,$s3,$s0,$s1);
+		    &encstep(3,"ebp",$s3,$s0,$s1,$s2);
+		}
+		&add	($key,16);			# advance rd_key
+		&xor	($s0,&DWP(0,$key));
+		&xor	($s1,&DWP(4,$key));
+		&xor	($s2,&DWP(8,$key));
+		&xor	($s3,&DWP(12,$key));
+	    &cmp	($key,&DWP(8,"esp"));
+	    &mov	(&DWP(12,"esp"),$key);
 	    &jb		(&label("loop"));
 	}
 	else {
-	    &cmp	("esi",10);
+	    &cmp	($acc,10);
 	    &jle	(&label("10rounds"));
-	    &cmp	("esi",12);
+	    &cmp	($acc,12);
 	    &jle	(&label("12rounds"));
 
 	&set_label("14rounds");
 	    for ($i=1;$i<3;$i++) {
-		&encstep(0,"ebp",$s0,$s1,$s2,$s3);
-		&encstep(1,"ebp",$s1,$s2,$s3,$s0);
-		&encstep(2,"ebp",$s2,$s3,$s0,$s1);
-		&encstep(3,"ebp",$s3,$s0,$s1,$s2);
-		&xor	($s0,&DWP(16*$i+0,"edi"));
-		&xor	($s1,&DWP(16*$i+4,"edi"));
-		&xor	($s2,&DWP(16*$i+8,"edi"));
-		&xor	($s3,&DWP(16*$i+12,"edi"));
+		if ($vertical_spin) {
+		    &encvert("ebp",$s0,$s1,$s2,$s3);
+		} else {
+		    &encstep(0,"ebp",$s0,$s1,$s2,$s3);
+		    &encstep(1,"ebp",$s1,$s2,$s3,$s0);
+		    &encstep(2,"ebp",$s2,$s3,$s0,$s1);
+		    &encstep(3,"ebp",$s3,$s0,$s1,$s2);
+		}
+		&xor	($s0,&DWP(16*$i+0,$key));
+		&xor	($s1,&DWP(16*$i+4,$key));
+		&xor	($s2,&DWP(16*$i+8,$key));
+		&xor	($s3,&DWP(16*$i+12,$key));
 	    }
-	    &add	("edi",32);
-	    &mov	(&DWP(12,"esp"),"edi");		# advance rd_key
+	    &add	($key,32);
+	    &mov	(&DWP(12,"esp"),$key);		# advance rd_key
 	&set_label("12rounds");
 	    for ($i=1;$i<3;$i++) {
-		&encstep(0,"ebp",$s0,$s1,$s2,$s3);
-		&encstep(1,"ebp",$s1,$s2,$s3,$s0);
-		&encstep(2,"ebp",$s2,$s3,$s0,$s1);
-		&encstep(3,"ebp",$s3,$s0,$s1,$s2);
-		&xor	($s0,&DWP(16*$i+0,"edi"));
-		&xor	($s1,&DWP(16*$i+4,"edi"));
-		&xor	($s2,&DWP(16*$i+8,"edi"));
-		&xor	($s3,&DWP(16*$i+12,"edi"));
+		if ($vertical_spin) {
+		    &encvert("ebp",$s0,$s1,$s2,$s3);
+		} else {
+		    &encstep(0,"ebp",$s0,$s1,$s2,$s3);
+		    &encstep(1,"ebp",$s1,$s2,$s3,$s0);
+		    &encstep(2,"ebp",$s2,$s3,$s0,$s1);
+		    &encstep(3,"ebp",$s3,$s0,$s1,$s2);
+		}
+		&xor	($s0,&DWP(16*$i+0,$key));
+		&xor	($s1,&DWP(16*$i+4,$key));
+		&xor	($s2,&DWP(16*$i+8,$key));
+		&xor	($s3,&DWP(16*$i+12,$key));
 	    }
-	    &add	("edi",32);
-	    &mov	(&DWP(12,"esp"),"edi");		# advance rd_key
+	    &add	($key,32);
+	    &mov	(&DWP(12,"esp"),$key);		# advance rd_key
 	&set_label("10rounds");
 	    for ($i=1;$i<10;$i++) {
-		&encstep(0,"ebp",$s0,$s1,$s2,$s3);
-		&encstep(1,"ebp",$s1,$s2,$s3,$s0);
-		&encstep(2,"ebp",$s2,$s3,$s0,$s1);
-		&encstep(3,"ebp",$s3,$s0,$s1,$s2);
-		&xor	($s0,&DWP(16*$i+0,"edi"));
-		&xor	($s1,&DWP(16*$i+4,"edi"));
-		&xor	($s2,&DWP(16*$i+8,"edi"));
-		&xor	($s3,&DWP(16*$i+12,"edi"));
+		if ($vertical_spin) {
+		    &encvert("ebp",$s0,$s1,$s2,$s3);
+		} else {
+		    &encstep(0,"ebp",$s0,$s1,$s2,$s3);
+		    &encstep(1,"ebp",$s1,$s2,$s3,$s0);
+		    &encstep(2,"ebp",$s2,$s3,$s0,$s1);
+		    &encstep(3,"ebp",$s3,$s0,$s1,$s2);
+		}
+		&xor	($s0,&DWP(16*$i+0,$key));
+		&xor	($s1,&DWP(16*$i+4,$key));
+		&xor	($s2,&DWP(16*$i+8,$key));
+		&xor	($s3,&DWP(16*$i+12,$key));
 	    }
 	}
 
 	&add	("ebp",4*1024);			# skip to Te4
+	if ($vertical_spin) {
+	    # "reincarnate" some registers for "horizontal" spin...
+	    &mov	($s1="ebx",$key="esi");
+	    &mov	($s2="ecx",$acc="edi");
+	}
 	&enclast(0,"ebp",$s0,$s1,$s2,$s3);
 	&enclast(1,"ebp",$s1,$s2,$s3,$s0);
 	&enclast(2,"ebp",$s2,$s3,$s0,$s1);
 	&enclast(3,"ebp",$s3,$s0,$s1,$s2);
 
 	&mov	("esp",&DWP(16,"esp"));		# restore %esp
-	&add	("edi",$small_footprint?16:160);
-	&xor	($s0,&DWP(0,"edi"));
-	&xor	($s1,&DWP(4,"edi"));
-	&xor	($s2,&DWP(8,"edi"));
-	&xor	($s3,&DWP(12,"edi"));
-	&bswap	($s0);
-	&bswap	($s1);
-	&bswap	($s2);
-	&bswap	($s3);
-	&mov	("edi",&wparam(1));		# load out
-	&mov	(&DWP(0,"edi"),$s0);		# write output data
-	&mov	(&DWP(4,"edi"),$s1);
-	&mov	(&DWP(8,"edi"),$s2);
-	&mov	(&DWP(12,"edi"),$s3);
+	&add	($key,$small_footprint?16:160);
+	&xor	($s0,&DWP(0,$key));
+	&xor	($s1,&DWP(4,$key));
+	&xor	($s2,&DWP(8,$key));
+	&xor	($s3,&DWP(12,$key));
+
+	&mov	($acc,&wparam(1));		# load out
+	&mov	(&DWP(0,$acc),$s0);		# write output data
+	&mov	(&DWP(4,$acc),$s1);
+	&mov	(&DWP(8,$acc),$s2);
+	&mov	(&DWP(12,$acc),$s3);
 
 	&pop	("edi");
 	&pop	("esi");
@@ -244,265 +364,265 @@ sub enclast()
 	&ret	();
 
 &set_label("AES_Te",64);	# Yes! I keep it in the code segment!
-	&data_word(0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d);
-	&data_word(0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554);
-	&data_word(0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d);
-	&data_word(0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a);
-	&data_word(0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87);
-	&data_word(0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b);
-	&data_word(0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea);
-	&data_word(0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b);
-	&data_word(0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a);
-	&data_word(0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f);
-	&data_word(0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108);
-	&data_word(0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f);
-	&data_word(0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e);
-	&data_word(0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5);
-	&data_word(0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d);
-	&data_word(0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f);
-	&data_word(0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e);
-	&data_word(0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb);
-	&data_word(0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce);
-	&data_word(0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497);
-	&data_word(0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c);
-	&data_word(0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed);
-	&data_word(0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b);
-	&data_word(0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a);
-	&data_word(0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16);
-	&data_word(0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594);
-	&data_word(0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81);
-	&data_word(0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3);
-	&data_word(0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a);
-	&data_word(0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504);
-	&data_word(0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163);
-	&data_word(0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d);
-	&data_word(0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f);
-	&data_word(0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739);
-	&data_word(0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47);
-	&data_word(0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395);
-	&data_word(0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f);
-	&data_word(0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883);
-	&data_word(0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c);
-	&data_word(0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76);
-	&data_word(0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e);
-	&data_word(0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4);
-	&data_word(0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6);
-	&data_word(0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b);
-	&data_word(0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7);
-	&data_word(0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0);
-	&data_word(0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25);
-	&data_word(0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818);
-	&data_word(0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72);
-	&data_word(0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651);
-	&data_word(0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21);
-	&data_word(0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85);
-	&data_word(0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa);
-	&data_word(0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12);
-	&data_word(0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0);
-	&data_word(0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9);
-	&data_word(0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133);
-	&data_word(0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7);
-	&data_word(0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920);
-	&data_word(0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a);
-	&data_word(0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17);
-	&data_word(0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8);
-	&data_word(0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11);
author	Andy Polyakov <appro@openssl.org>	2005-01-13 15:35:44 +0000
committer	Andy Polyakov <appro@openssl.org>	2005-01-13 15:35:44 +0000
commit	e7e1150706f8c8bcc807d8184bd0ebd08b6f5aff (patch)
tree	b4ab68be2540b291fe75ceaf0dba283d50dbf1c8
parent	5d727078ac8c16ccc0d987234c168a589a2ab767 (diff)