summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2005-01-13 15:35:44 +0000
committerAndy Polyakov <appro@openssl.org>2005-01-13 15:35:44 +0000
commite7e1150706f8c8bcc807d8184bd0ebd08b6f5aff (patch)
treeb4ab68be2540b291fe75ceaf0dba283d50dbf1c8
parent5d727078ac8c16ccc0d987234c168a589a2ab767 (diff)
"Monolithic" x86 assembler replacement for aes_core.c. Up to +15% better
performance on recent microarchitectures.
-rwxr-xr-xConfigure6
-rw-r--r--TABLE62
-rwxr-xr-xcrypto/aes/asm/aes-586.pl1934
-rw-r--r--crypto/engine/eng_padlock.c6
4 files changed, 1227 insertions, 781 deletions
diff --git a/Configure b/Configure
index f9d827d362..068b7c10e4 100755
--- a/Configure
+++ b/Configure
@@ -114,9 +114,9 @@ my $tlib="-lnsl -lsocket";
my $bits1="THIRTY_TWO_BIT ";
my $bits2="SIXTY_FOUR_BIT ";
-my $x86_elf_asm="x86cpuid-elf.o:asm/bn86-elf.o asm/co86-elf.o:asm/dx86-elf.o asm/yx86-elf.o:aes_core.o asm/ax86-elf.o:asm/bx86-elf.o:asm/mx86-elf.o:asm/sx86-elf.o asm/s512sse2-elf.o:asm/cx86-elf.o:asm/rx86-elf.o:asm/rm86-elf.o:asm/r586-elf.o";
-my $x86_coff_asm="x86cpuid-cof.o:asm/bn86-cof.o asm/co86-cof.o:asm/dx86-cof.o asm/yx86-cof.o:aes_core.o asm/ax86-cof.o:asm/bx86-cof.o:asm/mx86-cof.o:asm/sx86-cof.o asm/s512sse2-cof.o:asm/cx86-cof.o:asm/rx86-cof.o:asm/rm86-cof.o:asm/r586-cof.o";
-my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:aes_core.o ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o";
+my $x86_elf_asm="x86cpuid-elf.o:asm/bn86-elf.o asm/co86-elf.o:asm/dx86-elf.o asm/yx86-elf.o:asm/ax86-elf.o:asm/bx86-elf.o:asm/mx86-elf.o:asm/sx86-elf.o asm/s512sse2-elf.o:asm/cx86-elf.o:asm/rx86-elf.o:asm/rm86-elf.o:asm/r586-elf.o";
+my $x86_coff_asm="x86cpuid-cof.o:asm/bn86-cof.o asm/co86-cof.o:asm/dx86-cof.o asm/yx86-cof.o:asm/ax86-cof.o:asm/bx86-cof.o:asm/mx86-cof.o:asm/sx86-cof.o asm/s512sse2-cof.o:asm/cx86-cof.o:asm/rx86-cof.o:asm/rm86-cof.o:asm/r586-cof.o";
+my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o";
my $ia64_asm=":asm/ia64.o::aes_core.o asm/aes-ia64.o:::asm/sha1-ia64.o asm/sha256-ia64.o asm/sha512-ia64.o::asm/rc4-ia64.o::";
diff --git a/TABLE b/TABLE
index 5632f9ecb2..e910ed47fc 100644
--- a/TABLE
+++ b/TABLE
@@ -92,7 +92,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-cof.o
$bn_obj = asm/bn86-cof.o asm/co86-cof.o
$des_obj = asm/dx86-cof.o asm/yx86-cof.o
-$aes_obj = aes_core.o asm/ax86-cof.o
+$aes_obj = asm/ax86-cof.o
$bf_obj = asm/bx86-cof.o
$md5_obj = asm/mx86-cof.o
$sha1_obj = asm/sx86-cof.o asm/s512sse2-cof.o
@@ -146,7 +146,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-out.o
$bn_obj = bn86-out.o co86-out.o
$des_obj = dx86-out.o yx86-out.o
-$aes_obj = aes_core.o ax86-out.o
+$aes_obj = ax86-out.o
$bf_obj = bx86-out.o
$md5_obj = mx86-out.o
$sha1_obj = sx86-out.o s512sse2-out.o
@@ -173,7 +173,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-out.o
$bn_obj = bn86-out.o co86-out.o
$des_obj = dx86-out.o yx86-out.o
-$aes_obj = aes_core.o ax86-out.o
+$aes_obj = ax86-out.o
$bf_obj = bx86-out.o
$md5_obj = mx86-out.o
$sha1_obj = sx86-out.o s512sse2-out.o
@@ -227,7 +227,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -551,7 +551,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-out.o
$bn_obj = bn86-out.o co86-out.o
$des_obj = dx86-out.o yx86-out.o
-$aes_obj = aes_core.o ax86-out.o
+$aes_obj = ax86-out.o
$bf_obj = bx86-out.o
$md5_obj = mx86-out.o
$sha1_obj = sx86-out.o s512sse2-out.o
@@ -767,7 +767,7 @@ $bn_ops = BN_LLONG MD2_CHAR RC4_INDEX DES_PTR DES_RISC1 DES_UNROLL
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -794,7 +794,7 @@ $bn_ops = BN_LLONG MD2_CHAR RC4_INDEX DES_PTR DES_RISC1 DES_UNROLL
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1334,7 +1334,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1685,7 +1685,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1739,7 +1739,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1766,7 +1766,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1847,7 +1847,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1874,7 +1874,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1901,7 +1901,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1928,7 +1928,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -1955,7 +1955,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -2090,7 +2090,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -2279,7 +2279,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -2738,7 +2738,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -3035,7 +3035,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-out.o
$bn_obj = bn86-out.o co86-out.o
$des_obj = dx86-out.o yx86-out.o
-$aes_obj = aes_core.o ax86-out.o
+$aes_obj = ax86-out.o
$bf_obj = bx86-out.o
$md5_obj = mx86-out.o
$sha1_obj = sx86-out.o s512sse2-out.o
@@ -3062,7 +3062,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -3116,7 +3116,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -3197,7 +3197,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -3332,7 +3332,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -3413,7 +3413,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -3629,7 +3629,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-cof.o
$bn_obj = asm/bn86-cof.o asm/co86-cof.o
$des_obj = asm/dx86-cof.o asm/yx86-cof.o
-$aes_obj = aes_core.o asm/ax86-cof.o
+$aes_obj = asm/ax86-cof.o
$bf_obj = asm/bx86-cof.o
$md5_obj = asm/mx86-cof.o
$sha1_obj = asm/sx86-cof.o asm/s512sse2-cof.o
@@ -4034,7 +4034,7 @@ $bn_ops = DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -4061,7 +4061,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -4277,7 +4277,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -4547,7 +4547,7 @@ $bn_ops = BN_LLONG MD2_CHAR RC4_INDEX DES_PTR DES_RISC1 DES_UNROLL
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
@@ -4574,7 +4574,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
-$aes_obj = aes_core.o asm/ax86-elf.o
+$aes_obj = asm/ax86-elf.o
$bf_obj = asm/bx86-elf.o
$md5_obj = asm/mx86-elf.o
$sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
index c61ed6bdb2..688fda21ff 100755
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl
@@ -6,15 +6,21 @@
# forms are granted according to the OpenSSL license.
# ====================================================================
#
+# Version 2.0.
+#
# You might fail to appreciate this module performance from the first
-# try. If compared to "vanilla" linux-ia32-icc target, i.e. Intel C
-# without -KPIC, performance appears to be virtually identical... But
-# try to configure with shared library support... Aha! Intel compiler
-# "suddenly" lags behind by 30% [on P4]:-) And if compared to
-# position-independent code generated by GNU C, this code performs
-# more than *twice* as fast! Yes, all this buzz about PIC means that
-# [unlike other implementations] this module was explicitly designed
-# to be safe to use even in shared library context...
+# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
+# to be *the* best Intel C compiler without -KPIC, performance appears
+# to be virtually identical... But try to re-configure with shared
+# library support... Aha! Intel compiler "suddenly" lags behind by 30%
+# [on P4, more on others]:-) And if compared to position-independent
+# code generated by GNU C, this code performs *more* than *twice* as
+# fast! Yes, all this buzz about PIC means that unlike other hand-
+# coded implementations, this one was explicitly designed to be safe
+# to use even in shared library context... This also means that this
+# code isn't necessarily absolutely fastest "ever," because in order
+# to achieve position independence an extra register has to be
+# off-loaded to stack, which affects the benchmark result.
#
# Special note about instruction choice. Do you recall RC4_INT code
# performing poorly on P4? It might be the time to figure out why.
@@ -30,212 +36,326 @@
# intermediate implementation, which was spilling yet another register
# to stack... Final offset*4 code below runs just a tad faster on P4,
# but exhibits up to 10% improvement on other cores.
+#
+# Second version is "monolithic" replacement for aes_core.c, which in
+# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
+# This made it possible to implement little-endian variant of the
+# algorithm without modifying the base C code. Motivating factor for
+# the undertaken effort was that it appeared that in tight IA-32
+# register window little-endian flavor could achieve slightly higher
+# Instruction Level Parallelism, and it indeed resulted in up to 15%
+# better performance on most recent µ-archs...
+#
+# Current ECB performance numbers for 128-bit key in cycles per byte
+# [measure commonly used by AES benchmarkers] are:
+#
+# small footprint fully unrolled
+# P4[-3] 23[24] 22[23]
+# AMD K8 19 18
+# PIII 26(*) 23
+# Pentium 63(*) 52
+#
+# (*) Performance difference between small footprint code and fully
+# unrolled in more commonly used CBC mode is not as big, 7% for
+# PIII and 15% for Pentium, which I consider tolerable.
push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386");
-$small_footprint=1; # $small_footprint=1 code is 4-6% slower, but
- # 5 times smaller! I default to compact code.
+$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
+ # recent µ-archs], but ~5 times smaller!
+ # I favor compact code, because it minimizes
+ # cache contention...
+$vertical_spin=0; # shift "verticaly" defaults to 0, because of
+ # its proof-of-concept status, see below...
+
$s0="eax";
$s1="ebx";
$s2="ecx";
$s3="edx";
+$key="esi";
+$acc="edi";
+
+if ($vertical_spin) {
+ # I need high parts of volatile registers to be accessible...
+ $s1="esi"; $key="ebx";
+ $s2="edi"; $acc="ecx";
+}
+# Note that there is no decvert(), as well as last encryption round is
+# performed with "horizontal" shifts. This is because this "vertical"
+# implementation [one which groups shifts on a given $s[i] to form a
+# "column," unlike "horizontal" one, which groups shifts on different
+# $s[i] to form a "row"] is work in progress. It was observed to run
+# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
+# whole 12% slower:-( So we face a trade-off... Shall it be resolved
+# some day? Till then the code is considered experimental and by
+# default remains dormant...
+
+sub encvert()
+{ my ($te,@s) = @_;
+ my $v0 = $acc, $v1 = $key;
+
+ &mov ($v0,$s[3]); # copy s3
+ &mov (&DWP(0,"esp"),$s[2]); # save s2
+ &mov ($v1,$s[0]); # copy s0
+ &mov (&DWP(4,"esp"),$s[1]); # save s1
+
+ &movz ($s[2],&HB($s[0]));
+ &and ($s[0],0xFF);
+ &mov ($s[0],&DWP(1024*0,$te,$s[0],4)); # s0>>0
+ &shr ($v1,16);
+ &mov ($s[3],&DWP(1024*1,$te,$s[2],4)); # s0>>8
+ &movz ($s[1],&HB($v1));
+ &and ($v1,0xFF);
+ &mov ($s[2],&DWP(1024*2,$te,$v1,4)); # s0>>16
+ &mov ($v1,$v0);
+ &mov ($s[1],&DWP(1024*3,$te,$s[1],4)); # s0>>24
+
+ &and ($v0,0xFF);
+ &xor ($s[3],&DWP(1024*0,$te,$v0,4)); # s3>>0
+ &movz ($v0,&HB($v1));
+ &shr ($v1,16);
+ &xor ($s[2],&DWP(1024*1,$te,$v0,4)); # s3>>8
+ &movz ($v0,&HB($v1));
+ &and ($v1,0xFF);
+ &xor ($s[1],&DWP(1024*2,$te,$v1,4)); # s3>>16
+ &mov ($v1,&DWP(0,"esp")); # restore s2
+ &xor ($s[0],&DWP(1024*3,$te,$v0,4)); # s3>>24
+
+ &mov ($v0,$v1);
+ &and ($v1,0xFF);
+ &xor ($s[2],&DWP(1024*0,$te,$v1,4)); # s2>>0
+ &movz ($v1,&HB($v0));
+ &shr ($v0,16);
+ &xor ($s[1],&DWP(1024*1,$te,$v1,4)); # s2>>8
+ &movz ($v1,&HB($v0));
+ &and ($v0,0xFF);
+ &xor ($s[0],&DWP(1024*2,$te,$v0,4)); # s2>>16
+ &mov ($v0,&DWP(4,"esp")); # restore s1
+ &xor ($s[3],&DWP(1024*3,$te,$v1,4)); # s2>>24
+
+ &mov ($v1,$v0);
+ &and ($v0,0xFF);
+ &xor ($s[1],&DWP(1024*0,$te,$v0,4)); # s1>>0
+ &movz ($v0,&HB($v1));
+ &shr ($v1,16);
+ &xor ($s[0],&DWP(1024*1,$te,$v0,4)); # s1>>8
+ &movz ($v0,&HB($v1));
+ &and ($v1,0xFF);
+ &xor ($s[3],&DWP(1024*2,$te,$v1,4)); # s1>>16
+ &mov ($key,&DWP(12,"esp")); # reincarnate v1 as key
+ &xor ($s[2],&DWP(1024*3,$te,$v0,4)); # s1>>24
+}
sub encstep()
{ my ($i,$te,@s) = @_;
- my $tmp,$out;
+ my $tmp = $key;
+ my $out = $i==3?$s[0]:$acc;
- # lines marked with ## denote same $sN...
- if ($i==3) { &mov ("edi",&DWP(12,"esp"));
- &movz ($out=$s[0],&HB($s[0])); } ##
- else { &mov ($out="esi",$s[0]);
- &shr ($out,24); }
+ # lines marked with #%e?x[i] denote "reordered" instructions...
+ if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx
+ else { &mov ($out,$s[0]);
+ &and ($out,0xFF); }
+ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
+ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
&mov ($out,&DWP(1024*0,$te,$out,4));
- if ($i==2) { &movz ($tmp="edi",&LB($s[1])); } ##
- else { $i==3?$tmp=$s[1]:&mov($tmp="edi",$s[1]);
- &shr ($tmp,16);
- &and ($tmp,0xFF); }
+ if ($i==3) { $tmp=$s[1]; }##%eax
+ &movz ($tmp,&HB($s[1]));
&xor ($out,&DWP(1024*1,$te,$tmp,4));
- if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(0,"esp")); }
- else { $tmp="edi"; }
- &movz ($tmp,&HB($s[2]));
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(0,"esp")); }##%ebx
+ else { &mov ($tmp,$s[2]);
+ &shr ($tmp,16); }
+ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
+ &and ($tmp,0xFF);
&xor ($out,&DWP(1024*2,$te,$tmp,4));
- if ($i==1) { &shr ($s[2],16); } ##
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }
- else { &mov ($tmp="edi",$s[3]); }
- &and ($tmp,0xFF);
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }##%ecx
+ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
+ else { &mov ($tmp,$s[3]);
+ &shr ($tmp,24) }
&xor ($out,&DWP(1024*3,$te,$tmp,4));
if ($i<2) { &mov (&DWP(4*$i,"esp"),$out); }
- if ($i==3) { &mov ($s[3],"esi"); }
+ if ($i==3) { &mov ($s[3],$acc); }
+ &comment();
}
sub enclast()
{ my ($i,$te,@s)=@_;
- my $tmp,$out;
+ my $tmp = $key;
+ my $out = $i==3?$s[0]:$acc;
- if ($i==3) { &mov ("edi",&DWP(12,"esp"));
- &movz ($out=$s[0],&HB($s[0])); } ##
- else { &mov ($out="esi",$s[0]);
- &shr ($out,24); }
- &mov ($out,&DWP(0,$te,$out,4));
- &and ($out,0xff000000);
+ if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx
+ else { &mov ($out,$s[0]); }
+ &and ($out,0xFF);
+ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
+ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
+ &mov ($out,&DWP(1024*0,$te,$out,4));
+ &and ($out,0x000000ff);
- if ($i==2) { &movz ($tmp="edi",&LB($s[1])); } ##
- else { $i==3?$tmp=$s[1]:&mov($tmp="edi",$s[1]);
- &shr ($tmp,16);
- &and ($tmp,0xFF); }
+ if ($i==3) { $tmp=$s[1]; }##%eax
+ &movz ($tmp,&HB($s[1]));
&mov ($tmp,&DWP(0,$te,$tmp,4));
- &and ($tmp,0x00ff0000);
+ &and ($tmp,0x0000ff00);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(0,"esp")); }
- else { $tmp="edi"; }
- &movz ($tmp,&HB($s[2]));
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(0,"esp")); }##%ebx
+ else { mov ($tmp,$s[2]);
+ &shr ($tmp,16); }
+ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
+ &and ($tmp,0xFF);
&mov ($tmp,&DWP(0,$te,$tmp,4));
- &and ($tmp,0x0000ff00);
- if ($i==1) { &shr ($s[2],16); } ##
+ &and ($tmp,0x00ff0000);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }
- else { &mov ($tmp="edi",$s[3]); }
- &and ($tmp,0xFF);
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }##%ecx
+ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
+ else { &mov ($tmp,$s[3]);
+ &shr ($tmp,24); }
&mov ($tmp,&DWP(0,$te,$tmp,4));
- &and ($tmp,0x000000ff);
+ &and ($tmp,0xff000000);
&xor ($out,$tmp);
if ($i<2) { &mov (&DWP(4*$i,"esp"),$out); }
- if ($i==3) { &mov ($s[3],"esi"); }
+ if ($i==3) { &mov ($s[3],$acc); }
}
# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
&public_label("AES_Te");
&function_begin("AES_encrypt");
- &mov ("esi",&wparam(0)); # load inp
- &mov ("edi",&wparam(2)); # load key
+ &mov ($acc,&wparam(0)); # load inp
+ &mov ($key,&wparam(2)); # load key
&call (&label("pic_point")); # make it PIC!
-&set_label("pic_point");
+ &set_label("pic_point");
&blindpop("ebp");
&lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
# allocate aligned stack frame
- &mov ("eax","esp");
+ &mov ($s0,"esp");
&sub ("esp",20);
&and ("esp",-16);
- &mov (&DWP(12,"esp"),"edi"); # save key
- &mov (&DWP(16,"esp"),"eax"); # save %esp
-
- &mov ($s0,&DWP(0,"esi")); # load input data
- &mov ($s1,&DWP(4,"esi"));
- &mov ($s2,&DWP(8,"esi"));
- &mov ($s3,&DWP(12,"esi"));
- #
- # It's perfectly possible to implement algorithm as
- # little-endian and get rid of bswaps... It would give
- # less than 1% performance improvement, so I judge it
- # doesn't worth the trouble...
- #
- &bswap ($s0);
- &bswap ($s1);
- &bswap ($s2);
- &bswap ($s3);
- &xor ($s0,&DWP(0,"edi"));
- &xor ($s1,&DWP(4,"edi"));
- &xor ($s2,&DWP(8,"edi"));
- &xor ($s3,&DWP(12,"edi"));
-
- &mov ("esi",&DWP(240,"edi")); # load key->rounds
+ &mov (&DWP(12,"esp"),$key); # save key
+ &mov (&DWP(16,"esp"),$s0); # save %esp
+
+ &mov ($s0,&DWP(0,$acc)); # load input data
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+
+ &xor ($s0,&DWP(0,$key));
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
if ($small_footprint) {
- &lea ("esi",&DWP(-2,"esi","esi"));
- &lea ("esi",&DWP(0,"edi","esi",8));
- &mov (&DWP(8,"esp"),"esi"); # end of key schedule
+ &lea ($acc,&DWP(-2,$acc,$acc));
+ &lea ($acc,&DWP(0,$key,$acc,8));
+ &mov (&DWP(8,"esp"),$acc); # end of key schedule
&align (4);
&set_label("loop");
- &encstep(0,"ebp",$s0,$s1,$s2,$s3);
- &encstep(1,"ebp",$s1,$s2,$s3,$s0);
- &encstep(2,"ebp",$s2,$s3,$s0,$s1);
- &encstep(3,"ebp",$s3,$s0,$s1,$s2);
- &add ("edi",16); # advance rd_key
- &xor ($s0,&DWP(0,"edi"));
- &xor ($s1,&DWP(4,"edi"));
- &xor ($s2,&DWP(8,"edi"));
- &xor ($s3,&DWP(12,"edi"));
- &cmp ("edi",&DWP(8,"esp"));
- &mov (&DWP(12,"esp"),"edi");
+ if ($vertical_spin) {
+ &encvert("ebp",$s0,$s1,$s2,$s3);
+ } else {
+ &encstep(0,"ebp",$s0,$s1,$s2,$s3);
+ &encstep(1,"ebp",$s1,$s2,$s3,$s0);
+ &encstep(2,"ebp",$s2,$s3,$s0,$s1);
+ &encstep(3,"ebp",$s3,$s0,$s1,$s2);
+ }
+ &add ($key,16); # advance rd_key
+ &xor ($s0,&DWP(0,$key));
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+ &cmp ($key,&DWP(8,"esp"));
+ &mov (&DWP(12,"esp"),$key);
&jb (&label("loop"));
}
else {
- &cmp ("esi",10);
+ &cmp ($acc,10);
&jle (&label("10rounds"));
- &cmp ("esi",12);
+ &cmp ($acc,12);
&jle (&label("12rounds"));
&set_label("14rounds");
for ($i=1;$i<3;$i++) {
- &encstep(0,"ebp",$s0,$s1,$s2,$s3);
- &encstep(1,"ebp",$s1,$s2,$s3,$s0);
- &encstep(2,"ebp",$s2,$s3,$s0,$s1);
- &encstep(3,"ebp",$s3,$s0,$s1,$s2);
- &xor ($s0,&DWP(16*$i+0,"edi"));
- &xor ($s1,&DWP(16*$i+4,"edi"));
- &xor ($s2,&DWP(16*$i+8,"edi"));
- &xor ($s3,&DWP(16*$i+12,"edi"));
+ if ($vertical_spin) {
+ &encvert("ebp",$s0,$s1,$s2,$s3);
+ } else {
+ &encstep(0,"ebp",$s0,$s1,$s2,$s3);
+ &encstep(1,"ebp",$s1,$s2,$s3,$s0);
+ &encstep(2,"ebp",$s2,$s3,$s0,$s1);
+ &encstep(3,"ebp",$s3,$s0,$s1,$s2);
+ }
+ &xor ($s0,&DWP(16*$i+0,$key));
+ &xor ($s1,&DWP(16*$i+4,$key));
+ &xor ($s2,&DWP(16*$i+8,$key));
+ &xor ($s3,&DWP(16*$i+12,$key));
}
- &add ("edi",32);
- &mov (&DWP(12,"esp"),"edi"); # advance rd_key
+ &add ($key,32);
+ &mov (&DWP(12,"esp"),$key); # advance rd_key
&set_label("12rounds");
for ($i=1;$i<3;$i++) {
- &encstep(0,"ebp",$s0,$s1,$s2,$s3);
- &encstep(1,"ebp",$s1,$s2,$s3,$s0);
- &encstep(2,"ebp",$s2,$s3,$s0,$s1);
- &encstep(3,"ebp",$s3,$s0,$s1,$s2);
- &xor ($s0,&DWP(16*$i+0,"edi"));
- &xor ($s1,&DWP(16*$i+4,"edi"));
- &xor ($s2,&DWP(16*$i+8,"edi"));
- &xor ($s3,&DWP(16*$i+12,"edi"));
+ if ($vertical_spin) {
+ &encvert("ebp",$s0,$s1,$s2,$s3);
+ } else {
+ &encstep(0,"ebp",$s0,$s1,$s2,$s3);
+ &encstep(1,"ebp",$s1,$s2,$s3,$s0);
+ &encstep(2,"ebp",$s2,$s3,$s0,$s1);
+ &encstep(3,"ebp",$s3,$s0,$s1,$s2);
+ }
+ &xor ($s0,&DWP(16*$i+0,$key));
+ &xor ($s1,&DWP(16*$i+4,$key));
+ &xor ($s2,&DWP(16*$i+8,$key));
+ &xor ($s3,&DWP(16*$i+12,$key));
}
- &add ("edi",32);
- &mov (&DWP(12,"esp"),"edi"); # advance rd_key
+ &add ($key,32);
+ &mov (&DWP(12,"esp"),$key); # advance rd_key
&set_label("10rounds");
for ($i=1;$i<10;$i++) {
- &encstep(0,"ebp",$s0,$s1,$s2,$s3);
- &encstep(1,"ebp",$s1,$s2,$s3,$s0);
- &encstep(2,"ebp",$s2,$s3,$s0,$s1);
- &encstep(3,"ebp",$s3,$s0,$s1,$s2);
- &xor ($s0,&DWP(16*$i+0,"edi"));
- &xor ($s1,&DWP(16*$i+4,"edi"));
- &xor ($s2,&DWP(16*$i+8,"edi"));
- &xor ($s3,&DWP(16*$i+12,"edi"));
+ if ($vertical_spin) {
+ &encvert("ebp",$s0,$s1,$s2,$s3);
+ } else {
+ &encstep(0,"ebp",$s0,$s1,$s2,$s3);
+ &encstep(1,"ebp",$s1,$s2,$s3,$s0);
+ &encstep(2,"ebp",$s2,$s3,$s0,$s1);
+ &encstep(3,"ebp",$s3,$s0,$s1,$s2);
+ }
+ &xor ($s0,&DWP(16*$i+0,$key));
+ &xor ($s1,&DWP(16*$i+4,$key));
+ &xor ($s2,&DWP(16*$i+8,$key));
+ &xor ($s3,&DWP(16*$i+12,$key));
}
}
&add ("ebp",4*1024); # skip to Te4
+ if ($vertical_spin) {
+ # "reincarnate" some registers for "horizontal" spin...
+ &mov ($s1="ebx",$key="esi");
+ &mov ($s2="ecx",$acc="edi");
+ }
&enclast(0,"ebp",$s0,$s1,$s2,$s3);
&enclast(1,"ebp",$s1,$s2,$s3,$s0);
&enclast(2,"ebp",$s2,$s3,$s0,$s1);
&enclast(3,"ebp",$s3,$s0,$s1,$s2);
&mov ("esp",&DWP(16,"esp")); # restore %esp
- &add ("edi",$small_footprint?16:160);
- &xor ($s0,&DWP(0,"edi"));
- &xor ($s1,&DWP(4,"edi"));
- &xor ($s2,&DWP(8,"edi"));
- &xor ($s3,&DWP(12,"edi"));
- &bswap ($s0);
- &bswap ($s1);
- &bswap ($s2);
- &bswap ($s3);
- &mov ("edi",&wparam(1)); # load out
- &mov (&DWP(0,"edi"),$s0); # write output data
- &mov (&DWP(4,"edi"),$s1);
- &mov (&DWP(8,"edi"),$s2);
- &mov (&DWP(12,"edi"),$s3);
+ &add ($key,$small_footprint?16:160);
+ &xor ($s0,&DWP(0,$key));
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &mov ($acc,&wparam(1)); # load out
+ &mov (&DWP(0,$acc),$s0); # write output data
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
&pop ("edi");
&pop ("esi");
@@ -244,265 +364,265 @@ sub enclast()
&ret ();
&set_label("AES_Te",64); # Yes! I keep it in the code segment!
- &data_word(0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d);
- &data_word(0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554);
- &data_word(0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d);
- &data_word(0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a);
- &data_word(0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87);
- &data_word(0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b);
- &data_word(0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea);
- &data_word(0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b);
- &data_word(0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a);
- &data_word(0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f);
- &data_word(0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108);
- &data_word(0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f);
- &data_word(0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e);
- &data_word(0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5);
- &data_word(0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d);
- &data_word(0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f);
- &data_word(0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e);
- &data_word(0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb);
- &data_word(0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce);
- &data_word(0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497);
- &data_word(0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c);
- &data_word(0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed);
- &data_word(0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b);
- &data_word(0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a);
- &data_word(0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16);
- &data_word(0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594);
- &data_word(0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81);
- &data_word(0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3);
- &data_word(0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a);
- &data_word(0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504);
- &data_word(0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163);
- &data_word(0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d);
- &data_word(0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f);
- &data_word(0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739);
- &data_word(0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47);
- &data_word(0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395);
- &data_word(0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f);
- &data_word(0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883);
- &data_word(0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c);
- &data_word(0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76);
- &data_word(0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e);
- &data_word(0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4);
- &data_word(0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6);
- &data_word(0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b);
- &data_word(0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7);
- &data_word(0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0);
- &data_word(0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25);
- &data_word(0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818);
- &data_word(0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72);
- &data_word(0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651);
- &data_word(0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21);
- &data_word(0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85);
- &data_word(0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa);
- &data_word(0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12);
- &data_word(0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0);
- &data_word(0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9);
- &data_word(0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133);
- &data_word(0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7);
- &data_word(0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920);
- &data_word(0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a);
- &data_word(0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17);
- &data_word(0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8);
- &data_word(0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11);