summaryrefslogtreecommitdiffstats
path: root/crypto/aes
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2007-07-13 17:39:40 +0000
committerAndy Polyakov <appro@openssl.org>2007-07-13 17:39:40 +0000
commite1612ea59da067c76fa360cf976a6b38216cddf2 (patch)
treef58a3f58fb311e9af88a3671266262c5c036e4f2 /crypto/aes
parent71f4ea44eb2880b8a84e683dbf62708e512febf4 (diff)
Add _x86_64_AES_[en|de]crypt_compact.
Diffstat (limited to 'crypto/aes')
-rwxr-xr-xcrypto/aes/asm/aes-x86_64.pl1071
1 files changed, 952 insertions, 119 deletions
diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl
index d02cd5bd2b..75e1749b64 100755
--- a/crypto/aes/asm/aes-x86_64.pl
+++ b/crypto/aes/asm/aes-x86_64.pl
@@ -2,11 +2,12 @@
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
-# Version 1.2.
+# Version 2.0.
#
# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
@@ -17,17 +18,21 @@
#
# Performance in number of cycles per processed byte for 128-bit key:
#
-# ECB CBC encrypt
-# AMD64 13.7 13.0(*)
-# EM64T 20.2 18.6(*)
-#
-# (*) CBC benchmarks are better than ECB thanks to custom ABI used
-# by the private block encryption function.
+# ECB encrypt ECB decrypt CBC large chunk
+# AMD64 33 41 13.0
+# EM64T 38 59 18.6
+# Core 2 30 43 14.5
$verticalspin=1; # unlike 32-bit version $verticalspin performs
# ~15% better on both AMD and Intel cores
$output=shift;
-open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $output";
$code=".text\n";
@@ -35,9 +40,9 @@ $s0="%eax";
$s1="%ebx";
$s2="%ecx";
$s3="%edx";
-$acc0="%esi";
-$acc1="%edi";
-$acc2="%ebp";
+$acc0="%esi"; $mask80="%rsi";
+$acc1="%edi"; $maskfe="%rdi";
+$acc2="%ebp"; $mask1b="%rbp";
$inp="%r8";
$out="%r9";
$t0="%r10d";
@@ -51,6 +56,8 @@ sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
$r =~ s/%[er]([sd]i)/%\1l/;
$r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
+sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
+ $r =~ s/%r([0-9]+)/%r\1d/; $r; }
sub _data_word()
{ my $i;
while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -345,6 +352,235 @@ $code.=<<___;
.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
___
+# it's possible to implement this by shifting tN by 8, filling least
+# significant byte with byte load and finally bswap-ing at the end,
+# but partial byte load kills Core 2...
+sub enccompactvert()
+{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
+
+$code.=<<___;
+ movzb `&lo("$s0")`,$t0
+ movzb `&lo("$s1")`,$t1
+ movzb `&lo("$s2")`,$t2
+ movzb ($sbox,$t0,1),$t0
+ movzb ($sbox,$t1,1),$t1
+ movzb ($sbox,$t2,1),$t2
+
+ movzb `&lo("$s3")`,$t3
+ movzb `&hi("$s1")`,$acc0
+ movzb `&hi("$s2")`,$acc1
+ movzb ($sbox,$t3,1),$t3
+ movzb ($sbox,$acc0,1),$t4 #$t0
+ movzb ($sbox,$acc1,1),$t5 #$t1
+
+ movzb `&hi("$s3")`,$acc2
+ movzb `&hi("$s0")`,$acc0
+ shr \$16,$s2
+ movzb ($sbox,$acc2,1),$acc2 #$t2
+ movzb ($sbox,$acc0,1),$acc0 #$t3
+ shr \$16,$s3
+
+ movzb `&lo("$s2")`,$acc1
+ shl \$8,$t4
+ shl \$8,$t5
+ movzb ($sbox,$acc1,1),$acc1 #$t0
+ xor $t4,$t0
+ xor $t5,$t1
+
+ movzb `&lo("$s3")`,$t4
+ shr \$16,$s0
+ shr \$16,$s1
+ movzb `&lo("$s0")`,$t5
+ shl \$8,$acc2
+ shl \$8,$acc0
+ movzb ($sbox,$t4,1),$t4 #$t1
+ movzb ($sbox,$t5,1),$t5 #$t2
+ xor $acc2,$t2
+ xor $acc0,$t3
+
+ movzb `&lo("$s1")`,$acc2
+ movzb `&hi("$s3")`,$acc0
+ shl \$16,$acc1
+ movzb ($sbox,$acc2,1),$acc2 #$t3
+ movzb ($sbox,$acc0,1),$acc0 #$t0
+ xor $acc1,$t0
+
+ movzb `&hi("$s0")`,$acc1
+ shr \$8,$s2
+ shr \$8,$s1
+ movzb ($sbox,$acc1,1),$acc1 #$t1
+ movzb ($sbox,$s2,1),$s3 #$t3
+ movzb ($sbox,$s1,1),$s2 #$t2
+ shl \$16,$t4
+ shl \$16,$t5
+ shl \$16,$acc2
+ xor $t4,$t1
+ xor $t5,$t2
+ xor $acc2,$t3
+
+ shl \$24,$acc0
+ shl \$24,$acc1
+ shl \$24,$s3
+ xor $acc0,$t0
+ shl \$24,$s2
+ xor $acc1,$t1
+ mov $t0,$s0
+ mov $t1,$s1
+ xor $t2,$s2
+ xor $t3,$s3
+___
+}
+
+sub enctransform_ref()
+{ my $sn = shift;
+ my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
+
+$code.=<<___;
+ mov $sn,$acc
+ and \$0x80808080,$acc
+ mov $acc,$tmp
+ shr \$7,$tmp
+ lea ($sn,$sn),$r2
+ sub $tmp,$acc
+ and \$0xfefefefe,$r2
+ and \$0x1b1b1b1b,$acc
+ mov $sn,$tmp
+ xor $acc,$r2
+
+ xor $r2,$sn
+ rol \$24,$sn
+ xor $r2,$sn
+ ror \$16,$tmp
+ xor $tmp,$sn
+ ror \$8,$tmp
+ xor $tmp,$sn
+___
+}
+
+# unlike decrypt case it does not pay off to parallelize enctransform
+sub enctransform()
+{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
+
+$code.=<<___;
+ mov $s0,$acc0
+ mov $s1,$acc1
+ and \$0x80808080,$acc0
+ and \$0x80808080,$acc1
+ mov $acc0,$t0
+ mov $acc1,$t1
+ shr \$7,$t0
+ lea ($s0,$s0),$r20
+ shr \$7,$t1
+ lea ($s1,$s1),$r21
+ sub $t0,$acc0
+ sub $t1,$acc1
+ and \$0xfefefefe,$r20
+ and \$0xfefefefe,$r21
+ and \$0x1b1b1b1b,$acc0
+ and \$0x1b1b1b1b,$acc1
+ mov $s0,$t0
+ mov $s1,$t1
+ xor $acc0,$r20
+ xor $acc1,$r21
+
+ xor $r20,$s0
+ xor $r21,$s1
+ mov $s2,$acc0
+ mov $s3,$acc1
+ rol \$24,$s0
+ rol \$24,$s1
+ and \$0x80808080,$acc0
+ and \$0x80808080,$acc1
+ xor $r20,$s0
+ xor $r21,$s1
+ mov $acc0,$t2
+ mov $acc1,$t3
+ ror \$16,$t0
+ ror \$16,$t1
+ shr \$7,$t2
+ lea ($s2,$s2),$r20
+ xor $t0,$s0
+ xor $t1,$s1
+ shr \$7,$t3
+ lea ($s3,$s3),$r21
+ ror \$8,$t0
+ ror \$8,$t1
+ sub $t2,$acc0
+ sub $t3,$acc1
+ xor $t0,$s0
+ xor $t1,$s1
+
+ and \$0xfefefefe,$r20
+ and \$0xfefefefe,$r21
+ and \$0x1b1b1b1b,$acc0
+ and \$0x1b1b1b1b,$acc1
+ mov $s2,$t2
+ mov $s3,$t3
+ xor $acc0,$r20
+ xor $acc1,$r21
+
+ xor $r20,$s2
+ xor $r21,$s3
+ rol \$24,$s2
+ rol \$24,$s3
+ xor $r20,$s2
+ xor $r21,$s3
+ ror \$16,$t2
+ ror \$16,$t3
+ xor $t2,$s2
+ xor $t3,$s3
+ ror \$8,$t2
+ ror \$8,$t3
+ xor $t2,$s2
+ xor $t3,$s3
+
+___
+}
+
+$code.=<<___;
+.type _x86_64_AES_encrypt_compact,\@abi-omnipotent
+.align 16
+_x86_64_AES_encrypt_compact:
+ lea 128($sbox),$acc0 # size optimization
+ mov 0-128($acc0),$acc1 # prefetch Te4
+ mov 32-128($acc0),$acc2
+ mov 64-128($acc0),$t0
+ mov 96-128($acc0),$t1
+ mov 128-128($acc0),$acc1
+ mov 160-128($acc0),$acc2
+ mov 192-128($acc0),$t0
+ mov 224-128($acc0),$t1
+
+ xor 0($key),$s0 # xor with key
+ xor 4($key),$s1
+ xor 8($key),$s2
+ xor 12($key),$s3
+ lea 16($key),$key
+ jmp .Lenc_loop_compact
+.align 16
+.Lenc_loop_compact:
+___
+ &enccompactvert();
+ &enctransform();
+$code.=<<___;
+ xor 0($key),$s0
+ xor 4($key),$s1
+ xor 8($key),$s2
+ xor 12($key),$s3
+ lea 16($key),$key
+ cmp 16(%rsp),$key
+ jne .Lenc_loop_compact
+___
+ &enccompactvert();
+$code.=<<___;
+ xor 0($key),$s0
+ xor 4($key),$s1
+ xor 8($key),$s2
+ xor 12($key),$s3
+ .byte 0xf3,0xc3 # rep ret
+.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
+___
+
# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
$code.=<<___;
.globl AES_encrypt
@@ -358,25 +594,49 @@ AES_encrypt:
push %r14
push %r15
+ # allocate frame "above" key schedule
+ mov %rsp,%rax
mov %rdx,$key
- mov %rdi,$inp
- mov %rsi,$out
+ lea -63(%rdx),%rcx
+ and \$-64,%rsp
+ sub %rsp,%rcx
+ neg %rcx
+ and \$0x3c0,%rcx
+ sub %rcx,%rsp
- .picmeup $sbox
- lea AES_Te-.($sbox),$sbox
+ push %rax # save real stack pointer
+ push %rsi # save out
- mov 0($inp),$s0
- mov 4($inp),$s1
- mov 8($inp),$s2
- mov 12($inp),$s3
+ mov 240($key),$rnds # load rounds
- call _x86_64_AES_encrypt
+ mov 0(%rdi),$s0 # load input vector
+ mov 4(%rdi),$s1
+ mov 8(%rdi),$s2
+ mov 12(%rdi),$s3
- mov $s0,0($out)
+ shl \$4,$rnds
+ lea ($key,$rnds),%rbp
+ push %rbp
+ push $key
+
+ # pick Te4 copy which can't "overlap" with stack frame or key schedule
+ .picmeup $sbox
+ lea AES_Te+2048-.($sbox),$sbox
+ lea 768-32(%rsp),%rbp
+ sub $sbox,%rbp
+ and \$0x300,%rbp
+ lea ($sbox,%rbp),$sbox
+
+ call _x86_64_AES_encrypt_compact
+
+ lea 16(%rsp),%rsp
+ pop $out # restore out
+ mov $s0,0($out) # write output vector
mov $s1,4($out)
mov $s2,8($out)
mov $s3,12($out)
+ mov (%rsp),%rsp
pop %r15
pop %r14
pop %r13
@@ -453,19 +713,20 @@ sub declastvert()
{ my $t3="%r8d"; # zaps $inp!
$code.=<<___;
+ lea 2048($sbox),$sbox # size optimization
movzb `&lo("$s0")`,$acc0
movzb `&lo("$s1")`,$acc1
movzb `&lo("$s2")`,$acc2
- movzb 2048($sbox,$acc0,1),$t0
- movzb 2048($sbox,$acc1,1),$t1
- movzb 2048($sbox,$acc2,1),$t2
+ movzb ($sbox,$acc0,1),$t0
+ movzb ($sbox,$acc1,1),$t1
+ movzb ($sbox,$acc2,1),$t2
movzb `&lo("$s3")`,$acc0
movzb `&hi("$s3")`,$acc1
movzb `&hi("$s0")`,$acc2
- movzb 2048($sbox,$acc0,1),$t3
- movzb 2048($sbox,$acc1,1),$acc1 #$t0
- movzb 2048($sbox,$acc2,1),$acc2 #$t1
+ movzb ($sbox,$acc0,1),$t3
+ movzb ($sbox,$acc1,1),$acc1 #$t0
+ movzb ($sbox,$acc2,1),$acc2 #$t1
shl \$8,$acc1
shl \$8,$acc2
@@ -477,8 +738,8 @@ $code.=<<___;
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
shr \$16,$s0
- movzb 2048($sbox,$acc0,1),$acc0 #$t2
- movzb 2048($sbox,$acc1,1),$acc1 #$t3
+ movzb ($sbox,$acc0,1),$acc0 #$t2
+ movzb ($sbox,$acc1,1),$acc1 #$t3
shl \$8,$acc0
shl \$8,$acc1
@@ -490,9 +751,9 @@ $code.=<<___;
movzb `&lo("$s2")`,$acc0
movzb `&lo("$s3")`,$acc1
movzb `&lo("$s0")`,$acc2
- movzb 2048($sbox,$acc0,1),$acc0 #$t0
- movzb 2048($sbox,$acc1,1),$acc1 #$t1
- movzb 2048($sbox,$acc2,1),$acc2 #$t2
+ movzb ($sbox,$acc0,1),$acc0 #$t0
+ movzb ($sbox,$acc1,1),$acc1 #$t1
+ movzb ($sbox,$acc2,1),$acc2 #$t2
shl \$16,$acc0
shl \$16,$acc1
@@ -505,9 +766,9 @@ $code.=<<___;
movzb `&lo("$s1")`,$acc0
movzb `&hi("$s1")`,$acc1
movzb `&hi("$s2")`,$acc2
- movzb 2048($sbox,$acc0,1),$acc0 #$t3
- movzb 2048($sbox,$acc1,1),$acc1 #$t0
- movzb 2048($sbox,$acc2,1),$acc2 #$t1
+ movzb ($sbox,$acc0,1),$acc0 #$t3
+ movzb ($sbox,$acc1,1),$acc1 #$t0
+ movzb ($sbox,$acc2,1),$acc2 #$t1
shl \$16,$acc0
shl \$24,$acc1
@@ -520,8 +781,8 @@ $code.=<<___;
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
mov 16+12($key),$s3
- movzb 2048($sbox,$acc0,1),$acc0 #$t2
- movzb 2048($sbox,$acc1,1),$acc1 #$t3
+ movzb ($sbox,$acc0,1),$acc0 #$t2
+ movzb ($sbox,$acc1,1),$acc1 #$t3
mov 16+0($key),$s0
shl \$24,$acc0
@@ -532,6 +793,7 @@ $code.=<<___;
mov 16+4($key),$s1
mov 16+8($key),$s2
+ lea -2048($sbox),$sbox
xor $t0,$s0
xor $t1,$s1
xor $t2,$s2
@@ -659,6 +921,260 @@ $code.=<<___;
.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
___
+sub deccompactvert()
+{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
+
+$code.=<<___;
+ movzb `&lo("$s0")`,$t0
+ movzb `&lo("$s1")`,$t1
+ movzb `&lo("$s2")`,$t2
+ movzb ($sbox,$t0,1),$t0
+ movzb ($sbox,$t1,1),$t1
+ movzb ($sbox,$t2,1),$t2
+
+ movzb `&lo("$s3")`,$t3
+ movzb `&hi("$s3")`,$acc0
+ movzb `&hi("$s0")`,$acc1
+ movzb ($sbox,$t3,1),$t3
+ movzb ($sbox,$acc0,1),$t4 #$t0
+ movzb ($sbox,$acc1,1),$t5 #$t1
+
+ movzb `&hi("$s1")`,$acc2
+ movzb `&hi("$s2")`,$acc0
+ shr \$16,$s2
+ movzb ($sbox,$acc2,1),$acc2 #$t2
+ movzb ($sbox,$acc0,1),$acc0 #$t3
+ shr \$16,$s3
+
+ movzb `&lo("$s2")`,$acc1
+ shl \$8,$t4
+ shl \$8,$t5
+ movzb ($sbox,$acc1,1),$acc1 #$t0
+ xor $t4,$t0
+ xor $t5,$t1
+
+ movzb `&lo("$s3")`,$t4
+ shr \$16,$s0
+ shr \$16,$s1
+ movzb `&lo("$s0")`,$t5
+ shl \$8,$acc2
+ shl \$8,$acc0
+ movzb ($sbox,$t4,1),$t4 #$t1
+ movzb ($sbox,$t5,1),$t5 #$t2
+ xor $acc2,$t2
+ xor $acc0,$t3
+
+ movzb `&lo("$s1")`,$acc2
+ movzb `&hi("$s1")`,$acc0
+ shl \$16,$acc1
+ movzb ($sbox,$acc2,1),$acc2 #$t3
+ movzb ($sbox,$acc0,1),$acc0 #$t0
+ xor $acc1,$t0
+
+ movzb `&hi("$s2")`,$acc1
+ shl \$16,$t4
+ shl \$16,$t5
+ movzb ($sbox,$acc1,1),$s1 #$t1
+ xor $t4,$t1
+ xor $t5,$t2
+
+ movzb `&hi("$s3")`,$acc1
+ shr \$8,$s0
+ shl \$16,$acc2
+ movzb ($sbox,$acc1,1),$s2 #$t2
+ movzb ($sbox,$s0,1),$s3 #$t3
+ xor $acc2,$t3
+
+ shl \$24,$acc0
+ shl \$24,$s1
+ shl \$24,$s2
+ xor $acc0,$t0
+ shl \$24,$s3
+ xor $t1,$s1
+ mov $t0,$s0
+ xor $t2,$s2
+ xor $t3,$s3
+___
+}
+
+# parallelized version! input is pair of 64-bit values: %rax=s1.s0
+# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
+# %ecx=s2 and %edx=s3.
+sub dectransform()
+{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
+ my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
+
+$code.=<<___;
+ mov $tp10,$acc0
+ mov $tp18,$acc8
+ and $mask80,$acc0
+ and $mask80,$acc8
+ mov $acc0,$tp40
+ mov $acc8,$tp48
+ shr \$7,$tp40
+ lea ($tp10,$tp10),$tp20
+ shr \$7,$tp48
+ lea ($tp18,$tp18),$tp28
+ sub $tp40,$acc0
+ sub $tp48,$acc8
+ and $maskfe,$tp20
+ and $maskfe,$tp28
+ and $mask1b,$acc0
+ and $mask1b,$acc8
+ xor $tp20,$acc0
+ xor $tp28,$acc8
+ mov $acc0,$tp20
+ mov $acc8,$tp28
+
+ and $mask80,$acc0
+ and $mask80,$acc8
+ mov $acc0,$tp80
+ mov $acc8,$tp88
+ shr \$7,$tp80
+ lea ($tp20,$tp20),$tp40
+ shr \$7,$tp88
+ lea ($tp28,$tp28),$tp48
+ sub $tp80,$acc0
+ sub $tp88,$acc8
+ and $maskfe,$tp40
+ and $maskfe,$tp48
+ and $mask1b,$acc0
+ and $mask1b,$acc8
+ xor $tp40,$acc0
+ xor $tp48,$acc8
+ mov $acc0,$tp40
+ mov $acc8,$tp48
+
+ and $mask80,$acc0
+ and $mask80,$acc8
+ mov $acc0,$tp80
+ mov $acc8,$tp88
+ shr \$7,$tp80
+ xor $tp10,$tp20 # tp2^=tp1
+ shr \$7,$tp88
+ xor $tp18,$tp28 # tp2^=tp1
+ sub $tp80,$acc0
+ sub $tp88,$acc8
+ lea ($tp40,$tp40),$tp80
+ lea ($tp48,$tp48),$tp88
+ xor $tp10,$tp40 # tp4^=tp1
+ xor $tp18,$tp48 # tp4^=tp1
+ and $maskfe,$tp80
+ and $maskfe,$tp88
+ and $mask1b,$acc0
+ and $mask1b,$acc8
+ xor $acc0,$tp80
+ xor $acc8,$tp88
+
+ xor $tp80,$tp10 # tp1^=tp8
+ xor $tp88,$tp18 # tp1^=tp8
+ xor $tp80,$tp20 # tp2^tp1^=tp8
+ xor $tp88,$tp28 # tp2^tp1^=tp8
+ mov $tp10,$acc0
+ mov $tp18,$acc8
+ xor $tp80,$tp40 # tp4^tp1^=tp8
+ xor $tp88,$tp48 # tp4^tp1^=tp8
+ shr \$32,$acc0
+ shr \$32,$acc8
+ xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
+ xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
+ rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
+ rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
+ xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
+ xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
+
+ rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
+ rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
+ xor `&LO("$tp80")`,`&LO("$tp10")`
+ xor `&LO("$tp88")`,`&LO("$tp18")`
+ shr \$32,$tp80
+ shr \$32,$tp88
+ xor `&LO("$tp80")`,`&LO("$acc0")`
+ xor `&LO("$tp88")`,`&LO("$acc8")`
+
+ mov $tp20,$tp80
+ mov $tp28,$tp88
+ shr \$32,$tp80
+ shr \$32,$tp88
+ rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
+ rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
+ rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
+ rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
+ xor `&LO("$tp20")`,`&LO("$tp10")`
+ xor `&LO("$tp28")`,`&LO("$tp18")`
+ mov $tp40,$tp20
+ mov $tp48,$tp28
+ xor `&LO("$tp80")`,`&LO("$acc0")`
+ xor `&LO("$tp88")`,`&LO("$acc8")`
+
+ shr \$32,$tp20
+ shr \$32,$tp28
+ rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
+ rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
+ rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
+ rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
+ xor `&LO("$tp40")`,`&LO("$tp10")`
+ xor `&LO("$tp48")`,`&LO("$tp18")`
+ xor `&LO("$tp20")`,`&LO("$acc0")`
+ xor `&LO("$tp28")`,`&LO("$acc8")`
+
+___
+}
+
+$code.=<<___;
+.type _x86_64_AES_decrypt_compact,\@abi-omnipotent
+.align 16
+_x86_64_AES_decrypt_compact:
+ lea 128($sbox),$acc0 # size optimization
+ mov 0-128($acc0),$acc1 # prefetch Td4
+ mov 32-128($acc0),$acc2
+ mov 64-128($acc0),$t0
+ mov 96-128($acc0),$t1
+ mov 128-128($acc0),$acc1
+ mov 160-128($acc0),$acc2
+ mov 192-128($acc0),$t0
+ mov 224-128($acc0),$t1
+
+ xor 0($key),$s0 # xor with key
+ xor 4($key),$s1
+ xor 8($key),$s2
+ xor 12($key),$s3
+ lea 16($key),$key
+
+ jmp .Ldec_compact_loop
+.align 16
+.Ldec_compact_loop:
+___
+ &deccompactvert();
+$code.=<<___;
+ mov 256+0($sbox),$mask80
+ shl \$32,%rbx
+ shl \$32,%rdx
+ mov 256+8($sbox),$maskfe
+ or %rbx,%rax
+ or %rdx,%rcx
+ mov 256+16($sbox),$mask1b
+___
+ &dectransform();
+$code.=<<___;
+ xor 0($key),$s0
+ xor 4($key),$s1
+ xor 8($key),$s2
+ xor 12($key),$s3
+ lea 16($key),$key
+ cmp 16(%rsp),$key
+ jne .Ldec_compact_loop
+___
+ &deccompactvert();
+$code.=<<___;
+ xor 0($key),$s0
+ xor 4($key),$s1
+ xor 8($key),$s2
+ xor 12($key),$s3
+ .byte 0xf3,0xc3 # rep ret
+.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
+___
+
# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
$code.=<<___;
.globl AES_decrypt
@@ -672,37 +1188,51 @@ AES_decrypt:
push %r14
push %r15
+ # allocate frame "above" key schedule
+ mov %rsp,%rax
mov %rdx,$key
- mov %rdi,$inp
- mov %rsi,$out
+ lea -63(%rdx),%rcx
+ and \$-64,%rsp
+ sub %rsp,%rcx
+ neg %rcx
+ and \$0x3c0,%rcx
+ sub %rcx,%rsp
- .picmeup $sbox
- lea AES_Td-.($sbox),$sbox
-
- # prefetch Td4
- lea 2048+128($sbox),$sbox;
- mov 0-128($sbox),$s0
- mov 32-128($sbox),$s1
- mov 64-128($sbox),$s2
- mov 96-128($sbox),$s3
- mov 128-128($sbox),$s0
- mov 160-128($sbox),$s1
- mov 192-128($sbox),$s2
- mov 224-128($sbox),$s3
- lea -2048-128($sbox),$sbox;
-
- mov 0($inp),$s0
- mov 4($inp),$s1
- mov 8($inp),$s2
- mov 12($inp),$s3
-
- call _x86_64_AES_decrypt
+ push %rax # save real stack pointer
+ push %rsi # save out
+
+ mov 240($key),$rnds # load rounds
+
+ mov 0(%rdi),$s0 # load input vector
+ mov 4(%rdi),$s1
+ mov 8(%rdi),$s2
+ mov 12(%rdi),$s3
+
+ shl \$4,$rnds
+ lea ($key,$rnds),%rbp
+ push %rbp
+ push $key
+ # pick Td4 copy which can't "overlap" with stack frame or key schedule
+ .picmeup $sbox
+ lea AES_Td+2048-.($sbox),$sbox
+ lea 768-32(%rsp),%rbp
+ sub $sbox,%rbp
+ and \$0x300,%rbp
+ lea ($sbox,%rbp),$sbox
+ shr \$3,%rbp # recall "magic" constants!
+ add %rbp,$sbox
+
+ call _x86_64_AES_decrypt_compact
+
+ lea 16(%rsp),%rsp
+ pop $out # restore out
mov $s0,0($out)
mov $s1,4($out)
mov $s2,8($out)
mov $s3,12($out)
+ mov (%rsp),%rsp
pop %r15
pop %r14
pop %r13
@@ -718,27 +1248,26 @@ sub enckey()
{
$code.=<<___;
movz %dl,%esi # rk[i]>>0
- mov 2(%rbp,%rsi,8),%ebx
+ movzb -128(%rbp,%rsi),%ebx
movz %dh,%esi # rk[i]>>8
- and \$0xFF000000,%ebx
+ shl \$24,%ebx
xor %ebx,%eax
- mov 2(%rbp,%rsi,8),%ebx
+ movzb -128(%rbp,%rsi),%ebx
shr \$16,%edx
- and \$0x000000FF,%ebx
movz %dl,%esi # rk[i]>>16
xor %ebx,%eax
- mov 0(%rbp,%rsi,8),%ebx
+ movzb -128(%rbp,%rsi),%ebx
movz %dh,%esi # rk[i]>>24
- and \$0x0000FF00,%ebx
+ shl \$8,%ebx
xor %ebx,%eax
- mov 0(%rbp,%rsi,8),%ebx
- and \$0x00FF0000,%ebx
+ movzb -128(%rbp,%rsi),%ebx
+ shl \$16,%ebx
xor %ebx,%eax
- xor 2048(%rbp,%rcx,4),%eax # rcon
+ xor 1024-128(%rbp,%rcx,4),%eax # rcon
___
}
@@ -763,6 +1292,17 @@ AES_set_encrypt_key:
.picmeup %rbp
lea AES_Te-.(%rbp),%rbp
+ lea 2048+128(%rbp),%rbp
+
+ # prefetch Te4
+ mov 0-128(%rbp),%eax
+ mov 32-128(%rbp),%ebx
+ mov 64-128(%rbp),%r8d
+ mov 96-128(%rbp),%edx
+ mov 128-128(%rbp),%eax
+ mov 160-128(%rbp),%ebx
+ mov 192-128(%rbp),%r8d
+ mov 224-128(%rbp),%edx
cmp \$128,%ecx
je .L10rounds
@@ -900,24 +1440,23 @@ $code.=<<___;
mov %eax,%edx
mov 16(%rdi),%eax # rk[4]
movz %dl,%esi # rk[11]>>0
- mov 2(%rbp,%rsi,8),%ebx
+ movzb -128(%rbp,%rsi),%ebx
movz %dh,%esi # rk[11]>>8
- and \$0x000000FF,%ebx
xor %ebx,%eax
- mov 0(%rbp,%rsi,8),%ebx
+ movzb -128(%rbp,%rsi),%ebx
shr \$16,%edx
- and \$0x0000FF00,%ebx
+ shl \$8,%ebx
movz %dl,%esi # rk[11]>>16
xor %ebx,%eax
- mov 0(%rbp,%rsi,8),%ebx
+ movzb -128(%rbp,%rsi),%ebx
movz %dh,%esi # rk[11]>>24
- and \$0x00FF0000,%ebx
+ shl \$16,%ebx
xor %ebx,%eax
- mov 2(%rbp,%rsi,8),%ebx
- and \$0xFF000000,%ebx
+ movzb -128(%rbp,%rsi),%ebx
+ shl \$24,%ebx
xor %ebx,%eax
mov %eax,48(%rdi) # rk[12]
@@ -944,25 +1483,57 @@ $code.=<<___;
.size AES_set_encrypt_key,.-AES_set_encrypt_key
___
-sub deckey()
+sub deckey_ref()
{ my ($i,$ptr,$te,$td) = @_;
+ my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
$code.=<<___;
- mov $i($ptr),%eax
- mov %eax,%edx
- movz %ah,%ebx
- shr \$16,%edx
- and \$0xFF,%eax
- movzb 2($te,%rax,8),%rax
- movzb 2($te,%rbx,8),%rbx
- mov 0($td,%rax,8),%eax
- xor 3($td,%rbx,8),%eax
- movzb %dh,%ebx
- and \$0xFF,%edx
- movzb 2($te,%rdx,8),%rdx
- movzb 2($te,%rbx,8),%rbx
- xor 2($td,%rdx,8),%eax
- xor 1($td,%rbx,8),%eax
- mov %eax,$i($ptr)
+ mov $i($ptr),$tp1
+ mov $tp1,$acc
+ and \$0x80808080,$acc
+ mov $acc,$tp4
+ shr \$7,$tp4
+ lea 0($tp1,$tp1),$tp2
+ sub $tp4,$acc
+ and \$0xfefefefe,$tp2
+ and \$0x1b1b1b1b,$acc
+ xor $tp2,$acc
+ mov $acc,$tp2
+
+ and \$0x80808080,$acc
+ mov $acc,$tp8
+ shr \$7,$tp8
+ lea 0($tp2,$tp2),$tp4
+ sub $tp8,$acc
+ and \$0xfefefefe,$tp4
+ and \$0x1b1b1b1b,$acc
+ xor $tp1,$tp2 # tp2^tp1
+ xor $tp4,$acc
+ mov $acc,$tp4
+
+ and \$0x80808080,$acc
+ mov $acc,$tp8
+ shr \$7,$tp8
+ sub $tp8,$acc
+ lea 0($tp4,$tp4),$tp8
+ xor $tp1,$tp4 # tp4^tp1
+ and \$0xfefefefe,$tp8
+ and \$0x1b1b1b1b,$acc
+ xor $acc,$tp8
+
+ xor $tp8,$tp1 # tp1^tp8
+ rol \$8,$tp1 # ROTATE(tp1^tp8,8)
+ xor $tp8,$tp2 # tp2^tp1^tp8
+ xor $tp8,$tp4 # tp4^tp1^tp8
+ xor $tp2,$tp8
+ xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
+
+ xor $tp8,$tp1
+ rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24)
+ xor $tp2,$tp1
+ rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16)
+ xor $tp4,$tp1
+
+ mov $tp1,$i($ptr)
___
}
@@ -973,19 +1544,22 @@ $code.=<<___;
.type AES_set_decrypt_key,\@function,3
.align 16
AES_set_decrypt_key:
- push %rdx
+ push %rdx # save key schedule
call AES_set_encrypt_key
cmp \$0,%eax
- je .Lproceed
- lea 24(%rsp),%rsp
- ret
-.Lproceed:
- mov (%rsp),%r8 # restore key schedule
- mov %rbx,(%rsp)
+ pop %r8 # restore key schedule
+ jne .Labort
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
- mov 240(%r8),%ecx # pull number of rounds
+ mov 240(%r8),%r14d # pull number of rounds
xor %rdi,%rdi
- lea (%rdi,%rcx,4),%rcx
+ lea (%rdi,%r14d,4),%rcx
mov %r8,%rsi
lea (%r8,%rcx,4),%rdi # pointer to last chunk
.align 4
@@ -1003,27 +1577,38 @@ AES_set_decrypt_key:
cmp %rsi,%rdi
jne .Linvert
- .picmeup %r9
- lea AES_Td-.(%r9),%rdi
- lea AES_Te-AES_Td(%rdi),%r9
+ .picmeup %rax
+ lea AES_Te+2048+1024-.(%rax),%rax # rcon
- mov %r8,%rsi
- mov 240(%r8),%ecx # pull number of rounds
- sub \$1,%ecx
+ mov 40(%rax),$mask80
+ mov 48(%rax),$maskfe
+ mov 56(%rax),$mask1b
+
+ mov %r8,$key
+ sub \$1,%r14d
.align 4
.Lpermute:
- lea 16(%rsi),%rsi
+ lea 16($key),$key
+ mov 0($key),%rax
+ mov 8($key),%rcx
___
- &deckey (0,"%rsi","%r9","%rdi");
- &deckey (4,"%rsi","%r9","%rdi");
- &deckey (8,"%rsi","%r9","%rdi");
- &deckey (12,"%rsi","%r9","%rdi");
+ &dectransform ();
$code.=<<___;
- sub \$1,%ecx
+ mov %eax,0($key)
+ mov %ebx,4($key)
+ mov %ecx,8($key)
+ mov %edx,12($key)
+ sub \$1,%r14d
jnz .Lpermute
xor %rax,%rax
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
pop %rbx
+.Labort:
ret
.size AES_set_decrypt_key,.-AES_set_decrypt_key
___
@@ -1461,11 +2046,145 @@ ___
&_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
&_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
&_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
+
+#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
+
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
+
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
+
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
+ &data_byte(0xc2, 0x