summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xConfigure4
-rw-r--r--TABLE76
-rw-r--r--crypto/camellia/Makefile5
-rw-r--r--crypto/camellia/asm/cmll-x86.pl1138
-rw-r--r--crypto/camellia/asm/cmll-x86_64.pl1082
5 files changed, 2265 insertions, 40 deletions
diff --git a/Configure b/Configure
index 87e5abbb26..d12a3051e7 100755
--- a/Configure
+++ b/Configure
@@ -116,11 +116,11 @@ my $tlib="-lnsl -lsocket";
my $bits1="THIRTY_TWO_BIT ";
my $bits2="SIXTY_FOUR_BIT ";
-my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-586.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:";
+my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-586.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o";
my $x86_elf_asm="$x86_asm:elf";
-my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:";
+my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o";
my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::void";
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::void";
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void";
diff --git a/TABLE b/TABLE
index bc4a0f7c46..e2f5b01756 100644
--- a/TABLE
+++ b/TABLE
@@ -230,7 +230,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = a.out
$dso_scheme = dlfcn
$shared_target= bsd-shared
@@ -260,7 +260,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= bsd-shared
@@ -290,7 +290,7 @@ $rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
-$cmll_obj =
+$cmll_obj = cmll-x86_64.o cmll_misc.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= bsd-gcc-shared
@@ -320,7 +320,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = coff
$dso_scheme = dlfcn
$shared_target= cygwin-shared
@@ -380,7 +380,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = a.out
$dso_scheme =
$shared_target=
@@ -650,7 +650,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = win32n
$dso_scheme = win32
$shared_target=
@@ -920,7 +920,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = beos
$shared_target= beos-shared
@@ -950,7 +950,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = beos
$shared_target= beos-shared
@@ -980,7 +980,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= bsd-gcc-shared
@@ -1100,7 +1100,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = macosx
$dso_scheme = dlfcn
$shared_target= darwin-shared
@@ -1190,7 +1190,7 @@ $rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
-$cmll_obj =
+$cmll_obj = cmll-x86_64.o cmll_misc.o
$perlasm_scheme = macosx
$dso_scheme = dlfcn
$shared_target= darwin-shared
@@ -1250,7 +1250,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= bsd-shared
@@ -1490,7 +1490,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme =
$shared_target=
@@ -1520,7 +1520,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = macosx
$dso_scheme = dlfcn
$shared_target= darwin-shared
@@ -1640,7 +1640,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= linux-shared
@@ -1670,7 +1670,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= linux-shared
@@ -1760,7 +1760,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= linux-shared
@@ -1790,7 +1790,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= linux-shared
@@ -1850,7 +1850,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target=
@@ -1880,7 +1880,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target=
@@ -1910,7 +1910,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme =
$shared_target=
@@ -2060,7 +2060,7 @@ $rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
-$cmll_obj =
+$cmll_obj = cmll-x86_64.o cmll_misc.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= linux-shared
@@ -2090,7 +2090,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= linux-shared
@@ -2120,7 +2120,7 @@ $rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
-$cmll_obj =
+$cmll_obj = cmll-x86_64.o cmll_misc.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= linux-shared
@@ -2270,7 +2270,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme =
$shared_target=
@@ -2780,7 +2780,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= linux-shared
@@ -3110,7 +3110,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = a.out
$dso_scheme =
$shared_target=
@@ -3170,7 +3170,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= linux-shared
@@ -3260,7 +3260,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= linux-shared
@@ -3530,7 +3530,7 @@ $rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
-$cmll_obj =
+$cmll_obj = cmll-x86_64.o cmll_misc.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= linux-shared
@@ -3590,7 +3590,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = coff
$dso_scheme = win32
$shared_target= cygwin-shared
@@ -4190,7 +4190,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= svr3-shared
@@ -4220,7 +4220,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= svr3-shared
@@ -4460,7 +4460,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= solaris-shared
@@ -4550,7 +4550,7 @@ $rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
-$cmll_obj =
+$cmll_obj = cmll-x86_64.o cmll_misc.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= solaris-shared
@@ -4580,7 +4580,7 @@ $rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
-$cmll_obj =
+$cmll_obj = cmll-x86_64.o cmll_misc.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= solaris-shared
@@ -4820,7 +4820,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= svr5-shared
@@ -4850,7 +4850,7 @@ $rc4_obj = rc4-586.o
$rmd160_obj = rmd-586.o
$rc5_obj = rc5-586.o
$wp_obj = wp_block.o wp-mmx.o
-$cmll_obj =
+$cmll_obj = cmll-x86.o
$perlasm_scheme = elf
$dso_scheme = dlfcn
$shared_target= gnu-shared
diff --git a/crypto/camellia/Makefile b/crypto/camellia/Makefile
index 6154f81347..76331ff07a 100644
--- a/crypto/camellia/Makefile
+++ b/crypto/camellia/Makefile
@@ -44,6 +44,11 @@ lib: $(LIBOBJ)
$(RANLIB) $(LIB) || echo Never mind.
@touch lib
+cmll-x86.s: asm/cmll-x86.pl ../perlasm/x86asm.pl
+ $(PERL) asm/cmll-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
+cmll-x86_64.s: asm/cmll-x86_64.pl
+ $(PERL) asm/cmll-x86_64.pl $(PERLASM_SCHEME) > $@
+
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/camellia/asm/cmll-x86.pl b/crypto/camellia/asm/cmll-x86.pl
new file mode 100644
index 0000000000..0812815bfb
--- /dev/null
+++ b/crypto/camellia/asm/cmll-x86.pl
@@ -0,0 +1,1138 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
+#
+# This module may be used under the terms of either the GNU General
+# Public License version 2 or later, the GNU Lesser General Public
+# License version 2.1 or later, the Mozilla Public License version
+# 1.1 or the BSD License. The exact terms of either license are
+# distributed along with this module. For further details see
+# http://www.openssl.org/~appro/camellia/.
+# ====================================================================
+
+# Performance in cycles per processed byte (less is better) in
+# 'openssl speed ...' benchmark:
+#
+# AMD K8 Core2 PIII P4
+# -evp camellia-128-ecb 21.5 22.8 27.0 28.9
+# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64%
+# + over icc 8.0 +48/19% +21/15% +21/17% +55/37%
+#
+# camellia-128-cbc 17.3 21.1 23.9 25.9
+#
+# 128-bit key setup 196 280 256 240 cycles/key
+# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40%
+# + over icc 8.0 +18/3% +10/0% +10/3% +21/10%
+#
+# Pairs of numbers in "+" rows represent performance improvement over
+# compiler generated position-independent code, PIC, and non-PIC
+# respectively. PIC results are of greater relevance, as this module
+# is position-independent, i.e. suitable for a shared library or PIE.
+# Position independence "costs" one register, which is why compilers
+# are so close with non-PIC results, they have an extra register to
+# spare. CBC results are better than ECB ones thanks to "zero-copy"
+# private _x86_* interface, and are ~30-40% better than with compiler
+# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
+# same CPU (where applicable).
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+$OPENSSL=1;
+
+&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");
+
+@T=("eax","ebx","ecx","edx");
+$idx="esi";
+$key="edi";
+$Tbl="ebp";
+
+# stack frame layout in _x86_Camellia_* routines, frame is allocated
+# by caller
+$__ra=&DWP(0,"esp"); # return address
+$__s0=&DWP(4,"esp"); # s0 backing store
+$__s1=&DWP(8,"esp"); # s1 backing store
+$__s2=&DWP(12,"esp"); # s2 backing store
+$__s3=&DWP(16,"esp"); # s3 backing store
+$__end=&DWP(20,"esp"); # pointer to end/start of key schedule
+
+# stack frame layout in Camellia_[en|crypt] routines, which differs from
+# above by 4 and overlaps by pointer to end/start of key schedule
+$_end=&DWP(16,"esp");
+$_esp=&DWP(20,"esp");
+
+# const unsigned int Camellia_SBOX[4][256];
+# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
+# and [2][] - with [3][]. This is done to optimize code size.
+$SBOX1_1110=0; # Camellia_SBOX[0]
+$SBOX4_4404=4; # Camellia_SBOX[1]
+$SBOX2_0222=2048; # Camellia_SBOX[2]
+$SBOX3_3033=2052; # Camellia_SBOX[3]
+&static_label("Camellia_SIGMA");
+&static_label("Camellia_SBOX");
+
+sub Camellia_Feistel {
+my $i=@_[0];
+my $seed=defined(@_[1])?@_[1]:0;
+my $scale=$seed<0?-8:8;
+my $frame=defined(@_[2])?@_[2]:0;
+my $j=($i&1)*2;
+my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
+
+ &xor ($t0,$idx); # t0^=key[0]
+ &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1]
+ &movz ($idx,&HB($t0)); # (t0>>8)&0xff
+ &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0]
+ &movz ($idx,&LB($t0)); # (t0>>0)&0xff
+ &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0]
+ &shr ($t0,16);
+ &movz ($idx,&LB($t1)); # (t1>>0)&0xff
+ &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1]
+ &movz ($idx,&HB($t0)); # (t0>>24)&0xff
+ &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0]
+ &movz ($idx,&HB($t1)); # (t1>>8)&0xff
+ &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1]
+ &shr ($t1,16);
+ &movz ($t0,&LB($t0)); # (t0>>16)&0xff
+ &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0]
+ &movz ($idx,&HB($t1)); # (t1>>24)&0xff
+ &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3"
+ &xor ($t2,$t3); # t2^=t3
+ &rotr ($t3,8); # t3=RightRotate(t3,8)
+ &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1]
+ &movz ($idx,&LB($t1)); # (t1>>16)&0xff
+ &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2"
+ &xor ($t3,$t0); # t3^=s3
+ &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1]
+ &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1]
+ &xor ($t3,$t2); # t3^=t2
+ &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3
+ &xor ($t2,$t1); # t2^=s2
+ &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2
+}
+
+# void Camellia_EncryptBlock_Rounds(
+# int grandRounds,
+# const Byte plaintext[],
+# const KEY_TABLE_TYPE keyTable,
+# Byte ciphertext[])
+&function_begin("Camellia_EncryptBlock_Rounds");
+ &mov ("eax",&wparam(0)); # load grandRounds
+ &mov ($idx,&wparam(1)); # load plaintext pointer
+ &mov ($key,&wparam(2)); # load key schedule pointer
+
+ &mov ("ebx","esp");
+ &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
+ &and ("esp",-64);
+
+ # place stack frame just "above mod 1024" the key schedule
+ # this ensures that cache associativity of 2 suffices
+ &lea ("ecx",&DWP(-64-63,$key));
+ &sub ("ecx","esp");
+ &neg ("ecx");
+ &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
+ &sub ("esp","ecx");
+ &add ("esp",4); # 4 is reserved for callee's return address
+
+ &shl ("eax",6);
+ &lea ("eax",&DWP(0,$key,"eax"));
+ &mov ($_esp,"ebx"); # save %esp
+ &mov ($_end,"eax"); # save keyEnd
+
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop($Tbl);
+ &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
+
+ &mov (@T[0],&DWP(0,$idx)); # load plaintext
+ &mov (@T[1],&DWP(4,$idx));
+ &mov (@T[2],&DWP(8,$idx));
+ &bswap (@T[0]);
+ &mov (@T[3],&DWP(12,$idx));
+ &bswap (@T[1]);
+ &bswap (@T[2]);
+ &bswap (@T[3]);
+
+ &call ("_x86_Camellia_encrypt");
+
+ &mov ("esp",$_esp);
+ &bswap (@T[0]);
+ &mov ($idx,&wparam(3)); # load ciphertext pointer
+ &bswap (@T[1]);
+ &bswap (@T[2]);
+ &bswap (@T[3]);
+ &mov (&DWP(0,$idx),@T[0]); # write ciphertext
+ &mov (&DWP(4,$idx),@T[1]);
+ &mov (&DWP(8,$idx),@T[2]);
+ &mov (&DWP(12,$idx),@T[3]);
+&function_end("Camellia_EncryptBlock_Rounds");
+# V1.x API
+&function_begin_B("Camellia_EncryptBlock");
+ &mov ("eax",128);
+ &sub ("eax",&wparam(0)); # load keyBitLength
+ &mov ("eax",3);
+ &adc ("eax",0); # keyBitLength==128?3:4
+ &mov (&wparam(0),"eax");
+ &jmp (&label("Camellia_EncryptBlock_Rounds"));
+&function_end_B("Camellia_EncryptBlock");
+
+if ($OPENSSL) {
+# void Camellia_encrypt(
+# const unsigned char *in,
+# unsigned char *out,
+# const CAMELLIA_KEY *key)
+&function_begin("Camellia_encrypt");
+ &mov ($idx,&wparam(0)); # load plaintext pointer
+ &mov ($key,&wparam(2)); # load key schedule pointer
+
+ &mov ("ebx","esp");
+ &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
+ &and ("esp",-64);
+ &mov ("eax",&DWP(272,$key)); # load grandRounds counter
+
+ # place stack frame just "above mod 1024" the key schedule
+ # this ensures that cache associativity of 2 suffices
+ &lea ("ecx",&DWP(-64-63,$key));
+ &sub ("ecx","esp");
+ &neg ("ecx");
+ &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
+ &sub ("esp","ecx");
+ &add ("esp",4); # 4 is reserved for callee's return address
+
+ &shl ("eax",6);
+ &lea ("eax",&DWP(0,$key,"eax"));
+ &mov ($_esp,"ebx"); # save %esp
+ &mov ($_end,"eax"); # save keyEnd
+
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop($Tbl);
+ &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
+
+ &mov (@T[0],&DWP(0,$idx)); # load plaintext
+ &mov (@T[1],&DWP(4,$idx));
+ &mov (@T[2],&DWP(8,$idx));
+ &bswap (@T[0]);
+ &mov (@T[3],&DWP(12,$idx));
+ &bswap (@T[1]);
+ &bswap (@T[2]);
+ &bswap (@T[3]);
+
+ &call ("_x86_Camellia_encrypt");
+
+ &mov ("esp",$_esp);
+ &bswap (@T[0]);
+ &mov ($idx,&wparam(1)); # load ciphertext pointer
+ &bswap (@T[1]);
+ &bswap (@T[2]);
+ &bswap (@T[3]);
+ &mov (&DWP(0,$idx),@T[0]); # write ciphertext
+ &mov (&DWP(4,$idx),@T[1]);
+ &mov (&DWP(8,$idx),@T[2]);
+ &mov (&DWP(12,$idx),@T[3]);
+&function_end("Camellia_encrypt");
+}
+
+&function_begin_B("_x86_Camellia_encrypt");
+ &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
+ &xor (@T[1],&DWP(4,$key));
+ &xor (@T[2],&DWP(8,$key));
+ &xor (@T[3],&DWP(12,$key));
+ &mov ($idx,&DWP(16,$key)); # prefetch key[4]
+
+ &mov ($__s0,@T[0]); # save s[0-3]
+ &mov ($__s1,@T[1]);
+ &mov ($__s2,@T[2]);
+ &mov ($__s3,@T[3]);
+
+&set_label("loop",16);
+ for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
+
+ &add ($key,16*4);
+ &cmp ($key,$__end);
+ &je (&label("done"));
+
+ # @T[0-1] are preloaded, $idx is preloaded with key[0]
+ &and ($idx,@T[0]);
+ &mov (@T[3],$__s3);
+ &rotl ($idx,1);
+ &mov (@T[2],@T[3]);
+ &xor (@T[1],$idx);
+ &or (@T[2],&DWP(12,$key));
+ &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
+ &xor (@T[2],$__s2);
+
+ &mov ($idx,&DWP(4,$key));
+ &mov ($__s2,@T[2]); # s2^=s3|key[3];
+ &or ($idx,@T[1]);
+ &and (@T[2],&DWP(8,$key));
+ &xor (@T[0],$idx);
+ &rotl (@T[2],1);
+ &mov ($__s0,@T[0]); # s0^=s1|key[1];
+ &xor (@T[3],@T[2]);
+ &mov ($idx,&DWP(16,$key)); # prefetch key[4]
+ &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
+ &jmp (&label("loop"));
+
+&set_label("done",8);
+ &mov (@T[2],@T[0]); # SwapHalf
+ &mov (@T[3],@T[1]);
+ &mov (@T[0],$__s2);
+ &mov (@T[1],$__s3);
+ &xor (@T[0],$idx); # $idx is preloaded with key[0]
+ &xor (@T[1],&DWP(4,$key));
+ &xor (@T[2],&DWP(8,$key));
+ &xor (@T[3],&DWP(12,$key));
+ &ret ();
+&function_end_B("_x86_Camellia_encrypt");
+
+# void Camellia_DecryptBlock_Rounds(
+# int grandRounds,
+# const Byte ciphertext[],
+# const KEY_TABLE_TYPE keyTable,
+# Byte plaintext[])
+&function_begin("Camellia_DecryptBlock_Rounds");
+ &mov ("eax",&wparam(0)); # load grandRounds
+ &mov ($idx,&wparam(1)); # load ciphertext pointer
+ &mov ($key,&wparam(2)); # load key schedule pointer
+
+ &mov ("ebx","esp");
+ &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
+ &and ("esp",-64);
+
+ # place stack frame just "above mod 1024" the key schedule
+ # this ensures that cache associativity of 2 suffices
+ &lea ("ecx",&DWP(-64-63,$key));
+ &sub ("ecx","esp");
+ &neg ("ecx");
+ &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
+ &sub ("esp","ecx");
+ &add ("esp",4); # 4 is reserved for callee's return address
+
+ &shl ("eax",6);
+ &mov (&DWP(4*4,"esp"),$key); # save keyStart
+ &lea ($key,&DWP(0,$key,"eax"));
+ &mov (&DWP(5*4,"esp"),"ebx");# save %esp
+
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop($Tbl);
+ &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
+
+ &mov (@T[0],&DWP(0,$idx)); # load ciphertext
+ &mov (@T[1],&DWP(4,$idx));
+ &mov (@T[2],&DWP(8,$idx));
+ &bswap (@T[0]);
+ &mov (@T[3],&DWP(12,$idx));
+ &bswap (@T[1]);
+ &bswap (@T[2]);
+ &bswap (@T[3]);
+
+ &call ("_x86_Camellia_decrypt");
+
+ &mov ("esp",&DWP(5*4,"esp"));
+ &bswap (@T[0]);
+ &mov ($idx,&wparam(3)); # load plaintext pointer
+ &bswap (@T[1]);
+ &bswap (@T[2]);
+ &bswap (@T[3]);
+ &mov (&DWP(0,$idx),@T[0]); # write plaintext
+ &mov (&DWP(4,$idx),@T[1]);
+ &mov (&DWP(8,$idx),@T[2]);
+ &mov (&DWP(12,$idx),@T[3]);
+&function_end("Camellia_DecryptBlock_Rounds");
+# V1.x API
+&function_begin_B("Camellia_DecryptBlock");
+ &mov ("eax",128);
+ &sub ("eax",&wparam(0)); # load keyBitLength
+ &mov ("eax",3);
+ &adc ("eax",0); # keyBitLength==128?3:4
+ &mov (&wparam(0),"eax");
+ &jmp (&label("Camellia_DecryptBlock_Rounds"));
+&function_end_B("Camellia_DecryptBlock");
+
+if ($OPENSSL) {
+# void Camellia_decrypt(
+# const unsigned char *in,
+# unsigned char *out,
+# const CAMELLIA_KEY *key)
+&function_begin("Camellia_decrypt");
+ &mov ($idx,&wparam(0)); # load ciphertext pointer
+ &mov ($key,&wparam(2)); # load key schedule pointer
+
+ &mov ("ebx","esp");
+ &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
+ &and ("esp",-64);
+ &mov ("eax",&DWP(272,$key)); # load grandRounds counter
+
+ # place stack frame just "above mod 1024" the key schedule
+ # this ensures that cache associativity of 2 suffices
+ &lea ("ecx",&DWP(-64-63,$key));
+ &sub ("ecx","esp");
+ &neg ("ecx");
+ &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
+ &sub ("esp","ecx");
+ &add ("esp",4); # 4 is reserved for callee's return address
+
+ &shl ("eax",6);
+ &mov (&DWP(4*4,"esp"),$key); # save keyStart
+ &lea ($key,&DWP(0,$key,"eax"));
+ &mov (&DWP(5*4,"esp"),"ebx");# save %esp
+
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop($Tbl);
+ &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
+
+ &mov (@T[0],&DWP(0,$idx)); # load ciphertext
+ &mov (@T[1],&DWP(4,$idx));
+ &mov (@T[2],&DWP(8,$idx));
+ &bswap (@T[0]);
+ &mov (@T[3],&DWP(12,$idx));
+ &bswap (@T[1]);
+ &bswap (@T[2]);
+ &bswap (@T[3]);
+
+ &call ("_x86_Camellia_decrypt");
+
+ &mov ("esp",&DWP(5*4,"esp"));
+ &bswap (@T[0]);
+ &mov ($idx,&wparam(1)); # load plaintext pointer
+ &bswap (@T[1]);
+ &bswap (@T[2]);
+ &bswap (@T[3]);
+ &mov (&DWP(0,$idx),@T[0]); # write plaintext
+ &mov (&DWP(4,$idx),@T[1]);
+ &mov (&DWP(8,$idx),@T[2]);
+ &mov (&DWP(12,$idx),@T[3]);
+&function_end("Camellia_decrypt");
+}
+
+&function_begin_B("_x86_Camellia_decrypt");
+ &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
+ &xor (@T[1],&DWP(4,$key));
+ &xor (@T[2],&DWP(8,$key));
+ &xor (@T[3],&DWP(12,$key));
+ &mov ($idx,&DWP(-8,$key)); # prefetch key[-2]
+
+ &mov ($__s0,@T[0]); # save s[0-3]
+ &mov ($__s1,@T[1]);
+ &mov ($__s2,@T[2]);
+ &mov ($__s3,@T[3]);
+
+&set_label("loop",16);
+ for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
+
+ &sub ($key,16*4);
+ &cmp ($key,$__end);
+ &je (&label("done"));
+
+ # @T[0-1] are preloaded, $idx is preloaded with key[2]
+ &and ($idx,@T[0]);
+ &mov (@T[3],$__s3);
+ &rotl ($idx,1);
+ &mov (@T[2],@T[3]);
+ &xor (@T[1],$idx);
+ &or (@T[2],&DWP(4,$key));
+ &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
+ &xor (@T[2],$__s2);
+
+ &mov ($idx,&DWP(12,$key));
+ &mov ($__s2,@T[2]); # s2^=s3|key[3];
+ &or ($idx,@T[1]);
+ &and (@T[2],&DWP(0,$key));
+ &xor (@T[0],$idx);
+ &rotl (@T[2],1);
+ &mov ($__s0,@T[0]); # s0^=s1|key[1];
+ &xor (@T[3],@T[2]);
+ &mov ($idx,&DWP(-8,$key)); # prefetch key[4]
+ &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
+ &jmp (&label("loop"));
+
+&set_label("done",8);
+ &mov (@T[2],@T[0]); # SwapHalf
+ &mov (@T[3],@T[1]);
+ &mov (@T[0],$__s2);
+ &mov (@T[1],$__s3);
+ &xor (@T[2],$idx); # $idx is preloaded with key[2]
+ &xor (@T[3],&DWP(12,$key));
+ &xor (@T[0],&DWP(0,$key));
+ &xor (@T[1],&DWP(4,$key));
+ &ret ();
+&function_end_B("_x86_Camellia_decrypt");
+
+# shld is very slow on Intel P4 family. Even on AMD it limits
+# instruction decode rate [because it's VectorPath] and consequently
+# performance. PIII, PM and Core[2] seem to be the only ones which
+# execute this code ~7% faster...
+sub __rotl128 {
+ my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
+
+ $rnd *= 2;
+ if ($rot) {
+ &mov ($idx,$i0);
+ &shld ($i0,$i1,$rot);
+ &shld ($i1,$i2,$rot);
+ &shld ($i2,$i3,$rot);
+ &shld ($i3,$idx,$rot);
+ }
+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
+}
+
+# ... Implementing 128-bit rotate without shld gives >3x performance
+# improvement on P4, only ~7% degradation on other Intel CPUs and
+# not worse performance on AMD. This is therefore preferred.
+sub _rotl128 {
+ my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
+
+ $rnd *= 2;
+ if ($rot) {
+ &mov ($Tbl,$i0);
+ &shl ($i0,$rot);
+ &mov ($idx,$i1);
+ &shr ($idx,32-$rot);
+ &shl ($i1,$rot);
+ &or ($i0,$idx);
+ &mov ($idx,$i2);
+ &shl ($i2,$rot);
+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
+ &shr ($idx,32-$rot);
+ &or ($i1,$idx);
+ &shr ($Tbl,32-$rot);
+ &mov ($idx,$i3);
+ &shr ($idx,32-$rot);
+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
+ &shl ($i3,$rot);
+ &or ($i2,$idx);
+ &or ($i3,$Tbl);
+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
+ } else {
+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
+ }
+}
+
+sub _saveround {
+my ($rnd,$key,@T)=@_;
+my $bias=int(@T[0])?shift(@T):0;
+
+ &mov (&DWP($bias+$rnd*8+0,$key),@T[0]);
+ &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1);
+ &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2);
+ &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3);
+}
+
+sub _loadround {
+my ($rnd,$key,@T)=@_;
+my $bias=int(@T[0])?shift(@T):0;
+
+ &mov (@T[0],&DWP($bias+$rnd*8+0,$key));
+ &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1);
+ &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2);
+ &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3);
+}
+
+# void Camellia_Ekeygen(
+# const int keyBitLength,
+# const Byte *rawKey,
+# KEY_TABLE_TYPE keyTable)
+&function_begin("Camellia_Ekeygen");
+{ my $step=0;
+
+ &stack_push(4); # place for s[0-3]
+
+ &mov ($Tbl,&wparam(0)); # load arguments
+ &mov ($idx,&wparam(1));
+ &mov ($key,&wparam(2));
+
+ &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits
+ &mov (@T[1],&DWP(4,$idx));
+ &mov (@T[2],&DWP(8,$idx));
+ &mov (@T[3],&DWP(12,$idx));
+
+ &bswap (@T[0]);
+ &bswap (@T[1]);
+ &bswap (@T[2]);
+ &bswap (@T[3]);
+
+ &_saveround (0,$key,@T); # KL<<<0
+
+ &cmp ($Tbl,128);
+ &je (&label("1st128"));
+
+ &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits
+ &mov (@T[1],&DWP(20,$idx));
+ &cmp ($Tbl,192);
+ &je (&label("1st192"));
+ &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits
+ &mov (@T[3],&DWP(28,$idx));
+ &jmp (&label("1st256"));
+&set_label("1st192",4);
+ &mov (@T[2],@T[0]);
+ &mov (@T[3],@T[1]);
+ &not (@T[2]);
+ &not (@T[3]);
+&set_label("1st256",4);
+ &bswap (@T[0]);
+ &bswap (@T[1]);
+ &bswap (@T[2]);
+ &bswap (@T[3]);
+
+ &_saveround (4,$key,@T); # temporary storage for KR!
+
+ &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL
+ &xor (@T[1],&DWP(0*8+4,$key));
+ &xor (@T[2],&DWP(1*8+0,$key));
+ &xor (@T[3],&DWP(1*8+4,$key));
+
+&set_label("1st128",4);
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop($Tbl);
+ &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
+ &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
+
+ &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0]
+ &mov (&swtmp(0),@T[0]); # save s[0-3]
+ &mov (&swtmp(1),@T[1]);
+ &mov (&swtmp(2),@T[2]);
+ &mov (&swtmp(3),@T[3]);
+ &Camellia_Feistel($step++);
+ &Camellia_Feistel($step++);
+ &mov (@T[2],&swtmp(2));
+ &mov (@T[3],&swtmp(3));
+
+ &mov ($idx,&wparam(2));
+ &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL
+ &xor (@T[1],&DWP(0*8+4,$idx));
+ &xor (@T[2],&DWP(1*8+0,$idx));
+ &xor (@T[3],&DWP(1*8+4,$idx));
+
+ &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4]
+ &mov (&swtmp(0),@T[0]); # save s[0-3]
+ &mov (&swtmp(1),@T[1]);
+ &mov (&swtmp(2),@T[2]);
+ &mov (&swtmp(3),@T[3]);
+ &Camellia_Feistel($step++);
+ &Camellia_Feistel($step++);
+ &mov (@T[2],&swtmp(2));
+ &mov (@T[3],&swtmp(3));
+
+ &mov ($idx,&wparam(0));
+ &cmp ($idx,128);
+ &jne (&label("2nd256"));
+
+ &mov ($key,&wparam(2));
+ &lea ($key,&DWP(128,$key)); # size optimization
+
+ ####### process KA
+ &_saveround (2,$key,-128,@T); # KA<<<0
+ &_rotl128 (@T,1