diff options
-rwxr-xr-x | Configure | 4 | ||||
-rw-r--r-- | TABLE | 76 | ||||
-rw-r--r-- | crypto/camellia/Makefile | 5 | ||||
-rw-r--r-- | crypto/camellia/asm/cmll-x86.pl | 1138 | ||||
-rw-r--r-- | crypto/camellia/asm/cmll-x86_64.pl | 1082 |
5 files changed, 2265 insertions, 40 deletions
@@ -116,11 +116,11 @@ my $tlib="-lnsl -lsocket"; my $bits1="THIRTY_TWO_BIT "; my $bits2="SIXTY_FOUR_BIT "; -my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-586.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:"; +my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-586.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o"; my $x86_elf_asm="$x86_asm:elf"; -my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:"; +my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o"; my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::void"; my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void"; @@ -230,7 +230,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = a.out $dso_scheme = dlfcn $shared_target= bsd-shared @@ -260,7 +260,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= bsd-shared @@ -290,7 +290,7 @@ $rc4_obj = rc4-x86_64.o $rmd160_obj = $rc5_obj = $wp_obj = wp-x86_64.o -$cmll_obj = +$cmll_obj = cmll-x86_64.o cmll_misc.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= bsd-gcc-shared @@ -320,7 +320,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = coff $dso_scheme = dlfcn $shared_target= cygwin-shared @@ -380,7 +380,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = a.out $dso_scheme = $shared_target= @@ -650,7 +650,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = win32n $dso_scheme = win32 $shared_target= @@ -920,7 +920,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = beos $shared_target= beos-shared @@ -950,7 +950,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = beos $shared_target= beos-shared @@ -980,7 +980,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= bsd-gcc-shared @@ -1100,7 +1100,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = macosx $dso_scheme = dlfcn $shared_target= darwin-shared @@ -1190,7 +1190,7 @@ $rc4_obj = rc4-x86_64.o $rmd160_obj = $rc5_obj = $wp_obj = wp-x86_64.o -$cmll_obj = +$cmll_obj = cmll-x86_64.o cmll_misc.o $perlasm_scheme = macosx $dso_scheme = dlfcn $shared_target= darwin-shared @@ -1250,7 +1250,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= bsd-shared @@ -1490,7 +1490,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = $shared_target= @@ -1520,7 +1520,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = macosx $dso_scheme = dlfcn $shared_target= darwin-shared @@ -1640,7 +1640,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= linux-shared @@ -1670,7 +1670,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= linux-shared @@ -1760,7 +1760,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= linux-shared @@ -1790,7 +1790,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= linux-shared @@ -1850,7 +1850,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= @@ -1880,7 +1880,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= @@ -1910,7 +1910,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = $shared_target= @@ -2060,7 +2060,7 @@ $rc4_obj = rc4-x86_64.o $rmd160_obj = $rc5_obj = $wp_obj = wp-x86_64.o -$cmll_obj = +$cmll_obj = cmll-x86_64.o cmll_misc.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= linux-shared @@ -2090,7 +2090,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= linux-shared @@ -2120,7 +2120,7 @@ $rc4_obj = rc4-x86_64.o $rmd160_obj = $rc5_obj = $wp_obj = wp-x86_64.o -$cmll_obj = +$cmll_obj = cmll-x86_64.o cmll_misc.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= linux-shared @@ -2270,7 +2270,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = $shared_target= @@ -2780,7 +2780,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= linux-shared @@ -3110,7 +3110,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = a.out $dso_scheme = $shared_target= @@ -3170,7 +3170,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= linux-shared @@ -3260,7 +3260,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= linux-shared @@ -3530,7 +3530,7 @@ $rc4_obj = rc4-x86_64.o $rmd160_obj = $rc5_obj = $wp_obj = wp-x86_64.o -$cmll_obj = +$cmll_obj = cmll-x86_64.o cmll_misc.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= linux-shared @@ -3590,7 +3590,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = coff $dso_scheme = win32 $shared_target= cygwin-shared @@ -4190,7 +4190,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= svr3-shared @@ -4220,7 +4220,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= svr3-shared @@ -4460,7 +4460,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= solaris-shared @@ -4550,7 +4550,7 @@ $rc4_obj = rc4-x86_64.o $rmd160_obj = $rc5_obj = $wp_obj = wp-x86_64.o -$cmll_obj = +$cmll_obj = cmll-x86_64.o cmll_misc.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= solaris-shared @@ -4580,7 +4580,7 @@ $rc4_obj = rc4-x86_64.o $rmd160_obj = $rc5_obj = $wp_obj = wp-x86_64.o -$cmll_obj = +$cmll_obj = cmll-x86_64.o cmll_misc.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= solaris-shared @@ -4820,7 +4820,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= svr5-shared @@ -4850,7 +4850,7 @@ $rc4_obj = rc4-586.o $rmd160_obj = rmd-586.o $rc5_obj = rc5-586.o $wp_obj = wp_block.o wp-mmx.o -$cmll_obj = +$cmll_obj = cmll-x86.o $perlasm_scheme = elf $dso_scheme = dlfcn $shared_target= gnu-shared diff --git a/crypto/camellia/Makefile b/crypto/camellia/Makefile index 6154f81347..76331ff07a 100644 --- a/crypto/camellia/Makefile +++ b/crypto/camellia/Makefile @@ -44,6 +44,11 @@ lib: $(LIBOBJ) $(RANLIB) $(LIB) || echo Never mind. @touch lib +cmll-x86.s: asm/cmll-x86.pl ../perlasm/x86asm.pl + $(PERL) asm/cmll-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ +cmll-x86_64.s: asm/cmll-x86_64.pl + $(PERL) asm/cmll-x86_64.pl $(PERLASM_SCHEME) > $@ + files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO diff --git a/crypto/camellia/asm/cmll-x86.pl b/crypto/camellia/asm/cmll-x86.pl new file mode 100644 index 0000000000..0812815bfb --- /dev/null +++ b/crypto/camellia/asm/cmll-x86.pl @@ -0,0 +1,1138 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Copyright (c) 2008 Andy Polyakov <appro@openssl.org> +# +# This module may be used under the terms of either the GNU General +# Public License version 2 or later, the GNU Lesser General Public +# License version 2.1 or later, the Mozilla Public License version +# 1.1 or the BSD License. The exact terms of either license are +# distributed along with this module. For further details see +# http://www.openssl.org/~appro/camellia/. +# ==================================================================== + +# Performance in cycles per processed byte (less is better) in +# 'openssl speed ...' benchmark: +# +# AMD K8 Core2 PIII P4 +# -evp camellia-128-ecb 21.5 22.8 27.0 28.9 +# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64% +# + over icc 8.0 +48/19% +21/15% +21/17% +55/37% +# +# camellia-128-cbc 17.3 21.1 23.9 25.9 +# +# 128-bit key setup 196 280 256 240 cycles/key +# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40% +# + over icc 8.0 +18/3% +10/0% +10/3% +21/10% +# +# Pairs of numbers in "+" rows represent performance improvement over +# compiler generated position-independent code, PIC, and non-PIC +# respectively. PIC results are of greater relevance, as this module +# is position-independent, i.e. suitable for a shared library or PIE. +# Position independence "costs" one register, which is why compilers +# are so close with non-PIC results, they have an extra register to +# spare. CBC results are better than ECB ones thanks to "zero-copy" +# private _x86_* interface, and are ~30-40% better than with compiler +# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on +# same CPU (where applicable). + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +$OPENSSL=1; + +&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386"); + +@T=("eax","ebx","ecx","edx"); +$idx="esi"; +$key="edi"; +$Tbl="ebp"; + +# stack frame layout in _x86_Camellia_* routines, frame is allocated +# by caller +$__ra=&DWP(0,"esp"); # return address +$__s0=&DWP(4,"esp"); # s0 backing store +$__s1=&DWP(8,"esp"); # s1 backing store +$__s2=&DWP(12,"esp"); # s2 backing store +$__s3=&DWP(16,"esp"); # s3 backing store +$__end=&DWP(20,"esp"); # pointer to end/start of key schedule + +# stack frame layout in Camellia_[en|crypt] routines, which differs from +# above by 4 and overlaps by pointer to end/start of key schedule +$_end=&DWP(16,"esp"); +$_esp=&DWP(20,"esp"); + +# const unsigned int Camellia_SBOX[4][256]; +# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], +# and [2][] - with [3][]. This is done to optimize code size. +$SBOX1_1110=0; # Camellia_SBOX[0] +$SBOX4_4404=4; # Camellia_SBOX[1] +$SBOX2_0222=2048; # Camellia_SBOX[2] +$SBOX3_3033=2052; # Camellia_SBOX[3] +&static_label("Camellia_SIGMA"); +&static_label("Camellia_SBOX"); + +sub Camellia_Feistel { +my $i=@_[0]; +my $seed=defined(@_[1])?@_[1]:0; +my $scale=$seed<0?-8:8; +my $frame=defined(@_[2])?@_[2]:0; +my $j=($i&1)*2; +my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4]; + + &xor ($t0,$idx); # t0^=key[0] + &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1] + &movz ($idx,&HB($t0)); # (t0>>8)&0xff + &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0] + &movz ($idx,&LB($t0)); # (t0>>0)&0xff + &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0] + &shr ($t0,16); + &movz ($idx,&LB($t1)); # (t1>>0)&0xff + &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1] + &movz ($idx,&HB($t0)); # (t0>>24)&0xff + &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0] + &movz ($idx,&HB($t1)); # (t1>>8)&0xff + &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1] + &shr ($t1,16); + &movz ($t0,&LB($t0)); # (t0>>16)&0xff + &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0] + &movz ($idx,&HB($t1)); # (t1>>24)&0xff + &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3" + &xor ($t2,$t3); # t2^=t3 + &rotr ($t3,8); # t3=RightRotate(t3,8) + &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1] + &movz ($idx,&LB($t1)); # (t1>>16)&0xff + &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2" + &xor ($t3,$t0); # t3^=s3 + &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1] + &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1] + &xor ($t3,$t2); # t3^=t2 + &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3 + &xor ($t2,$t1); # t2^=s2 + &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2 +} + +# void Camellia_EncryptBlock_Rounds( +# int grandRounds, +# const Byte plaintext[], +# const KEY_TABLE_TYPE keyTable, +# Byte ciphertext[]) +&function_begin("Camellia_EncryptBlock_Rounds"); + &mov ("eax",&wparam(0)); # load grandRounds + &mov ($idx,&wparam(1)); # load plaintext pointer + &mov ($key,&wparam(2)); # load key schedule pointer + + &mov ("ebx","esp"); + &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra + &and ("esp",-64); + + # place stack frame just "above mod 1024" the key schedule + # this ensures that cache associativity of 2 suffices + &lea ("ecx",&DWP(-64-63,$key)); + &sub ("ecx","esp"); + &neg ("ecx"); + &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line + &sub ("esp","ecx"); + &add ("esp",4); # 4 is reserved for callee's return address + + &shl ("eax",6); + &lea ("eax",&DWP(0,$key,"eax")); + &mov ($_esp,"ebx"); # save %esp + &mov ($_end,"eax"); # save keyEnd + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop($Tbl); + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); + + &mov (@T[0],&DWP(0,$idx)); # load plaintext + &mov (@T[1],&DWP(4,$idx)); + &mov (@T[2],&DWP(8,$idx)); + &bswap (@T[0]); + &mov (@T[3],&DWP(12,$idx)); + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + + &call ("_x86_Camellia_encrypt"); + + &mov ("esp",$_esp); + &bswap (@T[0]); + &mov ($idx,&wparam(3)); # load ciphertext pointer + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + &mov (&DWP(0,$idx),@T[0]); # write ciphertext + &mov (&DWP(4,$idx),@T[1]); + &mov (&DWP(8,$idx),@T[2]); + &mov (&DWP(12,$idx),@T[3]); +&function_end("Camellia_EncryptBlock_Rounds"); +# V1.x API +&function_begin_B("Camellia_EncryptBlock"); + &mov ("eax",128); + &sub ("eax",&wparam(0)); # load keyBitLength + &mov ("eax",3); + &adc ("eax",0); # keyBitLength==128?3:4 + &mov (&wparam(0),"eax"); + &jmp (&label("Camellia_EncryptBlock_Rounds")); +&function_end_B("Camellia_EncryptBlock"); + +if ($OPENSSL) { +# void Camellia_encrypt( +# const unsigned char *in, +# unsigned char *out, +# const CAMELLIA_KEY *key) +&function_begin("Camellia_encrypt"); + &mov ($idx,&wparam(0)); # load plaintext pointer + &mov ($key,&wparam(2)); # load key schedule pointer + + &mov ("ebx","esp"); + &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra + &and ("esp",-64); + &mov ("eax",&DWP(272,$key)); # load grandRounds counter + + # place stack frame just "above mod 1024" the key schedule + # this ensures that cache associativity of 2 suffices + &lea ("ecx",&DWP(-64-63,$key)); + &sub ("ecx","esp"); + &neg ("ecx"); + &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line + &sub ("esp","ecx"); + &add ("esp",4); # 4 is reserved for callee's return address + + &shl ("eax",6); + &lea ("eax",&DWP(0,$key,"eax")); + &mov ($_esp,"ebx"); # save %esp + &mov ($_end,"eax"); # save keyEnd + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop($Tbl); + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); + + &mov (@T[0],&DWP(0,$idx)); # load plaintext + &mov (@T[1],&DWP(4,$idx)); + &mov (@T[2],&DWP(8,$idx)); + &bswap (@T[0]); + &mov (@T[3],&DWP(12,$idx)); + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + + &call ("_x86_Camellia_encrypt"); + + &mov ("esp",$_esp); + &bswap (@T[0]); + &mov ($idx,&wparam(1)); # load ciphertext pointer + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + &mov (&DWP(0,$idx),@T[0]); # write ciphertext + &mov (&DWP(4,$idx),@T[1]); + &mov (&DWP(8,$idx),@T[2]); + &mov (&DWP(12,$idx),@T[3]); +&function_end("Camellia_encrypt"); +} + +&function_begin_B("_x86_Camellia_encrypt"); + &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] + &xor (@T[1],&DWP(4,$key)); + &xor (@T[2],&DWP(8,$key)); + &xor (@T[3],&DWP(12,$key)); + &mov ($idx,&DWP(16,$key)); # prefetch key[4] + + &mov ($__s0,@T[0]); # save s[0-3] + &mov ($__s1,@T[1]); + &mov ($__s2,@T[2]); + &mov ($__s3,@T[3]); + +&set_label("loop",16); + for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); } + + &add ($key,16*4); + &cmp ($key,$__end); + &je (&label("done")); + + # @T[0-1] are preloaded, $idx is preloaded with key[0] + &and ($idx,@T[0]); + &mov (@T[3],$__s3); + &rotl ($idx,1); + &mov (@T[2],@T[3]); + &xor (@T[1],$idx); + &or (@T[2],&DWP(12,$key)); + &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); + &xor (@T[2],$__s2); + + &mov ($idx,&DWP(4,$key)); + &mov ($__s2,@T[2]); # s2^=s3|key[3]; + &or ($idx,@T[1]); + &and (@T[2],&DWP(8,$key)); + &xor (@T[0],$idx); + &rotl (@T[2],1); + &mov ($__s0,@T[0]); # s0^=s1|key[1]; + &xor (@T[3],@T[2]); + &mov ($idx,&DWP(16,$key)); # prefetch key[4] + &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); + &jmp (&label("loop")); + +&set_label("done",8); + &mov (@T[2],@T[0]); # SwapHalf + &mov (@T[3],@T[1]); + &mov (@T[0],$__s2); + &mov (@T[1],$__s3); + &xor (@T[0],$idx); # $idx is preloaded with key[0] + &xor (@T[1],&DWP(4,$key)); + &xor (@T[2],&DWP(8,$key)); + &xor (@T[3],&DWP(12,$key)); + &ret (); +&function_end_B("_x86_Camellia_encrypt"); + +# void Camellia_DecryptBlock_Rounds( +# int grandRounds, +# const Byte ciphertext[], +# const KEY_TABLE_TYPE keyTable, +# Byte plaintext[]) +&function_begin("Camellia_DecryptBlock_Rounds"); + &mov ("eax",&wparam(0)); # load grandRounds + &mov ($idx,&wparam(1)); # load ciphertext pointer + &mov ($key,&wparam(2)); # load key schedule pointer + + &mov ("ebx","esp"); + &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra + &and ("esp",-64); + + # place stack frame just "above mod 1024" the key schedule + # this ensures that cache associativity of 2 suffices + &lea ("ecx",&DWP(-64-63,$key)); + &sub ("ecx","esp"); + &neg ("ecx"); + &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line + &sub ("esp","ecx"); + &add ("esp",4); # 4 is reserved for callee's return address + + &shl ("eax",6); + &mov (&DWP(4*4,"esp"),$key); # save keyStart + &lea ($key,&DWP(0,$key,"eax")); + &mov (&DWP(5*4,"esp"),"ebx");# save %esp + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop($Tbl); + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); + + &mov (@T[0],&DWP(0,$idx)); # load ciphertext + &mov (@T[1],&DWP(4,$idx)); + &mov (@T[2],&DWP(8,$idx)); + &bswap (@T[0]); + &mov (@T[3],&DWP(12,$idx)); + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + + &call ("_x86_Camellia_decrypt"); + + &mov ("esp",&DWP(5*4,"esp")); + &bswap (@T[0]); + &mov ($idx,&wparam(3)); # load plaintext pointer + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + &mov (&DWP(0,$idx),@T[0]); # write plaintext + &mov (&DWP(4,$idx),@T[1]); + &mov (&DWP(8,$idx),@T[2]); + &mov (&DWP(12,$idx),@T[3]); +&function_end("Camellia_DecryptBlock_Rounds"); +# V1.x API +&function_begin_B("Camellia_DecryptBlock"); + &mov ("eax",128); + &sub ("eax",&wparam(0)); # load keyBitLength + &mov ("eax",3); + &adc ("eax",0); # keyBitLength==128?3:4 + &mov (&wparam(0),"eax"); + &jmp (&label("Camellia_DecryptBlock_Rounds")); +&function_end_B("Camellia_DecryptBlock"); + +if ($OPENSSL) { +# void Camellia_decrypt( +# const unsigned char *in, +# unsigned char *out, +# const CAMELLIA_KEY *key) +&function_begin("Camellia_decrypt"); + &mov ($idx,&wparam(0)); # load ciphertext pointer + &mov ($key,&wparam(2)); # load key schedule pointer + + &mov ("ebx","esp"); + &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra + &and ("esp",-64); + &mov ("eax",&DWP(272,$key)); # load grandRounds counter + + # place stack frame just "above mod 1024" the key schedule + # this ensures that cache associativity of 2 suffices + &lea ("ecx",&DWP(-64-63,$key)); + &sub ("ecx","esp"); + &neg ("ecx"); + &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line + &sub ("esp","ecx"); + &add ("esp",4); # 4 is reserved for callee's return address + + &shl ("eax",6); + &mov (&DWP(4*4,"esp"),$key); # save keyStart + &lea ($key,&DWP(0,$key,"eax")); + &mov (&DWP(5*4,"esp"),"ebx");# save %esp + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop($Tbl); + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); + + &mov (@T[0],&DWP(0,$idx)); # load ciphertext + &mov (@T[1],&DWP(4,$idx)); + &mov (@T[2],&DWP(8,$idx)); + &bswap (@T[0]); + &mov (@T[3],&DWP(12,$idx)); + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + + &call ("_x86_Camellia_decrypt"); + + &mov ("esp",&DWP(5*4,"esp")); + &bswap (@T[0]); + &mov ($idx,&wparam(1)); # load plaintext pointer + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + &mov (&DWP(0,$idx),@T[0]); # write plaintext + &mov (&DWP(4,$idx),@T[1]); + &mov (&DWP(8,$idx),@T[2]); + &mov (&DWP(12,$idx),@T[3]); +&function_end("Camellia_decrypt"); +} + +&function_begin_B("_x86_Camellia_decrypt"); + &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] + &xor (@T[1],&DWP(4,$key)); + &xor (@T[2],&DWP(8,$key)); + &xor (@T[3],&DWP(12,$key)); + &mov ($idx,&DWP(-8,$key)); # prefetch key[-2] + + &mov ($__s0,@T[0]); # save s[0-3] + &mov ($__s1,@T[1]); + &mov ($__s2,@T[2]); + &mov ($__s3,@T[3]); + +&set_label("loop",16); + for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); } + + &sub ($key,16*4); + &cmp ($key,$__end); + &je (&label("done")); + + # @T[0-1] are preloaded, $idx is preloaded with key[2] + &and ($idx,@T[0]); + &mov (@T[3],$__s3); + &rotl ($idx,1); + &mov (@T[2],@T[3]); + &xor (@T[1],$idx); + &or (@T[2],&DWP(4,$key)); + &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); + &xor (@T[2],$__s2); + + &mov ($idx,&DWP(12,$key)); + &mov ($__s2,@T[2]); # s2^=s3|key[3]; + &or ($idx,@T[1]); + &and (@T[2],&DWP(0,$key)); + &xor (@T[0],$idx); + &rotl (@T[2],1); + &mov ($__s0,@T[0]); # s0^=s1|key[1]; + &xor (@T[3],@T[2]); + &mov ($idx,&DWP(-8,$key)); # prefetch key[4] + &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); + &jmp (&label("loop")); + +&set_label("done",8); + &mov (@T[2],@T[0]); # SwapHalf + &mov (@T[3],@T[1]); + &mov (@T[0],$__s2); + &mov (@T[1],$__s3); + &xor (@T[2],$idx); # $idx is preloaded with key[2] + &xor (@T[3],&DWP(12,$key)); + &xor (@T[0],&DWP(0,$key)); + &xor (@T[1],&DWP(4,$key)); + &ret (); +&function_end_B("_x86_Camellia_decrypt"); + +# shld is very slow on Intel P4 family. Even on AMD it limits +# instruction decode rate [because it's VectorPath] and consequently +# performance. PIII, PM and Core[2] seem to be the only ones which +# execute this code ~7% faster... +sub __rotl128 { + my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; + + $rnd *= 2; + if ($rot) { + &mov ($idx,$i0); + &shld ($i0,$i1,$rot); + &shld ($i1,$i2,$rot); + &shld ($i2,$i3,$rot); + &shld ($i3,$idx,$rot); + } + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); +} + +# ... Implementing 128-bit rotate without shld gives >3x performance +# improvement on P4, only ~7% degradation on other Intel CPUs and +# not worse performance on AMD. This is therefore preferred. +sub _rotl128 { + my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; + + $rnd *= 2; + if ($rot) { + &mov ($Tbl,$i0); + &shl ($i0,$rot); + &mov ($idx,$i1); + &shr ($idx,32-$rot); + &shl ($i1,$rot); + &or ($i0,$idx); + &mov ($idx,$i2); + &shl ($i2,$rot); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); + &shr ($idx,32-$rot); + &or ($i1,$idx); + &shr ($Tbl,32-$rot); + &mov ($idx,$i3); + &shr ($idx,32-$rot); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); + &shl ($i3,$rot); + &or ($i2,$idx); + &or ($i3,$Tbl); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); + } else { + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); + } +} + +sub _saveround { +my ($rnd,$key,@T)=@_; +my $bias=int(@T[0])?shift(@T):0; + + &mov (&DWP($bias+$rnd*8+0,$key),@T[0]); + &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1); + &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2); + &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3); +} + +sub _loadround { +my ($rnd,$key,@T)=@_; +my $bias=int(@T[0])?shift(@T):0; + + &mov (@T[0],&DWP($bias+$rnd*8+0,$key)); + &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1); + &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2); + &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3); +} + +# void Camellia_Ekeygen( +# const int keyBitLength, +# const Byte *rawKey, +# KEY_TABLE_TYPE keyTable) +&function_begin("Camellia_Ekeygen"); +{ my $step=0; + + &stack_push(4); # place for s[0-3] + + &mov ($Tbl,&wparam(0)); # load arguments + &mov ($idx,&wparam(1)); + &mov ($key,&wparam(2)); + + &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits + &mov (@T[1],&DWP(4,$idx)); + &mov (@T[2],&DWP(8,$idx)); + &mov (@T[3],&DWP(12,$idx)); + + &bswap (@T[0]); + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + + &_saveround (0,$key,@T); # KL<<<0 + + &cmp ($Tbl,128); + &je (&label("1st128")); + + &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits + &mov (@T[1],&DWP(20,$idx)); + &cmp ($Tbl,192); + &je (&label("1st192")); + &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits + &mov (@T[3],&DWP(28,$idx)); + &jmp (&label("1st256")); +&set_label("1st192",4); + &mov (@T[2],@T[0]); + &mov (@T[3],@T[1]); + ¬ (@T[2]); + ¬ (@T[3]); +&set_label("1st256",4); + &bswap (@T[0]); + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + + &_saveround (4,$key,@T); # temporary storage for KR! + + &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL + &xor (@T[1],&DWP(0*8+4,$key)); + &xor (@T[2],&DWP(1*8+0,$key)); + &xor (@T[3],&DWP(1*8+4,$key)); + +&set_label("1st128",4); + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop($Tbl); + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); + &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl)); + + &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0] + &mov (&swtmp(0),@T[0]); # save s[0-3] + &mov (&swtmp(1),@T[1]); + &mov (&swtmp(2),@T[2]); + &mov (&swtmp(3),@T[3]); + &Camellia_Feistel($step++); + &Camellia_Feistel($step++); + &mov (@T[2],&swtmp(2)); + &mov (@T[3],&swtmp(3)); + + &mov ($idx,&wparam(2)); + &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL + &xor (@T[1],&DWP(0*8+4,$idx)); + &xor (@T[2],&DWP(1*8+0,$idx)); + &xor (@T[3],&DWP(1*8+4,$idx)); + + &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4] + &mov (&swtmp(0),@T[0]); # save s[0-3] + &mov (&swtmp(1),@T[1]); + &mov (&swtmp(2),@T[2]); + &mov (&swtmp(3),@T[3]); + &Camellia_Feistel($step++); + &Camellia_Feistel($step++); + &mov (@T[2],&swtmp(2)); + &mov (@T[3],&swtmp(3)); + + &mov ($idx,&wparam(0)); + &cmp ($idx,128); + &jne (&label("2nd256")); + + &mov ($key,&wparam(2)); + &lea ($key,&DWP(128,$key)); # size optimization + + ####### process KA + &_saveround (2,$key,-128,@T); # KA<<<0 + &_rotl128 (@T,1 |