#!/usr/bin/env perl
# ====================================================================
# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
#
# This module may be used under the terms of either the GNU General
# Public License version 2 or later, the GNU Lesser General Public
# License version 2.1 or later, the Mozilla Public License version
# 1.1 or the BSD License. The exact terms of either license are
# distributed along with this module. For further details see
# http://www.openssl.org/~appro/camellia/.
# ====================================================================
# Performance in cycles per processed byte (less is better) in
# 'openssl speed ...' benchmark:
#
# AMD K8 Core2 PIII P4
# -evp camellia-128-ecb 21.5 22.8 27.0 28.9
# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64%
# + over icc 8.0 +48/19% +21/15% +21/17% +55/37%
#
# camellia-128-cbc 17.3 21.1 23.9 25.9
#
# 128-bit key setup 196 280 256 240 cycles/key
# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40%
# + over icc 8.0 +18/3% +10/0% +10/3% +21/10%
#
# Pairs of numbers in "+" rows represent performance improvement over
# compiler generated position-independent code, PIC, and non-PIC
# respectively. PIC results are of greater relevance, as this module
# is position-independent, i.e. suitable for a shared library or PIE.
# Position independence "costs" one register, which is why compilers
# are so close with non-PIC results, they have an extra register to
# spare. CBC results are better than ECB ones thanks to "zero-copy"
# private _x86_* interface, and are ~30-40% better than with compiler
# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
# same CPU (where applicable).
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$OPENSSL=1;
&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");
@T=("eax","ebx","ecx","edx");
$idx="esi";
$key="edi";
$Tbl="ebp";
# stack frame layout in _x86_Camellia_* routines, frame is allocated
# by caller
$__ra=&DWP(0,"esp"); # return address
$__s0=&DWP(4,"esp"); # s0 backing store
$__s1=&DWP(8,"esp"); # s1 backing store
$__s2=&DWP(12,"esp"); # s2 backing store
$__s3=&DWP(16,"esp"); # s3 backing store
$__end=&DWP(20,"esp"); # pointer to end/start of key schedule
# stack frame layout in Camellia_[en|crypt] routines, which differs from
# above by 4 and overlaps by pointer to end/start of key schedule
$_end=&DWP(16,"esp");
$_esp=&DWP(20,"esp");
# const unsigned int Camellia_SBOX[4][256];
# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
# and [2][] - with [3][]. This is done to optimize code size.
$SBOX1_1110=0; # Camellia_SBOX[0]
$SBOX4_4404=4; # Camellia_SBOX[1]
$SBOX2_0222=2048; # Camellia_SBOX[2]
$SBOX3_3033=2052; # Camellia_SBOX[3]
&static_label("Camellia_SIGMA");
&static_label("Camellia_SBOX");
sub Camellia_Feistel {
my $i=@_[0];
my $seed=defined(@_[1])?@_[1]:0;
my $scale=$seed<0?-8:8;
my $frame=defined(@_[2])?@_[2]:0;
my $j=($i&1)*2;
my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
&xor ($t0,$idx); # t0^=key[0]
&xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1]
&movz ($idx,&HB($t0)); # (t0>>8)&0xff
&mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0]
&movz ($idx,&LB($t0)); # (t0>>0)&0xff
&xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0]
&shr ($t0,16);
&movz ($idx,&LB($t1)); # (t1>>0)&0xff
&mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1]
&movz ($idx,&HB($t0)); # (t0>>24)&0xff
&xor ($t3,&DWP($SBOX1_1110,