#!/usr/bin/env perl
# ====================================================================
# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
#
# This module may be used under the terms of either the GNU General
# Public License version 2 or later, the GNU Lesser General Public
# License version 2.1 or later, the Mozilla Public License version
# 1.1 or the BSD License. The exact terms of either license are
# distributed along with this module. For further details see
# http://www.openssl.org/~appro/camellia/.
# ====================================================================
# Performance in cycles per processed byte (less is better) in
# 'openssl speed ...' benchmark:
#
# AMD64 Core2 EM64T
# -evp camellia-128-ecb 16.7 21.0 22.7
# + over gcc 3.4.6 +25% +5% 0%
#
# camellia-128-cbc 15.7 20.4 21.1
#
# 128-bit key setup 128 216 205 cycles/key
# + over gcc 3.4.6 +54% +39% +15%
#
# Numbers in "+" rows represent performance improvement over compiler
# generated code. Key setup timings are impressive on AMD and Core2
# thanks to 64-bit operations being covertly deployed. Improvement on
# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
# apparently emulates some of 64-bit operations in [32-bit] microcode.
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
$r =~ s/%[er]([sd]i)/%\1l/;
$r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
@S=("%r8d","%r9d","%r10d","%r11d");
$i0="%esi";
$i1="%edi";
$Tbl="%rbp"; # size optimization
$inp="%r12";
$out="%r13";
$key="%r14";
$keyend="%r15";
$arg0d=$win64?"%ecx":"%edi";
# const unsigned int Camellia_SBOX[4][256];
# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
# and [2][] - with [3][]. This is done to minimize code size.
$SBOX1_1110=0; # Camellia_SBOX[0]
$SBOX4_4404=4; # Camellia_SBOX[1]
$SBOX2_0222=2048; # Camellia_SBOX[2]
$SBOX3_3033=2052; # Camellia_SBOX[3]
sub Camellia_Feistel {
my $i=@_[0];
my $seed=defined(@_[1])?@_[1]:0;
my $scale=$seed<0?-8:8;
my $j=($i&1)*2;
my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
$code.=<<___;
xor $s0,$t0 # t0^=key[0]
xor $s1,$t1 # t1^=key[1]
movz `&hi("$t0")`,$i0 # (t0>>8)&0xff
movz `&lo("$t1")`,$i1 # (t1>>0)&0xff
mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
movz `&lo("$t0")`,$i0 # (t0>>0)&0xff
shr \$16,$t0
movz `&hi("$t1")`,$i1 # (t1>>8)&0xff
xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
shr \$16,$t1
xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
movz `&hi("$t0")`,$i0 # (t0>>24)&0xff
movz `&lo("$t1")`,$i1 # (t1>>16)&0xff
xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
movz `&lo("$t0")`,$i0 # (t0>>16)&0xff
movz `&hi("$t1")`,$i1 # (t1>>24)&0xff
xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
mov `$seed+($i+1)*$scale+4`($key),$t0
xor $t3,$t2 # t2^=t3
ror \$8,$t3 # t3=RightRotate(t3,8)
xor $t2,$s2
xor $t2,$s3
xor $t3,$s3
___
}
# void Camellia_EncryptBlock_Rounds(
# int grandRounds,
# const Byte plaintext[],
# const KEY_TABLE_TYPE keyTable,
# Byte ciphertext[])
$code=<<___;
.text
# V1.x API
.globl Camellia_EncryptBlock
.type Camellia_EncryptBlock,\@abi-omnipotent
.align 16
Camellia_EncryptBlock:
movl \$128,%eax
subl $arg0d,%eax
movl \$3,$arg0d
adcl \$0,$arg0d # keyBitLength==128?3:4
jmp .Lenc_rounds
.size Camellia_EncryptBlock,.-Camellia_EncryptBlock
# V2
.globl Camellia_EncryptBlock_Rounds
.type Camellia_EncryptBlock_Rounds,\@function,4
.align 16
.Lenc_rounds:
Camellia_EncryptBlock_Rounds:
push %rbx
push %rbp
push %r13
push %r14
push %r15
.Lenc_prologue:
#mov %rsi,$inp # put away arguments
mov %rcx,$out
mov %rdx,$key
shl \$6,%edi # process grandRounds
lea .LCamellia_SBOX(%rip),$Tbl
lea ($key,%rdi),$keyend
mov 0(%rsi),@S[0] # load plaintext
mov 4(%rsi),@S[1]
mov 8(%rsi),@S[2]
bswap @S[0]
mov 12(%rsi),@S[3]
bswap @S[1]
bswap @S[2]
bswap @S[3]
call _x86_64_Camellia_encrypt
bswap @S[0]
bswap @S[1]
bswap @S[2]
mov @S[0],0($out)
bswap @S[3]
mov @S[1],4($out)
mov @S[2],8($out)
mov @S[3],12($out)
mov 0(%rsp),%r15
mov 8(%rsp),%r14
mov 16(%rsp),%r13
mov 24(%rsp),%rbp
mov 32(%rsp),%rbx
lea 40(%rsp),%rsp
.Lenc_epilogue:
ret
.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
.type _x86_64_Camellia_encrypt,\@abi-omnipotent
.align 16
_x86_64_Camellia_encrypt:
xor 0($key),@S[1]
xor 4($key),@S[0] # ^=key[0-3]
xor 8($key),@S[3]
xor 12($key),@S[2]
.align 16
.Leloop:
mov 16($key),$t1 # prefetch key[4-5]
mov 20($key),$t0
___
for ($i=0