#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# Needs more work: key setup, page boundaries, CBC routine...
#
# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
# 128-bit key, which is ~40% better than 64-bit code generated by gcc
# 4.0. But these are not the ones currently used! Their "compact"
# counterparts are, for security reason. ppc_AES_encrypt_compact runs
# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
# at 1/3 of ppc_AES_decrypt.
# February 2010
#
# Rescheduling instructions to favour Power6 pipeline gives 10%
# performance improvement on the platfrom in question (and marginal
# improvement even on others). It should be noted that Power6 fails
# to process byte in 18 cycles, only in 23, because it fails to issue
# 4 load instructions in two cycles, only in 3. As result non-compact
# block subroutines are 25% slower than one would expect. Compact
# functions scale better, because they have pure computational part,
# which scales perfectly with clock frequency. To be specific
# ppc_AES_encrypt_compact operates at 42 cycles per byte, while
# ppc_AES_decrypt_compact - at 55 (in 64-bit build).
$flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
} else { die "nonsense $flavour"; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=32*$SIZE_T;
sub _data_word()
{ my $i;
while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
}
$sp="r1";
$toc="r2";
$inp="r3";
$out="r4";
$key="r5";
$Tbl0="r3";
$Tbl1="r6";
$Tbl2="r7";
$Tbl3="r2";
$s0="r8";
$s1="r9";
$s2="r10";
$s3="r11";
$t0="r12";
$t1="r13";
$t2="r14";
$t3="r15";
$acc00="r16";
$acc01="r17";
$acc02="r18";
$acc03="r19";
$acc04="r20";
$acc05="r21";
$acc06="r22";
$acc07="r23";
$acc08="r24";
$acc09="r25";
$acc10="r26";
$acc11="r27";
$acc12="r28";
$acc13="r29";
$acc14="r30";
$acc15="r31";
# stay away from TLS pointer
if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; }
else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; }
$mask80=$Tbl2;
$mask1b=$Tbl3;
$code.=<<___;
.machine "any"
.text
.align 7
LAES_Te:
mflr r0
bcl 20,31,\$+4
mflr $Tbl0 ; vvvvv "distance" between . and 1st data entry
addi $Tbl0,$Tbl0,`128-8`
mtlr r0
blr
.space `32-24`
LAES_Td:
mflr r0
bcl 20,31,\$+4
mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
addi $Tbl0,$Tbl0,`128-8-32+2048+256`
mtlr r0
blr
.space `128-32-24`
___