#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
#
# Version 1.1
#
# The major reason for undertaken effort was to mitigate the hazard of
# cache-timing attack. This is [currently and initially!] addressed in
# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
# 2. References to them are scheduled for L2 cache latency, meaning
# that the tables don't have to reside in L1 cache. Once again, this
# is an initial draft and one should expect more countermeasures to
# be implemented...
#
# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
# round.
#
# Even though performance was not the primary goal [on the contrary,
# extra shifts "induced" by compressed S-box and longer loop epilogue
# "induced" by scheduling for L2 have negative effect on performance],
# the code turned out to run in ~23 cycles per processed byte en-/
# decrypted with 128-bit key. This is pretty good result for code
# with mentioned qualities and UltraSPARC core. Compared to Sun C
# generated code my encrypt procedure runs just few percents faster,
# while decrypt one - whole 50% faster [yes, Sun C failed to generate
# optimal decrypt procedure]. Compared to GNU C generated code both
# procedures are more than 60% faster:-)
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }
$locals=16;
$acc0="%l0";
$acc1="%o0";
$acc2="%o1";
$acc3="%o2";
$acc4="%l1";
$acc5="%o3";
$acc6="%o4";
$acc7="%o5";
$acc8="%l2";
$acc9="%o7";
$acc10="%g1";
$acc11="%g2";
$acc12="%l3";
$acc13="%g3";
$acc14="%g4";
$acc15="%g5";
$t0="%l4";
$t1="%l5";
$t2="%l6";
$t3="%l7";
$s0="%i0";
$s1="%i1";
$s2="%i2";
$s3="%i3";
$tbl="%i4";
$key="%i5";
$rounds="%i7"; # aliases with return address, which is off-loaded to stack
sub _data_word()
{ my $i;
while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
}
$code.=<<___ if ($bits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
.section ".text",#alloc,#execinstr
.align 256
AES_Te:
___
&_data_word(
0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,