// Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the OpenSSL license (the "License"). You may not use
// this file except in compliance with the License. You can obtain a copy
// in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// ====================================================================
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
// project. Rights for redistribution and usage in source and binary
// forms are granted according to the OpenSSL license.
// ====================================================================
//
// What's wrong with compiler generated code? Compiler never uses
// variable 'shr' which is pairable with 'extr'/'dep' instructions.
// Then it uses 'zxt' which is an I-type, but can be replaced with
// 'and' which in turn can be assigned to M-port [there're double as
// much M-ports as there're I-ports on Itanium 2]. By sacrificing few
// registers for small constants (255, 24 and 16) to be used with
// 'shr' and 'and' instructions I can achieve better ILP, Intruction
// Level Parallelism, and performance. This code outperforms GCC 3.3
// generated code by over factor of 2 (two), GCC 3.4 - by 70% and
// HP C - by 40%. Measured best-case scenario, i.e. aligned
// big-endian input, ECB timing on Itanium 2 is (18 + 13*rounds)
// ticks per block, or 9.25 CPU cycles per byte for 128 bit key.
// Version 1.2 mitigates the hazard of cache-timing attacks by
// a) compressing S-boxes from 8KB to 2KB+256B, b) scheduling
// references to S-boxes for L2 cache latency, c) prefetching T[ed]4
// prior last round. As result performance dropped to (26 + 15*rounds)
// ticks per block or 11 cycles per byte processed with 128-bit key.
// This is ~16% deterioration. For reference Itanium 2 L1 cache has
// 64 bytes line size and L2 - 128 bytes...
.ident "aes-ia64.S, version 1.2"
.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
.explicit
.text
rk0=r8; rk1=r9;
pfssave=r2;
lcsave=r10;
prsave=r3;
maskff=r11;
twenty4=r14;
sixteen=r15;
te00=r16; te11=r17; te22=r18; te33=r19;
te01=r20; te12=r21; te23=r22; te30=r23;
te02=r24; te13=r25; te20=r26; te31=r27;
te03=r28; te10=r29; te21=r30; te32=r31;
// these are rotating...
t0=r32; s0=r33;
t1=r34; s1=r35;
t2=r36; s2=r37;
t3=r38; s3=r39;
te0=r40; te1=r41; te2=r42; te3=r43;
#if defined(_HPUX_SOURCE) && !defined(_LP64)
# define ADDP addp4
#else
# define ADDP add
#endif
// Offsets from Te0
#define TE0 0
#define TE2 2
#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
#define TE1 3
#define TE3 1
#else
#define TE1 1
#define TE3 3
#endif
// This implies that AES_KEY comprises 32-bit key schedule elements
// even on LP64 platforms.
#ifndef KSZ
# define KSZ 4
# define LDKEY ld4
#endif
.proc _ia64_AES_encrypt#
// Input: rk0-rk1
// te0
// te3 as AES_KEY->rounds!!!
// s0-s3
// maskff,twenty4,sixteen
// Output: r16,r20,r24,r28 as s0-s3
// Clobber: r16-r31,rk0-rk1,r32-r43
.align 32
_ia64_AES_encrypt:
.prologue
.altrp b6
.body
{ .mmi; alloc r16=ar.pfs,12,0,0,8
LDKEY t0=[rk0],2*KSZ
mov pr.rot=1<<16 }
{ .mmi; LDKEY t1=[rk1],2*KSZ
add te1=TE1,te0
add te3=-3,te3 };;
{ .mib; LDKEY t2=[rk0],2*KSZ
mov ar.ec=2 }
{ .mib; LDKEY t3=[rk1],2*KSZ
add te2=TE2,te0
brp.loop.imp .Le_top,.Le_end-16 };;
{ .mmi; xor s0=s0,t0
xor s1=s1,t1
mov ar.lc=te3 }
{ .mmi; xor s2=s2,t2
xor s3=s3,t3
add te3=TE3,te0 };;
.align 32
.Le_top:
{ .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
(p0) and te33=s3,maskff // 0/0:s3&0xff
(p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
{ .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
(p0) and te30=s0,maskff // 0/1:s0&0xff
(p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24
{ .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
(p0) shladd te33=te33,3,te3 // 1/0:te0+s0>>24
(p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
{ .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
(p0) shladd te30=te30,3,te3 // 1/1:te3+s0
(p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24
{ .mmi; (p0) ld4 te33=[te33] // 2/0:te3[s3&0xff]
(p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff
(p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
{ .mmi; (p0) ld4 te30=[te30] // 2/1:te3[s0]
(p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8
(p0) shr.u