crypto/sha/asm/sha512-ia64.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
#
# SHA256/512_Transform for Itanium.
#
# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50%
# faster than gcc and >60%(!) faster than code generated by HP-UX
# compiler (yes, HP-UX is generating slower code, because unlike gcc,
# it failed to deploy "shift right pair," 'shrp' instruction, which
# substitutes for 64-bit rotate).
#
# 924 cycles long sha256_block outperforms gcc by over factor of 2(!)
# and HP-UX compiler - by >40% (yes, gcc won sha512_block, but lost
# this one big time). Note that "formally" 924 is about 100 cycles
# too much. I mean it's 64 32-bit rounds vs. 80 virtually identical
# 64-bit ones and 1003*64/80 gives 802. Extra cycles, 2 per round,
# are spent on extra work to provide for 32-bit rotations. 32-bit
# rotations are still handled by 'shrp' instruction and for this
# reason lower 32 bits are deposited to upper half of 64-bit register
# prior 'shrp' issue. And in order to minimize the amount of such
# operations, X[16] values are *maintained* with copies of lower
# halves in upper halves, which is why you'll spot such instructions
# as custom 'mux2', "parallel 32-bit add," 'padd4' and "parallel
# 32-bit unsigned right shift," 'pshr4.u' instructions here.
#
# Rules of engagement.
#
# There is only one integer shifter meaning that if I have two rotate,
# deposit or extract instructions in adjacent bundles, they shall
# split [at run-time if they have to]. But note that variable and
# parallel shifts are performed by multi-media ALU and *are* pairable
# with rotates [and alike]. On the backside MMALU is rather slow: it
# takes 2 extra cycles before the result of integer operation is
# available *to* MMALU and 2(*) extra cycles before the result of MM
# operation is available "back" *to* integer ALU, not to mention that
# MMALU itself has 2 cycles latency. However! I explicitly scheduled
# these MM instructions to avoid MM stalls, so that all these extra
# latencies get "hidden" in instruction-level parallelism.
#
# (*) 2 cycles on Itanium 1 and 1 cycle on Itanium 2. But I schedule
#     for 2 in order to provide for best *overall* performance,
#     because on Itanium 1 stall on MM result is accompanied by
#     pipeline flush, which takes 6 cycles:-(
#
# Resulting performance numbers for 900MHz Itanium 2 system:
#
# The 'numbers' are in 1000s of bytes per second processed.
# type     16 bytes    64 bytes   256 bytes  1024 bytes  8192 bytes
# sha1(*)   6210.14k   20376.30k   52447.83k   85870.05k  105478.12k
# sha256    7476.45k   20572.05k   41538.34k   56062.29k   62093.18k
# sha512    4996.56k   20026.28k   47597.20k   85278.79k  111501.31k
#
# (*) SHA1 numbers are for HP-UX compiler and are presented purely
#     for reference purposes. I bet it can improved too...
#
# To generate code, pass the file name with either 256 or 512 in its
# name and compiler flags.

$output=shift;

if ($output =~ /512.*\.[s|asm]/) {
	$SZ=8;
	$BITS=8*$SZ;
	$LDW="ld8";
	$STW="st8";
	$ADD="add";
	$SHRU="shr.u";
	$TABLE="K512";
	$func="sha512_block";
	@Sigma0=(28,34,39);
	@Sigma1=(14,18,41);
	@sigma0=(1,  8, 7);
	@sigma1=(19,61, 6);
	$rounds=80;
} elsif ($output =~ /256.*\.[s|asm]/) {
	$SZ=4;
	$BITS=8*$SZ;
	$LDW="ld4";
	$STW="st4";
	$ADD="padd4";
	$SHRU="pshr4.u";
	$TABLE="K256";
	$func="sha256_block";
	@Sigma0=( 2,13,22);
	@Sigma1=( 6,11,25);
	@sigma0=( 7,18, 3);
	@sigma1=(17,19,10);
	$rounds=64;
} else { die "nonsense $output"; }

open STDOUT,">$output" || die "can't open $output: $!";

if ($^O eq "hpux") {
    $ADDP="addp4";
    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
for (@ARGV)  {	$big_endian=1 if (/\-DB_ENDIAN/);
		$big_endian=0 if (/\-DL_ENDIAN/);  }
if (!defined($big_endian))
             {	$big_endian=(unpack('L',pack('N',1))==1);  }

$code=<<___;
.ident  \"$output, version 1.0\"
.ident  \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
.explicit
.text

prsave=r14;
K=r15;
A=r16;	B=r17;	C=r18;	D=r19;
E=r20;	F=r21;	G=r22;	H=r23;
T1=r24;	T2=r25;
s0=r26;	s1=r27;	t0=r28;	t1=r29;
Ktbl=r30;
ctx=r31;	// 1st arg
input=r48;	// 2nd arg
num=r49;	// 3rd arg
sgm0=r50;	sgm1=r51;	// small constants

// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host])
.global	$func#
.proc	$func#
.align	32
$func:
	.prologue
	.save	ar.pfs,r2
{ .mmi;	alloc	r2=ar.pfs,3,17,0,16
	$ADDP	ctx=0,r32		// 1st arg
	.save	ar.lc,r3
	mov	r3=ar.lc	}
{ .mmi;	$ADDP	input=0,r33		// 2nd arg
	addl	Ktbl=\@ltoff($TABLE#),gp
	.save	pr,prsave
	mov	prsave=pr	};;

	.body
{ .mii;	ld8	Ktbl=[Ktbl]
	mov	num=r34		};;	// 3rd arg

{ .mib;	add	r8=0*$SZ,ctx
	add	r9=1*$SZ,ctx
	brp.loop.imp	.L_first16,.L_first16_ctop
				}
{ .mib;	add	r10=2*$SZ,ctx
	add	r11=3*$SZ,ctx
	brp.loop.imp	.L_rest,.L_rest_ctop
				};;
// load A-H
{ .mmi;	$LDW	A=[r8],4*$SZ
	$LDW	B=[r9],4*$SZ
	mov	sgm0=$sigma0[2]	}
{ .mmi;	$LDW	C=[r10],4*$SZ
	$LDW	D=[r11],4*$SZ
	mov	sgm1=$sigma1[2]	};;
{ .mmi;	$LDW	E=[r8]
	$LDW	F=[r9]		}
{ .mmi;	$LDW	G=[r10]
	$LDW	H=[r11]
	cmp.ne	p15,p14=0,r35	};;	// used in sha256_block

.L_outer:
{ .mii;	mov	ar.lc=15
	mov	ar.ec=1		};;
.align	32
.L_first16:
.rotr	X[16]
___
$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
{ .mib;	(p14)	add	r9=1,input
	(p14)	add	r10=2,input	}
{ .mib;	(p14)	add	r11=3,input
	(p15)	br.dptk.few	.L_host	};;
{ .mmi;	(p14)	ld1	r8=[input],$SZ
	(p14)	ld1	r9=[r9]		}
{ .mmi;	(p14)	ld1	r10=[r10]
	(p14)	ld1	r11=[r11]	};;
{ .mii;	(p14)	dep	r9=r8,r9,8,8
	(p14)	dep	r11=r10,r11,8,8	};;
{ .mib;	(p14)	dep	X[15]=r9,r11,16,16 };;
.L_host:
{ .mib;	(p15)	$LDW	X[15]=[input],$SZ	// X[i]=*input++
		dep.z	$t1=E,32,32	}
{ .mib;		$LDW	K=[Ktbl],$SZ
		zxt4	E=E		};;
{ .mmi;		or	$t1=$t1,E
		and	T1=F,E
		and	T2=A,B		}
{ .mmi;		andcm	r8=G,E
		and	r9=A,C
		mux2	$t0=A,0x44	};;	// copy lower half to upper
{ .mib;		xor	T1=T1,r8		// T1=((e & f) ^ (~e & g))
		_rotr	r11=$t1,$Sigma1[0] }	// ROTR(e,14)
{ .mib;		and	r10=B,C
		xor	T2=T2,r9	};;
___
$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
{ .mmi;		$LDW	X[15]=[input],$SZ	// X[i]=*input++
		and	T1=F,E
		and	T2=A,B		}
{ .mmi;		$LDW	K=[Ktbl],$SZ
		andcm	r8=G,E
		and	r9=A,C		};;
{ .mmi;		xor	T1=T1,r8		//T1=((e & f) ^ (~e & g))
		and	r10=B,C
		_rotr	r11=$t1,$Sigma1[0] }	// ROTR(e,14)
{ .mmi;		xor	T2=T2,r9
		mux1	X[15]=X[15],\@rev };;	// eliminated in big-endian
___
$code.=<<___;
{ .mib;		add	T1=T1,H			// T1=Ch(e,f,g)+h
		_rotr	r8=$t1,$Sigma1[1] }	// ROTR(e,18)
{ .mib;		xor	T2=T2,r10		// T2=((a & b) ^ (a & c) ^ (b & c))
		mov	H=G		};;
{ .mib;		xor	r11=r8,r11
		_rotr	r9=$t1,$Sigma1[2] }	// ROTR(e,41)
{ .mib;		mov	G=F
		mov	F=E		};;
{ .mib;		xor	r9=r9,r11		// r9=Sigma1(e)
		_rotr	r10=$t0,$Sigma0[0] }	// ROTR(a,28)
{ .mib;		add	T1=T1,K			// T1=Ch(e,f,g)+h+K512[i]
		mov	E=D		};;
{ .mib;		add	T1=T1,r9		// T1+=Sigma1(e)
		_rotr	r11=$t0,$Sigma0[1] }	// ROTR(a,34)
{ .mib;		mov	D=C
		mov	C=B		};;
{ .mib;		add	T1=T1,X[15]		// T1+=X[i]
		_rotr	r8=$t0,$Sigma0[2] }	// ROTR(a,39)
{ .mib;		xor	r10=r10,r11
		mux2	X[15]=X[15],0x44 };;	// eliminated in 64-bit
{ .mmi;		xor	r10=r8,r10		// r10=Sigma0(a)
		mov	B=A
		add	A=T1,T2		};;
.L_first16_ctop:
{ .mib;		add	E=E,T1
		add	A=A,r10			// T2=Maj(a,b,c)+Sigma0(a)
	br.ctop.sptk	.L_first16	};;

{ .mib;	mov	ar.lc=$rounds-17	}
{ .mib;	mov	ar.ec=1			};;
.align	32
.L_rest:
.rotr	X[16]
{ .mib;		$LDW	K=[Ktbl],$SZ
		_rotr	r8=X[15-1],$sigma0[0] }	// ROTR(s0,1)
{ .mib; 	$ADD	X[15]=X[15],X[15-9]	// X[i&0xF]+=X[(i+9)&0xF]
		$SHRU	s0=X[15-1],sgm0	};;	// s0=X[(i+1)&0xF]>>7
{ .mib;		and	T1=F,E
		_rotr	r9=X[15-1],$sigma0[1] }	// ROTR(s0,8)
{ .mib;		andcm	r10=G,E
		$SHRU	s1=X[15-14],sgm1 };;	// s1=X[(i+14)&0xF]>>6
{ .mmi;		xor	T1=T1,r10		// T1=((e & f) ^ (~e & g))
		xor	r9=r8,r9
		_rotr	r10=X[15-14],$sigma1[0] };;// ROTR(s1,19)
{ .mib;		and	T2=A,B		
		_rotr	r11=X[15-14],$sigma1[1] }// ROTR(s1,61)
{ .mib;		and	r8=A,C		};;
___
$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
// I adhere to mmi; in order to hold Itanium 1 back and avoid 6 cycle
// pipeline flush in last bundle. Note that even on Itanium2 the
// latter stalls for one clock cycle...
{ .mmi;		xor	s0=s0,r9		// s0=sigma0(X[(i+1)&0xF])
		dep.z	$t1=E,32,32	}
{ .mmi;		xor	r10=r11,r10
		zxt4	E=E		};;
{ .mmi;		or	$t1=$t1,E
		xor	s1=s1,r10		// s1=sigma1(X[(i+14)&0xF])
		mux2	$t0=A,0x44	};;	/