1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
|
// ====================================================================
// Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL
// project.
// ====================================================================
//
// Poly1305 for Itanium.
//
// January 2019
//
// Performance was reported to be ~2.1 cycles per byte on Itanium 2.
// With exception for processors in 95xx family, which have higher
// floating-point instructions' latencies and deliver ~2.6 cpb.
// Comparison to compiler-generated code is not exactly fair, because
// of different radixes. But just for reference, it was observed to be
// >3x faster. Originally it was argued that floating-point base 2^32
// implementation would be optimal. Upon closer look estimate for below
// integer base 2^64 implementation turned to be approximately same on
// Itanium 2. But floating-point code would be larger, and have higher
// overhead, which would negatively affect small-block performance...
#if defined(_HPUX_SOURCE)
# if !defined(_LP64)
# define ADDP addp4
# else
# define ADDP add
# endif
# define RUM rum
# define SUM sum
#else
# define ADDP add
# define RUM nop
# define SUM nop
#endif
.text
.explicit
.global poly1305_init#
.proc poly1305_init#
.align 64
poly1305_init:
.prologue
.save ar.pfs,r2
{ .mmi; alloc r2=ar.pfs,2,0,0,0
cmp.eq p6,p7=0,r33 } // key == NULL?
{ .mmi; ADDP r9=8,r32
ADDP r10=16,r32
ADDP r32=0,r32 };;
.body
{ .mmi; st8 [r32]=r0,24 // ctx->h0 = 0
st8 [r9]=r0 // ctx->h1 = 0
(p7) ADDP r8=0,r33 }
{ .mib; st8 [r10]=r0 // ctx->h2 = 0
(p6) mov r8=0
(p6) br.ret.spnt b0 };;
{ .mmi; ADDP r9=1,r33
ADDP r10=2,r33
ADDP r11=3,r33 };;
{ .mmi; ld1 r16=[r8],4 // load key, little-endian
ld1 r17=[r9],4 }
{ .mmi; ld1 r18=[r10],4
ld1 r19=[r11],4 };;
{ .mmi; ld1 r20=[r8],4
ld1 r21=[r9],4 }
{ .mmi; ld1 r22=[r10],4
ld1 r23=[r11],4
and r19=15,r19 };;
{ .mmi; ld1 r24=[r8],4
ld1 r25=[r9],4
and r20=-4,r20 }
{ .mmi; ld1 r26=[r10],4
ld1 r27=[r11],4
and r23=15,r23 };;
{ .mmi; ld1 r28=[r8],4
ld1 r29=[r9],4
and r24=-4,r24 }
{ .mmi; ld1 r30=[r10],4
ld1 r31=[r11],4
and r27=15,r27 };;
{ .mii; and r28=-4,r28
dep r16=r17,r16,8,8
dep r18=r19,r18,8,8 };;
{ .mii; and r31=15,r31
dep r16=r18,r16,16,16
dep r20=r21,r20,8,8 };;
{ .mii; dep r16=r20,r16,32,16
dep r22=r23,r22,8,8 };;
{ .mii; dep r16=r22,r16,48,16
dep r24=r25,r24,8,8 };;
{ .mii; dep r26=r27,r26,8,8
dep r28=r29,r28,8,8 };;
{ .mii; dep r24=r26,r24,16,16
dep r30=r31,r30,8,8 };;
{ .mii; st8 [r32]=r16,8 // ctx->r0
dep r24=r28,r24,32,16;;
dep r24=r30,r24,48,16 };;
{ .mii; st8 [r32]=r24,8 // ctx->r1
shr.u r25=r24,2;;
add r25=r25,r24 };;
{ .mib; st8 [r32]=r25 // ctx->s1
mov r8=0
br.ret.sptk b0 };;
.endp poly1305_init#
h0=r17; h1=r18; h2=r19;
i0=r20; i1=r21;
HF0=f8; HF1=f9; HF2=f10;
RF0=f11; RF1=f12; SF1=f13;
.global poly1305_blocks#
.proc poly1305_blocks#
.align 64
poly1305_blocks:
.prologue
.save ar.pfs,r2
{ .mii; alloc r2=ar.pfs,4,1,0,0
.save ar.lc,r3
mov r3=ar.lc
.save pr,r36
mov r36=pr }
.body
{ .mmi; ADDP r8=0,r32
ADDP r9=8,r32
and r29=7,r33 };;
{ .mmi; ld8 h0=[r8],16
ld8 h1=[r9],16
and r33=-8,r33 };;
{ .mmi; ld8 h2=[r8],16
ldf8 RF0=[r9],16
shr.u r34=r34,4 };;
{ .mmi; ldf8 RF1=[r8],-32
ldf8 SF1=[r9],-32
cmp.ltu p16,p17=1,r34 };;
{ .mmi;
(p16) add r34=-2,r34
(p17) mov r34=0
ADDP r10=0,r33 }
{ .mii; ADDP r11=8,r33
(p16) mov ar.ec=2
(p17) mov ar.ec=1 };;
{ .mib; RUM 1<<1 // go little-endian
mov ar.lc=r34
brp.loop.imp .Loop,.Lcend-16 }
{ .mmi; cmp.eq p8,p7=0,r29
cmp.eq p9,p0=1,r29
cmp.eq p10,p0=2,r29 }
{ .mmi; cmp.eq p11,p0=3,r29
cmp.eq p12,p0=4,r29
cmp.eq p13,p0=5,r29 }
{ .mmi; cmp.eq p14,p0=6,r29
cmp.eq p15,p0=7,r29
add r16=16,r10 };;
{ .mmb;
(p8) ld8 i0=[r10],16 // aligned input
(p8) ld8 i1=[r11],16
(p8) br.cond.sptk .Loop };;
// align first block
.pred.rel "mutex",p8,p9,p10,p11,p12,p13,p14,p15
{ .mmi; (p7) ld8 r14=[r10],24
(p7) ld8 r15=[r11],24 }
{ .mii; (p7) ld8 r16=[r16]
nop.i 0;;
(p15) shrp i0=r15,r14,56 }
{ .mii; (p15) shrp i1=r16,r15,56
(p14) shrp i0=r15,r14,48 }
{ .mii; (p14) shrp i1=r16,r15,48
(p13) shrp i0=r15,r14,40 }
{ .mii; (p13) shrp i1=r16,r15,40
(p12) shrp i0=r15,r14,32 }
{ .mii; (p12) shrp i1=r16,r15,32
(p11) shrp i0=r15,r14,24 }
{ .mii; (p11) shrp i1=r16,r15,24
(p10) shrp i0=r15,r14,16 }
{ .mii; (p10) shrp i1=r16,r15,16
(p9) shrp i0=r15,r14,8 }
{ .mii; (p9) shrp i1=r16,r15,8
mov r14=r16 };;
.Loop:
.pred.rel "mutex",p8,p9,p10,p11,p12,p13,p14,p15
{ .mmi; add h0=h0,i0
add h1=h1,i1
add h2=h2,r35 };;
{ .mmi; setf.sig HF0=h0
cmp.ltu p6,p0=h0,i0
cmp.ltu p7,p0=h1,i1 };;
{ .mmi; (p6) add h1=1,h1;;
setf.sig HF1=h1
(p6) cmp.eq.or p7,p0=0,h1 };;
{ .mmi; (p7) add h2=1,h2;;
setf.sig HF2=h2 };;
{ .mfi; (p16) ld8 r15=[r10],16
xmpy.lu f32=HF0,RF0 }
{ .mfi; (p16) ld8 r16=[r11],16
xmpy.hu f33=HF0,RF0 }
{ .mfi; xmpy.lu f36=HF0,RF1 }
{ .mfi; xmpy.hu f37=HF0,RF1 };;
{ .mfi; xmpy.lu f34=HF1,SF1
(p15) shrp i0=r15,r14,56 }
{ .mfi; xmpy.hu f35=HF1,SF1 }
{ .mfi; xmpy.lu f38=HF1,RF0
(p15) shrp i1=r16,r15,56 }
{ .mfi; xmpy.hu f39=HF1,RF0 }
{ .mfi; xmpy.lu f40=HF2,SF1
(p14) shrp i0=r15,r14,48 }
{ .mfi; xmpy.lu f41=HF2,RF0 };;
{ .mmi; getf.sig r22=f32
getf.sig r23=f33
(p14) shrp i1=r16,r15,48 }
{ .mmi; getf.sig r24=f34
getf.sig r25=f35
(p13) shrp i0=r15,r14,40 }
{ .mmi; getf.sig r26=f36
getf.sig r27=f37
(p13) shrp i1=r16,r15,40 }
{ .mmi; getf.sig r28=f38
getf.sig r29=f39
(p12) shrp i0=r15,r14,32 }
{ .mmi; getf.sig r30=f40
getf.sig r31=f41 };;
{ .mmi; add h0=r22,r24
add r23=r23,r25
(p12) shrp i1=r16,r15,32 }
{ .mmi; add h1=r26,r28
add r27=r27,r29
(p11) shrp i0=r15,r14,24 };;
{ .mmi; cmp.ltu p6,p0=h0,r24
cmp.ltu p7,p0=h1,r28
add r23=r23,r30 };;
{ .mmi; (p6) add r23=1,r23
(p7) add r27=1,r27
(p11) shrp i1=r16,r15,24 };;
{ .mm
|