From 480cd6ab6e994626177de701c418264257954b03 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 15 Mar 2010 19:07:52 +0000 Subject: ghash-ia64.pl: new file, GHASH for Itanium. ghash-x86_64.pl: minimize stack frame usage. ghash-x86.pl: modulo-scheduling MMX loop in respect to input vector results in up to 10% performance improvement. --- crypto/modes/asm/ghash-ia64.pl | 228 +++++++++++++++++++++++++++++++++++++++ crypto/modes/asm/ghash-x86.pl | 67 ++++++++---- crypto/modes/asm/ghash-x86_64.pl | 30 +++--- 3 files changed, 290 insertions(+), 35 deletions(-) create mode 100755 crypto/modes/asm/ghash-ia64.pl diff --git a/crypto/modes/asm/ghash-ia64.pl b/crypto/modes/asm/ghash-ia64.pl new file mode 100755 index 0000000000..86c08c6477 --- /dev/null +++ b/crypto/modes/asm/ghash-ia64.pl @@ -0,0 +1,228 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March 2010 +# +# The module implements "4-bit" Galois field multiplication and +# streamed GHASH function. "4-bit" means that it uses 256 bytes +# per-key table [+128 bytes shared table]. Streamed GHASH performance +# was measured to be 6.35 cycles per processed byte on Itanium 2, +# which is >90% better than Microsoft compiler generated code. Well, +# the number should have been ~6.5. The deviation has everything to do +# with the way performance is measured, as difference between GCM and +# straightforward 128-bit counter mode. To anchor to something else +# sha1-ia64.pl module processes one byte in 6.0 cycles. On Itanium +# GHASH should run at ~8.5 cycles per byte. + +$output=shift and (open STDOUT,">$output" or die "can't open $output: $!"); + +if ($^O eq "hpux") { + $ADDP="addp4"; + for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } +} else { $ADDP="add"; } +for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); + $big_endian=0 if (/\-DL_ENDIAN/); } +if (!defined($big_endian)) + { $big_endian=(unpack('L',pack('N',1))==1); } + +sub loop() { +my $label=shift; +my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp + +# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. +# in scalable manner;-) Naturally assuming data in L1 cache... +# Special note about 'dep' instruction, which is used to construct +# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 +# bytes boundary and lower 7 bits of its address are guaranteed to +# be zero. +$code.=<<___; +$label: +{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 + (p19) dep rem=Zlo,rem_4bitp,3,4 } +{ .mfi; (p19) xor Zhi=Zhi,Hhi + ($p17) xor xi[1]=xi[1],in[1] };; +{ .mfi; (p18) ld8 Hhi=[Hi[1]] + (p19) shrp Zlo=Zhi,Zlo,4 } +{ .mfi; (p19) ld8 rem=[rem] + (p18) and Hi[1]=mask0xf0,xi[2] };; +{ .mmi; ($p16) ld1 in[0]=[inp],-1 + (p18) xor Zlo=Zlo,Hlo + (p19) shr.u Zhi=Zhi,4 } +{ .mib; (p19) xor Hhi=Hhi,rem + (p18) add Hi[1]=Htbl,Hi[1] };; + +{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 + (p18) dep rem=Zlo,rem_4bitp,3,4 } +{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0 + (p18) xor Zhi=Zhi,Hhi };; +{ .mfi; (p18) ld8 Hhi=[Hi[1]] + (p18) shrp Zlo=Zhi,Zlo,4 } +{ .mfi; (p18) ld8 rem=[rem] + (p17) and Hi[0]=mask0xf0,Hi[0] };; +{ .mmi; (p16) ld1 xi[0]=[Xi],-1 + (p18) xor Zlo=Zlo,Hlo + (p18) shr.u Zhi=Zhi,4 } +{ .mib; (p18) xor Hhi=Hhi,rem + (p17) add Hi[0]=Htbl,Hi[0] + br.ctop.sptk $label };; +___ +} + +$code=<<___; +.explicit +.text + +prevfs=r2; prevlc=r3; prevpr=r8; +mask0xf0=r21; +rem=r22; rem_4bitp=r23; +Xi=r24; Htbl=r25; +inp=r26; end=r27; +Hhi=r28; Hlo=r29; +Zhi=r30; Zlo=r31; + +.global gcm_gmult_4bit# +.proc gcm_gmult_4bit# +.align 128 +.skip 16;; // aligns loop body +gcm_gmult_4bit: + .prologue +{ .mmi; .save ar.pfs,prevfs + alloc prevfs=ar.pfs,2,6,0,8 + $ADDP Xi=15,in0 // &Xi[15] + mov rem_4bitp=ip } +{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo + .save ar.lc,prevlc + mov prevlc=ar.lc + .save pr,prevpr + mov prevpr=pr };; + + .body + .rotr in[3],xi[3],Hi[2] + +{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15] + mov mask0xf0=0xf0 + brp.loop.imp .Loop1,.Lend1-16};; +{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] + };; +{ .mii; shladd Hi[1]=xi[2],4,r0 + mov pr.rot=0x7<<16 + mov ar.lc=13 };; +{ .mii; and Hi[1]=mask0xf0,Hi[1] + mov ar.ec=3 + xor Zlo=Zlo,Zlo };; +{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo + add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp + xor Zhi=Zhi,Zhi };; +___ + &loop (".Loop1",1); +$code.=<<___; +.Lend1: +{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact +{ .mib; mux1 Zlo=Zlo,\@rev };; +{ .mib; mux1 Zhi=Zhi,\@rev };; +{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent + add Hhi=1,Xi };; // pipeline flush on Itanium +{ .mib; st8 [Hlo]=Zlo + mov pr=prevpr,-2 };; +{ .mib; st8 [Hhi]=Zhi + mov ar.lc=prevlc + br.ret.sptk.many b0 };; +.endp gcm_gmult_4bit# + +.global gcm_ghash_4bit# +.proc gcm_ghash_4bit# +.align 32;; +gcm_ghash_4bit: + .prologue +{ .mmi; .save ar.pfs,prevfs + alloc prevfs=ar.pfs,4,4,0,8 + $ADDP inp=15,in0 // &inp[15] + mov rem_4bitp=ip } +{ .mmi; $ADDP end=in1,in0 // &inp[len] + $ADDP Xi=15,in2 // &Xi[15] + .save ar.lc,prevlc + mov prevlc=ar.lc };; +{ .mmi; $ADDP Htbl=8,in3 // &Htbl[0].lo + mov mask0xf0=0xf0 + .save pr,prevpr + mov prevpr=pr } + + .body + .rotr in[3],xi[3],Hi[2] + +{ .mmi; ld1 in[2]=[inp],-1 // inp[15] + ld1 xi[2]=[Xi],-1 // Xi[15] + add end=-17,end };; +{ .mmi; ld1 in[1]=[inp],-1 // inp[14] + ld1 xi[1]=[Xi],-1 // Xi[14] + xor xi[2]=xi[2],in[2] };; +{ .mii; shladd Hi[1]=xi[2],4,r0 + mov pr.rot=0x7<<16 + mov ar.lc=13 };; +{ .mii; and Hi[1]=mask0xf0,Hi[1] + mov ar.ec=3 + xor Zlo=Zlo,Zlo };; +{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo + add rem_4bitp=rem_4bit#-gcm_ghash_4bit#,rem_4bitp + xor Zhi=Zhi,Zhi };; +___ + &loop (".LoopN"); +$code.=<<___; +{ .mib; xor Zhi=Zhi,Hhi // modulo-scheduling artefact + extr.u xi[2]=Zlo,0,8 } // Xi[15] +{ .mib; cmp.ltu p6,p0=inp,end // are we done? + add inp=32,inp // advance inp + clrrrb.pr };; +{ .mii; +(p6) ld1 in[2]=[inp],-1 // inp[15] +(p6) extr.u xi[1]=Zlo,8,8 // Xi[14] +(p6) mov ar.lc=13 };; +{ .mii; +(p6) ld1 in[1]=[inp],-1 // inp[14] +(p6) mov ar.ec=3 + mux1 Zlo=Zlo,\@rev };; +{ .mii; +(p6) xor xi[2]=xi[2],in[2] + mux1 Zhi=Zhi,\@rev };; +{ .mii; +(p6) shladd Hi[1]=xi[2],4,r0 + add Hlo=9,Xi // Xi is &Xi[-1] + add Hhi=1,Xi };; +{ .mii; +(p6) and Hi[1]=mask0xf0,Hi[1] +(p6) add Xi=14,Xi // &Xi[13] +(p6) mov pr.rot=0x7<<16 };; + +{ .mii; st8 [Hlo]=Zlo +(p6) xor Zlo=Zlo,Zlo +(p6) add Hi[1]=Htbl,Hi[1] };; +{ .mib; st8 [Hhi]=Zhi +(p6) xor Zhi=Zhi,Zhi +(p6) br.cond.dptk.many .LoopN };; + +{ .mib; mov pr=prevpr,-2 } +{ .mib; mov ar.lc=prevlc + br.ret.sptk.many b0 };; +.endp gcm_ghash_4bit# + +.align 128;; +.type rem_4bit#,\@object +rem_4bit: + data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 + data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 + data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 + data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 +.size rem_4bit#,128 +stringz "GHASH for IA64, CRYPTOGAMS by " +___ + +$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); + +print $code; +close STDOUT; diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl index 0222ede585..63e76c1da6 100644 --- a/crypto/modes/asm/ghash-x86.pl +++ b/crypto/modes/asm/ghash-x86.pl @@ -7,9 +7,11 @@ # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # +# March 2010 +# # The module implements "4-bit" Galois field multiplication and # streamed GHASH function. "4-bit" means that it uses 256 bytes -# per-key table [+128/256 bytes fixed table]. It has two code paths: +# per-key table [+64/128 bytes fixed table]. It has two code paths: # vanilla x86 and vanilla MMX. Former will be executed on 486 and # Pentium, latter on all others. Performance results are for streamed # GHASH subroutine and are expressed in cycles per processed byte, @@ -18,13 +20,13 @@ # gcc 2.95.3(*) MMX assembler x86 assembler # # Pentium 100/112(**) - 50 -# PIII 63 /77 17 24 -# P4 96 /122 33 84(***) -# Opteron 50 /71 22 30 -# Core2 63 /102 21 28 +# PIII 63 /77 16 24 +# P4 96 /122 30 84(***) +# Opteron 50 /71 21 30 +# Core2 63 /102 19 28 # # (*) gcc 3.4.x was observed to generate few percent slower code, -# which is one of reasons why 2.95.3 result were chosen; +# which is one of reasons why 2.95.3 results were chosen, # another reason is lack of 3.4.x results for older CPUs; # (**) second number is result for code compiled with -fPIC flag, # which is actually more relevant, because assembler code is @@ -32,8 +34,8 @@ # (***) see comment in non-MMX routine for further details; # # To summarize, it's 2-3 times faster than gcc-generated code. To -# anchor it to something else SHA1 assembler processes single byte -# in 11-13 cycles. +# anchor it to something else SHA1 assembler processes one byte in +# 11-13 cycles on contemporary x86 cores. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); @@ -52,13 +54,13 @@ $Htbl = "esi"; $unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse # than unrolled, which has to be weighted against - # almost 2x code size reduction. Well, *overall* - # code size. x86-specific code shrinks by 7.5x... + # 1.7x code size reduction. Well, *overall* 1.7x, + # x86-specific code itself shrinks by 2.5x... sub mmx_loop() { -# MMX version performs 2.5 times better on P4 (see comment in non-MMX -# routine for further details), 35% better on Opteron and Core2, 40% -# better on PIII... In other words effort is considered to be well +# MMX version performs 2.8 times better on P4 (see comment in non-MMX +# routine for further details), 40% better on Opteron, 50% better +# on PIII and Core2... In other words effort is considered to be well # spent... my $inp = shift; my $rem_4bit = shift; @@ -74,7 +76,7 @@ sub mmx_loop() { &xor ($nlo,$nlo); # avoid partial register stalls on PIII &mov ($nhi,$Zll); &mov (&LB($nlo),&LB($nhi)); - &mov ($cnt,15); + &mov ($cnt,14); &shl (&LB($nlo),4); &and ($nhi,0xf0); &movq ($Zlo,&QWP(8,$Htbl,$nlo)); @@ -85,34 +87,59 @@ sub mmx_loop() { &set_label("mmx_loop",16); &psrlq ($Zlo,4); &and ($rem,0xf); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); &movq ($tmp,$Zhi); &psrlq ($Zhi,4); + &mov (&LB($nlo),&BP(0,$inp,$cnt)); &dec ($cnt); - &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); &psllq ($tmp,60); &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); &movd ($rem,$Zlo); &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &mov ($nhi,$nlo); &pxor ($Zlo,$tmp); &js (&label("mmx_break")); - &movz ($nhi,&BP(0,$inp,$cnt)); + &shl (&LB($nlo),4); + &and ($rem,0xf); &psrlq ($Zlo,4); - &mov (&LB($nlo),&LB($nhi)); + &and ($nhi,0xf0); &movq ($tmp,$Zhi); - &shl (&LB($nlo),4); &psrlq ($Zhi,4); - &and ($rem,0xf); &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); &psllq ($tmp,60); &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); &movd ($rem,$Zlo); &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); &pxor ($Zlo,$tmp); - &and ($nhi,0xf0); &jmp (&label("mmx_loop")); &set_label("mmx_break",16); + &shl (&LB($nlo),4); + &and ($rem,0xf); + &psrlq ($Zlo,4); + &and ($nhi,0xf0); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); + &pxor ($Zlo,$tmp); + + &psrlq ($Zlo,4); + &and ($rem,0xf); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &mov ($nhi,$nlo); + &pxor ($Zlo,$tmp); + &psrlq ($Zlo,32); # lower part of Zlo is already there &movd ($Zhl,$Zhi); &psrlq ($Zhi,32); diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl index 252835dbb1..e20767836f 100644 --- a/crypto/modes/asm/ghash-x86_64.pl +++ b/crypto/modes/asm/ghash-x86_64.pl @@ -7,9 +7,11 @@ # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # +# March 2010 +# # The module implements "4-bit" Galois field multiplication and # streamed GHASH function. "4-bit" means that it uses 256 bytes -# per-key table [+128 bytes fixed table]. Performance results are for +# per-key table [+128 bytes shared table]. Performance results are for # streamed GHASH subroutine and are expressed in cycles per processed # byte, less is better: # @@ -136,9 +138,8 @@ $code=<<___; .align 16 gcm_gmult_4bit: push %rbx - push %rbp - push %r12 - sub \$16,%rsp + push %rbp # %rbp and %r12 are pushed exclusively in + push %r12 # order to reuse Win64 exception handler... .Lgmult_prologue: movzb 15($Xi),$Zlo @@ -149,8 +150,8 @@ $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) - mov 32(%rsp),%rbx - lea 40(%rsp),%rsp + mov 16(%rsp),%rbx + lea 24(%rsp),%rsp .Lgmult_epilogue: ret .size gcm_gmult_4bit,.-gcm_gmult_4bit @@ -174,7 +175,6 @@ gcm_ghash_4bit: push %rbx push %rbp push %r12 - sub \$16,%rsp .Lghash_prologue: mov 8($Xi),$Zlo @@ -186,11 +186,11 @@ gcm_ghash_4bit: xor 8($inp),$Zlo xor ($inp),$Zhi lea 16($inp),$inp - mov $Zlo,8(%rsp) - mov $Zhi,(%rsp) + mov $Zlo,8($Xi) + mov $Zhi,($Xi) shr \$56,$Zlo ___ - &loop ("%rsp"); + &loop ($Xi); $code.=<<___; cmp $len,$inp jb .Louter_loop @@ -198,10 +198,10 @@ $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) - mov 16(%rsp),%r12 - mov 24(%rsp),%rbp - mov 32(%rsp),%rbx - lea 40(%rsp),%rsp + mov 0(%rsp),%r12 + mov 8(%rsp),%rbp + mov 16(%rsp),%rbx + lea 24(%rsp),%rsp .Lghash_epilogue: ret .size gcm_ghash_4bit,.-gcm_ghash_4bit @@ -259,7 +259,7 @@ se_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue - lea 40(%rax),%rax # adjust "rsp" + lea 24(%rax),%rax # adjust "rsp" mov -8(%rax),%rbx mov -16(%rax),%rbp -- cgit v1.2.3