GHASH assembler: new ghash-sparcv9.pl module and saner descriptions.

author: Andy Polyakov <appro@openssl.org> 2010-03-22 17:24:18 +0000
committer: Andy Polyakov <appro@openssl.org> 2010-03-22 17:24:18 +0000
commit: c3473126b1ed43e281f5fb2394f16bf8a5be2922 (patch)
tree: d59a6c1a8b2dbeb59ca185683279e066669124e5 /crypto
parent: 82f385d71d1e060422897e8155c0d741bd459c01 (diff)
4 files changed, 346 insertions, 22 deletions
diff --git a/crypto/modes/asm/ghash-ia64.pl b/crypto/modes/asm/ghash-ia64.pl
index 86c08c6477..3fd1446cd9 100755
--- a/crypto/modes/asm/ghash-ia64.pl
+++ b/crypto/modes/asm/ghash-ia64.pl
@@ -9,16 +9,16 @@
 #
 # March 2010
 #
-# The module implements "4-bit" Galois field multiplication and
-# streamed GHASH function. "4-bit" means that it uses 256 bytes
-# per-key table [+128 bytes shared table]. Streamed GHASH performance
-# was measured to be 6.35 cycles per processed byte on Itanium 2,
-# which is >90% better than Microsoft compiler generated code. Well,
-# the number should have been ~6.5. The deviation has everything to do
-# with the way performance is measured, as difference between GCM and
-# straightforward 128-bit counter mode. To anchor to something else
-# sha1-ia64.pl module processes one byte in 6.0 cycles. On Itanium
-# GHASH should run at ~8.5 cycles per byte.
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
+# GHASH performance was measured to be 6.35 cycles per processed byte
+# on Itanium 2, which is >90% better than Microsoft compiler generated
+# code. Well, the number should have been ~6.5. The deviation has
+# everything to do with the way performance is measured: as difference
+# between GCM and straightforward 128-bit counter mode. To anchor to
+# something else sha1-ia64.pl module processes one byte in 6.0 cycles.
+# On Itanium GHASH should run at ~8.5 cycles per byte.
 
 $output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
 
diff --git a/crypto/modes/asm/ghash-sparcv9.pl b/crypto/modes/asm/ghash-sparcv9.pl
new file mode 100644
index 0000000000..47d7a1dca3
--- /dev/null
+++ b/crypto/modes/asm/ghash-sparcv9.pl
@@ -0,0 +1,324 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
+# and are expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.3.x	cc 5.2		this assembler
+#
+# 32-bit build	81.0		48.6		11.8	(+586%/+311%)
+# 64-bit build	27.5		20.3		11.8	(+133%/+72%)
+#
+# I don't quite understand why difference between 32-bit and 64-bit
+# compiler-generated code is so big. Compilers *were* instructed to
+# generate code for UltraSPARC and should have used 64-bit registers
+# for Z vector (see C code) even in 32-bit build... Oh well, it only
+# means more impressive improvement coefficients for this assembler
+# module;-) Loops are aggressively modulo-scheduled in respect to
+# references to input data and Z.hi updates to achieve 12 cycles
+# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
+# cycles to process one byte [on UltraSPARC pre-Tx CPU].
+
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=112; }
+
+$output=shift;
+open STDOUT,">$output";
+
+$Zhi="%o0";	# 64-bit values
+$Zlo="%o1";
+$Thi="%o2";
+$Tlo="%o3";
+$rem="%o4";
+$tmp="%o5";
+
+$nhi="%l0";	# small values and pointers
+$nlo="%l1";
+$xi0="%l2";
+$xi1="%l3";
+$rem_4bit="%l4";
+$remi="%l5";
+$Htblo="%l6";
+$cnt="%l7";
+
+$inp="%i0";	# input arguments for gcm_ghash_4bit
+$len="%i1";
+$Xi="%i2";
+$Htbl="%i3";
+
+$code.=<<___;
+.section	".text",#alloc,#execinstr
+
+.align	64
+rem_4bit:
+	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+.type	rem_4bit,#object
+.size	rem_4bit,(.-rem_4bit)
+
+.globl	gcm_ghash_4bit
+.align	32
+gcm_ghash_4bit:
+	save	%sp,-$frame,%sp
+	ldub	[$inp+15],$nlo
+	ldub	[$Xi+15],$xi0
+	ldub	[$Xi+14],$xi1
+	add	$len,$inp,$len
+	add	$Htbl,8,$Htblo
+
+1:	call	.+8
+	add	%o7,rem_4bit-1b,$rem_4bit
+
+.Louter:
+	xor	$xi0,$nlo,$nlo
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	sll	$nlo,4,$nlo
+	ldx	[$Htblo+$nlo],$Zlo
+	ldx	[$Htbl+$nlo],$Zhi
+
+	ldub	[$inp+14],$nlo
+
+	ldx	[$Htblo+$nhi],$Tlo
+	and	$Zlo,0xf,$remi
+	ldx	[$Htbl+$nhi],$Thi
+	sll	$remi,3,$remi
+	ldx	[$rem_4bit+$remi],$rem
+	srlx	$Zlo,4,$Zlo
+	mov	13,$cnt
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+
+	xor	$xi1,$nlo,$nlo
+	and	$Zlo,0xf,$remi
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	ba	.Lghash_inner
+	sll	$nlo,4,$nlo
+.align	32
+.Lghash_inner:
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$inp+$cnt],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	ldub	[$Xi+$cnt],$xi1
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$xi1,$nlo,$nlo
+	srlx	$Zhi,4,$Zhi
+	and	$nlo,0xf0,$nhi
+	addcc	$cnt,-1,$cnt
+	xor	$Zlo,$tmp,$Zlo
+	and	$nlo,0x0f,$nlo
+	xor	$Tlo,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	blu	.Lghash_inner
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+
+	add	$inp,16,$inp
+	cmp	$inp,$len
+	be,pn	`$bits==64?"%xcc":"%icc"`,.Ldone
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$inp+15],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+	srl	$Zlo,8,$xi1
+	and	$Zlo,0xff,$xi0
+	ba	.Louter
+	and	$xi1,0xff,$xi1
+.align	32
+.Ldone:
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+
+	ret
+	restore
+.type	gcm_ghash_4bit,#function
+.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
+___
+
+$Xi="%i0";	# input arguments for gcm_gmult_4bit
+$Htbl="%i1";
+undef $inp;
+undef $len;
+
+$code.=<<___;
+.globl	gcm_gmult_4bit
+.align	32
+gcm_gmult_4bit:
+	save	%sp,-$frame,%sp
+	ldub	[$Xi+15],$nlo
+	add	$Htbl,8,$Htblo
+
+1:	call	.+8
+	add	%o7,rem_4bit-1b,$rem_4bit
+
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	sll	$nlo,4,$nlo
+	ldx	[$Htblo+$nlo],$Zlo
+	ldx	[$Htbl+$nlo],$Zhi
+
+	ldub	[$Xi+14],$nlo
+
+	ldx	[$Htblo+$nhi],$Tlo
+	and	$Zlo,0xf,$remi
+	ldx	[$Htbl+$nhi],$Thi
+	sll	$remi,3,$remi
+	ldx	[$rem_4bit+$remi],$rem
+	srlx	$Zlo,4,$Zlo
+	mov	13,$cnt
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+
+	and	$Zlo,0xf,$remi
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	ba	.Lgmult_inner
+	sll	$nlo,4,$nlo
+.align	32
+.Lgmult_inner:
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$Xi+$cnt],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	srlx	$Zhi,4,$Zhi
+	and	$nlo,0xf0,$nhi
+	addcc	$cnt,-1,$cnt
+	xor	$Zlo,$tmp,$Zlo
+	and	$nlo,0x0f,$nlo
+	xor	$Tlo,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	blu	.Lgmult_inner
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+
+	ret
+	restore
+.type	gcm_gmult_4bit,#function
+.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
+.asciz	"GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl
index 63e76c1da6..13efbcef6a 100644
--- a/crypto/modes/asm/ghash-x86.pl
+++ b/crypto/modes/asm/ghash-x86.pl
@@ -9,13 +9,13 @@
 #
 # March 2010
 #
-# The module implements "4-bit" Galois field multiplication and
-# streamed GHASH function. "4-bit" means that it uses 256 bytes
-# per-key table [+64/128 bytes fixed table]. It has two code paths:
-# vanilla x86 and vanilla MMX. Former will be executed on 486 and
-# Pentium, latter on all others. Performance results are for streamed
-# GHASH subroutine and are expressed in cycles per processed byte,
-# less is better:
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
+# code paths: vanilla x86 and vanilla MMX. Former will be executed on
+# 486 and Pentium, latter on all others. Performance results are for
+# streamed GHASH subroutine and are expressed in cycles per processed
+# byte, less is better:
 #
 #		gcc 2.95.3(*)	MMX assembler	x86 assembler
 #
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl
index e20767836f..1072979829 100644
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -9,11 +9,11 @@
 #
 # March 2010
 #
-# The module implements "4-bit" Galois field multiplication and
-# streamed GHASH function. "4-bit" means that it uses 256 bytes
-# per-key table [+128 bytes shared table]. Performance results are for
-# streamed GHASH subroutine and are expressed in cycles per processed
-# byte, less is better:
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# results are for streamed GHASH subroutine and are expressed in
+# cycles per processed byte, less is better:
 #
 #		gcc 3.4.x	assembler
 #
author	Andy Polyakov <appro@openssl.org>	2010-03-22 17:24:18 +0000
committer	Andy Polyakov <appro@openssl.org>	2010-03-22 17:24:18 +0000
commit	c3473126b1ed43e281f5fb2394f16bf8a5be2922 (patch)
tree	d59a6c1a8b2dbeb59ca185683279e066669124e5 /crypto
parent	82f385d71d1e060422897e8155c0d741bd459c01 (diff)