diff options
author | Dr. Stephen Henson <steve@openssl.org> | 2011-08-11 22:36:19 +0000 |
---|---|---|
committer | Dr. Stephen Henson <steve@openssl.org> | 2011-08-11 22:36:19 +0000 |
commit | dc01af7723123aa9d426a58b34225224c4bc68ef (patch) | |
tree | 834d73a3027d11045d14c293859d66db5c720b52 /crypto/modes | |
parent | 5435d0412fff21450f6ba7b5dbfce1a537471588 (diff) |
Sync ASM/modes to add CCM and XTS modes and assembly language optimisation
(from HEAD, original by Andy).
Diffstat (limited to 'crypto/modes')
-rw-r--r-- | crypto/modes/Makefile | 75 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-alpha.pl | 451 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-armv4.pl | 429 | ||||
-rwxr-xr-x | crypto/modes/asm/ghash-ia64.pl | 463 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-parisc.pl | 730 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-s390x.pl | 262 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-sparcv9.pl | 330 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-x86.pl | 1342 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-x86_64.pl | 805 | ||||
-rw-r--r-- | crypto/modes/cbc128.c | 10 | ||||
-rw-r--r-- | crypto/modes/ccm128.c | 441 | ||||
-rw-r--r-- | crypto/modes/cfb128.c | 11 | ||||
-rw-r--r-- | crypto/modes/ctr128.c | 17 | ||||
-rw-r--r-- | crypto/modes/cts128.c | 226 | ||||
-rw-r--r-- | crypto/modes/modes.h | 42 | ||||
-rw-r--r-- | crypto/modes/ofb128.c | 11 | ||||
-rw-r--r-- | crypto/modes/xts128.c | 187 |
17 files changed, 5774 insertions, 58 deletions
diff --git a/crypto/modes/Makefile b/crypto/modes/Makefile index 007ceffdb6..c825b12f25 100644 --- a/crypto/modes/Makefile +++ b/crypto/modes/Makefile @@ -10,15 +10,21 @@ CFLAG=-g MAKEFILE= Makefile AR= ar r +MODES_ASM_OBJ= + CFLAGS= $(INCLUDES) $(CFLAG) +ASFLAGS= $(INCLUDES) $(ASFLAG) +AFLAGS= $(ASFLAGS) GENERAL=Makefile TEST= APPS= LIB=$(TOP)/libcrypto.a -LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c -LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o +LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \ + ccm128.c xts128.c +LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o \ + ccm128.o xts128.o $(MODES_ASM_OBJ) SRC= $(LIBSRC) @@ -38,6 +44,24 @@ lib: $(LIBOBJ) $(RANLIB) $(LIB) || echo Never mind. @touch lib +ghash-ia64.s: asm/ghash-ia64.pl + $(PERL) asm/ghash-ia64.pl $@ $(CFLAGS) +ghash-x86.s: asm/ghash-x86.pl + $(PERL) asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ +ghash-x86_64.s: asm/ghash-x86_64.pl + $(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@ +ghash-sparcv9.s: asm/ghash-sparcv9.pl + $(PERL) asm/ghash-sparcv9.pl $@ $(CFLAGS) +ghash-alpha.s: asm/ghash-alpha.pl + $(PERL) $< | $(CC) -E - | tee $@ > /dev/null +ghash-parisc.s: asm/ghash-parisc.pl + $(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@ + +# GNU make "catch all" +ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ + +ghash-armv4.o: ghash-armv4.S + files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO @@ -71,12 +95,47 @@ dclean: mv -f Makefile.new $(MAKEFILE) clean: - rm -f *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff + rm -f *.s *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff # DO NOT DELETE THIS LINE -- make depend depends on it. -cbc128.o: cbc128.c modes.h -cfb128.o: cfb128.c modes.h -ctr128.o: ctr128.c modes.h -cts128.o: cts128.c modes.h -ofb128.o: modes.h ofb128.c +cbc128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +cbc128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +cbc128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +cbc128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +cbc128.o: ../../include/openssl/symhacks.h cbc128.c modes_lcl.h +ccm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +ccm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +ccm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +ccm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +ccm128.o: ../../include/openssl/symhacks.h ccm128.c modes_lcl.h +cfb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +cfb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +cfb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +cfb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +cfb128.o: ../../include/openssl/symhacks.h cfb128.c modes_lcl.h +ctr128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +ctr128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +ctr128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +ctr128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +ctr128.o: ../../include/openssl/symhacks.h ctr128.c modes_lcl.h +cts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +cts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +cts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +cts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +cts128.o: ../../include/openssl/symhacks.h cts128.c modes_lcl.h +gcm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +gcm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +gcm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +gcm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +gcm128.o: ../../include/openssl/symhacks.h gcm128.c modes_lcl.h +ofb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +ofb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +ofb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +ofb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +ofb128.o: ../../include/openssl/symhacks.h modes_lcl.h ofb128.c +xts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +xts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +xts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +xts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +xts128.o: ../../include/openssl/symhacks.h modes_lcl.h xts128.c diff --git a/crypto/modes/asm/ghash-alpha.pl b/crypto/modes/asm/ghash-alpha.pl new file mode 100644 index 0000000000..6358b2750f --- /dev/null +++ b/crypto/modes/asm/ghash-alpha.pl @@ -0,0 +1,451 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Even though +# loops are aggressively modulo-scheduled in respect to references to +# Htbl and Z.hi updates for 8 cycles per byte, measured performance is +# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic +# scheduling "glitch," because uprofile(1) indicates uniform sample +# distribution, as if all instruction bundles execute in 1.5 cycles. +# Meaning that it could have been even faster, yet 12 cycles is ~60% +# better than gcc-generated code and ~80% than code generated by vendor +# compiler. + +$cnt="v0"; # $0 +$t0="t0"; +$t1="t1"; +$t2="t2"; +$Thi0="t3"; # $4 +$Tlo0="t4"; +$Thi1="t5"; +$Tlo1="t6"; +$rem="t7"; # $8 +################# +$Xi="a0"; # $16, input argument block +$Htbl="a1"; +$inp="a2"; +$len="a3"; +$nlo="a4"; # $20 +$nhi="a5"; +$Zhi="t8"; +$Zlo="t9"; +$Xhi="t10"; # $24 +$Xlo="t11"; +$remp="t12"; +$rem_4bit="AT"; # $28 + +{ my $N; + sub loop() { + + $N++; +$code.=<<___; +.align 4 + extbl $Xlo,7,$nlo + and $nlo,0xf0,$nhi + sll $nlo,4,$nlo + and $nlo,0xf0,$nlo + + addq $nlo,$Htbl,$nlo + ldq $Zlo,8($nlo) + addq $nhi,$Htbl,$nhi + ldq $Zhi,0($nlo) + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + lda $cnt,6(zero) + extbl $Xlo,6,$nlo + + ldq $Tlo1,8($nhi) + s8addq $remp,$rem_4bit,$remp + ldq $Thi1,0($nhi) + srl $Zlo,4,$Zlo + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + and $nlo,0xf0,$nhi + + xor $Tlo1,$Zlo,$Zlo + sll $nlo,4,$nlo + xor $Thi1,$Zhi,$Zhi + and $nlo,0xf0,$nlo + + addq $nlo,$Htbl,$nlo + ldq $Tlo0,8($nlo) + addq $nhi,$Htbl,$nhi + ldq $Thi0,0($nlo) + +.Looplo$N: + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + subq $cnt,1,$cnt + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xlo,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + bne $cnt,.Looplo$N + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + lda $cnt,7(zero) + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xhi,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + unop + + +.Loophi$N: + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + subq $cnt,1,$cnt + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xhi,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + bne $cnt,.Loophi$N + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + + xor $Tlo0,$Zlo,$Zlo + xor $Thi0,$Zhi,$Zhi + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + xor $t0,$Zlo,$Zlo + xor $rem,$Zhi,$Zhi +___ +}} + +$code=<<___; +#ifdef __linux__ +#include <asm/regdef.h> +#else +#include <asm.h> +#include <regdef.h> +#endif + +.text + +.set noat +.set noreorder +.globl gcm_gmult_4bit +.align 4 +.ent gcm_gmult_4bit +gcm_gmult_4bit: + .frame sp,0,ra + .prologue 0 + + ldq $Xlo,8($Xi) + ldq $Xhi,0($Xi) + + br $rem_4bit,.Lpic1 +.Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit) +___ + + &loop(); + +$code.=<<___; + srl $Zlo,24,$t0 # byte swap + srl $Zlo,8,$t1 + + sll $Zlo,8,$t2 + sll $Zlo,24,$Zlo + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + + zapnot $Zlo,0x88,$Zlo + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zlo,$t0,$Zlo + srl $Zhi,24,$t0 + srl $Zhi,8,$t1 + + or $Zlo,$t2,$Zlo + sll $Zhi,8,$t2 + sll $Zhi,24,$Zhi + + srl $Zlo,32,$Xlo + sll $Zlo,32,$Zlo + + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + stq $Xlo,8($Xi) + stq $Xhi,0($Xi) + + ret (ra) +.end gcm_gmult_4bit +___ + +$inhi="s0"; +$inlo="s1"; + +$code.=<<___; +.globl gcm_ghash_4bit +.align 4 +.ent gcm_ghash_4bit +gcm_ghash_4bit: + lda sp,-32(sp) + stq ra,0(sp) + stq s0,8(sp) + stq s1,16(sp) + .mask 0x04000600,-32 + .frame sp,32,ra + .prologue 0 + + ldq_u $inhi,0($inp) + ldq_u $Thi0,7($inp) + ldq_u $inlo,8($inp) + ldq_u $Tlo0,15($inp) + ldq $Xhi,0($Xi) + ldq $Xlo,8($Xi) + + br $rem_4bit,.Lpic2 +.Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit) + +.Louter: + extql $inhi,$inp,$inhi + extqh $Thi0,$inp,$Thi0 + or $inhi,$Thi0,$inhi + lda $inp,16($inp) + + extql $inlo,$inp,$inlo + extqh $Tlo0,$inp,$Tlo0 + or $inlo,$Tlo0,$inlo + subq $len,16,$len + + xor $Xlo,$inlo,$Xlo + xor $Xhi,$inhi,$Xhi +___ + + &loop(); + +$code.=<<___; + srl $Zlo,24,$t0 # byte swap + srl $Zlo,8,$t1 + + sll $Zlo,8,$t2 + sll $Zlo,24,$Zlo + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + + zapnot $Zlo,0x88,$Zlo + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zlo,$t0,$Zlo + srl $Zhi,24,$t0 + srl $Zhi,8,$t1 + + or $Zlo,$t2,$Zlo + sll $Zhi,8,$t2 + sll $Zhi,24,$Zhi + + srl $Zlo,32,$Xlo + sll $Zlo,32,$Zlo + beq $len,.Ldone + + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + ldq_u $inhi,0($inp) + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + ldq_u $Thi0,7($inp) + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + ldq_u $inlo,8($inp) + ldq_u $Tlo0,15($inp) + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + br zero,.Louter + +.Ldone: + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + + stq $Xlo,8($Xi) + stq $Xhi,0($Xi) + + .set noreorder + /*ldq ra,0(sp)*/ + ldq s0,8(sp) + ldq s1,16(sp) + lda sp,32(sp) + ret (ra) +.end gcm_ghash_4bit + +.align 4 +rem_4bit: + .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 + .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 + .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 + .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 +.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 + +___ +$output=shift and open STDOUT,">$output"; +print $code; +close STDOUT; + diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl new file mode 100644 index 0000000000..d91586ee29 --- /dev/null +++ b/crypto/modes/asm/ghash-armv4.pl @@ -0,0 +1,429 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# April 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+32 bytes shared table]. There is no +# experimental performance data available yet. The only approximation +# that can be made at this point is based on code size. Inner loop is +# 32 instructions long and on single-issue core should execute in <40 +# cycles. Having verified that gcc 3.4 didn't unroll corresponding +# loop, this assembler loop body was found to be ~3x smaller than +# compiler-generated one... +# +# July 2010 +# +# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on +# Cortex A8 core and ~25 cycles per processed byte (which was observed +# to be ~3 times faster than gcc-generated code:-) +# +# February 2011 +# +# Profiler-assisted and platform-specific optimization resulted in 7% +# improvement on Cortex A8 core and ~23.5 cycles per byte. +# +# March 2011 +# +# Add NEON implementation featuring polynomial multiplication, i.e. no +# lookup tables involved. On Cortex A8 it was measured to process one +# byte in 15 cycles or 55% faster than integer-only code. + +# ==================================================================== +# Note about "528B" variant. In ARM case it makes lesser sense to +# implement it for following reasons: +# +# - performance improvement won't be anywhere near 50%, because 128- +# bit shift operation is neatly fused with 128-bit xor here, and +# "538B" variant would eliminate only 4-5 instructions out of 32 +# in the inner loop (meaning that estimated improvement is ~15%); +# - ARM-based systems are often embedded ones and extra memory +# consumption might be unappreciated (for so little improvement); +# +# Byte order [in]dependence. ========================================= +# +# Caller is expected to maintain specific *dword* order in Htable, +# namely with *least* significant dword of 128-bit value at *lower* +# address. This differs completely from C code and has everything to +# do with ldm instruction and order in which dwords are "consumed" by +# algorithm. *Byte* order within these dwords in turn is whatever +# *native* byte order on current platform. See gcm128.c for working +# example... + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$Xi="r0"; # argument block +$Htbl="r1"; +$inp="r2"; +$len="r3"; + +$Zll="r4"; # variables +$Zlh="r5"; +$Zhl="r6"; +$Zhh="r7"; +$Tll="r8"; +$Tlh="r9"; +$Thl="r10"; +$Thh="r11"; +$nlo="r12"; +################# r13 is stack pointer +$nhi="r14"; +################# r15 is program counter + +$rem_4bit=$inp; # used in gcm_gmult_4bit +$cnt=$len; + +sub Zsmash() { + my $i=12; + my @args=@_; + for ($Zll,$Zlh,$Zhl,$Zhh) { + $code.=<<___; +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev $_,$_ + str $_,[$Xi,#$i] +#elif defined(__ARMEB__) + str $_,[$Xi,#$i] +#else + mov $Tlh,$_,lsr#8 + strb $_,[$Xi,#$i+3] + mov $Thl,$_,lsr#16 + strb $Tlh,[$Xi,#$i+2] + mov $Thh,$_,lsr#24 + strb $Thl,[$Xi,#$i+1] + strb $Thh,[$Xi,#$i] +#endif +___ + $code.="\t".shift(@args)."\n"; + $i-=4; + } +} + +$code=<<___; +#include "arm_arch.h" + +.text +.code 32 + +.type rem_4bit,%object +.align 5 +rem_4bit: +.short 0x0000,0x1C20,0x3840,0x2460 +.short 0x7080,0x6CA0,0x48C0,0x54E0 +.short 0xE100,0xFD20,0xD940,0xC560 +.short 0x9180,0x8DA0,0xA9C0,0xB5E0 +.size rem_4bit,.-rem_4bit + +.type rem_4bit_get,%function +rem_4bit_get: + sub $rem_4bit,pc,#8 + sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit + b .Lrem_4bit_got + nop +.size rem_4bit_get,.-rem_4bit_get + +.global gcm_ghash_4bit +.type gcm_ghash_4bit,%function +gcm_ghash_4bit: + sub r12,pc,#8 + add $len,$inp,$len @ $len to point at the end + stmdb sp!,{r3-r11,lr} @ save $len/end too + sub r12,r12,#48 @ &rem_4bit + + ldmia r12,{r4-r11} @ copy rem_4bit ... + stmdb sp!,{r4-r11} @ ... to stack + + ldrb $nlo,[$inp,#15] + ldrb $nhi,[$Xi,#15] +.Louter: + eor $nlo,$nlo,$nhi + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + mov $cnt,#14 + + add $Zhh,$Htbl,$nlo,lsl#4 + ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] + add $Thh,$Htbl,$nhi + ldrb $nlo,[$inp,#14] + + and $nhi,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + add $nhi,$nhi,$nhi + eor $Zll,$Tll,$Zll,lsr#4 + ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] + eor $Zll,$Zll,$Zlh,lsl#28 + ldrb $nhi,[$Xi,#14] + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + eor $nlo,$nlo,$nhi + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tll,lsl#16 + +.Linner: + add $Thh,$Htbl,$nlo,lsl#4 + and $nlo,$Zll,#0xf @ rem + subs $cnt,$cnt,#1 + add $nlo,$nlo,$nlo + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] + eor $Zhl,$Thl,$Zhl,lsr#4 + ldrplb $nlo,[$inp,$cnt] + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + add $nhi,$nhi,$nhi + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + eor $Zll,$Tll,$Zll,lsr#4 + ldrplb $Tll,[$Xi,$cnt] + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + ldrh $Tlh,[sp,$nhi] + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eorpl $nlo,$nlo,$Tll + eor $Zhh,$Thh,$Zhh,lsr#4 + andpl $nhi,$nlo,#0xf0 + andpl $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] + bpl .Linner + + ldr $len,[sp,#32] @ re-load $len/end + add $inp,$inp,#16 + mov $nhi,$Zll +___ + &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]"); +$code.=<<___; + bne .Louter + + add sp,sp,#36 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size gcm_ghash_4bit,.-gcm_ghash_4bit + +.global gcm_gmult_4bit +.type gcm_gmult_4bit,%function +gcm_gmult_4bit: + stmdb sp!,{r4-r11,lr} + ldrb $nlo,[$Xi,#15] + b rem_4bit_get +.Lrem_4bit_got: + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + mov $cnt,#14 + + add $Zhh,$Htbl,$nlo,lsl#4 + ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] + ldrb $nlo,[$Xi,#14] + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + add $nhi,$nhi,$nhi + eor $Zll,$Tll,$Zll,lsr#4 + ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + and $nhi,$nlo,#0xf0 + eor $Zhh,$Zhh,$Tll,lsl#16 + and $nlo,$nlo,#0x0f + +.Loop: + add $Thh,$Htbl,$nlo,lsl#4 + and $nlo,$Zll,#0xf @ rem + subs $cnt,$cnt,#1 + add $nlo,$nlo,$nlo + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] + eor $Zhl,$Thl,$Zhl,lsr#4 + ldrplb $nlo,[$Xi,$cnt] + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + add $nhi,$nhi,$nhi + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + andpl $nhi,$nlo,#0xf0 + andpl $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + bpl .Loop +___ + &Zsmash(); +$code.=<<___; +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size gcm_gmult_4bit,.-gcm_gmult_4bit +___ +{ +my $cnt=$Htbl; # $Htbl is used once in the very beginning + +my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7)); +my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15)); + +# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit +# in Zo. Or should I say "top bit", because GHASH is specified in +# reverse bit order? Otherwise straightforward 128-bt H by one input +# byte multiplication and modulo-reduction, times 16. + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } +sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu neon + +.global gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + sub $Htbl,#16 @ point at H in GCM128_CTX + vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi + vmov.i32 $mod,#0xe1 @ our irreducible polynomial + vld1.64 `&Dlo("$IN")`,[$Xi,:64]! + vshr.u64 $mod,#32 + vldmia $Htbl,{$Hhi-$Hlo} @ load H + veor $zero,$zero +#ifdef __ARMEL__ + vrev64.8 $IN,$IN +#endif + veor $Qpost,$Qpost + veor $R,$R + mov $cnt,#16 + veor $Z,$Z + mov $len,#16 + veor $Zo,$Zo + vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte + b .Linner_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.global gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi + vmov.i32 $mod,#0xe1 @ our irreducible polynomial + vld1.64 `&Dlo("$Z")`,[$Xi,:64]! + vshr.u64 $mod,#32 + vldmia $Xi,{$Hhi-$Hlo} @ load H + veor $zero,$zero + nop +#ifdef __ARMEL__ + vrev64.8 $Z,$Z +#endif +.Louter_neon: + vld1.64 `&Dhi($IN)`,[$inp]! @ load inp + veor $Qpost,$Qpost + vld1.64 `&Dlo($IN)`,[$inp]! + veor $R,$R + mov $cnt,#16 +#ifdef __ARMEL__ + vrev64.8 $IN,$IN +#endif + veor $Zo,$Zo + veor $IN,$Z @ inp^=Xi + veor $Z,$Z + vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte +.Linner_neon: + subs $cnt,$cnt,#1 + vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i] + vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i] + vext.8 $IN,$zero,#1 @ IN>>=8 + + veor $Z,$Qpost @ modulo-scheduled part + vshl.i64 `&Dlo("$R")`,#48 + vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte + veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")` + + veor `&Dhi("$Z")`,`&Dlo("$R")` + vuzp.8 $Qlo,$Qhi + vsli.8 $Zo,$T,#1 @ compose the "carry" byte + vext.8 $Z,$zero,#1 @ Z>>=8 + + vmull.p8 $R,$Zo,$mod @ "carry"·0xe1 + vshr.u8 $Zo,$T,#7 @ save Z's bottom bit + vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8 + veor $Z,$Qhi + bne .Linner_neon + + veor $Z,$Qpost @ modulo-scheduled artefact + vshl.i64 `&Dlo("$R")`,#48 + veor `&Dhi("$Z")`,`&Dlo("$R")` + + @ finalization, normalize Z:Zo + vand $Zo,$mod @ suffices to mask the bit + vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63 + vshl.i64 $Z,#1 + subs $len,#16 + vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1 + bne .Louter_neon + +#ifdef __ARMEL__ + vrev64.8 $Z,$Z +#endif + sub $Xi,#16 + vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi + vst1.64 `&Dlo("$Z")`,[$Xi,:64] + + bx lr +.size gcm_ghash_neon,.-gcm_ghash_neon +#endif +___ +} +$code.=<<___; +.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +print $code; +close STDOUT; # enforce flush diff --git a/crypto/modes/asm/ghash-ia64.pl b/crypto/modes/asm/ghash-ia64.pl new file mode 100755 index 0000000000..0354c95444 --- /dev/null +++ b/crypto/modes/asm/ghash-ia64.pl @@ -0,0 +1,463 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Streamed +# GHASH performance was measured to be 6.67 cycles per processed byte +# on Itanium 2, which is >90% better than Microsoft compiler generated +# code. To anchor to something else sha1-ia64.pl module processes one +# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per +# byte. + +# September 2010 +# +# It was originally thought that it makes lesser sense to implement +# "528B" variant on Itanium 2 for following reason. Because number of +# functional units is naturally limited, it appeared impossible to +# implement "528B" loop in 4 cycles, only in 5. This would mean that +# theoretically performance improvement couldn't be more than 20%. +# But occasionally you prove yourself wrong:-) I figured out a way to +# fold couple of instructions and having freed |