summaryrefslogtreecommitdiffstats
path: root/crypto/modes
diff options
context:
space:
mode:
authorDr. Stephen Henson <steve@openssl.org>2011-08-11 22:36:19 +0000
committerDr. Stephen Henson <steve@openssl.org>2011-08-11 22:36:19 +0000
commitdc01af7723123aa9d426a58b34225224c4bc68ef (patch)
tree834d73a3027d11045d14c293859d66db5c720b52 /crypto/modes
parent5435d0412fff21450f6ba7b5dbfce1a537471588 (diff)
Sync ASM/modes to add CCM and XTS modes and assembly language optimisation
(from HEAD, original by Andy).
Diffstat (limited to 'crypto/modes')
-rw-r--r--crypto/modes/Makefile75
-rw-r--r--crypto/modes/asm/ghash-alpha.pl451
-rw-r--r--crypto/modes/asm/ghash-armv4.pl429
-rwxr-xr-xcrypto/modes/asm/ghash-ia64.pl463
-rw-r--r--crypto/modes/asm/ghash-parisc.pl730
-rw-r--r--crypto/modes/asm/ghash-s390x.pl262
-rw-r--r--crypto/modes/asm/ghash-sparcv9.pl330
-rw-r--r--crypto/modes/asm/ghash-x86.pl1342
-rw-r--r--crypto/modes/asm/ghash-x86_64.pl805
-rw-r--r--crypto/modes/cbc128.c10
-rw-r--r--crypto/modes/ccm128.c441
-rw-r--r--crypto/modes/cfb128.c11
-rw-r--r--crypto/modes/ctr128.c17
-rw-r--r--crypto/modes/cts128.c226
-rw-r--r--crypto/modes/modes.h42
-rw-r--r--crypto/modes/ofb128.c11
-rw-r--r--crypto/modes/xts128.c187
17 files changed, 5774 insertions, 58 deletions
diff --git a/crypto/modes/Makefile b/crypto/modes/Makefile
index 007ceffdb6..c825b12f25 100644
--- a/crypto/modes/Makefile
+++ b/crypto/modes/Makefile
@@ -10,15 +10,21 @@ CFLAG=-g
MAKEFILE= Makefile
AR= ar r
+MODES_ASM_OBJ=
+
CFLAGS= $(INCLUDES) $(CFLAG)
+ASFLAGS= $(INCLUDES) $(ASFLAG)
+AFLAGS= $(ASFLAGS)
GENERAL=Makefile
TEST=
APPS=
LIB=$(TOP)/libcrypto.a
-LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c
-LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o
+LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
+ ccm128.c xts128.c
+LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o \
+ ccm128.o xts128.o $(MODES_ASM_OBJ)
SRC= $(LIBSRC)
@@ -38,6 +44,24 @@ lib: $(LIBOBJ)
$(RANLIB) $(LIB) || echo Never mind.
@touch lib
+ghash-ia64.s: asm/ghash-ia64.pl
+ $(PERL) asm/ghash-ia64.pl $@ $(CFLAGS)
+ghash-x86.s: asm/ghash-x86.pl
+ $(PERL) asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
+ghash-x86_64.s: asm/ghash-x86_64.pl
+ $(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@
+ghash-sparcv9.s: asm/ghash-sparcv9.pl
+ $(PERL) asm/ghash-sparcv9.pl $@ $(CFLAGS)
+ghash-alpha.s: asm/ghash-alpha.pl
+ $(PERL) $< | $(CC) -E - | tee $@ > /dev/null
+ghash-parisc.s: asm/ghash-parisc.pl
+ $(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
+
+# GNU make "catch all"
+ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
+
+ghash-armv4.o: ghash-armv4.S
+
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
@@ -71,12 +95,47 @@ dclean:
mv -f Makefile.new $(MAKEFILE)
clean:
- rm -f *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
+ rm -f *.s *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
# DO NOT DELETE THIS LINE -- make depend depends on it.
-cbc128.o: cbc128.c modes.h
-cfb128.o: cfb128.c modes.h
-ctr128.o: ctr128.c modes.h
-cts128.o: cts128.c modes.h
-ofb128.o: modes.h ofb128.c
+cbc128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+cbc128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+cbc128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+cbc128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+cbc128.o: ../../include/openssl/symhacks.h cbc128.c modes_lcl.h
+ccm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+ccm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+ccm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+ccm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+ccm128.o: ../../include/openssl/symhacks.h ccm128.c modes_lcl.h
+cfb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+cfb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+cfb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+cfb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+cfb128.o: ../../include/openssl/symhacks.h cfb128.c modes_lcl.h
+ctr128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+ctr128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+ctr128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+ctr128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+ctr128.o: ../../include/openssl/symhacks.h ctr128.c modes_lcl.h
+cts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+cts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+cts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+cts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+cts128.o: ../../include/openssl/symhacks.h cts128.c modes_lcl.h
+gcm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+gcm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+gcm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+gcm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+gcm128.o: ../../include/openssl/symhacks.h gcm128.c modes_lcl.h
+ofb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+ofb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+ofb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+ofb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+ofb128.o: ../../include/openssl/symhacks.h modes_lcl.h ofb128.c
+xts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+xts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+xts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+xts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+xts128.o: ../../include/openssl/symhacks.h modes_lcl.h xts128.c
diff --git a/crypto/modes/asm/ghash-alpha.pl b/crypto/modes/asm/ghash-alpha.pl
new file mode 100644
index 0000000000..6358b2750f
--- /dev/null
+++ b/crypto/modes/asm/ghash-alpha.pl
@@ -0,0 +1,451 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Even though
+# loops are aggressively modulo-scheduled in respect to references to
+# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
+# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
+# scheduling "glitch," because uprofile(1) indicates uniform sample
+# distribution, as if all instruction bundles execute in 1.5 cycles.
+# Meaning that it could have been even faster, yet 12 cycles is ~60%
+# better than gcc-generated code and ~80% than code generated by vendor
+# compiler.
+
+$cnt="v0"; # $0
+$t0="t0";
+$t1="t1";
+$t2="t2";
+$Thi0="t3"; # $4
+$Tlo0="t4";
+$Thi1="t5";
+$Tlo1="t6";
+$rem="t7"; # $8
+#################
+$Xi="a0"; # $16, input argument block
+$Htbl="a1";
+$inp="a2";
+$len="a3";
+$nlo="a4"; # $20
+$nhi="a5";
+$Zhi="t8";
+$Zlo="t9";
+$Xhi="t10"; # $24
+$Xlo="t11";
+$remp="t12";
+$rem_4bit="AT"; # $28
+
+{ my $N;
+ sub loop() {
+
+ $N++;
+$code.=<<___;
+.align 4
+ extbl $Xlo,7,$nlo
+ and $nlo,0xf0,$nhi
+ sll $nlo,4,$nlo
+ and $nlo,0xf0,$nlo
+
+ addq $nlo,$Htbl,$nlo
+ ldq $Zlo,8($nlo)
+ addq $nhi,$Htbl,$nhi
+ ldq $Zhi,0($nlo)
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ lda $cnt,6(zero)
+ extbl $Xlo,6,$nlo
+
+ ldq $Tlo1,8($nhi)
+ s8addq $remp,$rem_4bit,$remp
+ ldq $Thi1,0($nhi)
+ srl $Zlo,4,$Zlo
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ and $nlo,0xf0,$nhi
+
+ xor $Tlo1,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+ xor $Thi1,$Zhi,$Zhi
+ and $nlo,0xf0,$nlo
+
+ addq $nlo,$Htbl,$nlo
+ ldq $Tlo0,8($nlo)
+ addq $nhi,$Htbl,$nhi
+ ldq $Thi0,0($nlo)
+
+.Looplo$N:
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ subq $cnt,1,$cnt
+ srl $Zlo,4,$Zlo
+
+ ldq $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldq $Thi1,0($nhi)
+ s8addq $remp,$rem_4bit,$remp
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ extbl $Xlo,$cnt,$nlo
+
+ and $nlo,0xf0,$nhi
+ xor $Thi0,$Zhi,$Zhi
+ xor $Tlo0,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ and $nlo,0xf0,$nlo
+ srl $Zlo,4,$Zlo
+
+ s8addq $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+ addq $nlo,$Htbl,$nlo
+ addq $nhi,$Htbl,$nhi
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ ldq $Tlo0,8($nlo)
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ ldq $Thi0,0($nlo)
+ bne $cnt,.Looplo$N
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ lda $cnt,7(zero)
+ srl $Zlo,4,$Zlo
+
+ ldq $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldq $Thi1,0($nhi)
+ s8addq $remp,$rem_4bit,$remp
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ extbl $Xhi,$cnt,$nlo
+
+ and $nlo,0xf0,$nhi
+ xor $Thi0,$Zhi,$Zhi
+ xor $Tlo0,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ and $nlo,0xf0,$nlo
+ srl $Zlo,4,$Zlo
+
+ s8addq $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+ addq $nlo,$Htbl,$nlo
+ addq $nhi,$Htbl,$nhi
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ ldq $Tlo0,8($nlo)
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ ldq $Thi0,0($nlo)
+ unop
+
+
+.Loophi$N:
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ subq $cnt,1,$cnt
+ srl $Zlo,4,$Zlo
+
+ ldq $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldq $Thi1,0($nhi)
+ s8addq $remp,$rem_4bit,$remp
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ extbl $Xhi,$cnt,$nlo
+
+ and $nlo,0xf0,$nhi
+ xor $Thi0,$Zhi,$Zhi
+ xor $Tlo0,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ and $nlo,0xf0,$nlo
+ srl $Zlo,4,$Zlo
+
+ s8addq $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+ addq $nlo,$Htbl,$nlo
+ addq $nhi,$Htbl,$nhi
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ ldq $Tlo0,8($nlo)
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ ldq $Thi0,0($nlo)
+ bne $cnt,.Loophi$N
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ srl $Zlo,4,$Zlo
+
+ ldq $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldq $Thi1,0($nhi)
+ s8addq $remp,$rem_4bit,$remp
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo0,$Zlo,$Zlo
+ xor $Thi0,$Zhi,$Zhi
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ srl $Zlo,4,$Zlo
+
+ s8addq $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ xor $t0,$Zlo,$Zlo
+ xor $rem,$Zhi,$Zhi
+___
+}}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set noat
+.set noreorder
+.globl gcm_gmult_4bit
+.align 4
+.ent gcm_gmult_4bit
+gcm_gmult_4bit:
+ .frame sp,0,ra
+ .prologue 0
+
+ ldq $Xlo,8($Xi)
+ ldq $Xhi,0($Xi)
+
+ br $rem_4bit,.Lpic1
+.Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit)
+___
+
+ &loop();
+
+$code.=<<___;
+ srl $Zlo,24,$t0 # byte swap
+ srl $Zlo,8,$t1
+
+ sll $Zlo,8,$t2
+ sll $Zlo,24,$Zlo
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+
+ zapnot $Zlo,0x88,$Zlo
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zlo,$t0,$Zlo
+ srl $Zhi,24,$t0
+ srl $Zhi,8,$t1
+
+ or $Zlo,$t2,$Zlo
+ sll $Zhi,8,$t2
+ sll $Zhi,24,$Zhi
+
+ srl $Zlo,32,$Xlo
+ sll $Zlo,32,$Zlo
+
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+ or $Zlo,$Xlo,$Xlo
+
+ zapnot $Zhi,0x88,$Zhi
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zhi,$t0,$Zhi
+ or $Zhi,$t2,$Zhi
+
+ srl $Zhi,32,$Xhi
+ sll $Zhi,32,$Zhi
+
+ or $Zhi,$Xhi,$Xhi
+ stq $Xlo,8($Xi)
+ stq $Xhi,0($Xi)
+
+ ret (ra)
+.end gcm_gmult_4bit
+___
+
+$inhi="s0";
+$inlo="s1";
+
+$code.=<<___;
+.globl gcm_ghash_4bit
+.align 4
+.ent gcm_ghash_4bit
+gcm_ghash_4bit:
+ lda sp,-32(sp)
+ stq ra,0(sp)
+ stq s0,8(sp)
+ stq s1,16(sp)
+ .mask 0x04000600,-32
+ .frame sp,32,ra
+ .prologue 0
+
+ ldq_u $inhi,0($inp)
+ ldq_u $Thi0,7($inp)
+ ldq_u $inlo,8($inp)
+ ldq_u $Tlo0,15($inp)
+ ldq $Xhi,0($Xi)
+ ldq $Xlo,8($Xi)
+
+ br $rem_4bit,.Lpic2
+.Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit)
+
+.Louter:
+ extql $inhi,$inp,$inhi
+ extqh $Thi0,$inp,$Thi0
+ or $inhi,$Thi0,$inhi
+ lda $inp,16($inp)
+
+ extql $inlo,$inp,$inlo
+ extqh $Tlo0,$inp,$Tlo0
+ or $inlo,$Tlo0,$inlo
+ subq $len,16,$len
+
+ xor $Xlo,$inlo,$Xlo
+ xor $Xhi,$inhi,$Xhi
+___
+
+ &loop();
+
+$code.=<<___;
+ srl $Zlo,24,$t0 # byte swap
+ srl $Zlo,8,$t1
+
+ sll $Zlo,8,$t2
+ sll $Zlo,24,$Zlo
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+
+ zapnot $Zlo,0x88,$Zlo
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zlo,$t0,$Zlo
+ srl $Zhi,24,$t0
+ srl $Zhi,8,$t1
+
+ or $Zlo,$t2,$Zlo
+ sll $Zhi,8,$t2
+ sll $Zhi,24,$Zhi
+
+ srl $Zlo,32,$Xlo
+ sll $Zlo,32,$Zlo
+ beq $len,.Ldone
+
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+ or $Zlo,$Xlo,$Xlo
+ ldq_u $inhi,0($inp)
+
+ zapnot $Zhi,0x88,$Zhi
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+ ldq_u $Thi0,7($inp)
+
+ or $Zhi,$t0,$Zhi
+ or $Zhi,$t2,$Zhi
+ ldq_u $inlo,8($inp)
+ ldq_u $Tlo0,15($inp)
+
+ srl $Zhi,32,$Xhi
+ sll $Zhi,32,$Zhi
+
+ or $Zhi,$Xhi,$Xhi
+ br zero,.Louter
+
+.Ldone:
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+ or $Zlo,$Xlo,$Xlo
+
+ zapnot $Zhi,0x88,$Zhi
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zhi,$t0,$Zhi
+ or $Zhi,$t2,$Zhi
+
+ srl $Zhi,32,$Xhi
+ sll $Zhi,32,$Zhi
+
+ or $Zhi,$Xhi,$Xhi
+
+ stq $Xlo,8($Xi)
+ stq $Xhi,0($Xi)
+
+ .set noreorder
+ /*ldq ra,0(sp)*/
+ ldq s0,8(sp)
+ ldq s1,16(sp)
+ lda sp,32(sp)
+ ret (ra)
+.end gcm_ghash_4bit
+
+.align 4
+rem_4bit:
+ .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+ .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+ .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+ .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
+
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
new file mode 100644
index 0000000000..d91586ee29
--- /dev/null
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -0,0 +1,429 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+# bit shift operation is neatly fused with 128-bit xor here, and
+# "538B" variant would eliminate only 4-5 instructions out of 32
+# in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+# consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$Xi="r0"; # argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+
+$Zll="r4"; # variables
+$Zlh="r5";
+$Zhl="r6";
+$Zhh="r7";
+$Tll="r8";
+$Tlh="r9";
+$Thl="r10";
+$Thh="r11";
+$nlo="r12";
+################# r13 is stack pointer
+$nhi="r14";
+################# r15 is program counter
+
+$rem_4bit=$inp; # used in gcm_gmult_4bit
+$cnt=$len;
+
+sub Zsmash() {
+ my $i=12;
+ my @args=@_;
+ for ($Zll,$Zlh,$Zhl,$Zhh) {
+ $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev $_,$_
+ str $_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+ str $_,[$Xi,#$i]
+#else
+ mov $Tlh,$_,lsr#8
+ strb $_,[$Xi,#$i+3]
+ mov $Thl,$_,lsr#16
+ strb $Tlh,[$Xi,#$i+2]
+ mov $Thh,$_,lsr#24
+ strb $Thl,[$Xi,#$i+1]
+ strb $Thh,[$Xi,#$i]
+#endif
+___
+ $code.="\t".shift(@args)."\n";
+ $i-=4;
+ }
+}
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code 32
+
+.type rem_4bit,%object
+.align 5
+rem_4bit:
+.short 0x0000,0x1C20,0x3840,0x2460
+.short 0x7080,0x6CA0,0x48C0,0x54E0
+.short 0xE100,0xFD20,0xD940,0xC560
+.short 0x9180,0x8DA0,0xA9C0,0xB5E0
+.size rem_4bit,.-rem_4bit
+
+.type rem_4bit_get,%function
+rem_4bit_get:
+ sub $rem_4bit,pc,#8
+ sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
+ b .Lrem_4bit_got
+ nop
+.size rem_4bit_get,.-rem_4bit_get
+
+.global gcm_ghash_4bit
+.type gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+ sub r12,pc,#8
+ add $len,$inp,$len @ $len to point at the end
+ stmdb sp!,{r3-r11,lr} @ save $len/end too
+ sub r12,r12,#48 @ &rem_4bit
+
+ ldmia r12,{r4-r11} @ copy rem_4bit ...
+ stmdb sp!,{r4-r11} @ ... to stack
+
+ ldrb $nlo,[$inp,#15]
+ ldrb $nhi,[$Xi,#15]
+.Louter:
+ eor $nlo,$nlo,$nhi
+ and $nhi,$nlo,#0xf0
+ and $nlo,$nlo,#0x0f
+ mov $cnt,#14
+
+ add $Zhh,$Htbl,$nlo,lsl#4
+ ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
+ add $Thh,$Htbl,$nhi
+ ldrb $nlo,[$inp,#14]
+
+ and $nhi,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
+ add $nhi,$nhi,$nhi
+ eor $Zll,$Tll,$Zll,lsr#4
+ ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
+ eor $Zll,$Zll,$Zlh,lsl#28
+ ldrb $nhi,[$Xi,#14]
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+ eor $nlo,$nlo,$nhi
+ and $nhi,$nlo,#0xf0
+ and $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tll,lsl#16
+
+.Linner:
+ add $Thh,$Htbl,$nlo,lsl#4
+ and $nlo,$Zll,#0xf @ rem
+ subs $cnt,$cnt,#1
+ add $nlo,$nlo,$nlo
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
+ eor $Zll,$Tll,$Zll,lsr#4
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ ldrplb $nlo,[$inp,$cnt]
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+
+ add $Thh,$Htbl,$nhi
+ and $nhi,$Zll,#0xf @ rem
+ eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
+ add $nhi,$nhi,$nhi
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
+ eor $Zll,$Tll,$Zll,lsr#4
+ ldrplb $Tll,[$Xi,$cnt]
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ ldrh $Tlh,[sp,$nhi]
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eorpl $nlo,$nlo,$Tll
+ eor $Zhh,$Thh,$Zhh,lsr#4
+ andpl $nhi,$nlo,#0xf0
+ andpl $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
+ bpl .Linner
+
+ ldr $len,[sp,#32] @ re-load $len/end
+ add $inp,$inp,#16
+ mov $nhi,$Zll
+___
+ &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+$code.=<<___;
+ bne .Louter
+
+ add sp,sp,#36
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global gcm_gmult_4bit
+.type gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+ stmdb sp!,{r4-r11,lr}
+ ldrb $nlo,[$Xi,#15]
+ b rem_4bit_get
+.Lrem_4bit_got:
+ and $nhi,$nlo,#0xf0
+ and $nlo,$nlo,#0x0f
+ mov $cnt,#14
+
+ add $Zhh,$Htbl,$nlo,lsl#4
+ ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
+ ldrb $nlo,[$Xi,#14]
+
+ add $Thh,$Htbl,$nhi
+ and $nhi,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
+ add $nhi,$nhi,$nhi
+ eor $Zll,$Tll,$Zll,lsr#4
+ ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+ and $nhi,$nlo,#0xf0
+ eor $Zhh,$Zhh,$Tll,lsl#16
+ and $nlo,$nlo,#0x0f
+
+.Loop:
+ add $Thh,$Htbl,$nlo,lsl#4
+ and $nlo,$Zll,#0xf @ rem
+ subs $cnt,$cnt,#1
+ add $nlo,$nlo,$nlo
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
+ eor $Zll,$Tll,$Zll,lsr#4
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ ldrplb $nlo,[$Xi,$cnt]
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+
+ add $Thh,$Htbl,$nhi
+ and $nhi,$Zll,#0xf @ rem
+ eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
+ add $nhi,$nhi,$nhi
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
+ eor $Zll,$Tll,$Zll,lsr#4
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+ andpl $nhi,$nlo,#0xf0
+ andpl $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
+ bpl .Loop
+___
+ &Zsmash();
+$code.=<<___;
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+{
+my $cnt=$Htbl; # $Htbl is used once in the very beginning
+
+my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
+my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
+
+# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
+# in Zo. Or should I say "top bit", because GHASH is specified in
+# reverse bit order? Otherwise straightforward 128-bt H by one input
+# byte multiplication and modulo-reduction, times 16.
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu neon
+
+.global gcm_gmult_neon
+.type gcm_gmult_neon,%function
+.align 4
+gcm_gmult_neon:
+ sub $Htbl,#16 @ point at H in GCM128_CTX
+ vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+ vmov.i32 $mod,#0xe1 @ our irreducible polynomial
+ vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
+ vshr.u64 $mod,#32
+ vldmia $Htbl,{$Hhi-$Hlo} @ load H
+ veor $zero,$zero
+#ifdef __ARMEL__
+ vrev64.8 $IN,$IN
+#endif
+ veor $Qpost,$Qpost
+ veor $R,$R
+ mov $cnt,#16
+ veor $Z,$Z
+ mov $len,#16
+ veor $Zo,$Zo
+ vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
+ b .Linner_neon
+.size gcm_gmult_neon,.-gcm_gmult_neon
+
+.global gcm_ghash_neon
+.type gcm_ghash_neon,%function
+.align 4
+gcm_ghash_neon:
+ vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
+ vmov.i32 $mod,#0xe1 @ our irreducible polynomial
+ vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
+ vshr.u64 $mod,#32
+ vldmia $Xi,{$Hhi-$Hlo} @ load H
+ veor $zero,$zero
+ nop
+#ifdef __ARMEL__
+ vrev64.8 $Z,$Z
+#endif
+.Louter_neon:
+ vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
+ veor $Qpost,$Qpost
+ vld1.64 `&Dlo($IN)`,[$inp]!
+ veor $R,$R
+ mov $cnt,#16
+#ifdef __ARMEL__
+ vrev64.8 $IN,$IN
+#endif
+ veor $Zo,$Zo
+ veor $IN,$Z @ inp^=Xi
+ veor $Z,$Z
+ vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
+.Linner_neon:
+ subs $cnt,$cnt,#1
+ vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
+ vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
+ vext.8 $IN,$zero,#1 @ IN>>=8
+
+ veor $Z,$Qpost @ modulo-scheduled part
+ vshl.i64 `&Dlo("$R")`,#48
+ vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
+ veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
+
+ veor `&Dhi("$Z")`,`&Dlo("$R")`
+ vuzp.8 $Qlo,$Qhi
+ vsli.8 $Zo,$T,#1 @ compose the "carry" byte
+ vext.8 $Z,$zero,#1 @ Z>>=8
+
+ vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
+ vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
+ vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
+ veor $Z,$Qhi
+ bne .Linner_neon
+
+ veor $Z,$Qpost @ modulo-scheduled artefact
+ vshl.i64 `&Dlo("$R")`,#48
+ veor `&Dhi("$Z")`,`&Dlo("$R")`
+
+ @ finalization, normalize Z:Zo
+ vand $Zo,$mod @ suffices to mask the bit
+ vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
+ vshl.i64 $Z,#1
+ subs $len,#16
+ vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
+ bne .Louter_neon
+
+#ifdef __ARMEL__
+ vrev64.8 $Z,$Z
+#endif
+ sub $Xi,#16
+ vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
+ vst1.64 `&Dlo("$Z")`,[$Xi,:64]
+
+ bx lr
+.size gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush
diff --git a/crypto/modes/asm/ghash-ia64.pl b/crypto/modes/asm/ghash-ia64.pl
new file mode 100755
index 0000000000..0354c95444
--- /dev/null
+++ b/crypto/modes/asm/ghash-ia64.pl
@@ -0,0 +1,463 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
+# GHASH performance was measured to be 6.67 cycles per processed byte
+# on Itanium 2, which is >90% better than Microsoft compiler generated
+# code. To anchor to something else sha1-ia64.pl module processes one
+# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
+# byte.
+
+# September 2010
+#
+# It was originally thought that it makes lesser sense to implement
+# "528B" variant on Itanium 2 for following reason. Because number of
+# functional units is naturally limited, it appeared impossible to
+# implement "528B" loop in 4 cycles, only in 5. This would mean that
+# theoretically performance improvement couldn't be more than 20%.
+# But occasionally you prove yourself wrong:-) I figured out a way to
+# fold couple of instructions and having freed