From 925596f85be423ab24be2a481a0c37fc3ab88472 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Thu, 5 May 2011 21:57:11 +0000
Subject: ARM assembler pack: engage newly introduced armv4-gf2m module.

---
 crypto/bn/Makefile          |  3 +++
 crypto/bn/asm/armv4-gf2m.pl | 35 ++++++++++++++++++++++++-----------
 crypto/bn/bn_gf2m.c         |  5 ++++-
 3 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'crypto/bn')

diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile
index 74bc4f721f..18d704bc97 100644
--- a/crypto/bn/Makefile
+++ b/crypto/bn/Makefile
@@ -120,6 +120,9 @@ alpha-mont.s:	asm/alpha-mont.pl
 
 # GNU make "catch all"
 %-mont.s:	asm/%-mont.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
+%-gf2m.S:	asm/%-gf2m.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
+
+armv4-gf2m.o:	armv4-gf2m.S
 
 files:
 	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index 67ec4b2c14..4fe9db9894 100644
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -21,13 +21,8 @@
 # runs in even less cycles, ~30, improvement is measurable only on
 # longer keys. One has to optimize code elsewhere to get NEON glow...
 
-$a="r1";
-$b="r0";
-
-($a0,$a1,$a2,$a12,$a4,$a14)=
-($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
-
-$mask="r12";
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
 
 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
@@ -67,9 +62,21 @@ mul_1x1_neon:
 	bx	lr
 .size	mul_1x1_neon,.-mul_1x1_neon
 #endif
+___
+################
+# private interface to mul_1x1_ialu
+#
+$a="r1";
+$b="r0";
 
-.align	5
+($a0,$a1,$a2,$a12,$a4,$a14)=
+($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
+
+$mask="r12";
+
+$code.=<<___;
 .type	mul_1x1_ialu,%function
+.align	5
 mul_1x1_ialu:
 	mov	$a0,#0
 	bic	$a1,$a,#3<<30		@ a1=a&0x3fffffff
@@ -147,7 +154,15 @@ mul_1x1_ialu:
 
 	mov	pc,lr
 .size	mul_1x1_ialu,.-mul_1x1_ialu
+___
+################
+# void	bn_GF2m_mul_2x2(BN_ULONG *r,
+#	BN_ULONG a1,BN_ULONG a0,
+#	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0搓1b0
+
+($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
 
+$code.=<<___;
 .global	bn_GF2m_mul_2x2
 .type	bn_GF2m_mul_2x2,%function
 .align	5
@@ -157,9 +172,7 @@ bn_GF2m_mul_2x2:
 .Lpic:	ldr	r12,[pc,r12]
 	tst	r12,#1
 	beq	.Lialu
-___
-($A1,$B1,$A0,$B0,$A0B0,$A1B1)=map("d$_",(18..23));
-$code.=<<___;
+
 	veor	$A1,$A1
 	vmov.32	$B1,r3,r3		@ two copies of b1
 	vmov.32	${A1}[0],r1		@ a1
diff --git a/crypto/bn/bn_gf2m.c b/crypto/bn/bn_gf2m.c
index 5a13515c36..19a101bccd 100644
--- a/crypto/bn/bn_gf2m.c
+++ b/crypto/bn/bn_gf2m.c
@@ -126,6 +126,7 @@ static const BN_ULONG SQR_tb[16] =
     SQR_tb[(w) >>  4 & 0xF] <<  8 | SQR_tb[(w)       & 0xF]
 #endif
 
+#if !defined(OPENSSL_BN_ASM_GF2m)
 /* Product of two polynomials a, b each with degree < BN_BITS2 - 1,
  * result is a polynomial r with degree < 2 * BN_BITS - 1
  * The caller MUST ensure that the variables have the right amount
@@ -220,7 +221,9 @@ static void bn_GF2m_mul_2x2(BN_ULONG *r, const BN_ULONG a1, const BN_ULONG a0, c
 	r[2] ^= m1 ^ r[1] ^ r[3];  /* h0 ^= m1 ^ l1 ^ h1; */
 	r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0;  /* l1 ^= l0 ^ h0 ^ m0; */
 	}
-
+#else
+void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+#endif 
 
 /* Add polynomials a and b and store result in r; r could be a or b, a and b 
  * could be equal; r is the bitwise XOR of a and b.
-- 
cgit v1.2.3