Support for "multiply high" instruction, see BN_UMULT_HIGH comment in

crypto/bn/bn_lcl.h for further details. It should be noted that for the moment of this writing the code was tested only on Alpha. If compiled with DEC C the C implementation exhibits 12% performance improvement over the crypto/bn/asm/alpha.s (on EV56 box running AlphaLinux). GNU C is (unfortunately) 8% behind the assembler implementation. But it's OpenVMS Alpha users who *may* benefit most as 'apps/openssl speed rsa' exhibits 6 (six) times performance improvement over the original VMS bignum implementation. Where "*may*" means "as soon as code is enabled though #define SIXTY_FOUR_BIT and crypto/bn/asm/vms.mar is skipped."
author: Andy Polyakov <appro@openssl.org> 2000-02-02 16:18:12 +0000
committer: Andy Polyakov <appro@openssl.org> 2000-02-02 16:18:12 +0000
commit: fb81ac5e6be4041e59df9362cd63880b21e2cafc (patch)
tree: ca35a62e7e39a135465937ef7b37bcab9d15add2 /crypto
parent: 54a34aecc38b95115704565ac77a2715688be400 (diff)
3 files changed, 151 insertions, 37 deletions
diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c
index 4d3da16a0c..a62fdcf07a 100644
--- a/crypto/bn/bn_asm.c
+++ b/crypto/bn/bn_asm.c
@@ -60,7 +60,7 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-#ifdef BN_LLONG 
+#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
 
 BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 	{
@@ -69,18 +69,19 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 	bn_check_num(num);
 	if (num <= 0) return(c1);
 
-	for (;;)
+	while (num&~3)
 		{
 		mul_add(rp[0],ap[0],w,c1);
-		if (--num == 0) break;
 		mul_add(rp[1],ap[1],w,c1);
-		if (--num == 0) break;
 		mul_add(rp[2],ap[2],w,c1);
-		if (--num == 0) break;
 		mul_add(rp[3],ap[3],w,c1);
-		if (--num == 0) break;
-		ap+=4;
-		rp+=4;
+		ap+=4; rp+=4; num-=4;
+		}
+	if (num)
+		{
+		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
+		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
+		mul_add(rp[2],ap[2],w,c1); return c1;
 		}
 	
 	return(c1);
@@ -93,19 +94,19 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 	bn_check_num(num);
 	if (num <= 0) return(c1);
 
-	/* for (;;) */
-	while (1) /* circumvent egcs-1.1.2 bug */
+	while (num&~3)
 		{
 		mul(rp[0],ap[0],w,c1);
-		if (--num == 0) break;
 		mul(rp[1],ap[1],w,c1);
-		if (--num == 0) break;
 		mul(rp[2],ap[2],w,c1);
-		if (--num == 0) break;
 		mul(rp[3],ap[3],w,c1);
-		if (--num == 0) break;
-		ap+=4;
-		rp+=4;
+		ap+=4; rp+=4; num-=4;
+		}
+	if (num)
+		{
+		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
+		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
+		mul(rp[2],ap[2],w,c1);
 		}
 	return(c1);
 	} 
@@ -114,28 +115,19 @@ void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
         {
 	bn_check_num(n);
 	if (n <= 0) return;
-	for (;;)
+	while (n&~3)
 		{
-		BN_ULLONG t;
-
-		t=(BN_ULLONG)(a[0])*(a[0]);
-		r[0]=Lw(t); r[1]=Hw(t);
-		if (--n == 0) break;
-
-		t=(BN_ULLONG)(a[1])*(a[1]);
-		r[2]=Lw(t); r[3]=Hw(t);
-		if (--n == 0) break;
-
-		t=(BN_ULLONG)(a[2])*(a[2]);
-		r[4]=Lw(t); r[5]=Hw(t);
-		if (--n == 0) break;
-
-		t=(BN_ULLONG)(a[3])*(a[3]);
-		r[6]=Lw(t); r[7]=Hw(t);
-		if (--n == 0) break;
-
-		a+=4;
-		r+=8;
+		sqr(r[0],r[1],a[0]);
+		sqr(r[2],r[3],a[1]);
+		sqr(r[4],r[5],a[2]);
+		sqr(r[6],r[7],a[3]);
+		a+=4; r+=8; n-=4;
+		}
+	if (n)
+		{
+		sqr(r[0],r[1],a[0]); if (--n == 0) return;
+		sqr(r[2],r[3],a[1]); if (--n == 0) return;
+		sqr(r[4],r[5],a[2]);
 		}
 	}
 
@@ -460,6 +452,38 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
 
 #define sqr_add_c2(a,i,j,c0,c1,c2) \
 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
+
+#elif defined(BN_UMULT_HIGH)
+
+#define mul_add_c(a,b,c0,c1,c2)	{	\
+	BN_ULONG ta=(a),tb=(b);		\
+	t1 = ta * tb;			\
+	t2 = BN_UMULT_HIGH(ta,tb);	\
+	c0 += t1; t2 += (c0<t1)?1:0;	\
+	c1 += t2; c2 += (c1<t2)?1:0;	\
+	}
+
+#define mul_add_c2(a,b,c0,c1,c2) {	\
+	BN_ULONG ta=(a),tb=(b),t0;	\
+	t1 = BN_UMULT_HIGH(ta,tb);	\
+	t0 = ta * tb;			\
+	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
+	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
+	c0 += t1; t2 += (c0<t1)?1:0;	\
+	c1 += t2; c2 += (c1<t2)?1:0;	\
+	}
+
+#define sqr_add_c(a,i,c0,c1,c2)	{	\
+	BN_ULONG ta=(a)[i];		\
+	t1 = ta * ta;			\
+	t2 = BN_UMULT_HIGH(ta,ta);	\
+	c0 += t1; t2 += (c0<t1)?1:0;	\
+	c1 += t2; c2 += (c1<t2)?1:0;	\
+	}
+
+#define sqr_add_c2(a,i,j,c0,c1,c2)	\
+	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
+
 #else
 #define mul_add_c(a,b,c0,c1,c2) \
 	t1=LBITS(a); t2=HBITS(a); \
diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c
index d8c31e1db0..39d7602c30 100644
--- a/crypto/bn/bn_div.c
+++ b/crypto/bn/bn_div.c
@@ -280,9 +280,14 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 		 */
 		rem=(n1-q*d0)&BN_MASK2;
 #endif
+#ifdef BN_UMULT_HIGH
+		t2l = d1 * q;
+		t2h = BN_UMULT_HIGH(d1,q);
+#else
 		t2l=LBITS(d1); t2h=HBITS(d1);
 		ql =LBITS(q);  qh =HBITS(q);
 		mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+#endif
 
 		for (;;)
 			{
diff --git a/crypto/bn/bn_lcl.h b/crypto/bn/bn_lcl.h
index 85a372695b..d24cffdfae 100644
--- a/crypto/bn/bn_lcl.h
+++ b/crypto/bn/bn_lcl.h
@@ -86,6 +86,54 @@ extern "C" {
 #endif
 #endif
 
+#if !defined(NO_ASM) && !defined(PEDANTIC)
+/*
+ * BN_UMULT_HIGH section.
+ *
+ * No, I'm not trying to overwhelm you when stating that the
+ * product of N-bit numbers is 2*N bits wide:-) No, I don't expect
+ * you to be impressed when I say that if the compiler doesn't
+ * support 2*N integer type, then you have to replace every N*N
+ * multiplication with 4 (N/2)*(N/2) accompanied by some shifts
+ * and additions which unavoidably results in severe performance
+ * penalties. Of course provided that the hardware is capable of
+ * producing 2*N result... That's when you normally start
+ * considering assembler implementation. However! It should be
+ * pointed out that some CPUs (most notably Alpha, PowerPC and
+ * upcoming IA-64 family:-) provide *separate* instruction
+ * calculating the upper half of the product placing the result
+ * into a general purpose register. Now *if* the compiler supports
+ * inline assembler, then it's not impossible to implement the
+ * "bignum" routines (and have the compiler optimize 'em)
+ * exhibiting "native" performance in C. That's what BN_UMULT_HIGH
+ * macro is about:-)
+ *
+ *					<appro@fy.chalmers.se>
+ */
+# if defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
+#  if defined(__DECC)
+#   include <c_asm.h>
+#   define BN_UMULT_HIGH(a,b)	(BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
+#  elif defined(__GNUC__)
+#   define BN_UMULT_HIGH(a,b)	({	\
+	register BN_ULONG ret;		\
+	asm ("umulh	%1,%2,%0"	\
+	     : "=r"(ret)		\
+	     : "r"(a), "r"(b));		\
+	ret;			})
+#  endif	/* compiler */
+# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
+#  if defined(__GNUC__)
+#   define BN_UMULT_HIGH(a,b)	({	\
+	register BN_ULONG ret;		\
+	asm ("mulhdu	%0,%1,%2"	\
+	     : "=r"(ret)		\
+	     : "r"(a), "r"(b));		\
+	ret;			})
+#  endif	/* compiler */
+# endif		/* cpu */
+#endif		/* NO_ASM */
+
 /*************************************************************
  * Using the long long type
  */
@@ -151,6 +199,43 @@ extern "C" {
 	(c)= Hw(t); \
 	}
 
+#define sqr(r0,r1,a) { \
+	BN_ULLONG t; \
+	t=(BN_ULLONG)(a)*(a); \
+	(r0)=Lw(t); \
+	(r1)=Hw(t); ]
+	}
+
+#elif defined(BN_UMULT_HIGH)
+#define mul_add(r,a,w,c) {		\
+	BN_ULONG high,low,ret,tmp=(a);	\
+	ret =  (r);			\
+	high=  BN_UMULT_HIGH(w,tmp);	\
+	ret += (c);			\
+	low =  (w) * tmp;		\
+	(c) =  (ret<(c))?1:0;		\
+	(c) += high;			\
+	ret += low;			\
+	(c) += (ret<low)?1:0;		\
+	(r) =  ret;			\
+	}
+
+#define mul(r,a,w,c)	{		\
+	BN_ULONG high,low,ret,ta=(a);	\
+	low =  (w) * ta;		\
+	high=  BN_UMULT_HIGH(w,ta);	\
+	ret =  low + (c);		\
+	(c) =  high;			\
+	(c) += (ret<low)?1:0;		\
+	(r) =  ret;			\
+	}
+
+#define sqr(r0,r1,a)	{		\
+	BN_ULONG tmp=(a);		\
+	(r0) = tmp * tmp;		\
+	(r1) = BN_UMULT_HIGH(tmp,tmp);	\
+	}
+
 #else
 /*************************************************************
  * No long long type
author	Andy Polyakov <appro@openssl.org>	2000-02-02 16:18:12 +0000
committer	Andy Polyakov <appro@openssl.org>	2000-02-02 16:18:12 +0000
commit	fb81ac5e6be4041e59df9362cd63880b21e2cafc (patch)
tree	ca35a62e7e39a135465937ef7b37bcab9d15add2 /crypto
parent	54a34aecc38b95115704565ac77a2715688be400 (diff)