summaryrefslogtreecommitdiffstats
path: root/crypto/bn/bn_asm.c
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2000-02-02 16:18:12 +0000
committerAndy Polyakov <appro@openssl.org>2000-02-02 16:18:12 +0000
commitfb81ac5e6be4041e59df9362cd63880b21e2cafc (patch)
treeca35a62e7e39a135465937ef7b37bcab9d15add2 /crypto/bn/bn_asm.c
parent54a34aecc38b95115704565ac77a2715688be400 (diff)
Support for "multiply high" instruction, see BN_UMULT_HIGH comment in
crypto/bn/bn_lcl.h for further details. It should be noted that for the moment of this writing the code was tested only on Alpha. If compiled with DEC C the C implementation exhibits 12% performance improvement over the crypto/bn/asm/alpha.s (on EV56 box running AlphaLinux). GNU C is (unfortunately) 8% behind the assembler implementation. But it's OpenVMS Alpha users who *may* benefit most as 'apps/openssl speed rsa' exhibits 6 (six) times performance improvement over the original VMS bignum implementation. Where "*may*" means "as soon as code is enabled though #define SIXTY_FOUR_BIT and crypto/bn/asm/vms.mar is skipped."
Diffstat (limited to 'crypto/bn/bn_asm.c')
-rw-r--r--crypto/bn/bn_asm.c98
1 files changed, 61 insertions, 37 deletions
diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c
index 4d3da16a0c..a62fdcf07a 100644
--- a/crypto/bn/bn_asm.c
+++ b/crypto/bn/bn_asm.c
@@ -60,7 +60,7 @@
#include "cryptlib.h"
#include "bn_lcl.h"
-#ifdef BN_LLONG
+#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
{
@@ -69,18 +69,19 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
bn_check_num(num);
if (num <= 0) return(c1);
- for (;;)
+ while (num&~3)
{
mul_add(rp[0],ap[0],w,c1);
- if (--num == 0) break;
mul_add(rp[1],ap[1],w,c1);
- if (--num == 0) break;
mul_add(rp[2],ap[2],w,c1);
- if (--num == 0) break;
mul_add(rp[3],ap[3],w,c1);
- if (--num == 0) break;
- ap+=4;
- rp+=4;
+ ap+=4; rp+=4; num-=4;
+ }
+ if (num)
+ {
+ mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
+ mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
+ mul_add(rp[2],ap[2],w,c1); return c1;
}
return(c1);
@@ -93,19 +94,19 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
bn_check_num(num);
if (num <= 0) return(c1);
- /* for (;;) */
- while (1) /* circumvent egcs-1.1.2 bug */
+ while (num&~3)
{
mul(rp[0],ap[0],w,c1);
- if (--num == 0) break;
mul(rp[1],ap[1],w,c1);
- if (--num == 0) break;
mul(rp[2],ap[2],w,c1);
- if (--num == 0) break;
mul(rp[3],ap[3],w,c1);
- if (--num == 0) break;
- ap+=4;
- rp+=4;
+ ap+=4; rp+=4; num-=4;
+ }
+ if (num)
+ {
+ mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
+ mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
+ mul(rp[2],ap[2],w,c1);
}
return(c1);
}
@@ -114,28 +115,19 @@ void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
{
bn_check_num(n);
if (n <= 0) return;
- for (;;)
+ while (n&~3)
{
- BN_ULLONG t;
-
- t=(BN_ULLONG)(a[0])*(a[0]);
- r[0]=Lw(t); r[1]=Hw(t);
- if (--n == 0) break;
-
- t=(BN_ULLONG)(a[1])*(a[1]);
- r[2]=Lw(t); r[3]=Hw(t);
- if (--n == 0) break;
-
- t=(BN_ULLONG)(a[2])*(a[2]);
- r[4]=Lw(t); r[5]=Hw(t);
- if (--n == 0) break;
-
- t=(BN_ULLONG)(a[3])*(a[3]);
- r[6]=Lw(t); r[7]=Hw(t);
- if (--n == 0) break;
-
- a+=4;
- r+=8;
+ sqr(r[0],r[1],a[0]);
+ sqr(r[2],r[3],a[1]);
+ sqr(r[4],r[5],a[2]);
+ sqr(r[6],r[7],a[3]);
+ a+=4; r+=8; n-=4;
+ }
+ if (n)
+ {
+ sqr(r[0],r[1],a[0]); if (--n == 0) return;
+ sqr(r[2],r[3],a[1]); if (--n == 0) return;
+ sqr(r[4],r[5],a[2]);
}
}
@@ -460,6 +452,38 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
#define sqr_add_c2(a,i,j,c0,c1,c2) \
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
+
+#elif defined(BN_UMULT_HIGH)
+
+#define mul_add_c(a,b,c0,c1,c2) { \
+ BN_ULONG ta=(a),tb=(b); \
+ t1 = ta * tb; \
+ t2 = BN_UMULT_HIGH(ta,tb); \
+ c0 += t1; t2 += (c0<t1)?1:0; \
+ c1 += t2; c2 += (c1<t2)?1:0; \
+ }
+
+#define mul_add_c2(a,b,c0,c1,c2) { \
+ BN_ULONG ta=(a),tb=(b),t0; \
+ t1 = BN_UMULT_HIGH(ta,tb); \
+ t0 = ta * tb; \
+ t2 = t1+t1; c2 += (t2<t1)?1:0; \
+ t1 = t0+t0; t2 += (t1<t0)?1:0; \
+ c0 += t1; t2 += (c0<t1)?1:0; \
+ c1 += t2; c2 += (c1<t2)?1:0; \
+ }
+
+#define sqr_add_c(a,i,c0,c1,c2) { \
+ BN_ULONG ta=(a)[i]; \
+ t1 = ta * ta; \
+ t2 = BN_UMULT_HIGH(ta,ta); \
+ c0 += t1; t2 += (c0<t1)?1:0; \
+ c1 += t2; c2 += (c1<t2)?1:0; \
+ }
+
+#define sqr_add_c2(a,i,j,c0,c1,c2) \
+ mul_add_c2((a)[i],(a)[j],c0,c1,c2)
+
#else
#define mul_add_c(a,b,c0,c1,c2) \
t1=LBITS(a); t2=HBITS(a); \