summaryrefslogtreecommitdiffstats
path: root/crypto
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2011-11-14 21:05:42 +0000
committerAndy Polyakov <appro@openssl.org>2011-11-14 21:05:42 +0000
commit9f1c5491d27465adf2c26fb94e309355da7cbc95 (patch)
tree9a118234f2aec799c636849d9ca5d06027ac1337 /crypto
parent70b52222f53c5cbc81b5d061c09d7f15196a46e1 (diff)
BN update from HEAD.
Diffstat (limited to 'crypto')
-rw-r--r--crypto/bn/bn_div.c272
-rw-r--r--crypto/bn/bn_exp.c240
-rw-r--r--crypto/bn/bn_gf2m.c106
-rw-r--r--crypto/bn/bn_lcl.h19
-rw-r--r--crypto/bn/bn_mont.c116
-rw-r--r--crypto/bn/bn_nist.c338
-rw-r--r--crypto/bn/bn_shift.c27
7 files changed, 654 insertions, 464 deletions
diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c
index 802a43d642..52b3304293 100644
--- a/crypto/bn/bn_div.c
+++ b/crypto/bn/bn_div.c
@@ -169,15 +169,13 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
#endif /* OPENSSL_NO_ASM */
-/* BN_div[_no_branch] computes dv := num / divisor, rounding towards
+/* BN_div computes dv := num / divisor, rounding towards
* zero, and sets up rm such that dv*divisor + rm = num holds.
* Thus:
* dv->neg == num->neg ^ divisor->neg (unless the result is zero)
* rm->neg == num->neg (unless the remainder is zero)
* If 'dv' or 'rm' is NULL, the respective value is not returned.
*/
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
- const BIGNUM *divisor, BN_CTX *ctx);
int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
BN_CTX *ctx)
{
@@ -186,6 +184,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
BN_ULONG *resp,*wnump;
BN_ULONG d0,d1;
int num_n,div_n;
+ int no_branch=0;
/* Invalid zero-padding would have particularly bad consequences
* in the case of 'num', so don't just rely on bn_check_top() for this one
@@ -200,7 +199,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0))
{
- return BN_div_no_branch(dv, rm, num, divisor, ctx);
+ no_branch=1;
}
bn_check_top(dv);
@@ -214,7 +213,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
return(0);
}
- if (BN_ucmp(num,divisor) < 0)
+ if (!no_branch && BN_ucmp(num,divisor) < 0)
{
if (rm != NULL)
{ if (BN_copy(rm,num) == NULL) return(0); }
@@ -239,242 +238,25 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
norm_shift+=BN_BITS2;
if (!(BN_lshift(snum,num,norm_shift))) goto err;
snum->neg=0;
- div_n=sdiv->top;
- num_n=snum->top;
- loop=num_n-div_n;
- /* Lets setup a 'window' into snum
- * This is the part that corresponds to the current
- * 'area' being divided */
- wnum.neg = 0;
- wnum.d = &(snum->d[loop]);
- wnum.top = div_n;
- /* only needed when BN_ucmp messes up the values between top and max */
- wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */
-
- /* Get the top 2 words of sdiv */
- /* div_n=sdiv->top; */
- d0=sdiv->d[div_n-1];
- d1=(div_n == 1)?0:sdiv->d[div_n-2];
-
- /* pointer to the 'top' of snum */
- wnump= &(snum->d[num_n-1]);
-
- /* Setup to 'res' */
- res->neg= (num->neg^divisor->neg);
- if (!bn_wexpand(res,(loop+1))) goto err;
- res->top=loop;
- resp= &(res->d[loop-1]);
-
- /* space for temp */
- if (!bn_wexpand(tmp,(div_n+1))) goto err;
- if (BN_ucmp(&wnum,sdiv) >= 0)
+ if (no_branch)
{
- /* If BN_DEBUG_RAND is defined BN_ucmp changes (via
- * bn_pollute) the const bignum arguments =>
- * clean the values between top and max again */
- bn_clear_top2max(&wnum);
- bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
- *resp=1;
- }
- else
- res->top--;
- /* if res->top == 0 then clear the neg value otherwise decrease
- * the resp pointer */
- if (res->top == 0)
- res->neg = 0;
- else
- resp--;
-
- for (i=0; i<loop-1; i++, wnump--, resp--)
- {
- BN_ULONG q,l0;
- /* the first part of the loop uses the top two words of
- * snum and sdiv to calculate a BN_ULONG q such that
- * | wnum - sdiv * q | < sdiv */
-#if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
- BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG);
- q=bn_div_3_words(wnump,d1,d0);
-#else
- BN_ULONG n0,n1,rem=0;
-
- n0=wnump[0];
- n1=wnump[-1];
- if (n0 == d0)
- q=BN_MASK2;
- else /* n0 < d0 */
- {
-#ifdef BN_LLONG
- BN_ULLONG t2;
-
-#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
- q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0);
-#else
- q=bn_div_words(n0,n1,d0);
-#ifdef BN_DEBUG_LEVITTE
- fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
-X) -> 0x%08X\n",
- n0, n1, d0, q);
-#endif
-#endif
-
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
- /*
- * rem doesn't have to be BN_ULLONG. The least we
- * know it's less that d0, isn't it?
- */
- rem=(n1-q*d0)&BN_MASK2;
-#endif
- t2=(BN_ULLONG)d1*q;
-
- for (;;)
- {
- if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
- break;
- q--;
- rem += d0;
- if (rem < d0) break; /* don't let rem overflow */
- t2 -= d1;
- }
-#else /* !BN_LLONG */
- BN_ULONG t2l,t2h;
-
- q=bn_div_words(n0,n1,d0);
-#ifdef BN_DEBUG_LEVITTE
- fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
-X) -> 0x%08X\n",
- n0, n1, d0, q);
-#endif
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
- rem=(n1-q*d0)&BN_MASK2;
-#endif
-
-#if defined(BN_UMULT_LOHI)
- BN_UMULT_LOHI(t2l,t2h,d1,q);
-#elif defined(BN_UMULT_HIGH)
- t2l = d1 * q;
- t2h = BN_UMULT_HIGH(d1,q);
-#else
+ /* Since we don't know whether snum is larger than sdiv,
+ * we pad snum with enough zeroes without changing its
+ * value.
+ */
+ if (snum->top <= sdiv->top+1)
{
- BN_ULONG ql, qh;
- t2l=LBITS(d1); t2h=HBITS(d1);
- ql =LBITS(q); qh =HBITS(q);
- mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+ if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
+ for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
+ snum->top = sdiv->top + 2;
}
-#endif
-
- for (;;)
- {
- if ((t2h < rem) ||
- ((t2h == rem) && (t2l <= wnump[-2])))
- break;
- q--;
- rem += d0;
- if (rem < d0) break; /* don't let rem overflow */
- if (t2l < d1) t2h--; t2l -= d1;
- }
-#endif /* !BN_LLONG */
- }
-#endif /* !BN_DIV3W */
-
- l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
- tmp->d[div_n]=l0;
- wnum.d--;
- /* ingore top values of the bignums just sub the two
- * BN_ULONG arrays with bn_sub_words */
- if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1))
+ else
{
- /* Note: As we have considered only the leading
- * two BN_ULONGs in the calculation of q, sdiv * q
- * might be greater than wnum (but then (q-1) * sdiv
- * is less or equal than wnum)
- */
- q--;
- if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
- /* we can't have an overflow here (assuming
- * that q != 0, but if q == 0 then tmp is
- * zero anyway) */
- (*wnump)++;
+ if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
+ snum->d[snum->top] = 0;
+ snum->top ++;
}
- /* store part of the result */
- *resp = q;
- }
- bn_correct_top(snum);
- if (rm != NULL)
- {
- /* Keep a copy of the neg flag in num because if rm==num
- * BN_rshift() will overwrite it.
- */
- int neg = num->neg;
- BN_rshift(rm,snum,norm_shift);
- if (!BN_is_zero(rm))
- rm->neg = neg;
- bn_check_top(rm);
- }
- BN_CTX_end(ctx);
- return(1);
-err:
- bn_check_top(rm);
- BN_CTX_end(ctx);
- return(0);
- }
-
-
-/* BN_div_no_branch is a special version of BN_div. It does not contain
- * branches that may leak sensitive information.
- */
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
- const BIGNUM *divisor, BN_CTX *ctx)
- {
- int norm_shift,i,loop;
- BIGNUM *tmp,wnum,*snum,*sdiv,*res;
- BN_ULONG *resp,*wnump;
- BN_ULONG d0,d1;
- int num_n,div_n;
-
- bn_check_top(dv);
- bn_check_top(rm);
- /* bn_check_top(num); */ /* 'num' has been checked in BN_div() */
- bn_check_top(divisor);
-
- if (BN_is_zero(divisor))
- {
- BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO);
- return(0);
- }
-
- BN_CTX_start(ctx);
- tmp=BN_CTX_get(ctx);
- snum=BN_CTX_get(ctx);
- sdiv=BN_CTX_get(ctx);
- if (dv == NULL)
- res=BN_CTX_get(ctx);
- else res=dv;
- if (sdiv == NULL || res == NULL) goto err;
-
- /* First we normalise the numbers */
- norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
- if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err;
- sdiv->neg=0;
- norm_shift+=BN_BITS2;
- if (!(BN_lshift(snum,num,norm_shift))) goto err;
- snum->neg=0;
-
- /* Since we don't know whether snum is larger than sdiv,
- * we pad snum with enough zeroes without changing its
- * value.
- */
- if (snum->top <= sdiv->top+1)
- {
- if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
- for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
- snum->top = sdiv->top + 2;
- }
- else
- {
- if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
- snum->d[snum->top] = 0;
- snum->top ++;
}
div_n=sdiv->top;
@@ -500,12 +282,27 @@ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
/* Setup to 'res' */
res->neg= (num->neg^divisor->neg);
if (!bn_wexpand(res,(loop+1))) goto err;
- res->top=loop-1;
+ res->top=loop-no_branch;
resp= &(res->d[loop-1]);
/* space for temp */
if (!bn_wexpand(tmp,(div_n+1))) goto err;
+ if (!no_branch)
+ {
+ if (BN_ucmp(&wnum,sdiv) >= 0)
+ {
+ /* If BN_DEBUG_RAND is defined BN_ucmp changes (via
+ * bn_pollute) the const bignum arguments =>
+ * clean the values between top and max again */
+ bn_clear_top2max(&wnum);
+ bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
+ *resp=1;
+ }
+ else
+ res->top--;
+ }
+
/* if res->top == 0 then clear the neg value otherwise decrease
* the resp pointer */
if (res->top == 0)
@@ -638,7 +435,7 @@ X) -> 0x%08X\n",
rm->neg = neg;
bn_check_top(rm);
}
- bn_correct_top(res);
+ if (no_branch) bn_correct_top(res);
BN_CTX_end(ctx);
return(1);
err:
@@ -646,5 +443,4 @@ err:
BN_CTX_end(ctx);
return(0);
}
-
#endif
diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c
index d9b6c737fc..2abf6fd678 100644
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -113,6 +113,18 @@
#include "cryptlib.h"
#include "bn_lcl.h"
+#include <stdlib.h>
+#ifdef _WIN32
+# include <malloc.h>
+# ifndef alloca
+# define alloca _alloca
+# endif
+#elif defined(__GNUC__)
+# ifndef alloca
+# define alloca(s) __builtin_alloca((s))
+# endif
+#endif
+
/* maximum precomputation table size for *variable* sliding windows */
#define TABLE_SIZE 32
@@ -522,23 +534,17 @@ err:
* as cache lines are concerned. The following functions are used to transfer a BIGNUM
* from/to that table. */
-static int MOD_EXP_CTIME_COPY_TO_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width)
+static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width)
{
size_t i, j;
- if (bn_wexpand(b, top) == NULL)
- return 0;
- while (b->top < top)
- {
- b->d[b->top++] = 0;
- }
-
+ if (top > b->top)
+ top = b->top; /* this works because 'buf' is explicitly zeroed */
for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
{
buf[j] = ((unsigned char*)b->d)[i];
}
- bn_correct_top(b);
return 1;
}
@@ -561,7 +567,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
/* Given a pointer value, compute the next address that is a cache line multiple. */
#define MOD_EXP_CTIME_ALIGN(x_) \
- ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
+ ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
/* This variant of BN_mod_exp_mont() uses fixed windows and the special
* precomputation memory layout to limit data-dependency to a minimum
@@ -572,17 +578,15 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
{
- int i,bits,ret=0,idx,window,wvalue;
+ int i,bits,ret=0,window,wvalue;
int top;
- BIGNUM *r;
- const BIGNUM *aa;
BN_MONT_CTX *mont=NULL;
int numPowers;
unsigned char *powerbufFree=NULL;
int powerbufLen = 0;
unsigned char *powerbuf=NULL;
- BIGNUM *computeTemp=NULL, *am=NULL;
+ BIGNUM tmp, am;
bn_check_top(a);
bn_check_top(p);
@@ -602,10 +606,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
return ret;
}
- /* Initialize BIGNUM context and allocate intermediate result */
BN_CTX_start(ctx);
- r = BN_CTX_get(ctx);
- if (r == NULL) goto err;
/* Allocate a montgomery context if it was not supplied by the caller.
* If this is not done, things will break in the montgomery part.
@@ -620,40 +621,154 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
/* Get the window size to use with size of p. */
window = BN_window_bits_for_ctime_exponent_size(bits);
+#if defined(OPENSSL_BN_ASM_MONT5)
+ if (window==6 && bits<=1024) window=5; /* ~5% improvement of 2048-bit RSA sign */
+#endif
/* Allocate a buffer large enough to hold all of the pre-computed
- * powers of a.
+ * powers of am, am itself and tmp.
*/
numPowers = 1 << window;
- powerbufLen = sizeof(m->d[0])*top*numPowers;
+ powerbufLen = sizeof(m->d[0])*(top*numPowers +
+ ((2*top)>numPowers?(2*top):numPowers));
+#ifdef alloca
+ if (powerbufLen < 3072)
+ powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
+ else
+#endif
if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
goto err;
powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
memset(powerbuf, 0, powerbufLen);
- /* Initialize the intermediate result. Do this early to save double conversion,
- * once each for a^0 and intermediate result.
- */
- if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
+#ifdef alloca
+ if (powerbufLen < 3072)
+ powerbufFree = NULL;
+#endif
- /* Initialize computeTemp as a^1 with montgomery precalcs */
- computeTemp = BN_CTX_get(ctx);
- am = BN_CTX_get(ctx);
- if (computeTemp==NULL || am==NULL) goto err;
+ /* lay down tmp and am right after powers table */
+ tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers);
+ am.d = tmp.d + top;
+ tmp.top = am.top = 0;
+ tmp.dmax = am.dmax = top;
+ tmp.neg = am.neg = 0;
+ tmp.flags = am.flags = BN_FLG_STATIC_DATA;
+
+ /* prepare a^0 in Montgomery domain */
+#if 1
+ if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx)) goto err;
+#else
+ tmp.d[0] = (0-m->d[0])&BN_MASK2; /* 2^(top*BN_BITS2) - m */
+ for (i=1;i<top;i++)
+ tmp.d[i] = (~m->d[i])&BN_MASK2;
+ tmp.top = top;
+#endif
+ /* prepare a^1 in Montgomery domain */
if (a->neg || BN_ucmp(a,m) >= 0)
{
- if (!BN_mod(am,a,m,ctx))
- goto err;
- aa= am;
+ if (!BN_mod(&am,a,m,ctx)) goto err;
+ if (!BN_to_montgomery(&am,&am,mont,ctx)) goto err;
}
- else
- aa=a;
- if (!BN_to_montgomery(am,aa,mont,ctx)) goto err;
- if (!BN_copy(computeTemp, am)) goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
+ else if (!BN_to_montgomery(&am,a,mont,ctx)) goto err;
+
+#if defined(OPENSSL_BN_ASM_MONT5)
+ /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
+ * specifically optimization of cache-timing attack countermeasures
+ * and pre-computation optimization. */
+
+ /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
+ * 512-bit RSA is hardly relevant, we omit it to spare size... */
+ if (window==5)
+ {
+ void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap,
+ const void *table,const BN_ULONG *np,
+ const BN_ULONG *n0,int num,int power);
+ void bn_scatter5(const BN_ULONG *inp,size_t num,
+ void *table,size_t power);
+ void bn_gather5(BN_ULONG *out,size_t num,
+ void *table,size_t power);
+
+ BN_ULONG *np=mont->N.d, *n0=mont->n0;
+
+ /* BN_to_montgomery can contaminate words above .top
+ * [in BN_DEBUG[_DEBUG] build]... */
+ for (i=am.top; i<top; i++) am.d[i]=0;
+ for (i=tmp.top; i<top; i++) tmp.d[i]=0;
+
+ bn_scatter5(tmp.d,top,powerbuf,0);
+ bn_scatter5(am.d,am.top,powerbuf,1);
+ bn_mul_mont(tmp.d,am.d,am.d,np,n0,top);
+ bn_scatter5(tmp.d,top,powerbuf,2);
+
+#if 0
+ for (i=3; i<32; i++)
+ {
+ /* Calculate a^i = a^(i-1) * a */
+ bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+ bn_scatter5(tmp.d,top,powerbuf,i);
+ }
+#else
+ /* same as above, but uses squaring for 1/2 of operations */
+ for (i=4; i<32; i*=2)
+ {
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_scatter5(tmp.d,top,powerbuf,i);
+ }
+ for (i=3; i<8; i+=2)
+ {
+ int j;
+ bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+ bn_scatter5(tmp.d,top,powerbuf,i);
+ for (j=2*i; j<32; j*=2)
+ {
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_scatter5(tmp.d,top,powerbuf,j);
+ }
+ }
+ for (; i<16; i+=2)
+ {
+ bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+ bn_scatter5(tmp.d,top,powerbuf,i);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_scatter5(tmp.d,top,powerbuf,2*i);
+ }
+ for (; i<32; i+=2)
+ {
+ bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+ bn_scatter5(tmp.d,top,powerbuf,i);
+ }
+#endif
+ bits--;
+ for (wvalue=0, i=bits%5; i>=0; i--,bits--)
+ wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+ bn_gather5(tmp.d,top,powerbuf,wvalue);
+
+ /* Scan the exponent one window at a time starting from the most
+ * significant bits.
+ */
+ while (bits >= 0)
+ {
+ for (wvalue=0, i=0; i<5; i++,bits--)
+ wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
+ }
+
+ tmp.top=top;
+ bn_correct_top(&tmp);
+ }
+ else
+#endif
+ {
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err;
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers)) goto err;
/* If the window size is greater than 1, then calculate
* val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
@@ -662,62 +777,54 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
*/
if (window > 1)
{
- for (i=2; i<numPowers; i++)
+ if (!BN_mod_mul_montgomery(&tmp,&am,&am,mont,ctx)) goto err;
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, numPowers)) goto err;
+ for (i=3; i<numPowers; i++)
{
/* Calculate a^i = a^(i-1) * a */
- if (!BN_mod_mul_montgomery(computeTemp,am,computeTemp,mont,ctx))
+ if (!BN_mod_mul_montgomery(&tmp,&am,&tmp,mont,ctx))
goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(computeTemp, top, powerbuf, i, numPowers)) goto err;
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, numPowers)) goto err;
}
}
- /* Adjust the number of bits up to a multiple of the window size.
- * If the exponent length is not a multiple of the window size, then
- * this pads the most significant bits with zeros to normalize the
- * scanning loop to there's no special cases.
- *
- * * NOTE: Making the window size a power of two less than the native
- * * word size ensures that the padded bits won't go past the last
- * * word in the internal BIGNUM structure. Going past the end will
- * * still produce the correct result, but causes a different branch
- * * to be taken in the BN_is_bit_set function.
- */
- bits = ((bits+window-1)/window)*window;
- idx=bits-1; /* The top bit of the window */
-
- /* Scan the exponent one window at a time starting from the most
- * significant bits.
- */
- while (idx >= 0)
+ bits--;
+ for (wvalue=0, i=bits%window; i>=0; i--,bits--)
+ wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err;
+
+ /* Scan the exponent one window at a time starting from the most
+ * significant bits.
+ */
+ while (bits >= 0)
{
wvalue=0; /* The 'value' of the window */
/* Scan the window, squaring the result as we go */
- for (i=0; i<window; i++,idx--)
+ for (i=0; i<window; i++,bits--)
{
- if (!BN_mod_mul_montgomery(r,r,r,mont,ctx)) goto err;
- wvalue = (wvalue<<1)+BN_is_bit_set(p,idx);
+ if (!BN_mod_mul_montgomery(&tmp,&tmp,&tmp,mont,ctx)) goto err;
+ wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
}
/* Fetch the appropriate pre-computed value from the pre-buf */
- if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, numPowers)) goto err;
/* Multiply the result into the intermediate result */
- if (!BN_mod_mul_montgomery(r,r,computeTemp,mont,ctx)) goto err;
+ if (!BN_mod_mul_montgomery(&tmp,&tmp,&am,mont,ctx)) goto err;
}
+ }
/* Convert the final result from montgomery to standard format */
- if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
+ if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err;
ret=1;
err:
if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
if (powerbuf!=NULL)
{
OPENSSL_cleanse(powerbuf,powerbufLen);
- OPENSSL_free(powerbufFree);
+ if (powerbufFree) OPENSSL_free(powerbufFree);
}
- if (am!=NULL) BN_clear(am);
- if (computeTemp!=NULL) BN_clear(computeTemp);
BN_CTX_end(ctx);
return(ret);
}
@@ -988,4 +1095,3 @@ err:
bn_check_top(r);
return(ret);
}
-
diff --git a/crypto/bn/bn_gf2m.c b/crypto/bn/bn_gf2m.c
index 573156accb..ffe1b096ba 100644
--- a/crypto/bn/bn_gf2m.c
+++ b/crypto/bn/bn_gf2m.c
@@ -124,6 +124,7 @@ static const BN_ULONG SQR_tb[16] =
SQR_tb[(w) >> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF]
#endif
+#if !defined(OPENSSL_BN_ASM_GF2m)
/* Product of two polynomials a, b each with degree < BN_BITS2 - 1,
* result is a polynomial r with degree < 2 * BN_BITS - 1
* The caller MUST ensure that the variables have the right amount
@@ -218,7 +219,9 @@ static void bn_GF2m_mul_2x2(BN_ULONG *r, const BN_ULONG a1, const BN_ULONG a0, c
r[2] ^= m1 ^ r[1] ^ r[3]; /* h0 ^= m1 ^ l1 ^ h1; */
r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0; /* l1 ^= l0 ^ h0 ^ m0; */
}
-
+#else
+void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+#endif
/* Add polynomials a and b and store result in r; r could be a or b, a and b
* could be equal; r is the bitwise XOR of a and b.
@@ -362,21 +365,17 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
{
int ret = 0;
- const int max = BN_num_bits(p) + 1;
- int *arr=NULL;
+ int arr[6];
bn_check_top(a);
bn_check_top(p);
- if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
- ret = BN_GF2m_poly2arr(p, arr, max);
- if (!ret || ret > max)
+ ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0]));
+ if (!ret || ret > (int)(sizeof(arr)/sizeof(arr[0])))
{
BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH);
- goto err;
+ return 0;
}
ret = BN_GF2m_mod_arr(r, a, arr);
bn_check_top(r);
-err:
- if (arr) OPENSSL_free(arr);
return ret;
}
@@ -531,18 +530,18 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
BN_CTX_start(ctx);
- b = BN_CTX_get(ctx);
- c = BN_CTX_get(ctx);
- u = BN_CTX_get(ctx);
- v = BN_CTX_get(ctx);
- if (v == NULL) goto err;
+ if ((b = BN_CTX_get(ctx))==NULL) goto err;
+ if ((c = BN_CTX_get(ctx))==NULL) goto err;
+ if ((u = BN_CTX_get(ctx))==NULL) goto err;
+ if ((v = BN_CTX_get(ctx))==NULL) goto err;
- if (!BN_one(b)) goto err;
if (!BN_GF2m_mod(u, a, p)) goto err;
- if (!BN_copy(v, p)) goto err;
-
if (BN_is_zero(u)) goto err;
+ if (!BN_copy(v, p)) goto err;
+#if 0
+ if (!BN_one(b)) goto err;
+
while (1)
{
while (!BN_is_odd(u))
@@ -567,13 +566,86 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
if (!BN_GF2m_add(u, u, v)) goto err;
if (!BN_GF2m_add(b, b, c)) goto err;
}
+#else
+ {
+ int i, ubits = BN_num_bits(u),
+ vbits = BN_num_bits(v), /* v is copy of p */
+ top = p->top;
+ BN_ULONG *udp,*bdp,*vdp,*cdp;
+
+ bn_wexpand(u,top); udp = u->d;
+ for (i=u->top;i<top;i++) udp[i] = 0;
+ u->top = top;
+ bn_wexpand(b,top); bdp = b->d;
+ bdp[0] = 1;
+ for (i=1;i<top;i++) bdp[i] = 0;
+ b->top = top;
+ bn_wexpand(c,top); cdp = c->d;
+ for (i=0;i<top;i++) cdp[i] = 0;
+ c->top = top;
+ vdp = v->d; /* It pays off to "cache" *->d pointers, because
+ * it allows optimizer to be more aggressive.
+ * But we don't have to "cache" p->d, because *p
+ * is declared 'const'... */
+ while (1)
+ {
+ while (ubits && !(udp[0]&1))
+ {
+ BN_ULONG u0,u1,b0,b1,mask;
+ u0 = udp[0];
+ b0 = bdp[0];
+ mask = (BN_ULONG)0-(b0&1);
+ b0 ^= p->d[0]&mask;
+ for (i=0;i<top-1;i++)
+ {
+ u1 = udp[i+1];
+ udp[i] = ((u0>>1)|(u1<<(BN_BITS2-1)))&BN_MASK2;
+ u0 = u1;
+ b1 = bdp[i+1]^(p->d[i+1]&mask);
+ bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2;
+ b0 = b1;
+ }
+ udp[i] = u0>>1;
+ bdp[i] = b0>>1;
+ ubits--;
+ }
+
+ if (ubits<=BN_BITS2 && udp[0]==1) break;
+
+ if (ubits<vbits)
+ {
+ i = ubits; ubits = vbits; vbits = i;
+ tmp = u; u = v; v = tmp;
+ tmp = b; b = c; c = tmp;
+ udp = vdp; vdp = v->d;
+ bdp = cdp; cdp = c->d;
+ }
+ for(i=0;i<top;i++)
+ {
+ udp[i] ^= vdp[i];
+ bdp[i] ^= cdp[i];
+ }
+ if (ubits==vbits)
+ {
+ bn_correct_top(u);
+ ubits = BN_num_bits(u);
+ }
+ }
+ bn_correct_top(b);
+ }
+#endif
if (!BN_copy(r, b)) goto err;
bn_check_top(r);
ret = 1;
err:
+#ifdef BN_DEBUG /* BN_CTX_end would complain about the expanded form */
+ bn_correct_top(c);
+ bn_correct_top(u);
+ bn_correct_top(v);
+#endif
BN_CTX_end(ctx);
return ret;
}
diff --git a/crypto/bn/bn_lcl.h b/crypto/bn/bn_lcl.h
index 8e5e98e3f2..d7dff0d90c 100644
--- a/crypto/bn/bn_lcl.h
+++ b/crypto/bn/bn_lcl.h
@@ -238,7 +238,7 @@ extern "C" {
# if defined(__DECC)
# include <c_asm.h>
# define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
-# elif defined(__GNUC__)
+# elif defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("umulh %1,%2,%0" \
@@ -247,7 +247,7 @@ extern "C" {
ret; })
# endif /* compiler */
# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
-# if defined(__GNUC__)
+# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("mulhdu %0,%1,%2" \
@@ -257,7 +257,7 @@ extern "C" {
# endif /* compiler */
# elif (defined(__x86_64) || defined(__x86_64__)) && \
(defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
-# if defined(__GNUC__)
+# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret,discard; \
asm ("mulq %3" \
@@ -280,6 +280,19 @@ extern "C" {
# define BN_UMULT_HIGH(a,b) __umulh((a),(b))
# define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high)))
# endif
+# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
+# if defined(__GNUC__) && __GNUC__>=2
+# define BN_UMULT_HIGH(a,b) ({ \
+ register BN_ULONG ret; \
+ asm ("dmultu %1,%2" \
+ : "=h"(ret) \
+ : "r"(a), "r"(b) : "l"); \
+ ret; })
+# define BN_UMULT_LOHI(low,high,a,b) \
+ asm ("dmultu %2,%3" \
+ : "=l"(low),"=h"(high) \
+ : "r"(a), "r"(b));
+# endif
# endif /* cpu */
#endif /* OPENSSL_NO_ASM */
diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c
index 1a866880f5..427b5cf4df 100644
--- a/crypto/bn/bn_mont.c
+++ b/crypto/bn/bn_mont.c
@@ -177,31 +177,26 @@ err:
static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
{
BIGNUM *n;
- BN_ULONG *ap,*np,*rp,n0,v,*nrp;
- int al,nl,max,i,x,ri;
+ BN_ULONG *ap,*np,*rp,n0,v,carry;
+ int nl,max,i;
n= &(mont->N);
- /* mont->ri is the size of mont->N in bits (rounded up
- to the word size) */
- al=ri=mont->ri/BN_BITS2;
-
nl=n->top;
- if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
+ if (nl == 0) { ret->top=0; return(1); }
- max=(nl+al+1); /* allow for overflow (no?) XXX */
+ max=(2*nl); /* carry is stored separately */
if (bn_wexpand(r,max) == NULL) return(0);
r->neg^=n->neg;
np=n->d;
rp=r->d;
- nrp= &(r->d[nl]);
/* clear the top words of T */
#if 1
for (i=r->top; i<max; i++) /* memset? XXX */
- r->d[i]=0;
+ rp[i]=0;
#else
- memset(&(r->d[r->top]),0,(max-r->top)*sizeof(BN_ULONG));
+ memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG));
#endif
r->top=max;
@@ -210,7 +205,7 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
#ifdef BN_COUNT
fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
#endif
- for (i=0; i<nl; i++)
+ for (carry=0, i=0; i<nl; i++, rp++)
{
#ifdef __TANDEM
{
@@ -228,61 +223,33 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
#else
v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
#endif
- nrp++;
- rp++;
- if (((nrp[-1]+=v)&BN_MASK2) >= v)
- continue;
- else
- {
- if (((++nrp[0])&BN_MASK2) != 0) continue;
- if (((++nrp[1])&BN_MASK2) != 0) continue;
- for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ;
- }
- }
- bn_correct_top(r);
-
- /* mont->ri will be a multiple of the word size and below code
- * is kind of BN_rshift(ret,r,mont->ri) equivalent */
- if (r->top <= ri)
- {
- ret->top=0;
- return(1);
+ v = (v+carry+rp[nl])&BN_MASK2;
+ carry |= (v != rp[nl]);
+ carry &= (v <= rp[nl]);
+ rp[nl]=v;
}
- al=r->top-ri;
-#define BRANCH_FREE 1
-#if BRANCH_FREE
- if (bn_wexpand(ret,ri) == NULL) return(0);
- x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
- ret->top=x=(ri&~x)|(al&x); /* min(ri,al) */
+ if (bn_wexpand(ret,nl) == NULL) return(0);
+ ret->top=nl;
ret->neg=r->neg;
rp=ret->d;
- ap=&(r->d[ri]);
+ ap=&(r->d[nl]);
+#define BRANCH_FREE 1
+#if BRANCH_FREE
{
- size_t m1,m2;
-
- v=bn_sub_words(rp,ap,np,ri);
- /* this ----------------^^ works even in al<ri case
- * thanks to zealous zeroing of top of the vector in the
- * beginning. */
+ BN_ULONG *nrp;
+ size_t m;
- /* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
- /* in other words if subtraction result is real, then
+ v=bn_sub_words(rp,ap,np,nl)-carry;
+ /* if subtraction result is real, then
* trick unconditional memcpy below to perform in-place
* "refresh" instead of actual copy. */
- m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1); /* al<ri */
- m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1); /* al>ri */
- m1|=m2; /* (al!=ri) */
- m1|=(0-(size_t)v); /* (al!=ri || v) */
- m1&=~m2; /* (al!=ri || v) && !al>ri */
- nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m1)|((PTR_SIZE_INT)ap&m1));
- }
+ m=(0-(size_t)v);
+ nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m));