summaryrefslogtreecommitdiffstats
path: root/crypto/bn/bn_asm.c
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2005-10-04 06:19:29 +0000
committerAndy Polyakov <appro@openssl.org>2005-10-04 06:19:29 +0000
commite738280547e0f7e3cc5756a92ce3c926eb7736ce (patch)
tree7f7eca3a84bb0f98d1970523ac9b32f9b1616295 /crypto/bn/bn_asm.c
parent8265328def2eec75a8afda71b38195a342aeecb2 (diff)
Add reference implementation for bn_[mul|sqr]_mont, new candidates for
assembler implementation.
Diffstat (limited to 'crypto/bn/bn_asm.c')
-rw-r--r--crypto/bn/bn_asm.c126
1 files changed, 124 insertions, 2 deletions
diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c
index 99bc2de491..52af96d36b 100644
--- a/crypto/bn/bn_asm.c
+++ b/crypto/bn/bn_asm.c
@@ -820,18 +820,95 @@ void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
r[6]=c1;
r[7]=c2;
}
+
+#ifdef OPENSSL_BN_ASM_MONT
+/*
+ * This is essentially reference implementation, which may or may not
+ * result in performance improvement. E.g. on IA-32 this does give 40%
+ * faster rsa1024 private key operations and 10% faster rsa4096 ones,
+ * while on AMD64 it improves rsa1024 sign only by 10%, but *worsens*
+ * rsa4096 sign by 15%. Once again, it's a reference implementation,
+ * one to be used as start-point for platform-specific assembler.
+ */
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num)
+ {
+ BN_ULONG c0,c1,ml,*tp;
+#ifdef mul64
+ BN_ULONG mh;
+#endif
+ volatile BN_ULONG *vp;
+ int i=0,j;
+
+ vp = tp = alloca((num+2)*sizeof(BN_ULONG));
+
+ tp[num] = bn_mul_words(tp,ap,num,bp[0]);
+ tp[num+1] = 0;
+ goto enter;
+
+ for(i=0;i<num;i++)
+ {
+ c0 = bn_mul_add_words(tp,ap,num,bp[i]);
+ c1 = (tp[num] + c0)&BN_MASK2;
+ tp[num] = c1;
+ tp[num+1] = (c1<c0?1:0);
+ enter:
+ c1 = tp[0];
+ ml = (c1*n0)&BN_MASK2;
+ c0 = 0;
+#ifdef mul64
+ mh = HBITS(ml);
+ ml = LBITS(ml);
+ mul_add(c1,np[0],ml,mh,c0);
+#else
+ mul_add(c1,ml,np[0],c0);
+#endif
+ for(j=1;j<num;j++)
+ {
+ c1 = tp[j];
+#ifdef mul64
+ mul_add(c1,np[j],ml,mh,c0);
+#else
+ mul_add(c1,ml,np[j],c0);
+#endif
+ tp[j-1] = c1&BN_MASK2;
+ }
+ c1 = (tp[num] + c0)&BN_MASK2;
+ tp[num-1] = c1;
+ tp[num] = tp[num+1] + (c1<c0?1:0);
+ }
+
+ if (tp[num]!=0 || tp[num-1]>=np[num-1])
+ {
+ c0 = bn_sub_words(rp,tp,np,num);
+ if (tp[num]!=0 || c0==0)
+ {
+ for(i=0;i<num+2;i++) vp[i] = 0;
+ return;
+ }
+ }
+ for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;
+ vp[num] = 0;
+ vp[num+1] = 0;
+ }
+
+void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num)
+ {
+ bn_mul_mont(rp,ap,ap,np,n0,num);
+ }
+#endif /* OPENSSL_BN_ASM_MONT */
+
#else /* !BN_MUL_COMBA */
/* hmm... is it faster just to do a multiply? */
#undef bn_sqr_comba4
-void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG t[8];
bn_sqr_normal(r,a,4,t);
}
#undef bn_sqr_comba8
-void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG t[16];
bn_sqr_normal(r,a,8,t);
@@ -857,4 +934,49 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
}
+#ifdef OPENSSL_BN_ASM_MONT
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num)
+ {
+ BN_ULONG c0,c1,*tp;
+ volatile BN_ULONG *vp;
+ int i=0,j;
+
+ vp = tp = alloca((num+2)*sizeof(BN_ULONG));
+
+ for(i=0;i<=num;i++) tp[i]=0;
+
+ for(i=0;i<num;i++)
+ {
+ c0 = bn_mul_add_words(tp,ap,num,bp[i]);
+ c1 = tp[num] + c0;
+ tp[num] = c1;
+ tp[num+1] = (c1<c0?1:0);
+
+ c0 = bn_mul_add_words(tp,np,num,tp[0]*n0);
+ c1 = tp[num] + c0;
+ tp[num] = c1;
+ tp[num+1] += (c1<c0?1:0);
+ for(j=0;j<=num;j++) tp[j]=tp[j+1];
+ }
+
+ if (tp[num]!=0 || tp[num-1]>=np[num-1])
+ {
+ c0 = bn_sub_words(rp,tp,np,num);
+ if (tp[num]!=0 || c0==0)
+ {
+ for(i=0;i<num+2;i++) vp[i] = 0;
+ return;
+ }
+ }
+ for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;
+ vp[num] = 0;
+ vp[num+1] = 0;
+ }
+
+void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num)
+ {
+ bn_mul_mont(rp,ap,ap,np,n0,num);
+ }
+#endif /* OPENSSL_BN_ASM_MONT */
+
#endif /* !BN_MUL_COMBA */