summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2016-09-09 17:25:04 +0200
committerAndy Polyakov <appro@openssl.org>2016-10-24 20:00:40 +0200
commit80d27cdb84985c697f8fabb7649abf1f54714d13 (patch)
treed9b1afe5cfdd008e5f027218f7c1e78fa722379f
parent68f6d2a02c8cc30c5c737fc948b7cf023a234b47 (diff)
ppccap.c: engage new multipplication and squaring subroutines.
[And remove FPU mutiplication subroutine.] Reviewed-by: Rich Salz <rsalz@openssl.org>
-rw-r--r--Configurations/00-base-templates.conf2
-rw-r--r--crypto/bn/asm/ppc-mont.pl3
-rw-r--r--crypto/ppccap.c44
3 files changed, 16 insertions, 33 deletions
diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf
index 8bb4de75f9..ed50910eed 100644
--- a/Configurations/00-base-templates.conf
+++ b/Configurations/00-base-templates.conf
@@ -278,7 +278,7 @@
ppc32_asm => {
template => 1,
cpuid_asm_src => "ppccpuid.s ppccap.c",
- bn_asm_src => "bn-ppc.s ppc-mont.s ppc64-mont.s",
+ bn_asm_src => "bn-ppc.s ppc-mont.s",
aes_asm_src => "aes_core.c aes_cbc.c aes-ppc.s vpaes-ppc.s aesp8-ppc.s",
sha1_asm_src => "sha1-ppc.s sha256-ppc.s sha512-ppc.s sha256p8-ppc.s sha512p8-ppc.s",
modes_asm_src => "ghashp8-ppc.s",
diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl
index ce0b061a7d..fdc049ae0e 100644
--- a/crypto/bn/asm/ppc-mont.pl
+++ b/crypto/bn/asm/ppc-mont.pl
@@ -134,10 +134,7 @@ $code=<<___;
.globl .bn_mul_mont_int
.align 5
.bn_mul_mont_int:
- cmpwi $num,4
mr $rp,r3 ; $rp is reassigned
- li r3,0
- bltlr
___
$code.=<<___ if ($BNSZ==4);
cmpwi $num,32 ; longer key performance is not better
diff --git a/crypto/ppccap.c b/crypto/ppccap.c
index 28cfa199e5..b2b898e797 100644
--- a/crypto/ppccap.c
+++ b/crypto/ppccap.c
@@ -35,38 +35,24 @@ static sigset_t all_masked;
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, int num)
{
- int bn_mul_mont_fpu64(BN_ULONG *rp, const BN_ULONG *ap,
- const BN_ULONG *bp, const BN_ULONG *np,
- const BN_ULONG *n0, int num);
int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, int num);
+ int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+ const BN_ULONG *np, const BN_ULONG *n0, int num);
- if (sizeof(size_t) == 4) {
-# if 1 || (defined(__APPLE__) && defined(__MACH__))
- if (num >= 8 && (num & 3) == 0 && (OPENSSL_ppccap_P & PPC_FPU64))
- return bn_mul_mont_fpu64(rp, ap, bp, np, n0, num);
-# else
- /*
- * boundary of 32 was experimentally determined on Linux 2.6.22,
- * might have to be adjusted on AIX...
- */
- if (num >= 32 && (num & 3) == 0 && (OPENSSL_ppccap_P & PPC_FPU64)) {
- sigset_t oset;
- int ret;
-
- sigprocmask(SIG_SETMASK, &all_masked, &oset);
- ret = bn_mul_mont_fpu64(rp, ap, bp, np, n0, num);
- sigprocmask(SIG_SETMASK, &oset, NULL);
-
- return ret;
- }
-# endif
- } else if ((OPENSSL_ppccap_P & PPC_FPU64))
- /*
- * this is a "must" on POWER6, but run-time detection is not
- * implemented yet...
- */
- return bn_mul_mont_fpu64(rp, ap, bp, np, n0, num);
+ if (num < 4)
+ return 0;
+
+ if ((num & 3) == 0)
+ return bn_mul4x_mont_int(rp, ap, bp, np, n0, num);
+
+ /*
+ * There used to be [optional] call to bn_mul_mont_fpu64 here,
+ * but above subroutine is faster on contemporary processors.
+ * Formulation means that there might be old processors where
+ * FPU code path would be faster, POWER6 perhaps, but there was
+ * no opportunity to figure it out...
+ */
return bn_mul_mont_int(rp, ap, bp, np, n0, num);
}