summaryrefslogtreecommitdiffstats
path: root/crypto/ec
diff options
context:
space:
mode:
authorRohan McLure <rohanmclure@linux.ibm.com>2023-07-12 12:25:22 +1000
committerTodd Short <todd.short@me.com>2023-08-04 10:20:28 -0400
commit01d901e470d9e035a3bd78e77b9438a4cc0da785 (patch)
tree205d9a10c12dab419a43383bcaf2d231f275d7d2 /crypto/ec
parent3e47a286dc3274bda72a196c3a4030a1fc8302f1 (diff)
ec: 56-bit Limb Solinas' Strategy for secp384r1
Adopt a 56-bit redundant-limb Solinas' reduction approach for efficient modular multiplication in P384. This has the affect of accelerating digital signing by 446% and verification by 106%. The implementation strategy and names of methods are the same as that provided in ecp_nistp224 and ecp_nistp521. As in Commit 1036749883cc ("ec: Add run time code selection for p521 field operations"), allow for run time selection of implementation for felem_{square,mul}, where an assembly implementation is proclaimed to be present when ECP_NISTP384_ASM is present. Signed-off-by: Rohan McLure <rohanmclure@linux.ibm.com> Reviewed-by: Paul Dale <pauli@openssl.org> Reviewed-by: Shane Lontis <shane.lontis@oracle.com> Reviewed-by: Dmitry Belyavskiy <beldmit@gmail.com> Reviewed-by: Todd Short <todd.short@me.com> (Merged from https://github.com/openssl/openssl/pull/21471)
Diffstat (limited to 'crypto/ec')
-rw-r--r--crypto/ec/build.info2
-rw-r--r--crypto/ec/ec_curve.c4
-rw-r--r--crypto/ec/ec_lib.c8
-rw-r--r--crypto/ec/ec_local.h27
-rw-r--r--crypto/ec/ecp_nistp384.c1988
5 files changed, 2027 insertions, 2 deletions
diff --git a/crypto/ec/build.info b/crypto/ec/build.info
index 6fd73e4573..1fa60a1ded 100644
--- a/crypto/ec/build.info
+++ b/crypto/ec/build.info
@@ -74,7 +74,7 @@ IF[{- !$disabled{'ecx'} -}]
ENDIF
IF[{- !$disabled{'ec_nistp_64_gcc_128'} -}]
- $COMMON=$COMMON ecp_nistp224.c ecp_nistp256.c ecp_nistp521.c ecp_nistputil.c
+ $COMMON=$COMMON ecp_nistp224.c ecp_nistp256.c ecp_nistp384.c ecp_nistp521.c ecp_nistputil.c
ENDIF
SOURCE[../../libcrypto]=$COMMON ec_ameth.c ec_pmeth.c \
diff --git a/crypto/ec/ec_curve.c b/crypto/ec/ec_curve.c
index 2bf6522e84..724525a479 100644
--- a/crypto/ec/ec_curve.c
+++ b/crypto/ec/ec_curve.c
@@ -2838,6 +2838,8 @@ static const ec_list_element curve_list[] = {
{NID_secp384r1, &_EC_NIST_PRIME_384.h,
# if defined(S390X_EC_ASM)
EC_GFp_s390x_nistp384_method,
+# elif !defined(OPENSSL_NO_EC_NISTP_64_GCC_128)
+ ossl_ec_GFp_nistp384_method,
# else
0,
# endif
@@ -2931,6 +2933,8 @@ static const ec_list_element curve_list[] = {
{NID_secp384r1, &_EC_NIST_PRIME_384.h,
# if defined(S390X_EC_ASM)
EC_GFp_s390x_nistp384_method,
+# elif !defined(OPENSSL_NO_EC_NISTP_64_GCC_128)
+ ossl_ec_GFp_nistp384_method,
# else
0,
# endif
diff --git a/crypto/ec/ec_lib.c b/crypto/ec/ec_lib.c
index 9e262b427d..e0d6cf7342 100644
--- a/crypto/ec/ec_lib.c
+++ b/crypto/ec/ec_lib.c
@@ -99,12 +99,16 @@ void EC_pre_comp_free(EC_GROUP *group)
case PCT_nistp256:
EC_nistp256_pre_comp_free(group->pre_comp.nistp256);
break;
+ case PCT_nistp384:
+ ossl_ec_nistp384_pre_comp_free(group->pre_comp.nistp384);
+ break;
case PCT_nistp521:
EC_nistp521_pre_comp_free(group->pre_comp.nistp521);
break;
#else
case PCT_nistp224:
case PCT_nistp256:
+ case PCT_nistp384:
case PCT_nistp521:
break;
#endif
@@ -188,12 +192,16 @@ int EC_GROUP_copy(EC_GROUP *dest, const EC_GROUP *src)
case PCT_nistp256:
dest->pre_comp.nistp256 = EC_nistp256_pre_comp_dup(src->pre_comp.nistp256);
break;
+ case PCT_nistp384:
+ dest->pre_comp.nistp384 = ossl_ec_nistp384_pre_comp_dup(src->pre_comp.nistp384);
+ break;
case PCT_nistp521:
dest->pre_comp.nistp521 = EC_nistp521_pre_comp_dup(src->pre_comp.nistp521);
break;
#else
case PCT_nistp224:
case PCT_nistp256:
+ case PCT_nistp384:
case PCT_nistp521:
break;
#endif
diff --git a/crypto/ec/ec_local.h b/crypto/ec/ec_local.h
index 92d2b6f570..7181090fca 100644
--- a/crypto/ec/ec_local.h
+++ b/crypto/ec/ec_local.h
@@ -203,6 +203,7 @@ struct ec_method_st {
*/
typedef struct nistp224_pre_comp_st NISTP224_PRE_COMP;
typedef struct nistp256_pre_comp_st NISTP256_PRE_COMP;
+typedef struct nistp384_pre_comp_st NISTP384_PRE_COMP;
typedef struct nistp521_pre_comp_st NISTP521_PRE_COMP;
typedef struct nistz256_pre_comp_st NISTZ256_PRE_COMP;
typedef struct ec_pre_comp_st EC_PRE_COMP;
@@ -264,12 +265,13 @@ struct ec_group_st {
*/
enum {
PCT_none,
- PCT_nistp224, PCT_nistp256, PCT_nistp521, PCT_nistz256,
+ PCT_nistp224, PCT_nistp256, PCT_nistp384, PCT_nistp521, PCT_nistz256,
PCT_ec
} pre_comp_type;
union {
NISTP224_PRE_COMP *nistp224;
NISTP256_PRE_COMP *nistp256;
+ NISTP384_PRE_COMP *nistp384;
NISTP521_PRE_COMP *nistp521;
NISTZ256_PRE_COMP *nistz256;
EC_PRE_COMP *ec;
@@ -332,6 +334,7 @@ static ossl_inline int ec_point_is_compat(const EC_POINT *point,
NISTP224_PRE_COMP *EC_nistp224_pre_comp_dup(NISTP224_PRE_COMP *);
NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *);
+NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *);
NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *);
NISTZ256_PRE_COMP *EC_nistz256_pre_comp_dup(NISTZ256_PRE_COMP *);
NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *);
@@ -340,6 +343,7 @@ EC_PRE_COMP *EC_ec_pre_comp_dup(EC_PRE_COMP *);
void EC_pre_comp_free(EC_GROUP *group);
void EC_nistp224_pre_comp_free(NISTP224_PRE_COMP *);
void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *);
+void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *);
void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *);
void EC_nistz256_pre_comp_free(NISTZ256_PRE_COMP *);
void EC_ec_pre_comp_free(EC_PRE_COMP *);
@@ -551,6 +555,27 @@ int ossl_ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
int ossl_ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
int ossl_ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group);
+/* method functions in ecp_nistp384.c */
+int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group);
+int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p,
+ const BIGNUM *a, const BIGNUM *n,
+ BN_CTX *);
+int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group,
+ const EC_POINT *point,
+ BIGNUM *x, BIGNUM *y,
+ BN_CTX *ctx);
+int ossl_ec_GFp_nistp384_mul(const EC_GROUP *group, EC_POINT *r,
+ const BIGNUM *scalar, size_t num,
+ const EC_POINT *points[], const BIGNUM *scalars[],
+ BN_CTX *);
+int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r,
+ const BIGNUM *scalar, size_t num,
+ const EC_POINT *points[],
+ const BIGNUM *scalars[], BN_CTX *ctx);
+int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group);
+const EC_METHOD *ossl_ec_GFp_nistp384_method(void);
+
/* method functions in ecp_nistp521.c */
int ossl_ec_GFp_nistp521_group_init(EC_GROUP *group);
int ossl_ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
diff --git a/crypto/ec/ecp_nistp384.c b/crypto/ec/ecp_nistp384.c
new file mode 100644
index 0000000000..a0559487ed
--- /dev/null
+++ b/crypto/ec/ecp_nistp384.c
@@ -0,0 +1,1988 @@
+/*
+ * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/* Copyright 2023 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ *
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>.
+ * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm
+ * acceleration of felem_{square,mul} as supported in these files.
+ */
+
+#include <openssl/e_os2.h>
+
+#include <string.h>
+#include <openssl/err.h>
+#include "ec_local.h"
+
+#include "internal/numbers.h"
+
+#ifndef INT128_MAX
+# error "Your compiler doesn't appear to support 128-bit integer types"
+#endif
+
+typedef uint8_t u8;
+typedef uint64_t u64;
+
+/*
+ * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We
+ * can serialize an element of this field into 48 bytes. We call this an
+ * felem_bytearray.
+ */
+
+typedef u8 felem_bytearray[48];
+
+/*
+ * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4.
+ * These values are big-endian.
+ */
+static const felem_bytearray nistp384_curve_params[5] = {
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC},
+ {0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */
+ 0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12,
+ 0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D,
+ 0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF},
+ {0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */
+ 0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
+ 0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D,
+ 0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7},
+ {0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */
+ 0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
+ 0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE,
+ 0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F},
+};
+
+/*-
+ * The representation of field elements.
+ * ------------------------------------
+ *
+ * We represent field elements with seven values. These values are either 64 or
+ * 128 bits and the field element represented is:
+ * v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336 (mod p)
+ * Each of the seven values is called a 'limb'. Since the limbs are spaced only
+ * 56 bits apart, but are greater than 56 bits in length, the most significant
+ * bits of each limb overlap with the least significant bits of the next
+ *
+ * This representation is considered to be 'redundant' in the sense that
+ * intermediate values can each contain more than a 56-bit value in each limb.
+ * Reduction causes all but the final limb to be reduced to contain a value less
+ * than 2^56, with the final value represented allowed to be larger than 2^384,
+ * inasmuch as we can be sure that arithmetic overflow remains impossible. The
+ * reduced value must of course be congruent to the unreduced value.
+ *
+ * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
+ * 'widefelem', featuring enough bits to store the result of a multiplication
+ * and even some further arithmetic without need for immediate reduction.
+ */
+
+#define NLIMBS 7
+
+typedef uint64_t limb;
+typedef uint128_t widelimb;
+typedef limb limb_aX __attribute((__aligned__(1)));
+typedef limb felem[NLIMBS];
+typedef widelimb widefelem[2*NLIMBS-1];
+
+static const limb bottom56bits = 0xffffffffffffff;
+
+/* Helper functions (de)serialising reduced field elements in little endian */
+static void bin48_to_felem(felem out, const u8 in[48])
+{
+ memset(out, 0, 56);
+ out[0] = (*((limb *) & in[0])) & bottom56bits;
+ out[1] = (*((limb_aX *) & in[7])) & bottom56bits;
+ out[2] = (*((limb_aX *) & in[14])) & bottom56bits;
+ out[3] = (*((limb_aX *) & in[21])) & bottom56bits;
+ out[4] = (*((limb_aX *) & in[28])) & bottom56bits;
+ out[5] = (*((limb_aX *) & in[35])) & bottom56bits;
+ memmove(&out[6], &in[42], 6);
+}
+
+static void felem_to_bin48(u8 out[48], const felem in)
+{
+ memset(out, 0, 48);
+ (*((limb *) & out[0])) |= (in[0] & bottom56bits);
+ (*((limb_aX *) & out[7])) |= (in[1] & bottom56bits);
+ (*((limb_aX *) & out[14])) |= (in[2] & bottom56bits);
+ (*((limb_aX *) & out[21])) |= (in[3] & bottom56bits);
+ (*((limb_aX *) & out[28])) |= (in[4] & bottom56bits);
+ (*((limb_aX *) & out[35])) |= (in[5] & bottom56bits);
+ memmove(&out[42], &in[6], 6);
+}
+
+/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
+static int BN_to_felem(felem out, const BIGNUM *bn)
+{
+ felem_bytearray b_out;
+ int num_bytes;
+
+ if (BN_is_negative(bn)) {
+ ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
+ return 0;
+ }
+ num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
+ if (num_bytes < 0) {
+ ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
+ return 0;
+ }
+ bin48_to_felem(out, b_out);
+ return 1;
+}
+
+/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
+static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
+{
+ felem_bytearray b_out;
+
+ felem_to_bin48(b_out, in);
+ return BN_lebin2bn(b_out, sizeof(b_out), out);
+}
+
+/*-
+ * Field operations
+ * ----------------
+ */
+
+static void felem_one(felem out)
+{
+ out[0] = 1;
+ memset(&out[1], 0, sizeof(limb) * (NLIMBS-1));
+}
+
+static void felem_assign(felem out, const felem in)
+{
+ memcpy(out, in, sizeof(felem));
+}
+
+/* felem_sum64 sets out = out + in. */
+static void felem_sum64(felem out, const felem in)
+{
+ unsigned int i;
+
+ for (i = 0; i < NLIMBS; i++)
+ out[i] += in[i];
+}
+
+/* felem_scalar sets out = in * scalar */
+static void felem_scalar(felem out, const felem in, limb scalar)
+{
+ unsigned int i;
+
+ for (i = 0; i < NLIMBS; i++)
+ out[i] = in[i] * scalar;
+}
+
+/* felem_scalar64 sets out = out * scalar */
+static void felem_scalar64(felem out, limb scalar)
+{
+ unsigned int i;
+
+ for (i = 0; i < NLIMBS; i++)
+ out[i] *= scalar;
+}
+
+/* felem_scalar128 sets out = out * scalar */
+static void felem_scalar128(widefelem out, limb scalar)
+{
+ unsigned int i;
+
+ for (i = 0; i < 2*NLIMBS-1; i++)
+ out[i] *= scalar;
+}
+
+/*-
+ * felem_neg sets |out| to |-in|
+ * On entry:
+ * in[i] < 2^60 - 2^29
+ * On exit:
+ * out[i] < 2^60
+ */
+static void felem_neg(felem out, const felem in)
+{
+ /*
+ * In order to prevent underflow, we add a multiple of p before subtracting.
+ * Use telescopic sums to represent 2^12 * p redundantly with each limb
+ * of the form 2^60 + ...
+ */
+ static const limb two60m52m4 = (((limb) 1) << 60)
+ - (((limb) 1) << 52)
+ - (((limb) 1) << 4);
+ static const limb two60p44m12 = (((limb) 1) << 60)
+ + (((limb) 1) << 44)
+ - (((limb) 1) << 12);
+ static const limb two60m28m4 = (((limb) 1) << 60)
+ - (((limb) 1) << 28)
+ - (((limb) 1) << 4);
+ static const limb two60m4 = (((limb) 1) << 60)
+ - (((limb) 1) << 4);
+
+ out[0] = two60p44m12 - in[0];
+ out[1] = two60m52m4 - in[1];
+ out[2] = two60m28m4 - in[2];
+ out[3] = two60m4 - in[3];
+ out[4] = two60m4 - in[4];
+ out[5] = two60m4 - in[5];
+ out[6] = two60m4 - in[6];
+}
+
+/*-
+ * felem_diff64 subtracts |in| from |out|
+ * On entry:
+ * in[i] < 2^60 - 2^52 - 2^4
+ * On exit:
+ * out[i] < out_orig[i] + 2^60 + 2^44
+ */
+static void felem_diff64(felem out, const felem in)
+{
+ /*
+ * In order to prevent underflow, we add a multiple of p before subtracting.
+ * Use telescopic sums to represent 2^12 * p redundantly with each limb
+ * of the form 2^60 + ...
+ */
+
+ static const limb two60m52m4 = (((limb) 1) << 60)
+ - (((limb) 1) << 52)
+ - (((limb) 1) << 4);
+ static const limb two60p44m12 = (((limb) 1) << 60)
+ + (((limb) 1) << 44)
+ - (((limb) 1) << 12);
+ static const limb two60m28m4 = (((limb) 1) << 60)
+ - (((limb) 1) << 28)
+ - (((limb) 1) << 4);
+ static const limb two60m4 = (((limb) 1) << 60)
+ - (((limb) 1) << 4);
+
+ out[0] += two60p44m12 - in[0];
+ out[1] += two60m52m4 - in[1];
+ out[2] += two60m28m4 - in[2];
+ out[3] += two60m4 - in[3];
+ out[4] += two60m4 - in[4];
+ out[5] += two60m4 - in[5];
+ out[6] += two60m4 - in[6];
+}
+
+/*
+ * in[i] < 2^63
+ * out[i] < out_orig[i] + 2^64 + 2^48
+ */
+static void felem_diff_128_64(widefelem out, const felem in)
+{
+ /*
+ * In order to prevent underflow, we add a multiple of p before subtracting.
+ * Use telescopic sums to represent 2^16 * p redundantly with each limb
+ * of the form 2^64 + ...
+ */
+
+ static const widelimb two64m56m8 = (((widelimb) 1) << 64)
+ - (((widelimb) 1) << 56)
+ - (((widelimb) 1) << 8);
+ static const widelimb two64m32m8 = (((widelimb) 1) << 64)
+ - (((widelimb) 1) << 32)
+ - (((widelimb) 1) << 8);
+ static const widelimb two64m8 = (((widelimb) 1) << 64)
+ - (((widelimb) 1) << 8);
+ static const widelimb two64p48m16 = (((widelimb) 1) << 64)
+ + (((widelimb) 1) << 48)
+ - (((widelimb) 1) << 16);
+ unsigned int i;
+
+ out[0] += two64p48m16;
+ out[1] += two64m56m8;
+ out[2] += two64m32m8;
+ out[3] += two64m8;
+ out[4] += two64m8;
+ out[5] += two64m8;
+ out[6] += two64m8;
+
+ for (i = 0; i < NLIMBS; i++)
+ out[i] -= in[i];
+}
+
+/*
+ * in[i] < 2^127 - 2^119 - 2^71
+ * out[i] < out_orig[i] + 2^127 + 2^111
+ */
+static void felem_diff128(widefelem out, const widefelem in)
+{
+ /*
+ * In order to prevent underflow, we add a multiple of p before subtracting.
+ * Use telescopic sums to represent 2^415 * p redundantly with each limb
+ * of the form 2^127 + ...
+ */
+
+ static const widelimb two127 = ((widelimb) 1) << 127;
+ static const widelimb two127m71 = (((widelimb) 1) << 127)
+ - (((widelimb) 1) << 71);
+ static const widelimb two127p111m79m71 = (((widelimb) 1) << 127)
+ + (((widelimb) 1) << 111)
+ - (((widelimb) 1) << 79)
+ - (((widelimb) 1) << 71);
+ static const widelimb two127m119m71 = (((widelimb) 1) << 127)
+ - (((widelimb) 1) << 119)
+ - (((widelimb) 1) << 71);
+ static const widelimb two127m95m71 = (((widelimb) 1) << 127)
+ - (((widelimb) 1) << 95)
+ - (((widelimb) 1) << 71);
+ unsigned int i;
+
+ out[0] += two127;
+ out[1] += two127m71;
+ out[2] += two127m71;
+ out[3] += two127m71;
+ out[4] += two127m71;
+ out[5] += two127m71;
+ out[6] += two127p111m79m71;
+ out[7] += two127m119m71;
+ out[8] += two127m95m71;
+ out[9] += two127m71;
+ out[10] += two127m71;
+ out[11] += two127m71;
+ out[12] += two127m71;
+
+ for (i = 0; i < 2*NLIMBS-1; i++)
+ out[i] -= in[i];
+}
+
+static void felem_square_ref(widefelem out, const felem in)
+{
+ felem inx2;
+ felem_scalar(inx2, in, 2);
+
+ out[0] = ((uint128_t) in[0]) * in[0];
+
+ out[1] = ((uint128_t) in[0]) * inx2[1];
+
+ out[2] = ((uint128_t) in[0]) * inx2[2]
+ + ((uint128_t) in[1]) * in[1];
+
+ out[3] = ((uint128_t) in[0]) * inx2[3]
+ + ((uint128_t) in[1]) * inx2[2];
+
+ out[4] = ((uint128_t) in[0]) * inx2[4]
+ + ((uint128_t) in[1]) * inx2[3]
+ + ((uint128_t) in[2]) * in[2];
+
+ out[5] = ((uint128_t) in[0]) * inx2[5]
+ + ((uint128_t) in[1]) * inx2[4]
+ + ((uint128_t) in[2]) * inx2[3];
+
+ out[6] = ((uint128_t) in[0]) * inx2[6]
+ + ((uint128_t) in[1]) * inx2[5]
+ + ((uint128_t) in[2]) * inx2[4]
+ + ((uint128_t) in[3]) * in[3];
+
+ out[7] = ((uint128_t) in[1]) * inx2[6]
+ + ((uint128_t) in[2]) * inx2[5]
+ + ((uint128_t) in[3]) * inx2[4];
+
+ out[8] = ((uint128_t) in[2]) * inx2[6]
+ + ((uint128_t) in[3]) * inx2[5]
+ + ((uint128_t) in[4]) * in[4];
+
+ out[9] = ((uint128_t) in[3]) * inx2[6]
+ + ((uint128_t) in[4]) * inx2[5];
+
+ out[10] = ((uint128_t) in[4]) * inx2[6]
+ + ((uint128_t) in[5]) * in[5];
+
+ out[11] = ((uint128_t) in[5]) * inx2[6];
+
+ out[12] = ((uint128_t) in[6]) * in[6];
+}
+
+static void felem_mul_ref(widefelem out, const felem in1, const felem in2)
+{
+ out[0] = ((uint128_t) in1[0]) * in2[0];
+
+ out[1] = ((uint128_t) in1[0]) * in2[1]
+ + ((uint128_t) in1[1]) * in2[0];
+
+ out[2] = ((uint128_t) in1[0]) * in2[2]
+ + ((uint128_t) in1[1]) * in2[1]
+ + ((uint128_t) in1[2]) * in2[0];
+
+ out[3] = ((uint128_t) in1[0]) * in2[3]
+ + ((uint128_t) in1[1]) * in2[2]
+ + ((uint128_t) in1[2]) * in2[1]
+ + ((uint128_t) in1[3]) * in2[0];
+
+ out[4] = ((uint128_t) in1[0]) * in2[4]
+ + ((uint128_t) in1[1]) * in2[3]
+ + ((uint128_t) in1[2]) * in2[2]
+ + ((uint128_t) in1[3]) * in2[1]
+ + ((uint128_t) in1[4]) * in2[0];
+
+ out[5] = ((uint128_t) in1[0]) * in2[5]
+ + ((uint128_t) in1[1]) * in2[4]
+ + ((uint128_t) in1[2]) * in2[3]
+ + ((uint128_t) in1[3]) * in2[2]
+ + ((uint128_t) in1[4]) * in2[1]
+ + ((uint128_t) in1[5]) * in2[0];
+
+ out[6] = ((uint128_t) in1[0]) * in2[6]
+ + ((uint128_t) in1[1]) * in2[5]
+ + ((uint128_t) in1[2]) * in2[4]
+ + ((uint128_t) in1[3]) * in2[3]
+ + ((uint128_t) in1[4]) * in2[2]
+ + ((uint128_t) in1[5]) * in2[1]
+ + ((uint128_t) in1[6]) * in2[0];
+
+ out[7] = ((uint128_t) in1[1]) * in2[6]
+ + ((uint128_t) in1[2]) * in2[5]
+ + ((uint128_t) in1[3]) * in2[4]
+ + ((uint128_t) in1[4]) * in2[3]
+ + ((uint128_t) in1[5]) * in2[2]
+ + ((uint128_t) in1[6]) * in2[1];
+
+ out[8] = ((uint128_t) in1[2]) * in2[6]
+ + ((uint128_t) in1[3]) * in2[5]
+ + ((uint128_t) in1[4]) * in2[4]
+ + ((uint128_t) in1[5]) * in2[3]
+ + ((uint128_t) in1[6]) * in2[2];
+
+ out[9] = ((uint128_t) in1[3]) * in2[6]
+ + ((uint128_t) in1[4]) * in2[5]
+ + ((uint128_t) in1[5]) * in2[4]
+ + ((uint128_t) in1[6]) * in2[3];
+
+ out[10] = ((uint128_t) in1[4]) * in2[6]
+ + ((uint128_t) in1[5]) * in2[5]
+ + ((uint128_t) in1[6]) * in2[4];
+
+ out[11] = ((uint128_t) in1[5]) * in2[6]
+ + ((uint128_t) in1[6]) * in2[5];
+
+ out[12] = ((uint128_t) in1[6]) * in2[6];
+}
+
+/*-
+ * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients.
+ * in[i] < 2^128 - 2^125
+ * out[i] < 2^56 for i < 6,
+ * out[6] <= 2^48
+ *
+ * The technique in use here stems from the format of the prime modulus:
+ * P384 = 2^384 - delta
+ *
+ * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting
+ * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These
+ * coefficients are still quite large, and so we repeatedly apply this
+ * technique on high-order bits in order to guarantee the desired bounds on
+ * the size of our output.
+ *
+ * The three phases of elimination are as follows:
+ * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9])
+ * [2]: Y = 2^8 (acc[8] | acc[7])
+ * [3]: Y = 2^48 (acc[6] >> 48)
+ * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d)
+ */
+static void felem_reduce(felem out, const widefelem in)
+{
+ /*
+ * In order to prevent underflow, we add a multiple of p before subtracting.
+ * Use telescopic sums to represent 2^76 * p redundantly with each limb
+ * of the form 2^124 + ...
+ */
+ static const widelimb two124m68 = (((widelimb) 1) << 124)
+ - (((widelimb) 1) << 68);
+ static const widelimb two124m116m68 = (((widelimb) 1) << 124)
+ - (((widelimb) 1) << 116)
+ - (((widelimb) 1) << 68);
+ static const widelimb two124p108m76 = (((widelimb) 1) << 124)
+ + (((widelimb) 1) << 108)
+ - (((widelimb) 1) << 76);
+ static const widelimb two124m92m68 = (((widelimb) 1) << 124)
+ - (((widelimb) 1) << 92)
+ - (((widelimb) 1) << 68);
+ widelimb temp, acc[9];
+ unsigned int i;
+
+ memcpy(acc, in, sizeof(widelimb) * 9);
+
+ acc[0] += two124p108m76;
+ acc[1] += two124m116m68;
+ acc[2] += two124m92m68;
+ acc[3] += two124m68;
+ acc[4] += two124m68;
+ acc[5] += two124m68;
+ acc[6] += two124m68;
+
+ /* [1]: Eliminate in[9], ..., in[12] */
+ acc[8] += in[12] >> 32;
+ acc[7] += (in[12] & 0xffffffff) << 24;
+ acc[7] += in[12] >> 8;
+ acc[6] += (in[12] & 0xff) << 48;
+ acc[6] -= in[12] >> 16;
+ acc[5] -= ((in[12] & 0xffff) << 40);
+ acc[6] += in[12] >> 48;
+ acc[5] += (in[12] & 0xffffffffffff) << 8;
+
+ acc[7] += in[11] >> 32;
+ acc[6] += (in[11] & 0xffffffff) << 24;
+ acc[6] += in[11] >> 8;
+ acc[5] += (in[11] & 0xff) << 48;
+ acc[5] -= in[11] >> 16;
+ acc[4] -= ((in[11] & 0xffff) << 40);
+ acc[5] += in[11] >> 48;
+ acc[4] += (in[11] & 0xffffffffffff) << 8;
+
+ acc[6] += in[10] >> 32;
+ acc[5] += (in[10] & 0xffffffff) << 24;
+ acc[5] += in[10] >> 8;
+ acc[4] += (in[10] & 0xff) << 48;
+ acc[4] -= in[10] >> 16;
+ acc[3] -= ((in[10] & 0xffff) << 40);
+ acc[4] += in[10] >> 48;
+ acc[3] += (in[10] & 0xffffffffffff) << 8;
+
+ acc[5] += in[9] >> 32;
+ acc[4] += (in[9] & 0xffffffff) << 24;
+ acc[4] += in[9] >> 8;
+ acc[3] += (in[9] & 0xff) << 48;
+ acc[3] -= in[9] >> 16;
+ acc[2] -= ((in[9] & 0xffff) << 40);
+ acc[3] += in[9] >> 48;
+ acc[2] += (in[9] & 0xffffffffffff) << 8;
+
+ /*
+ * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as
+ * well as the contributions made from eliminating higher limbs.
+ * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121
+ * acc[8] < in[8] + 2^96
+ */
+ acc[4] += acc[8] >> 32;
+ acc[3] += (acc[8] & 0xffffffff) << 24;
+ acc[3] += acc[8] >> 8;
+ acc[2] += (acc[8] & 0xff) << 48;
+ acc[2] -= acc[8] >> 16;
+ acc[1] -= ((acc[8] & 0xffff) << 40);
+ acc[2] += acc[8] >> 48;
+ acc[1] += (acc[8] & 0xffffffffffff) << 8;
+
+ acc[3] += acc[7] >> 32;
+ acc[2] += (acc[7] & 0xffffffff) << 24;
+ acc[2] += acc[7] >> 8;
+ acc[1] += (acc[7] & 0xff) << 48;
+ acc[1] -= acc[7] >> 16;
+ acc[0] -= ((acc[7] & 0xffff) << 40);
+ acc[1] += acc[7] >> 48;
+ acc[0] += (acc[7] & 0xffffffffffff) << 8;
+
+ /*-
+ * acc[k] < in[k] + 2^124 + 2^121
+ * < in[k] + 2^125
+ * < 2^128, for k <= 6
+ */
+
+ /*
+ * Carry 4 -> 5 -> 6
+ * This has the effect of ensuring that these more significant limbs
+ * will be small in value after eliminating high bits from acc[6].
+ */
+ acc[5] += acc[4] >> 56;
+ acc[4] &= 0x00ffffffffffffff;
+
+ acc[6] += acc[5] >> 56;
+ acc[5] &= 0x00ffffffffffffff;
+
+ /*-
+ * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16
+ * < in[6] + 2^125
+ * < 2^128
+ */
+
+ /* [3]: Eliminate high bits of acc[6] */
+ temp = acc[6] >> 48;
+ acc[6] &= 0x0000ffffffffffff;
+
+ /* temp < 2^80 */
+
+ acc[3] += temp >> 40;
+ acc[2] += (temp & 0xffffffffff) << 16;
+ acc[2] += temp >> 16;
+ acc[1] += (temp & 0xffff) << 40;
+ acc[1] -= temp >> 24;
+ acc[0] -= (temp & 0xffffff) << 32;
+ acc[0] += temp;
+
+ /*-
+ * acc[k] < acc_old[k] + 2^64 + 2^56
+ * < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4
+ */
+
+ /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
+ acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */
+ acc[0] &= 0x00ffffffffffffff;
+
+ acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */
+ acc[1] &= 0x00ffffffffffffff;
+
+ acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */
+ acc[2] &= 0x00ffffffffffffff;
+
+ /*-
+ * acc[k] < acc_old[k] + 2^72 + 2^16
+ * < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17
+ * < in[k] + 2^125
+ * < 2^128 , k < 4
+ */
+
+ acc[4] += acc[3] >> 56; /*-
+ * acc[4] < acc_old[4] + 2^72 + 2^16
+ * < 2^72 + 2^56 + 2^16
+ */
+ acc[3] &= 0x00ffffffffffffff;
+
+ acc[5] += acc[4] >> 56; /*-
+ * acc[5] < acc_old[5] + 2^16 + 1
+ * < 2^56 + 2^16 + 1
+ */
+ acc[4] &= 0x00ffffffffffffff;
+
+ acc[6] += acc[5] >> 56; /* acc[6] < 2^48 + 1 <= 2^48 */
+ acc[5] &= 0x00ffffffffffffff;
+
+ for (i = 0; i < NLIMBS; i++)
+ out[i] = acc[i];
+}
+
+#if defined(ECP_NISTP384_ASM)
+static void felem_square_wrapper(widefelem out, const felem in);
+static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2);
+
+static void (*felem_square_p)(widefelem out, const felem in) =
+ felem_square_wrapper;
+static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) =
+ felem_mul_wrapper;
+
+void p384_felem_square(widefelem out, const felem in);
+void p384_felem_mul(widefelem out, const felem in1, const felem in2);
+
+# if defined(_ARCH_PPC64)
+# include "crypto/ppc_arch.h"
+# endif
+
+static void felem_select(void)
+{
+ /* Default */
+ felem_square_p = felem_square_ref;
+ felem_mul_p = felem_mul_ref;
+}
+
+static void felem_square_wrapper(widefelem out, const felem in)
+{
+ felem_select();
+ felem_square_p(out, in);
+}
+
+static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2)
+{
+ felem_select();
+ felem_mul_p(out, in1, in2);
+}
+
+# define felem_square felem_square_p
+# define felem_mul felem_mul_p
+#else
+# define felem_square felem_square_ref
+# define felem_mul felem_mul_ref
+#endif
+
+static ossl_inline void felem_square_reduce(felem out, const felem in)
+{
+ widefelem tmp;
+
+ felem_square(tmp, in);
+ felem_reduce(out, tmp);
+}
+
+static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem in2)
+{
+ widefelem tmp;
+
+ felem_mul(tmp, in1, in2);
+ felem_reduce(out, tmp);
+}
+
+/*-
+ * felem_inv calculates |out| = |in|^{-1}
+ *
+ * Based on Fermat's Little Theorem:
+ * a^p = a (mod p)
+ * a^{p-1} = 1 (mod p)
+ * a^{p-2} = a^{-1} (mod p)
+ */
+static void felem_inv(felem out, const felem in)
+{
+ felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6;
+ unsigned int i = 0;
+
+ felem_square_reduce(ftmp, in); /* 2^1 */
+ felem_mul_reduce(ftmp, ftmp, in); /* 2^1 + 2^0 */
+ felem_assign(ftmp2, ftmp);
+
+ felem_square_reduce(ftmp, ftmp); /* 2^2 + 2^1 */
+ felem_mul_reduce(ftmp, ftmp, in); /* 2^2 + 2^1 * 2^0 */
+ felem_assign(ftmp3, ftmp);
+
+ for (i = 0; i < 3; i++)
+ felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */
+ felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */
+ felem_assign(ftmp4, ftmp);
+
+ for (i = 0; i < 6; i++)
+ felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */
+ felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */
+
+ for (i = 0; i < 3; i++)
+ felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */
+ felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */
+ felem_assign(ftmp5, ftmp);
+
+ for (i =