diff options
author | Rohan McLure <rohanmclure@linux.ibm.com> | 2023-07-12 12:25:22 +1000 |
---|---|---|
committer | Todd Short <todd.short@me.com> | 2023-08-04 10:20:28 -0400 |
commit | 01d901e470d9e035a3bd78e77b9438a4cc0da785 (patch) | |
tree | 205d9a10c12dab419a43383bcaf2d231f275d7d2 /crypto/ec | |
parent | 3e47a286dc3274bda72a196c3a4030a1fc8302f1 (diff) |
ec: 56-bit Limb Solinas' Strategy for secp384r1
Adopt a 56-bit redundant-limb Solinas' reduction approach for efficient
modular multiplication in P384. This has the affect of accelerating
digital signing by 446% and verification by 106%. The implementation
strategy and names of methods are the same as that provided in
ecp_nistp224 and ecp_nistp521.
As in Commit 1036749883cc ("ec: Add run time code selection for p521
field operations"), allow for run time selection of implementation for
felem_{square,mul}, where an assembly implementation is proclaimed to
be present when ECP_NISTP384_ASM is present.
Signed-off-by: Rohan McLure <rohanmclure@linux.ibm.com>
Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Shane Lontis <shane.lontis@oracle.com>
Reviewed-by: Dmitry Belyavskiy <beldmit@gmail.com>
Reviewed-by: Todd Short <todd.short@me.com>
(Merged from https://github.com/openssl/openssl/pull/21471)
Diffstat (limited to 'crypto/ec')
-rw-r--r-- | crypto/ec/build.info | 2 | ||||
-rw-r--r-- | crypto/ec/ec_curve.c | 4 | ||||
-rw-r--r-- | crypto/ec/ec_lib.c | 8 | ||||
-rw-r--r-- | crypto/ec/ec_local.h | 27 | ||||
-rw-r--r-- | crypto/ec/ecp_nistp384.c | 1988 |
5 files changed, 2027 insertions, 2 deletions
diff --git a/crypto/ec/build.info b/crypto/ec/build.info index 6fd73e4573..1fa60a1ded 100644 --- a/crypto/ec/build.info +++ b/crypto/ec/build.info @@ -74,7 +74,7 @@ IF[{- !$disabled{'ecx'} -}] ENDIF IF[{- !$disabled{'ec_nistp_64_gcc_128'} -}] - $COMMON=$COMMON ecp_nistp224.c ecp_nistp256.c ecp_nistp521.c ecp_nistputil.c + $COMMON=$COMMON ecp_nistp224.c ecp_nistp256.c ecp_nistp384.c ecp_nistp521.c ecp_nistputil.c ENDIF SOURCE[../../libcrypto]=$COMMON ec_ameth.c ec_pmeth.c \ diff --git a/crypto/ec/ec_curve.c b/crypto/ec/ec_curve.c index 2bf6522e84..724525a479 100644 --- a/crypto/ec/ec_curve.c +++ b/crypto/ec/ec_curve.c @@ -2838,6 +2838,8 @@ static const ec_list_element curve_list[] = { {NID_secp384r1, &_EC_NIST_PRIME_384.h, # if defined(S390X_EC_ASM) EC_GFp_s390x_nistp384_method, +# elif !defined(OPENSSL_NO_EC_NISTP_64_GCC_128) + ossl_ec_GFp_nistp384_method, # else 0, # endif @@ -2931,6 +2933,8 @@ static const ec_list_element curve_list[] = { {NID_secp384r1, &_EC_NIST_PRIME_384.h, # if defined(S390X_EC_ASM) EC_GFp_s390x_nistp384_method, +# elif !defined(OPENSSL_NO_EC_NISTP_64_GCC_128) + ossl_ec_GFp_nistp384_method, # else 0, # endif diff --git a/crypto/ec/ec_lib.c b/crypto/ec/ec_lib.c index 9e262b427d..e0d6cf7342 100644 --- a/crypto/ec/ec_lib.c +++ b/crypto/ec/ec_lib.c @@ -99,12 +99,16 @@ void EC_pre_comp_free(EC_GROUP *group) case PCT_nistp256: EC_nistp256_pre_comp_free(group->pre_comp.nistp256); break; + case PCT_nistp384: + ossl_ec_nistp384_pre_comp_free(group->pre_comp.nistp384); + break; case PCT_nistp521: EC_nistp521_pre_comp_free(group->pre_comp.nistp521); break; #else case PCT_nistp224: case PCT_nistp256: + case PCT_nistp384: case PCT_nistp521: break; #endif @@ -188,12 +192,16 @@ int EC_GROUP_copy(EC_GROUP *dest, const EC_GROUP *src) case PCT_nistp256: dest->pre_comp.nistp256 = EC_nistp256_pre_comp_dup(src->pre_comp.nistp256); break; + case PCT_nistp384: + dest->pre_comp.nistp384 = ossl_ec_nistp384_pre_comp_dup(src->pre_comp.nistp384); + break; case PCT_nistp521: dest->pre_comp.nistp521 = EC_nistp521_pre_comp_dup(src->pre_comp.nistp521); break; #else case PCT_nistp224: case PCT_nistp256: + case PCT_nistp384: case PCT_nistp521: break; #endif diff --git a/crypto/ec/ec_local.h b/crypto/ec/ec_local.h index 92d2b6f570..7181090fca 100644 --- a/crypto/ec/ec_local.h +++ b/crypto/ec/ec_local.h @@ -203,6 +203,7 @@ struct ec_method_st { */ typedef struct nistp224_pre_comp_st NISTP224_PRE_COMP; typedef struct nistp256_pre_comp_st NISTP256_PRE_COMP; +typedef struct nistp384_pre_comp_st NISTP384_PRE_COMP; typedef struct nistp521_pre_comp_st NISTP521_PRE_COMP; typedef struct nistz256_pre_comp_st NISTZ256_PRE_COMP; typedef struct ec_pre_comp_st EC_PRE_COMP; @@ -264,12 +265,13 @@ struct ec_group_st { */ enum { PCT_none, - PCT_nistp224, PCT_nistp256, PCT_nistp521, PCT_nistz256, + PCT_nistp224, PCT_nistp256, PCT_nistp384, PCT_nistp521, PCT_nistz256, PCT_ec } pre_comp_type; union { NISTP224_PRE_COMP *nistp224; NISTP256_PRE_COMP *nistp256; + NISTP384_PRE_COMP *nistp384; NISTP521_PRE_COMP *nistp521; NISTZ256_PRE_COMP *nistz256; EC_PRE_COMP *ec; @@ -332,6 +334,7 @@ static ossl_inline int ec_point_is_compat(const EC_POINT *point, NISTP224_PRE_COMP *EC_nistp224_pre_comp_dup(NISTP224_PRE_COMP *); NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *); +NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *); NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *); NISTZ256_PRE_COMP *EC_nistz256_pre_comp_dup(NISTZ256_PRE_COMP *); NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *); @@ -340,6 +343,7 @@ EC_PRE_COMP *EC_ec_pre_comp_dup(EC_PRE_COMP *); void EC_pre_comp_free(EC_GROUP *group); void EC_nistp224_pre_comp_free(NISTP224_PRE_COMP *); void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *); +void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *); void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *); void EC_nistz256_pre_comp_free(NISTZ256_PRE_COMP *); void EC_ec_pre_comp_free(EC_PRE_COMP *); @@ -551,6 +555,27 @@ int ossl_ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r, int ossl_ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx); int ossl_ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group); +/* method functions in ecp_nistp384.c */ +int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group); +int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p, + const BIGNUM *a, const BIGNUM *n, + BN_CTX *); +int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group, + const EC_POINT *point, + BIGNUM *x, BIGNUM *y, + BN_CTX *ctx); +int ossl_ec_GFp_nistp384_mul(const EC_GROUP *group, EC_POINT *r, + const BIGNUM *scalar, size_t num, + const EC_POINT *points[], const BIGNUM *scalars[], + BN_CTX *); +int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r, + const BIGNUM *scalar, size_t num, + const EC_POINT *points[], + const BIGNUM *scalars[], BN_CTX *ctx); +int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx); +int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group); +const EC_METHOD *ossl_ec_GFp_nistp384_method(void); + /* method functions in ecp_nistp521.c */ int ossl_ec_GFp_nistp521_group_init(EC_GROUP *group); int ossl_ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p, diff --git a/crypto/ec/ecp_nistp384.c b/crypto/ec/ecp_nistp384.c new file mode 100644 index 0000000000..a0559487ed --- /dev/null +++ b/crypto/ec/ecp_nistp384.c @@ -0,0 +1,1988 @@ +/* + * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/* Copyright 2023 IBM Corp. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>. + * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm + * acceleration of felem_{square,mul} as supported in these files. + */ + +#include <openssl/e_os2.h> + +#include <string.h> +#include <openssl/err.h> +#include "ec_local.h" + +#include "internal/numbers.h" + +#ifndef INT128_MAX +# error "Your compiler doesn't appear to support 128-bit integer types" +#endif + +typedef uint8_t u8; +typedef uint64_t u64; + +/* + * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We + * can serialize an element of this field into 48 bytes. We call this an + * felem_bytearray. + */ + +typedef u8 felem_bytearray[48]; + +/* + * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4. + * These values are big-endian. + */ +static const felem_bytearray nistp384_curve_params[5] = { + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC}, + {0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */ + 0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12, + 0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D, + 0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF}, + {0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */ + 0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98, + 0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D, + 0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7}, + {0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */ + 0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C, + 0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE, + 0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F}, +}; + +/*- + * The representation of field elements. + * ------------------------------------ + * + * We represent field elements with seven values. These values are either 64 or + * 128 bits and the field element represented is: + * v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336 (mod p) + * Each of the seven values is called a 'limb'. Since the limbs are spaced only + * 56 bits apart, but are greater than 56 bits in length, the most significant + * bits of each limb overlap with the least significant bits of the next + * + * This representation is considered to be 'redundant' in the sense that + * intermediate values can each contain more than a 56-bit value in each limb. + * Reduction causes all but the final limb to be reduced to contain a value less + * than 2^56, with the final value represented allowed to be larger than 2^384, + * inasmuch as we can be sure that arithmetic overflow remains impossible. The + * reduced value must of course be congruent to the unreduced value. + * + * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a + * 'widefelem', featuring enough bits to store the result of a multiplication + * and even some further arithmetic without need for immediate reduction. + */ + +#define NLIMBS 7 + +typedef uint64_t limb; +typedef uint128_t widelimb; +typedef limb limb_aX __attribute((__aligned__(1))); +typedef limb felem[NLIMBS]; +typedef widelimb widefelem[2*NLIMBS-1]; + +static const limb bottom56bits = 0xffffffffffffff; + +/* Helper functions (de)serialising reduced field elements in little endian */ +static void bin48_to_felem(felem out, const u8 in[48]) +{ + memset(out, 0, 56); + out[0] = (*((limb *) & in[0])) & bottom56bits; + out[1] = (*((limb_aX *) & in[7])) & bottom56bits; + out[2] = (*((limb_aX *) & in[14])) & bottom56bits; + out[3] = (*((limb_aX *) & in[21])) & bottom56bits; + out[4] = (*((limb_aX *) & in[28])) & bottom56bits; + out[5] = (*((limb_aX *) & in[35])) & bottom56bits; + memmove(&out[6], &in[42], 6); +} + +static void felem_to_bin48(u8 out[48], const felem in) +{ + memset(out, 0, 48); + (*((limb *) & out[0])) |= (in[0] & bottom56bits); + (*((limb_aX *) & out[7])) |= (in[1] & bottom56bits); + (*((limb_aX *) & out[14])) |= (in[2] & bottom56bits); + (*((limb_aX *) & out[21])) |= (in[3] & bottom56bits); + (*((limb_aX *) & out[28])) |= (in[4] & bottom56bits); + (*((limb_aX *) & out[35])) |= (in[5] & bottom56bits); + memmove(&out[42], &in[6], 6); +} + +/* BN_to_felem converts an OpenSSL BIGNUM into an felem */ +static int BN_to_felem(felem out, const BIGNUM *bn) +{ + felem_bytearray b_out; + int num_bytes; + + if (BN_is_negative(bn)) { + ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out)); + if (num_bytes < 0) { + ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + bin48_to_felem(out, b_out); + return 1; +} + +/* felem_to_BN converts an felem into an OpenSSL BIGNUM */ +static BIGNUM *felem_to_BN(BIGNUM *out, const felem in) +{ + felem_bytearray b_out; + + felem_to_bin48(b_out, in); + return BN_lebin2bn(b_out, sizeof(b_out), out); +} + +/*- + * Field operations + * ---------------- + */ + +static void felem_one(felem out) +{ + out[0] = 1; + memset(&out[1], 0, sizeof(limb) * (NLIMBS-1)); +} + +static void felem_assign(felem out, const felem in) +{ + memcpy(out, in, sizeof(felem)); +} + +/* felem_sum64 sets out = out + in. */ +static void felem_sum64(felem out, const felem in) +{ + unsigned int i; + + for (i = 0; i < NLIMBS; i++) + out[i] += in[i]; +} + +/* felem_scalar sets out = in * scalar */ +static void felem_scalar(felem out, const felem in, limb scalar) +{ + unsigned int i; + + for (i = 0; i < NLIMBS; i++) + out[i] = in[i] * scalar; +} + +/* felem_scalar64 sets out = out * scalar */ +static void felem_scalar64(felem out, limb scalar) +{ + unsigned int i; + + for (i = 0; i < NLIMBS; i++) + out[i] *= scalar; +} + +/* felem_scalar128 sets out = out * scalar */ +static void felem_scalar128(widefelem out, limb scalar) +{ + unsigned int i; + + for (i = 0; i < 2*NLIMBS-1; i++) + out[i] *= scalar; +} + +/*- + * felem_neg sets |out| to |-in| + * On entry: + * in[i] < 2^60 - 2^29 + * On exit: + * out[i] < 2^60 + */ +static void felem_neg(felem out, const felem in) +{ + /* + * In order to prevent underflow, we add a multiple of p before subtracting. + * Use telescopic sums to represent 2^12 * p redundantly with each limb + * of the form 2^60 + ... + */ + static const limb two60m52m4 = (((limb) 1) << 60) + - (((limb) 1) << 52) + - (((limb) 1) << 4); + static const limb two60p44m12 = (((limb) 1) << 60) + + (((limb) 1) << 44) + - (((limb) 1) << 12); + static const limb two60m28m4 = (((limb) 1) << 60) + - (((limb) 1) << 28) + - (((limb) 1) << 4); + static const limb two60m4 = (((limb) 1) << 60) + - (((limb) 1) << 4); + + out[0] = two60p44m12 - in[0]; + out[1] = two60m52m4 - in[1]; + out[2] = two60m28m4 - in[2]; + out[3] = two60m4 - in[3]; + out[4] = two60m4 - in[4]; + out[5] = two60m4 - in[5]; + out[6] = two60m4 - in[6]; +} + +/*- + * felem_diff64 subtracts |in| from |out| + * On entry: + * in[i] < 2^60 - 2^52 - 2^4 + * On exit: + * out[i] < out_orig[i] + 2^60 + 2^44 + */ +static void felem_diff64(felem out, const felem in) +{ + /* + * In order to prevent underflow, we add a multiple of p before subtracting. + * Use telescopic sums to represent 2^12 * p redundantly with each limb + * of the form 2^60 + ... + */ + + static const limb two60m52m4 = (((limb) 1) << 60) + - (((limb) 1) << 52) + - (((limb) 1) << 4); + static const limb two60p44m12 = (((limb) 1) << 60) + + (((limb) 1) << 44) + - (((limb) 1) << 12); + static const limb two60m28m4 = (((limb) 1) << 60) + - (((limb) 1) << 28) + - (((limb) 1) << 4); + static const limb two60m4 = (((limb) 1) << 60) + - (((limb) 1) << 4); + + out[0] += two60p44m12 - in[0]; + out[1] += two60m52m4 - in[1]; + out[2] += two60m28m4 - in[2]; + out[3] += two60m4 - in[3]; + out[4] += two60m4 - in[4]; + out[5] += two60m4 - in[5]; + out[6] += two60m4 - in[6]; +} + +/* + * in[i] < 2^63 + * out[i] < out_orig[i] + 2^64 + 2^48 + */ +static void felem_diff_128_64(widefelem out, const felem in) +{ + /* + * In order to prevent underflow, we add a multiple of p before subtracting. + * Use telescopic sums to represent 2^16 * p redundantly with each limb + * of the form 2^64 + ... + */ + + static const widelimb two64m56m8 = (((widelimb) 1) << 64) + - (((widelimb) 1) << 56) + - (((widelimb) 1) << 8); + static const widelimb two64m32m8 = (((widelimb) 1) << 64) + - (((widelimb) 1) << 32) + - (((widelimb) 1) << 8); + static const widelimb two64m8 = (((widelimb) 1) << 64) + - (((widelimb) 1) << 8); + static const widelimb two64p48m16 = (((widelimb) 1) << 64) + + (((widelimb) 1) << 48) + - (((widelimb) 1) << 16); + unsigned int i; + + out[0] += two64p48m16; + out[1] += two64m56m8; + out[2] += two64m32m8; + out[3] += two64m8; + out[4] += two64m8; + out[5] += two64m8; + out[6] += two64m8; + + for (i = 0; i < NLIMBS; i++) + out[i] -= in[i]; +} + +/* + * in[i] < 2^127 - 2^119 - 2^71 + * out[i] < out_orig[i] + 2^127 + 2^111 + */ +static void felem_diff128(widefelem out, const widefelem in) +{ + /* + * In order to prevent underflow, we add a multiple of p before subtracting. + * Use telescopic sums to represent 2^415 * p redundantly with each limb + * of the form 2^127 + ... + */ + + static const widelimb two127 = ((widelimb) 1) << 127; + static const widelimb two127m71 = (((widelimb) 1) << 127) + - (((widelimb) 1) << 71); + static const widelimb two127p111m79m71 = (((widelimb) 1) << 127) + + (((widelimb) 1) << 111) + - (((widelimb) 1) << 79) + - (((widelimb) 1) << 71); + static const widelimb two127m119m71 = (((widelimb) 1) << 127) + - (((widelimb) 1) << 119) + - (((widelimb) 1) << 71); + static const widelimb two127m95m71 = (((widelimb) 1) << 127) + - (((widelimb) 1) << 95) + - (((widelimb) 1) << 71); + unsigned int i; + + out[0] += two127; + out[1] += two127m71; + out[2] += two127m71; + out[3] += two127m71; + out[4] += two127m71; + out[5] += two127m71; + out[6] += two127p111m79m71; + out[7] += two127m119m71; + out[8] += two127m95m71; + out[9] += two127m71; + out[10] += two127m71; + out[11] += two127m71; + out[12] += two127m71; + + for (i = 0; i < 2*NLIMBS-1; i++) + out[i] -= in[i]; +} + +static void felem_square_ref(widefelem out, const felem in) +{ + felem inx2; + felem_scalar(inx2, in, 2); + + out[0] = ((uint128_t) in[0]) * in[0]; + + out[1] = ((uint128_t) in[0]) * inx2[1]; + + out[2] = ((uint128_t) in[0]) * inx2[2] + + ((uint128_t) in[1]) * in[1]; + + out[3] = ((uint128_t) in[0]) * inx2[3] + + ((uint128_t) in[1]) * inx2[2]; + + out[4] = ((uint128_t) in[0]) * inx2[4] + + ((uint128_t) in[1]) * inx2[3] + + ((uint128_t) in[2]) * in[2]; + + out[5] = ((uint128_t) in[0]) * inx2[5] + + ((uint128_t) in[1]) * inx2[4] + + ((uint128_t) in[2]) * inx2[3]; + + out[6] = ((uint128_t) in[0]) * inx2[6] + + ((uint128_t) in[1]) * inx2[5] + + ((uint128_t) in[2]) * inx2[4] + + ((uint128_t) in[3]) * in[3]; + + out[7] = ((uint128_t) in[1]) * inx2[6] + + ((uint128_t) in[2]) * inx2[5] + + ((uint128_t) in[3]) * inx2[4]; + + out[8] = ((uint128_t) in[2]) * inx2[6] + + ((uint128_t) in[3]) * inx2[5] + + ((uint128_t) in[4]) * in[4]; + + out[9] = ((uint128_t) in[3]) * inx2[6] + + ((uint128_t) in[4]) * inx2[5]; + + out[10] = ((uint128_t) in[4]) * inx2[6] + + ((uint128_t) in[5]) * in[5]; + + out[11] = ((uint128_t) in[5]) * inx2[6]; + + out[12] = ((uint128_t) in[6]) * in[6]; +} + +static void felem_mul_ref(widefelem out, const felem in1, const felem in2) +{ + out[0] = ((uint128_t) in1[0]) * in2[0]; + + out[1] = ((uint128_t) in1[0]) * in2[1] + + ((uint128_t) in1[1]) * in2[0]; + + out[2] = ((uint128_t) in1[0]) * in2[2] + + ((uint128_t) in1[1]) * in2[1] + + ((uint128_t) in1[2]) * in2[0]; + + out[3] = ((uint128_t) in1[0]) * in2[3] + + ((uint128_t) in1[1]) * in2[2] + + ((uint128_t) in1[2]) * in2[1] + + ((uint128_t) in1[3]) * in2[0]; + + out[4] = ((uint128_t) in1[0]) * in2[4] + + ((uint128_t) in1[1]) * in2[3] + + ((uint128_t) in1[2]) * in2[2] + + ((uint128_t) in1[3]) * in2[1] + + ((uint128_t) in1[4]) * in2[0]; + + out[5] = ((uint128_t) in1[0]) * in2[5] + + ((uint128_t) in1[1]) * in2[4] + + ((uint128_t) in1[2]) * in2[3] + + ((uint128_t) in1[3]) * in2[2] + + ((uint128_t) in1[4]) * in2[1] + + ((uint128_t) in1[5]) * in2[0]; + + out[6] = ((uint128_t) in1[0]) * in2[6] + + ((uint128_t) in1[1]) * in2[5] + + ((uint128_t) in1[2]) * in2[4] + + ((uint128_t) in1[3]) * in2[3] + + ((uint128_t) in1[4]) * in2[2] + + ((uint128_t) in1[5]) * in2[1] + + ((uint128_t) in1[6]) * in2[0]; + + out[7] = ((uint128_t) in1[1]) * in2[6] + + ((uint128_t) in1[2]) * in2[5] + + ((uint128_t) in1[3]) * in2[4] + + ((uint128_t) in1[4]) * in2[3] + + ((uint128_t) in1[5]) * in2[2] + + ((uint128_t) in1[6]) * in2[1]; + + out[8] = ((uint128_t) in1[2]) * in2[6] + + ((uint128_t) in1[3]) * in2[5] + + ((uint128_t) in1[4]) * in2[4] + + ((uint128_t) in1[5]) * in2[3] + + ((uint128_t) in1[6]) * in2[2]; + + out[9] = ((uint128_t) in1[3]) * in2[6] + + ((uint128_t) in1[4]) * in2[5] + + ((uint128_t) in1[5]) * in2[4] + + ((uint128_t) in1[6]) * in2[3]; + + out[10] = ((uint128_t) in1[4]) * in2[6] + + ((uint128_t) in1[5]) * in2[5] + + ((uint128_t) in1[6]) * in2[4]; + + out[11] = ((uint128_t) in1[5]) * in2[6] + + ((uint128_t) in1[6]) * in2[5]; + + out[12] = ((uint128_t) in1[6]) * in2[6]; +} + +/*- + * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients. + * in[i] < 2^128 - 2^125 + * out[i] < 2^56 for i < 6, + * out[6] <= 2^48 + * + * The technique in use here stems from the format of the prime modulus: + * P384 = 2^384 - delta + * + * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting + * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These + * coefficients are still quite large, and so we repeatedly apply this + * technique on high-order bits in order to guarantee the desired bounds on + * the size of our output. + * + * The three phases of elimination are as follows: + * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9]) + * [2]: Y = 2^8 (acc[8] | acc[7]) + * [3]: Y = 2^48 (acc[6] >> 48) + * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d) + */ +static void felem_reduce(felem out, const widefelem in) +{ + /* + * In order to prevent underflow, we add a multiple of p before subtracting. + * Use telescopic sums to represent 2^76 * p redundantly with each limb + * of the form 2^124 + ... + */ + static const widelimb two124m68 = (((widelimb) 1) << 124) + - (((widelimb) 1) << 68); + static const widelimb two124m116m68 = (((widelimb) 1) << 124) + - (((widelimb) 1) << 116) + - (((widelimb) 1) << 68); + static const widelimb two124p108m76 = (((widelimb) 1) << 124) + + (((widelimb) 1) << 108) + - (((widelimb) 1) << 76); + static const widelimb two124m92m68 = (((widelimb) 1) << 124) + - (((widelimb) 1) << 92) + - (((widelimb) 1) << 68); + widelimb temp, acc[9]; + unsigned int i; + + memcpy(acc, in, sizeof(widelimb) * 9); + + acc[0] += two124p108m76; + acc[1] += two124m116m68; + acc[2] += two124m92m68; + acc[3] += two124m68; + acc[4] += two124m68; + acc[5] += two124m68; + acc[6] += two124m68; + + /* [1]: Eliminate in[9], ..., in[12] */ + acc[8] += in[12] >> 32; + acc[7] += (in[12] & 0xffffffff) << 24; + acc[7] += in[12] >> 8; + acc[6] += (in[12] & 0xff) << 48; + acc[6] -= in[12] >> 16; + acc[5] -= ((in[12] & 0xffff) << 40); + acc[6] += in[12] >> 48; + acc[5] += (in[12] & 0xffffffffffff) << 8; + + acc[7] += in[11] >> 32; + acc[6] += (in[11] & 0xffffffff) << 24; + acc[6] += in[11] >> 8; + acc[5] += (in[11] & 0xff) << 48; + acc[5] -= in[11] >> 16; + acc[4] -= ((in[11] & 0xffff) << 40); + acc[5] += in[11] >> 48; + acc[4] += (in[11] & 0xffffffffffff) << 8; + + acc[6] += in[10] >> 32; + acc[5] += (in[10] & 0xffffffff) << 24; + acc[5] += in[10] >> 8; + acc[4] += (in[10] & 0xff) << 48; + acc[4] -= in[10] >> 16; + acc[3] -= ((in[10] & 0xffff) << 40); + acc[4] += in[10] >> 48; + acc[3] += (in[10] & 0xffffffffffff) << 8; + + acc[5] += in[9] >> 32; + acc[4] += (in[9] & 0xffffffff) << 24; + acc[4] += in[9] >> 8; + acc[3] += (in[9] & 0xff) << 48; + acc[3] -= in[9] >> 16; + acc[2] -= ((in[9] & 0xffff) << 40); + acc[3] += in[9] >> 48; + acc[2] += (in[9] & 0xffffffffffff) << 8; + + /* + * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as + * well as the contributions made from eliminating higher limbs. + * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121 + * acc[8] < in[8] + 2^96 + */ + acc[4] += acc[8] >> 32; + acc[3] += (acc[8] & 0xffffffff) << 24; + acc[3] += acc[8] >> 8; + acc[2] += (acc[8] & 0xff) << 48; + acc[2] -= acc[8] >> 16; + acc[1] -= ((acc[8] & 0xffff) << 40); + acc[2] += acc[8] >> 48; + acc[1] += (acc[8] & 0xffffffffffff) << 8; + + acc[3] += acc[7] >> 32; + acc[2] += (acc[7] & 0xffffffff) << 24; + acc[2] += acc[7] >> 8; + acc[1] += (acc[7] & 0xff) << 48; + acc[1] -= acc[7] >> 16; + acc[0] -= ((acc[7] & 0xffff) << 40); + acc[1] += acc[7] >> 48; + acc[0] += (acc[7] & 0xffffffffffff) << 8; + + /*- + * acc[k] < in[k] + 2^124 + 2^121 + * < in[k] + 2^125 + * < 2^128, for k <= 6 + */ + + /* + * Carry 4 -> 5 -> 6 + * This has the effect of ensuring that these more significant limbs + * will be small in value after eliminating high bits from acc[6]. + */ + acc[5] += acc[4] >> 56; + acc[4] &= 0x00ffffffffffffff; + + acc[6] += acc[5] >> 56; + acc[5] &= 0x00ffffffffffffff; + + /*- + * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16 + * < in[6] + 2^125 + * < 2^128 + */ + + /* [3]: Eliminate high bits of acc[6] */ + temp = acc[6] >> 48; + acc[6] &= 0x0000ffffffffffff; + + /* temp < 2^80 */ + + acc[3] += temp >> 40; + acc[2] += (temp & 0xffffffffff) << 16; + acc[2] += temp >> 16; + acc[1] += (temp & 0xffff) << 40; + acc[1] -= temp >> 24; + acc[0] -= (temp & 0xffffff) << 32; + acc[0] += temp; + + /*- + * acc[k] < acc_old[k] + 2^64 + 2^56 + * < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4 + */ + + /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */ + acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */ + acc[0] &= 0x00ffffffffffffff; + + acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */ + acc[1] &= 0x00ffffffffffffff; + + acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */ + acc[2] &= 0x00ffffffffffffff; + + /*- + * acc[k] < acc_old[k] + 2^72 + 2^16 + * < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17 + * < in[k] + 2^125 + * < 2^128 , k < 4 + */ + + acc[4] += acc[3] >> 56; /*- + * acc[4] < acc_old[4] + 2^72 + 2^16 + * < 2^72 + 2^56 + 2^16 + */ + acc[3] &= 0x00ffffffffffffff; + + acc[5] += acc[4] >> 56; /*- + * acc[5] < acc_old[5] + 2^16 + 1 + * < 2^56 + 2^16 + 1 + */ + acc[4] &= 0x00ffffffffffffff; + + acc[6] += acc[5] >> 56; /* acc[6] < 2^48 + 1 <= 2^48 */ + acc[5] &= 0x00ffffffffffffff; + + for (i = 0; i < NLIMBS; i++) + out[i] = acc[i]; +} + +#if defined(ECP_NISTP384_ASM) +static void felem_square_wrapper(widefelem out, const felem in); +static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2); + +static void (*felem_square_p)(widefelem out, const felem in) = + felem_square_wrapper; +static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) = + felem_mul_wrapper; + +void p384_felem_square(widefelem out, const felem in); +void p384_felem_mul(widefelem out, const felem in1, const felem in2); + +# if defined(_ARCH_PPC64) +# include "crypto/ppc_arch.h" +# endif + +static void felem_select(void) +{ + /* Default */ + felem_square_p = felem_square_ref; + felem_mul_p = felem_mul_ref; +} + +static void felem_square_wrapper(widefelem out, const felem in) +{ + felem_select(); + felem_square_p(out, in); +} + +static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2) +{ + felem_select(); + felem_mul_p(out, in1, in2); +} + +# define felem_square felem_square_p +# define felem_mul felem_mul_p +#else +# define felem_square felem_square_ref +# define felem_mul felem_mul_ref +#endif + +static ossl_inline void felem_square_reduce(felem out, const felem in) +{ + widefelem tmp; + + felem_square(tmp, in); + felem_reduce(out, tmp); +} + +static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem in2) +{ + widefelem tmp; + + felem_mul(tmp, in1, in2); + felem_reduce(out, tmp); +} + +/*- + * felem_inv calculates |out| = |in|^{-1} + * + * Based on Fermat's Little Theorem: + * a^p = a (mod p) + * a^{p-1} = 1 (mod p) + * a^{p-2} = a^{-1} (mod p) + */ +static void felem_inv(felem out, const felem in) +{ + felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6; + unsigned int i = 0; + + felem_square_reduce(ftmp, in); /* 2^1 */ + felem_mul_reduce(ftmp, ftmp, in); /* 2^1 + 2^0 */ + felem_assign(ftmp2, ftmp); + + felem_square_reduce(ftmp, ftmp); /* 2^2 + 2^1 */ + felem_mul_reduce(ftmp, ftmp, in); /* 2^2 + 2^1 * 2^0 */ + felem_assign(ftmp3, ftmp); + + for (i = 0; i < 3; i++) + felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */ + felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */ + felem_assign(ftmp4, ftmp); + + for (i = 0; i < 6; i++) + felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */ + felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */ + + for (i = 0; i < 3; i++) + felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */ + felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */ + felem_assign(ftmp5, ftmp); + + for (i = |