summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatt Caswell <matt@openssl.org>2018-01-31 13:14:48 +0000
committerMatt Caswell <matt@openssl.org>2018-02-20 12:59:30 +0000
commit7e492f3372ed83af074a63d5920f13de7e3455b6 (patch)
tree83c2158a316d0ab8ddbf11e07efec583762ffb2c
parent0cdcdacc337005e08a906b2e07d4e44e3ee48138 (diff)
Remove curve448 architecture specific files
Remove all architecture specific files except for the reference arch_32 version. These files provide archicture specific performance optimisation. However they have not been integrated yet. In order to avoid review issues they are removed for now. They may be reintroduced at a later time. Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de> (Merged from https://github.com/openssl/openssl/pull/5105)
-rw-r--r--crypto/ec/curve448/arch_arm_32/arch_intrinsics.h37
-rw-r--r--crypto/ec/curve448/arch_arm_32/f_impl.c846
-rw-r--r--crypto/ec/curve448/arch_arm_32/f_impl.h59
-rw-r--r--crypto/ec/curve448/arch_neon/arch_intrinsics.h36
-rw-r--r--crypto/ec/curve448/arch_neon/f_impl.c594
-rw-r--r--crypto/ec/curve448/arch_neon/f_impl.h65
-rw-r--r--crypto/ec/curve448/arch_ref64/arch_intrinsics.h31
-rw-r--r--crypto/ec/curve448/arch_ref64/f_impl.c308
-rw-r--r--crypto/ec/curve448/arch_ref64/f_impl.h49
-rw-r--r--crypto/ec/curve448/arch_x86_64/arch_intrinsics.h338
-rw-r--r--crypto/ec/curve448/arch_x86_64/f_impl.c308
-rw-r--r--crypto/ec/curve448/arch_x86_64/f_impl.h69
12 files changed, 0 insertions, 2740 deletions
diff --git a/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h b/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h
deleted file mode 100644
index a98ffe4d7e..0000000000
--- a/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__
-# define __ARCH_ARM_32_ARCH_INTRINSICS_H__
-
-# define ARCH_WORD_BITS 32
-
-static __inline__ __attribute((always_inline, unused))
-uint32_t word_is_zero(uint32_t a)
-{
- uint32_t ret;
-
- asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
- return ret;
-}
-
-static __inline__ __attribute((always_inline, unused))
-uint64_t widemul(uint32_t a, uint32_t b)
-{
- /*
- * Could be UMULL, but it's hard to express to CC that the registers must
- * be different
- */
- return ((uint64_t)a) * b;
-}
-
-#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
diff --git a/crypto/ec/curve448/arch_arm_32/f_impl.c b/crypto/ec/curve448/arch_arm_32/f_impl.c
deleted file mode 100644
index 8a2b0886b5..0000000000
--- a/crypto/ec/curve448/arch_arm_32/f_impl.c
+++ /dev/null
@@ -1,846 +0,0 @@
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#include "field.h"
-
-static inline void __attribute__ ((gnu_inline, always_inline))
- smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
-{
-
-#ifdef __ARMEL__
- uint32_t lo = *acc, hi = (*acc) >> 32;
-
- __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
- : [lo]"+&r"(lo), [hi]"+&r"(hi)
- : [a]"r"(a), [b]"r"(b));
-
-
- *acc = lo + (((uint64_t)hi) << 32);
-#else
- *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b;
-#endif
-}
-
-static inline void __attribute__ ((gnu_inline, always_inline))
- smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
-{
-#ifdef __ARMEL__
- uint32_t lo = *acc, hi = (*acc) >> 32;
-
- __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
- : [lo]"+&r"(lo), [hi]"+&r"(hi)
- : [a]"r"(a), [b]"r"(2 * b));
-
-
-
- *acc = lo + (((uint64_t)hi) << 32);
-#else
- *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
-#endif
-}
-
-static inline void __attribute__ ((gnu_inline, always_inline))
- smull(uint64_t *acc, const uint32_t a, const uint32_t b)
-{
-#ifdef __ARMEL__
- uint32_t lo, hi;
-
- __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]"
- : [lo]"=&r"(lo), [hi]"=&r"(hi)
- : [a]"r"(a), [b]"r"(b));
-
- *acc = lo + (((uint64_t)hi) << 32);
-#else
- *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b;
-#endif
-}
-
-static inline void __attribute__ ((gnu_inline, always_inline))
- smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
-{
-#ifdef __ARMEL__
- uint32_t lo, hi;
-
- __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
- : [lo]"=&r"(lo), [hi]"=&r"(hi)
- : [a]"r"(a), [b]"r"(2*b));
-
- *acc = lo + (((uint64_t)hi) << 32);
-#else
- *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
-#endif
-}
-
-void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
-{
-
- const uint32_t *a = as->limb, *b = bs->limb;
- uint32_t *c = cs->limb;
-
- uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1;
- uint32_t mask = (1 << 28) - 1;
-
- uint32_t aa[8], bm[8];
-
- int i;
- for (i = 0; i < 8; i++) {
- aa[i] = a[i] + a[i + 8];
- bm[i] = b[i] - b[i + 8];
- }
-
- uint32_t ax, bx;
- {
- /* t^3 terms */
- smull(&accum1, ax = aa[1], bx = b[15]);
- smull(&accum3, ax = aa[2], bx);
- smlal(&accum1, ax, bx = b[14]);
- smlal(&accum3, ax = aa[3], bx);
- smlal(&accum1, ax, bx = b[13]);
- smlal(&accum3, ax = aa[4], bx);
- smlal(&accum1, ax, bx = b[12]);
- smlal(&accum3, ax = aa[5], bx);
- smlal(&accum1, ax, bx = b[11]);
- smlal(&accum3, ax = aa[6], bx);
- smlal(&accum1, ax, bx = b[10]);
- smlal(&accum3, ax = aa[7], bx);
- smlal(&accum1, ax, bx = b[9]);
-
- accum0 = accum1;
- accum2 = accum3;
-
- /* t^2 terms */
- smlal(&accum2, ax = aa[0], bx);
- smlal(&accum0, ax, bx = b[8]);
- smlal(&accum2, ax = aa[1], bx);
-
- smlal(&accum0, ax = a[9], bx = b[7]);
- smlal(&accum2, ax = a[10], bx);
- smlal(&accum0, ax, bx = b[6]);
- smlal(&accum2, ax = a[11], bx);
- smlal(&accum0, ax, bx = b[5]);
- smlal(&accum2, ax = a[12], bx);
- smlal(&accum0, ax, bx = b[4]);
- smlal(&accum2, ax = a[13], bx);
- smlal(&accum0, ax, bx = b[3]);
- smlal(&accum2, ax = a[14], bx);
- smlal(&accum0, ax, bx = b[2]);
- smlal(&accum2, ax = a[15], bx);
- smlal(&accum0, ax, bx = b[1]);
-
- /* t terms */
- accum1 += accum0;
- accum3 += accum2;
- smlal(&accum3, ax = a[8], bx);
- smlal(&accum1, ax, bx = b[0]);
- smlal(&accum3, ax = a[9], bx);
-
- smlal(&accum1, ax = a[1], bx = bm[7]);
- smlal(&accum3, ax = a[2], bx);
- smlal(&accum1, ax, bx = bm[6]);
- smlal(&accum3, ax = a[3], bx);
- smlal(&accum1, ax, bx = bm[5]);
- smlal(&accum3, ax = a[4], bx);
- smlal(&accum1, ax, bx = bm[4]);
- smlal(&accum3, ax = a[5], bx);
- smlal(&accum1, ax, bx = bm[3]);
- smlal(&accum3, ax = a[6], bx);
- smlal(&accum1, ax, bx = bm[2]);
- smlal(&accum3, ax = a[7], bx);
- smlal(&accum1, ax, bx = bm[1]);
-
- /* 1 terms */
- smlal(&accum2, ax = a[0], bx);
- smlal(&accum0, ax, bx = bm[0]);
- smlal(&accum2, ax = a[1], bx);
-
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[0] = ((uint32_t)(accum0)) & mask;
- c[1] = ((uint32_t)(accum2)) & mask;
- c[8] = ((uint32_t)(accum1)) & mask;
- c[9] = ((uint32_t)(accum3)) & mask;
-
- accumC0 = accum2 >> 28;
- accumC1 = accum3 >> 28;
- }
- {
- /* t^3 terms */
- smull(&accum1, ax = aa[3], bx = b[15]);
- smull(&accum3, ax = aa[4], bx);
- smlal(&accum1, ax, bx = b[14]);
- smlal(&accum3, ax = aa[5], bx);
- smlal(&accum1, ax, bx = b[13]);
- smlal(&accum3, ax = aa[6], bx);
- smlal(&accum1, ax, bx = b[12]);
- smlal(&accum3, ax = aa[7], bx);
- smlal(&accum1, ax, bx = b[11]);
-
- accum0 = accum1;
- accum2 = accum3;
-
- /* t^2 terms */
- smlal(&accum2, ax = aa[0], bx);
- smlal(&accum0, ax, bx = b[10]);
- smlal(&accum2, ax = aa[1], bx);
- smlal(&accum0, ax, bx = b[9]);
- smlal(&accum2, ax = aa[2], bx);
- smlal(&accum0, ax, bx = b[8]);
- smlal(&accum2, ax = aa[3], bx);
-
- smlal(&accum0, ax = a[11], bx = b[7]);
- smlal(&accum2, ax = a[12], bx);
- smlal(&accum0, ax, bx = b[6]);
- smlal(&accum2, ax = a[13], bx);
- smlal(&accum0, ax, bx = b[5]);
- smlal(&accum2, ax = a[14], bx);
- smlal(&accum0, ax, bx = b[4]);
- smlal(&accum2, ax = a[15], bx);
- smlal(&accum0, ax, bx = b[3]);
-
- /* t terms */
- accum1 += accum0;
- accum3 += accum2;
- smlal(&accum3, ax = a[8], bx);
- smlal(&accum1, ax, bx = b[2]);
- smlal(&accum3, ax = a[9], bx);
- smlal(&accum1, ax, bx = b[1]);
- smlal(&accum3, ax = a[10], bx);
- smlal(&accum1, ax, bx = b[0]);
- smlal(&accum3, ax = a[11], bx);
-
- smlal(&accum1, ax = a[3], bx = bm[7]);
- smlal(&accum3, ax = a[4], bx);
- smlal(&accum1, ax, bx = bm[6]);
- smlal(&accum3, ax = a[5], bx);
- smlal(&accum1, ax, bx = bm[5]);
- smlal(&accum3, ax = a[6], bx);
- smlal(&accum1, ax, bx = bm[4]);
- smlal(&accum3, ax = a[7], bx);
- smlal(&accum1, ax, bx = bm[3]);
-
- /* 1 terms */
- smlal(&accum2, ax = a[0], bx);
- smlal(&accum0, ax, bx = bm[2]);
- smlal(&accum2, ax = a[1], bx);
- smlal(&accum0, ax, bx = bm[1]);
- smlal(&accum2, ax = a[2], bx);
- smlal(&accum0, ax, bx = bm[0]);
- smlal(&accum2, ax = a[3], bx);
-
- accum0 += accumC0;
- accum1 += accumC1;
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[2] = ((uint32_t)(accum0)) & mask;
- c[3] = ((uint32_t)(accum2)) & mask;
- c[10] = ((uint32_t)(accum1)) & mask;
- c[11] = ((uint32_t)(accum3)) & mask;
-
- accumC0 = accum2 >> 28;
- accumC1 = accum3 >> 28;
- }
- {
-
- /* t^3 terms */
- smull(&accum1, ax = aa[5], bx = b[15]);
- smull(&accum3, ax = aa[6], bx);
- smlal(&accum1, ax, bx = b[14]);
- smlal(&accum3, ax = aa[7], bx);
- smlal(&accum1, ax, bx = b[13]);
-
- accum0 = accum1;
- accum2 = accum3;
-
- /* t^2 terms */
-
- smlal(&accum2, ax = aa[0], bx);
- smlal(&accum0, ax, bx = b[12]);
- smlal(&accum2, ax = aa[1], bx);
- smlal(&accum0, ax, bx = b[11]);
- smlal(&accum2, ax = aa[2], bx);
- smlal(&accum0, ax, bx = b[10]);
- smlal(&accum2, ax = aa[3], bx);
- smlal(&accum0, ax, bx = b[9]);
- smlal(&accum2, ax = aa[4], bx);
- smlal(&accum0, ax, bx = b[8]);
- smlal(&accum2, ax = aa[5], bx);
-
- smlal(&accum0, ax = a[13], bx = b[7]);
- smlal(&accum2, ax = a[14], bx);
- smlal(&accum0, ax, bx = b[6]);
- smlal(&accum2, ax = a[15], bx);
- smlal(&accum0, ax, bx = b[5]);
-
- /* t terms */
- accum1 += accum0;
- accum3 += accum2;
-
- smlal(&accum3, ax = a[8], bx);
- smlal(&accum1, ax, bx = b[4]);
- smlal(&accum3, ax = a[9], bx);
- smlal(&accum1, ax, bx = b[3]);
- smlal(&accum3, ax = a[10], bx);
- smlal(&accum1, ax, bx = b[2]);
- smlal(&accum3, ax = a[11], bx);
- smlal(&accum1, ax, bx = b[1]);
- smlal(&accum3, ax = a[12], bx);
- smlal(&accum1, ax, bx = b[0]);
- smlal(&accum3, ax = a[13], bx);
-
- smlal(&accum1, ax = a[5], bx = bm[7]);
- smlal(&accum3, ax = a[6], bx);
- smlal(&accum1, ax, bx = bm[6]);
- smlal(&accum3, ax = a[7], bx);
- smlal(&accum1, ax, bx = bm[5]);
-
- /* 1 terms */
-
- smlal(&accum2, ax = a[0], bx);
- smlal(&accum0, ax, bx = bm[4]);
- smlal(&accum2, ax = a[1], bx);
- smlal(&accum0, ax, bx = bm[3]);
- smlal(&accum2, ax = a[2], bx);
- smlal(&accum0, ax, bx = bm[2]);
- smlal(&accum2, ax = a[3], bx);
- smlal(&accum0, ax, bx = bm[1]);
- smlal(&accum2, ax = a[4], bx);
- smlal(&accum0, ax, bx = bm[0]);
- smlal(&accum2, ax = a[5], bx);
-
- accum0 += accumC0;
- accum1 += accumC1;
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[4] = ((uint32_t)(accum0)) & mask;
- c[5] = ((uint32_t)(accum2)) & mask;
- c[12] = ((uint32_t)(accum1)) & mask;
- c[13] = ((uint32_t)(accum3)) & mask;
-
- accumC0 = accum2 >> 28;
- accumC1 = accum3 >> 28;
- }
- {
-
- /* t^3 terms */
- smull(&accum1, ax = aa[7], bx = b[15]);
- accum0 = accum1;
-
- /* t^2 terms */
-
- smull(&accum2, ax = aa[0], bx);
- smlal(&accum0, ax, bx = b[14]);
- smlal(&accum2, ax = aa[1], bx);
- smlal(&accum0, ax, bx = b[13]);
- smlal(&accum2, ax = aa[2], bx);
- smlal(&accum0, ax, bx = b[12]);
- smlal(&accum2, ax = aa[3], bx);
- smlal(&accum0, ax, bx = b[11]);
- smlal(&accum2, ax = aa[4], bx);
- smlal(&accum0, ax, bx = b[10]);
- smlal(&accum2, ax = aa[5], bx);
- smlal(&accum0, ax, bx = b[9]);
- smlal(&accum2, ax = aa[6], bx);
- smlal(&accum0, ax, bx = b[8]);
- smlal(&accum2, ax = aa[7], bx);
-
- smlal(&accum0, ax = a[15], bx = b[7]);
-
- /* t terms */
- accum1 += accum0;
- accum3 = accum2;
-
- smlal(&accum3, ax = a[8], bx);
- smlal(&accum1, ax, bx = b[6]);
- smlal(&accum3, ax = a[9], bx);
- smlal(&accum1, ax, bx = b[5]);
- smlal(&accum3, ax = a[10], bx);
- smlal(&accum1, ax, bx = b[4]);
- smlal(&accum3, ax = a[11], bx);
- smlal(&accum1, ax, bx = b[3]);
- smlal(&accum3, ax = a[12], bx);
- smlal(&accum1, ax, bx = b[2]);
- smlal(&accum3, ax = a[13], bx);
- smlal(&accum1, ax, bx = b[1]);
- smlal(&accum3, ax = a[14], bx);
- smlal(&accum1, ax, bx = b[0]);
- smlal(&accum3, ax = a[15], bx);
-
- smlal(&accum1, ax = a[7], bx = bm[7]);
-
- /* 1 terms */
-
- smlal(&accum2, ax = a[0], bx);
- smlal(&accum0, ax, bx = bm[6]);
- smlal(&accum2, ax = a[1], bx);
- smlal(&accum0, ax, bx = bm[5]);
- smlal(&accum2, ax = a[2], bx);
- smlal(&accum0, ax, bx = bm[4]);
- smlal(&accum2, ax = a[3], bx);
- smlal(&accum0, ax, bx = bm[3]);
- smlal(&accum2, ax = a[4], bx);
- smlal(&accum0, ax, bx = bm[2]);
- smlal(&accum2, ax = a[5], bx);
- smlal(&accum0, ax, bx = bm[1]);
- smlal(&accum2, ax = a[6], bx);
- smlal(&accum0, ax, bx = bm[0]);
- smlal(&accum2, ax = a[7], bx);
-
- accum0 += accumC0;
- accum1 += accumC1;
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[6] = ((uint32_t)(accum0)) & mask;
- c[7] = ((uint32_t)(accum2)) & mask;
- c[14] = ((uint32_t)(accum1)) & mask;
- c[15] = ((uint32_t)(accum3)) & mask;
-
- accum0 = accum2 >> 28;
- accum1 = accum3 >> 28;
- }
-
- accum0 += accum1;
- accum0 += c[8];
- accum1 += c[0];
- c[8] = ((uint32_t)(accum0)) & mask;
- c[0] = ((uint32_t)(accum1)) & mask;
-
- accum0 >>= 28;
- accum1 >>= 28;
- c[9] += ((uint32_t)(accum0));
- c[1] += ((uint32_t)(accum1));
-}
-
-void gf_sqr(gf_s * __restrict__ cs, const gf as)
-{
- const uint32_t *a = as->limb;
- uint32_t *c = cs->limb;
-
- uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp;
- uint32_t mask = (1 << 28) - 1;
-
- uint32_t bm[8];
-
- int i;
- for (i = 0; i < 8; i++) {
- bm[i] = a[i] - a[i + 8];
- }
-
- uint32_t ax, bx;
- {
- /* t^3 terms */
- smull2(&accum1, ax = a[9], bx = a[15]);
- smull2(&accum3, ax = a[10], bx);
- smlal2(&accum1, ax, bx = a[14]);
- smlal2(&accum3, ax = a[11], bx);
- smlal2(&accum1, ax, bx = a[13]);
- smlal2(&accum3, ax = a[12], bx);
- smlal(&accum1, ax, ax);
-
- accum0 = accum1;
- accum2 = accum3;
-
- /* t^2 terms */
- smlal2(&accum2, ax = a[8], a[9]);
- smlal(&accum0, ax, ax);
-
- smlal2(&accum0, ax = a[1], bx = a[7]);
- smlal2(&accum2, ax = a[2], bx);
- smlal2(&accum0, ax, bx = a[6]);
- smlal2(&accum2, ax = a[3], bx);
- smlal2(&accum0, ax, bx = a[5]);
- smlal2(&accum2, ax = a[4], bx);
- smlal(&accum0, ax, ax);
-
- /* t terms */
- accum1 += accum0;
- accum3 += accum2;
- smlal2(&accum3, ax = a[0], bx = a[1]);
- smlal(&accum1, ax, ax);
-
- accum1 = -accum1;
- accum3 = -accum3;
- accum2 = -accum2;
- accum0 = -accum0;
-
- smlal2(&accum1, ax = bm[1], bx = bm[7]);
- smlal2(&accum3, ax = bm[2], bx);
- smlal2(&accum1, ax, bx = bm[6]);
- smlal2(&accum3, ax = bm[3], bx);
- smlal2(&accum1, ax, bx = bm[5]);
- smlal2(&accum3, ax = bm[4], bx);
- smlal(&accum1, ax, ax);
-
- /* 1 terms */
- smlal2(&accum2, ax = bm[0], bx = bm[1]);
- smlal(&accum0, ax, ax);
-
- tmp = -accum3;
- accum3 = tmp - accum2;
- accum2 = tmp;
- tmp = -accum1;
- accum1 = tmp - accum0;
- accum0 = tmp;
-
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[0] = ((uint32_t)(accum0)) & mask;
- c[1] = ((uint32_t)(accum2)) & mask;
- c[8] = ((uint32_t)(accum1)) & mask;
- c[9] = ((uint32_t)(accum3)) & mask;
-
- accumC0 = accum2 >> 28;
- accumC1 = accum3 >> 28;
- }
- {
- /* t^3 terms */
- smull2(&accum1, ax = a[11], bx = a[15]);
- smull2(&accum3, ax = a[12], bx);
- smlal2(&accum1, ax, bx = a[14]);
- smlal2(&accum3, ax = a[13], bx);
- smlal(&accum1, ax, ax);
-
- accum0 = accum1;
- accum2 = accum3;
-
- /* t^2 terms */
- smlal2(&accum2, ax = a[8], bx = a[11]);
- smlal2(&accum0, ax, bx = a[10]);
- smlal2(&accum2, ax = a[9], bx);
- smlal(&accum0, ax, ax);
-
- smlal2(&accum0, ax = a[3], bx = a[7]);
- smlal2(&accum2, ax = a[4], bx);
- smlal2(&accum0, ax, bx = a[6]);
- smlal2(&accum2, ax = a[5], bx);
- smlal(&accum0, ax, ax);
-
- /* t terms */
- accum1 += accum0;
- accum3 += accum2;
- smlal2(&accum3, ax = a[0], bx = a[3]);
- smlal2(&accum1, ax, bx = a[2]);
- smlal2(&accum3, ax = a[1], bx);
- smlal(&accum1, ax, ax);
-
- accum1 = -accum1;
- accum3 = -accum3;
- accum2 = -accum2;
- accum0 = -accum0;
-
- smlal2(&accum1, ax = bm[3], bx = bm[7]);
- smlal2(&accum3, ax = bm[4], bx);
- smlal2(&accum1, ax, bx = bm[6]);
- smlal2(&accum3, ax = bm[5], bx);
- smlal(&accum1, ax, ax);
-
- /* 1 terms */
- smlal2(&accum2, ax = bm[0], bx = bm[3]);
- smlal2(&accum0, ax, bx = bm[2]);
- smlal2(&accum2, ax = bm[1], bx);
- smlal(&accum0, ax, ax);
-
- tmp = -accum3;
- accum3 = tmp - accum2;
- accum2 = tmp;
- tmp = -accum1;
- accum1 = tmp - accum0;
- accum0 = tmp;
-
- accum0 += accumC0;
- accum1 += accumC1;
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[2] = ((uint32_t)(accum0)) & mask;
- c[3] = ((uint32_t)(accum2)) & mask;
- c[10] = ((uint32_t)(accum1)) & mask;
- c[11] = ((uint32_t)(accum3)) & mask;
-
- accumC0 = accum2 >> 28;
- accumC1 = accum3 >> 28;
- }
- {
-
- /* t^3 terms */
- smull2(&accum1, ax = a[13], bx = a[15]);
- smull2(&accum3, ax = a[14], bx);
- smlal(&accum1, ax, ax);
-
- accum0 = accum1;
- accum2 = accum3;
-
- /* t^2 terms */
-
- smlal2(&accum2, ax = a[8], bx = a[13]);
- smlal2(&accum0, ax, bx = a[12]);
- smlal2(&accum2, ax = a[9], bx);
- smlal2(&accum0, ax, bx = a[11]);
- smlal2(&accum2, ax = a[10], bx);
- smlal(&accum0, ax, ax);
-
- smlal2(&accum0, ax = a[5], bx = a[7]);
- smlal2(&accum2, ax = a[6], bx);
- smlal(&accum0, ax, ax);
-
- /* t terms */
- accum1 += accum0;
- accum3 += accum2;
-
- smlal2(&accum3, ax = a[0], bx = a[5]);
- smlal2(&accum1, ax, bx = a[4]);
- smlal2(&accum3, ax = a[1], bx);
- smlal2(&accum1, ax, bx = a[3]);
- smlal2(&accum3, ax = a[2], bx);
- smlal(&accum1, ax, ax);
-
- accum1 = -accum1;
- accum3 = -accum3;
- accum2 = -accum2;
- accum0 = -accum0;
-
- smlal2(&accum1, ax = bm[5], bx = bm[7]);
- smlal2(&accum3, ax = bm[6], bx);
- smlal(&accum1, ax, ax);
-
- /* 1 terms */
-
- smlal2(&accum2, ax = bm[0], bx = bm[5]);
- smlal2(&accum0, ax, bx = bm[4]);
- smlal2(&accum2, ax = bm[1], bx);
- smlal2(&accum0, ax, bx = bm[3]);
- smlal2(&accum2, ax = bm[2], bx);
- smlal(&accum0, ax, ax);
-
- tmp = -accum3;
- accum3 = tmp - accum2;
- accum2 = tmp;
- tmp = -accum1;
- accum1 = tmp - accum0;
- accum0 = tmp;
-
- accum0 += accumC0;
- accum1 += accumC1;
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[4] = ((uint32_t)(accum0)) & mask;
- c[5] = ((uint32_t)(accum2)) & mask;
- c[12] = ((uint32_t)(accum1)) & mask;
- c[13] = ((uint32_t)(accum3)) & mask;
-
- accumC0 = accum2 >> 28;
- accumC1 = accum3 >> 28;
- }
- {
-
- /* t^3 terms */
- smull(&accum1, ax = a[15], bx = a[15]);
- accum0 = accum1;
-
- /* t^2 terms */
-
- smull2(&accum2, ax = a[8], bx);
- smlal2(&accum0, ax, bx = a[14]);
- smlal2(&accum2, ax = a[9], bx);
- smlal2(&accum0, ax, bx = a[13]);
- smlal2(&accum2, ax = a[10], bx);
- smlal2(&accum0, ax, bx = a[12]);
- smlal2(&accum2, ax = a[11], bx);
- smlal(&accum0, ax, ax);
-
- smlal(&accum0, ax = a[7], bx = a[7]);
-
- /* t terms */
- accum1 += accum0;
- accum3 = accum2;
-
- smlal2(&accum3, ax = a[0], bx);
- smlal2(&accum1, ax, bx = a[6]);
- smlal2(&accum3, ax = a[1], bx);
- smlal2(&accum1, ax, bx = a[5]);
- smlal2(&accum3, ax = a[2], bx);
- smlal2(&accum1, ax, bx = a[4]);
- smlal2(&accum3, ax = a[3], bx);
- smlal(&accum1, ax, ax);
-
- accum1 = -accum1;
- accum3 = -accum3;
- accum2 = -accum2;
- accum0 = -accum0;
-
- bx = bm[7];
- smlal(&accum1, bx, bx);
-
- /* 1 terms */
-
- smlal2(&accum2, ax = bm[0], bx);
- smlal2(&accum0, ax, bx = bm[6]);
- smlal2(&accum2, ax = bm[1], bx);
- smlal2(&accum0, ax, bx = bm[5]);
- smlal2(&accum2, ax = bm[2], bx);
- smlal2(&accum0, ax, bx = bm[4]);
- smlal2(&accum2, ax = bm[3], bx);
- smlal(&accum0, ax, ax);
-
- tmp = -accum3;
- accum3 = tmp - accum2;
- accum2 = tmp;
- tmp = -accum1;
- accum1 = tmp - accum0;
- accum0 = tmp;
-
- accum0 += accumC0;
- accum1 += accumC1;
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[6] = ((uint32_t)(accum0)) & mask;
- c[7] = ((uint32_t)(accum2)) & mask;
- c[14] = ((uint32_t)(accum1)) & mask;
- c[15] = ((uint32_t)(accum3)) & mask;
-
- accum0 = accum2 >> 28;
- accum1 = accum3 >> 28;
- }
-
- accum0 += accum1;
- accum0 += c[8];
- accum1 += c[0];
- c[8] = ((uint32_t)(accum0)) & mask;
- c[0] = ((uint32_t)(accum1)) & mask;
-
- accum0 >>= 28;
- accum1 >>= 28;
- c[9] += ((uint32_t)(accum0));
- c[1] += ((uint32_t)(accum1));
-}
-
-void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
-{
- uint32_t mask = (1ull << 28) - 1;
- const uint32_t *a = as->limb;
- uint32_t *c = cs->limb;
- uint64_t accum0, accum8;
- int i;
- uint32_t c0, c8, n0, n8;
-
- assert(b <= mask);
-
- c0 = a[0];
- c8 = a[8];
- accum0 = widemul(b, c0);
- accum8 = widemul(b, c8);
-
- c[0] = accum0 & mask;
- accum0 >>= 28;
- c[8] = accum8 & mask;
- accum8 >>= 28;
-
- i = 1;
- {
- n0 = a[i];
- n8 = a[i + 8];
- smlal(&accum0, b, n0);
- smlal(&accum8, b, n8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
- {
- c0 = a[i];
- c8 = a[i + 8];
- smlal(&accum0, b, c0);
- smlal(&accum8, b, c8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
- {
- n0 = a[i];
- n8 = a[i + 8];
- smlal(&accum0, b, n0);
- smlal(&accum8, b, n8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
- {
- c0 = a[i];
- c8 = a[i + 8];
- smlal(&accum0, b, c0);
- smlal(&accum8, b, c8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
- {
- n0 = a[i];
- n8 = a[i + 8];
- smlal(&accum0, b, n0);
- smlal(&accum8, b, n8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
- {
- c0 = a[i];
- c8 = a[i + 8];
- smlal(&accum0, b, c0);
- smlal(&accum8, b, c8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
- {
- n0 = a[i];
- n8 = a[i + 8];
- smlal(&accum0, b, n0);
- smlal(&accum8, b, n8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
-
- accum0 += accum8 + c[8];
- c[8] = accum0 & mask;
- c[9] += accum0 >> 28;
-
- accum8 += c[0];
- c[0] = accum8 & mask;
- c[1] += accum8 >> 28;
-}
diff --git a/crypto/ec/curve448/arch_arm_32/f_impl.h b/crypto/ec/curve448/arch_arm_32/f_impl.h
deleted file mode 100644
index af90c74f69..0000000000
--- a/crypto/ec/curve448/arch_arm_32/f_impl.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014-2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#define GF_HEADROOM 2
-#define LIMB(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
-#define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
- {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}
-
-#define LIMB_PLACE_VALUE(i) 28
-
-void gf_add_RAW(gf out, const gf a, const gf b)
-{
- for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
- ((uint32xn_t *) out)[i] =
- ((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
- }
-}
-
-void gf_sub_RAW(gf out, const gf a, const gf b)
-{
- for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
- ((uint32xn_t *) out)[i] =
- ((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
- }
-}
-
-void gf_bias(gf a, int amt)
-{
- uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt;
- uint32x4_t lo = { co1, co1, co1, co1 }, hi = {
- co2, co1, co1, co1};
- uint32x4_t *aa = (uint32x4_t *) a;
-
- aa[0] += lo;
- aa[1] += lo;
- aa[2] += hi;
- aa[3] += lo;
-}
-
-void gf_weak_reduce(gf a)
-{
- uint64_t mask = (1ull << 28) - 1;
- uint64_t tmp = a->limb[15] >> 28;
-
- a->limb[8] += tmp;
- for (unsigned int i = 15; i > 0; i--) {
- a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28);
- }
- a->limb[0] = (a->limb[0] & mask) + tmp;
-}
diff --git a/crypto/ec/curve448/arch_neon/arch_intrinsics.h b/crypto/ec/curve448/arch_neon/arch_intrinsics.h
deleted file mode 100644
index 17db426433..0000000000
--- a/crypto/ec/curve448/arch_neon/arch_intrinsics.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2016 Cryptography Research, Inc.
- *
- * Licensed unde