summaryrefslogtreecommitdiffstats
path: root/crypto
diff options
context:
space:
mode:
authorMatt Caswell <matt@openssl.org>2017-12-04 11:38:58 +0000
committerMatt Caswell <matt@openssl.org>2018-02-20 12:59:30 +0000
commit205fd6388175704bd7597dbfb571c84f868ce6da (patch)
tree4a5a69b4f40a22b614bdfae9924679cafe263cb6 /crypto
parent1308e022e1a62214b9e7f8ec92ca7045e70af3a2 (diff)
Run util/openssl-format-source on the Curve448 code
Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de> (Merged from https://github.com/openssl/openssl/pull/5105)
Diffstat (limited to 'crypto')
-rw-r--r--crypto/ec/curve448/arch_32/arch_intrinsics.h19
-rw-r--r--crypto/ec/curve448/arch_32/f_impl.c100
-rw-r--r--crypto/ec/curve448/arch_32/f_impl.h31
-rw-r--r--crypto/ec/curve448/arch_arm_32/arch_intrinsics.h26
-rw-r--r--crypto/ec/curve448/arch_arm_32/f_impl.c446
-rw-r--r--crypto/ec/curve448/arch_arm_32/f_impl.h52
-rw-r--r--crypto/ec/curve448/arch_neon/arch_intrinsics.h26
-rw-r--r--crypto/ec/curve448/arch_neon/f_impl.c1093
-rw-r--r--crypto/ec/curve448/arch_neon/f_impl.h55
-rw-r--r--crypto/ec/curve448/arch_ref64/arch_intrinsics.h21
-rw-r--r--crypto/ec/curve448/arch_ref64/f_impl.c172
-rw-r--r--crypto/ec/curve448/arch_ref64/f_impl.h34
-rw-r--r--crypto/ec/curve448/arch_x86_64/arch_intrinsics.h525
-rw-r--r--crypto/ec/curve448/arch_x86_64/f_impl.c111
-rw-r--r--crypto/ec/curve448/arch_x86_64/f_impl.h63
-rw-r--r--crypto/ec/curve448/constant_time.h175
-rw-r--r--crypto/ec/curve448/curve448.c860
-rw-r--r--crypto/ec/curve448/curve448_tables.c1689
-rw-r--r--crypto/ec/curve448/curve448_test.c58
-rw-r--r--crypto/ec/curve448/curve448utils.h67
-rw-r--r--crypto/ec/curve448/ed448.h149
-rw-r--r--crypto/ec/curve448/eddsa.c333
-rw-r--r--crypto/ec/curve448/f_arithmetic.c62
-rw-r--r--crypto/ec/curve448/f_field.h132
-rw-r--r--crypto/ec/curve448/f_generic.c104
-rw-r--r--crypto/ec/curve448/field.h91
-rw-r--r--crypto/ec/curve448/point_448.h174
-rw-r--r--crypto/ec/curve448/scalar.c206
-rw-r--r--crypto/ec/curve448/word.h372
29 files changed, 4178 insertions, 3068 deletions
diff --git a/crypto/ec/curve448/arch_32/arch_intrinsics.h b/crypto/ec/curve448/arch_32/arch_intrinsics.h
index 4e6aac2889..33439822fe 100644
--- a/crypto/ec/curve448/arch_32/arch_intrinsics.h
+++ b/crypto/ec/curve448/arch_32/arch_intrinsics.h
@@ -11,20 +11,21 @@
*/
#ifndef __ARCH_ARCH_32_ARCH_INTRINSICS_H__
-#define __ARCH_ARCH_32_ARCH_INTRINSICS_H__
+# define __ARCH_ARCH_32_ARCH_INTRINSICS_H__
-#define ARCH_WORD_BITS 32
+# define ARCH_WORD_BITS 32
-static __inline__ __attribute((always_inline,unused))
-uint32_t word_is_zero(uint32_t a) {
+static __inline__ __attribute((always_inline, unused))
+uint32_t word_is_zero(uint32_t a)
+{
/* let's hope the compiler isn't clever enough to optimize this. */
- return (((uint64_t)a)-1)>>32;
+ return (((uint64_t)a) - 1) >> 32;
}
-static __inline__ __attribute((always_inline,unused))
-uint64_t widemul(uint32_t a, uint32_t b) {
+static __inline__ __attribute((always_inline, unused))
+uint64_t widemul(uint32_t a, uint32_t b)
+{
return ((uint64_t)a) * b;
}
-#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
-
+#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
diff --git a/crypto/ec/curve448/arch_32/f_impl.c b/crypto/ec/curve448/arch_32/f_impl.c
index ca67d496df..76ec9711f0 100644
--- a/crypto/ec/curve448/arch_32/f_impl.c
+++ b/crypto/ec/curve448/arch_32/f_impl.c
@@ -14,84 +14,80 @@
#if (defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) && !I_HATE_UNROLLED_LOOPS) \
|| defined(DECAF_FORCE_UNROLL)
-#define REPEAT8(_x) _x _x _x _x _x _x _x _x
-#define FOR_LIMB(_i,_start,_end,_x) do { _i=_start; REPEAT8( if (_i<_end) { _x; } _i++;) } while (0)
+# define REPEAT8(_x) _x _x _x _x _x _x _x _x
+# define FOR_LIMB(_i,_start,_end,_x) do { _i=_start; REPEAT8( if (_i<_end) { _x; } _i++;) } while (0)
#else
-#define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0)
+# define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0)
#endif
-void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
+void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
+{
const uint32_t *a = as->limb, *b = bs->limb;
uint32_t *c = cs->limb;
uint64_t accum0 = 0, accum1 = 0, accum2 = 0;
- uint32_t mask = (1<<28) - 1;
+ uint32_t mask = (1 << 28) - 1;
uint32_t aa[8], bb[8];
-
- int i,j;
- for (i=0; i<8; i++) {
- aa[i] = a[i] + a[i+8];
- bb[i] = b[i] + b[i+8];
- }
-
- FOR_LIMB(j,0,8,{
- accum2 = 0;
-
- FOR_LIMB (i,0,j+1,{
- accum2 += widemul(a[j-i],b[i]);
- accum1 += widemul(aa[j-i],bb[i]);
- accum0 += widemul(a[8+j-i], b[8+i]);
- });
-
- accum1 -= accum2;
- accum0 += accum2;
- accum2 = 0;
-
- FOR_LIMB (i,j+1,8,{
- accum0 -= widemul(a[8+j-i], b[i]);
- accum2 += widemul(aa[8+j-i], bb[i]);
- accum1 += widemul(a[16+j-i], b[8+i]);
- });
- accum1 += accum2;
- accum0 += accum2;
+ int i, j;
+ for (i = 0; i < 8; i++) {
+ aa[i] = a[i] + a[i + 8];
+ bb[i] = b[i] + b[i + 8];
+ }
- c[j] = ((uint32_t)(accum0)) & mask;
- c[j+8] = ((uint32_t)(accum1)) & mask;
+ FOR_LIMB(j, 0, 8, {
+ accum2 = 0;
+ FOR_LIMB(i, 0, j + 1, {
+ accum2 += widemul(a[j - i], b[i]);
+ accum1 += widemul(aa[j - i], bb[i]);
+ accum0 += widemul(a[8 + j - i], b[8 + i]);
+ }
+ ); accum1 -= accum2; accum0 += accum2;
+ accum2 = 0;
+ FOR_LIMB(i, j + 1, 8, {
+ accum0 -=
+ widemul(a[8 + j - i], b[i]);
+ accum2 +=
+ widemul(aa[8 + j - i],
+ bb[i]);
+ accum1 += widemul(a[16 + j - i], b[8 + i]);
+ }
+ );
+ accum1 += accum2;
+ accum0 += accum2;
+ c[j] = ((uint32_t)(accum0)) & mask;
+ c[j + 8] = ((uint32_t)(accum1)) & mask;
+ accum0 >>= 28; accum1 >>= 28;
+ });
- accum0 >>= 28;
- accum1 >>= 28;
- });
-
accum0 += accum1;
accum0 += c[8];
accum1 += c[0];
c[8] = ((uint32_t)(accum0)) & mask;
c[0] = ((uint32_t)(accum1)) & mask;
-
+
accum0 >>= 28;
accum1 >>= 28;
c[9] += ((uint32_t)(accum0));
c[1] += ((uint32_t)(accum1));
}
-void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
+void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
+{
const uint32_t *a = as->limb;
uint32_t *c = cs->limb;
uint64_t accum0 = 0, accum8 = 0;
- uint32_t mask = (1<<28)-1;
+ uint32_t mask = (1 << 28) - 1;
int i;
- assert(b<1<<28);
+ assert(b < 1 << 28);
- FOR_LIMB(i,0,8,{
- accum0 += widemul(b, a[i]);
- accum8 += widemul(b, a[i+8]);
-
- c[i] = accum0 & mask; accum0 >>= 28;
- c[i+8] = accum8 & mask; accum8 >>= 28;
- });
+ FOR_LIMB(i, 0, 8, {
+ accum0 += widemul(b, a[i]); accum8 += widemul(b, a[i + 8]);
+ c[i] = accum0 & mask; accum0 >>= 28;
+ c[i + 8] = accum8 & mask; accum8 >>= 28;
+ });
accum0 += accum8 + c[8];
c[8] = accum0 & mask;
@@ -102,7 +98,7 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
c[1] += accum8 >> 28;
}
-void gf_sqr (gf_s *__restrict__ cs, const gf as) {
- gf_mul(cs,as,as); /* Performs better with a dedicated square */
+void gf_sqr(gf_s * __restrict__ cs, const gf as)
+{
+ gf_mul(cs, as, as); /* Performs better with a dedicated square */
}
-
diff --git a/crypto/ec/curve448/arch_32/f_impl.h b/crypto/ec/curve448/arch_32/f_impl.h
index 427e03de3f..25bfa1f79e 100644
--- a/crypto/ec/curve448/arch_32/f_impl.h
+++ b/crypto/ec/curve448/arch_32/f_impl.h
@@ -13,43 +13,46 @@
#define LIMB(x) (x)&((1<<28)-1), (x)>>28
#define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
{{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}
-
+
#define LIMB_PLACE_VALUE(i) 28
-void gf_add_RAW (gf out, const gf a, const gf b) {
+void gf_add_RAW(gf out, const gf a, const gf b)
+{
unsigned int i;
- for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+ for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] + b->limb[i];
}
}
-void gf_sub_RAW (gf out, const gf a, const gf b) {
+void gf_sub_RAW(gf out, const gf a, const gf b)
+{
unsigned int i;
- for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+ for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] - b->limb[i];
}
}
-void gf_bias (gf a, int amt) {
+void gf_bias(gf a, int amt)
+{
unsigned int i;
- uint32_t co1 = ((1<<28)-1)*amt, co2 = co1-amt;
+ uint32_t co1 = ((1 << 28) - 1) * amt, co2 = co1 - amt;
- for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
- a->limb[i] += (i==sizeof(*a)/sizeof(a->limb[0])/2) ? co2 : co1;
+ for (i = 0; i < sizeof(*a) / sizeof(a->limb[0]); i++) {
+ a->limb[i] += (i == sizeof(*a) / sizeof(a->limb[0]) / 2) ? co2 : co1;
}
}
-void gf_weak_reduce (gf a) {
- uint32_t mask = (1<<28) - 1;
+void gf_weak_reduce(gf a)
+{
+ uint32_t mask = (1 << 28) - 1;
uint32_t tmp = a->limb[15] >> 28;
unsigned int i;
a->limb[8] += tmp;
- for (i=15; i>0; i--) {
- a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
+ for (i = 15; i > 0; i--) {
+ a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28);
}
a->limb[0] = (a->limb[0] & mask) + tmp;
}
-
diff --git a/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h b/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h
index 9c3d481db6..73b82755c3 100644
--- a/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h
+++ b/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h
@@ -11,22 +11,26 @@
*/
#ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__
-#define __ARCH_ARM_32_ARCH_INTRINSICS_H__
+# define __ARCH_ARM_32_ARCH_INTRINSICS_H__
-#define ARCH_WORD_BITS 32
+# define ARCH_WORD_BITS 32
-static __inline__ __attribute((always_inline,unused))
-uint32_t word_is_zero(uint32_t a) {
+static __inline__ __attribute((always_inline, unused))
+uint32_t word_is_zero(uint32_t a)
+{
uint32_t ret;
- asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc");
+ asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
return ret;
}
-static __inline__ __attribute((always_inline,unused))
-uint64_t widemul(uint32_t a, uint32_t b) {
- /* Could be UMULL, but it's hard to express to CC that the registers must be different */
- return ((uint64_t)a) * b;
+static __inline__ __attribute((always_inline, unused))
+uint64_t widemul(uint32_t a, uint32_t b)
+{
+ /*
+ * Could be UMULL, but it's hard to express to CC that the registers must
+ * be different
+ */
+ return ((uint64_t)a) * b;
}
-#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
-
+#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
diff --git a/crypto/ec/curve448/arch_arm_32/f_impl.c b/crypto/ec/curve448/arch_arm_32/f_impl.c
index b43e24db75..25e970389b 100644
--- a/crypto/ec/curve448/arch_arm_32/f_impl.c
+++ b/crypto/ec/curve448/arch_arm_32/f_impl.c
@@ -12,100 +12,89 @@
#include "f_field.h"
-static inline void __attribute__((gnu_inline,always_inline))
-smlal (
- uint64_t *acc,
- const uint32_t a,
- const uint32_t b
-) {
+static inline void __attribute__ ((gnu_inline, always_inline))
+ smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
+{
#ifdef __ARMEL__
- uint32_t lo = *acc, hi = (*acc)>>32;
-
- __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
- : [lo]"+&r"(lo), [hi]"+&r"(hi)
- : [a]"r"(a), [b]"r"(b));
-
- *acc = lo + (((uint64_t)hi)<<32);
+ uint32_t lo = *acc, hi = (*acc) >> 32;
+
+ __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
+ [hi] "+&r"(hi)
+ :[a] "r"(a),[b] "r"(b));
+
+ *acc = lo + (((uint64_t)hi) << 32);
#else
- *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
+ *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b;
#endif
}
-static inline void __attribute__((gnu_inline,always_inline))
-smlal2 (
- uint64_t *acc,
- const uint32_t a,
- const uint32_t b
-) {
+static inline void __attribute__ ((gnu_inline, always_inline))
+ smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
+{
#ifdef __ARMEL__
- uint32_t lo = *acc, hi = (*acc)>>32;
-
- __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
- : [lo]"+&r"(lo), [hi]"+&r"(hi)
- : [a]"r"(a), [b]"r"(2*b));
-
- *acc = lo + (((uint64_t)hi)<<32);
+ uint32_t lo = *acc, hi = (*acc) >> 32;
+
+ __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
+ [hi] "+&r"(hi)
+ :[a] "r"(a),[b] "r"(2 * b));
+
+ *acc = lo + (((uint64_t)hi) << 32);
#else
- *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
+ *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
#endif
}
-static inline void __attribute__((gnu_inline,always_inline))
-smull (
- uint64_t *acc,
- const uint32_t a,
- const uint32_t b
-) {
+static inline void __attribute__ ((gnu_inline, always_inline))
+ smull(uint64_t *acc, const uint32_t a, const uint32_t b)
+{
#ifdef __ARMEL__
uint32_t lo, hi;
-
- __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]"
- : [lo]"=&r"(lo), [hi]"=&r"(hi)
- : [a]"r"(a), [b]"r"(b));
-
- *acc = lo + (((uint64_t)hi)<<32);
+
+ __asm__ __volatile__("smull %[lo], %[hi], %[a], %[b]":[lo] "=&r"(lo),
+ [hi] "=&r"(hi)
+ :[a] "r"(a),[b] "r"(b));
+
+ *acc = lo + (((uint64_t)hi) << 32);
#else
- *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
+ *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b;
#endif
}
-static inline void __attribute__((gnu_inline,always_inline))
-smull2 (
- uint64_t *acc,
- const uint32_t a,
- const uint32_t b
-) {
+static inline void __attribute__ ((gnu_inline, always_inline))
+ smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
+{
#ifdef __ARMEL__
uint32_t lo, hi;
-
+
__asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
- : [lo]"=&r"(lo), [hi]"=&r"(hi)
- : [a]"r"(a), [b]"r"(2*b));
-
- *acc = lo + (((uint64_t)hi)<<32);
+ : [lo] "=&r"(lo),[hi] "=&r"(hi)
+ : [a] "r"(a),[b] "r"(2 * b));
+
+ *acc = lo + (((uint64_t)hi) << 32);
#else
- *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
+ *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
#endif
}
-void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
-
+void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
+{
+
const uint32_t *a = as->limb, *b = bs->limb;
uint32_t *c = cs->limb;
uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1;
- uint32_t mask = (1<<28) - 1;
+ uint32_t mask = (1 << 28) - 1;
uint32_t aa[8], bm[8];
int i;
- for (i=0; i<8; i++) {
- aa[i] = a[i] + a[i+8];
- bm[i] = b[i] - b[i+8];
+ for (i = 0; i < 8; i++) {
+ aa[i] = a[i] + a[i + 8];
+ bm[i] = b[i] - b[i + 8];
}
- uint32_t ax,bx;
+ uint32_t ax, bx;
{
/* t^3 terms */
smull(&accum1, ax = aa[1], bx = b[15]);
@@ -121,15 +110,15 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum1, ax, bx = b[10]);
smlal(&accum3, ax = aa[7], bx);
smlal(&accum1, ax, bx = b[9]);
-
+
accum0 = accum1;
accum2 = accum3;
-
+
/* t^2 terms */
smlal(&accum2, ax = aa[0], bx);
smlal(&accum0, ax, bx = b[8]);
smlal(&accum2, ax = aa[1], bx);
-
+
smlal(&accum0, ax = a[9], bx = b[7]);
smlal(&accum2, ax = a[10], bx);
smlal(&accum0, ax, bx = b[6]);
@@ -143,14 +132,14 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum0, ax, bx = b[2]);
smlal(&accum2, ax = a[15], bx);
smlal(&accum0, ax, bx = b[1]);
-
+
/* t terms */
accum1 += accum0;
accum3 += accum2;
smlal(&accum3, ax = a[8], bx);
smlal(&accum1, ax, bx = b[0]);
smlal(&accum3, ax = a[9], bx);
-
+
smlal(&accum1, ax = a[1], bx = bm[7]);
smlal(&accum3, ax = a[2], bx);
smlal(&accum1, ax, bx = bm[6]);
@@ -164,20 +153,20 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum1, ax, bx = bm[2]);
smlal(&accum3, ax = a[7], bx);
smlal(&accum1, ax, bx = bm[1]);
-
+
/* 1 terms */
smlal(&accum2, ax = a[0], bx);
smlal(&accum0, ax, bx = bm[0]);
smlal(&accum2, ax = a[1], bx);
-
+
accum2 += accum0 >> 28;
accum3 += accum1 >> 28;
-
+
c[0] = ((uint32_t)(accum0)) & mask;
c[1] = ((uint32_t)(accum2)) & mask;
c[8] = ((uint32_t)(accum1)) & mask;
c[9] = ((uint32_t)(accum3)) & mask;
-
+
accumC0 = accum2 >> 28;
accumC1 = accum3 >> 28;
}
@@ -192,10 +181,10 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum1, ax, bx = b[12]);
smlal(&accum3, ax = aa[7], bx);
smlal(&accum1, ax, bx = b[11]);
-
+
accum0 = accum1;
accum2 = accum3;
-
+
/* t^2 terms */
smlal(&accum2, ax = aa[0], bx);
smlal(&accum0, ax, bx = b[10]);
@@ -204,7 +193,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum2, ax = aa[2], bx);
smlal(&accum0, ax, bx = b[8]);
smlal(&accum2, ax = aa[3], bx);
-
+
smlal(&accum0, ax = a[11], bx = b[7]);
smlal(&accum2, ax = a[12], bx);
smlal(&accum0, ax, bx = b[6]);
@@ -214,7 +203,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum0, ax, bx = b[4]);
smlal(&accum2, ax = a[15], bx);
smlal(&accum0, ax, bx = b[3]);
-
+
/* t terms */
accum1 += accum0;
accum3 += accum2;
@@ -225,7 +214,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum3, ax = a[10], bx);
smlal(&accum1, ax, bx = b[0]);
smlal(&accum3, ax = a[11], bx);
-
+
smlal(&accum1, ax = a[3], bx = bm[7]);
smlal(&accum3, ax = a[4], bx);
smlal(&accum1, ax, bx = bm[6]);
@@ -235,7 +224,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum1, ax, bx = bm[4]);
smlal(&accum3, ax = a[7], bx);
smlal(&accum1, ax, bx = bm[3]);
-
+
/* 1 terms */
smlal(&accum2, ax = a[0], bx);
smlal(&accum0, ax, bx = bm[2]);
@@ -244,34 +233,34 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum2, ax = a[2], bx);
smlal(&accum0, ax, bx = bm[0]);
smlal(&accum2, ax = a[3], bx);
-
+
accum0 += accumC0;
accum1 += accumC1;
accum2 += accum0 >> 28;
accum3 += accum1 >> 28;
-
+
c[2] = ((uint32_t)(accum0)) & mask;
c[3] = ((uint32_t)(accum2)) & mask;
c[10] = ((uint32_t)(accum1)) & mask;
c[11] = ((uint32_t)(accum3)) & mask;
-
+
accumC0 = accum2 >> 28;
accumC1 = accum3 >> 28;
}
{
-
+
/* t^3 terms */
smull(&accum1, ax = aa[5], bx = b[15]);
smull(&accum3, ax = aa[6], bx);
smlal(&accum1, ax, bx = b[14]);
smlal(&accum3, ax = aa[7], bx);
smlal(&accum1, ax, bx = b[13]);
-
+
accum0 = accum1;
accum2 = accum3;
-
+
/* t^2 terms */
-
+
smlal(&accum2, ax = aa[0], bx);
smlal(&accum0, ax, bx = b[12]);
smlal(&accum2, ax = aa[1], bx);
@@ -283,18 +272,17 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum2, ax = aa[4], bx);
smlal(&accum0, ax, bx = b[8]);
smlal(&accum2, ax = aa[5], bx);
-
-
+
smlal(&accum0, ax = a[13], bx = b[7]);
smlal(&accum2, ax = a[14], bx);
smlal(&accum0, ax, bx = b[6]);
smlal(&accum2, ax = a[15], bx);
smlal(&accum0, ax, bx = b[5]);
-
+
/* t terms */
accum1 += accum0;
accum3 += accum2;
-
+
smlal(&accum3, ax = a[8], bx);
smlal(&accum1, ax, bx = b[4]);
smlal(&accum3, ax = a[9], bx);
@@ -306,16 +294,15 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum3, ax = a[12], bx);
smlal(&accum1, ax, bx = b[0]);
smlal(&accum3, ax = a[13], bx);
-
-
+
smlal(&accum1, ax = a[5], bx = bm[7]);
smlal(&accum3, ax = a[6], bx);
smlal(&accum1, ax, bx = bm[6]);
smlal(&accum3, ax = a[7], bx);
smlal(&accum1, ax, bx = bm[5]);
-
+
/* 1 terms */
-
+
smlal(&accum2, ax = a[0], bx);
smlal(&accum0, ax, bx = bm[4]);
smlal(&accum2, ax = a[1], bx);
@@ -327,28 +314,28 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum2, ax = a[4], bx);
smlal(&accum0, ax, bx = bm[0]);
smlal(&accum2, ax = a[5], bx);
-
+
accum0 += accumC0;
accum1 += accumC1;
accum2 += accum0 >> 28;
accum3 += accum1 >> 28;
-
+
c[4] = ((uint32_t)(accum0)) & mask;
c[5] = ((uint32_t)(accum2)) & mask;
c[12] = ((uint32_t)(accum1)) & mask;
c[13] = ((uint32_t)(accum3)) & mask;
-
+
accumC0 = accum2 >> 28;
accumC1 = accum3 >> 28;
}
{
-
+
/* t^3 terms */
smull(&accum1, ax = aa[7], bx = b[15]);
accum0 = accum1;
-
+
/* t^2 terms */
-
+
smull(&accum2, ax = aa[0], bx);
smlal(&accum0, ax, bx = b[14]);
smlal(&accum2, ax = aa[1], bx);
@@ -364,14 +351,13 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum2, ax = aa[6], bx);
smlal(&accum0, ax, bx = b[8]);
smlal(&accum2, ax = aa[7], bx);
-
-
+
smlal(&accum0, ax = a[15], bx = b[7]);
-
+
/* t terms */
accum1 += accum0;
accum3 = accum2;
-
+
smlal(&accum3, ax = a[8], bx);
smlal(&accum1, ax, bx = b[6]);
smlal(&accum3, ax = a[9], bx);
@@ -387,12 +373,11 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum3, ax = a[14], bx);
smlal(&accum1, ax, bx = b[0]);
smlal(&accum3, ax = a[15], bx);
-
-
+
smlal(&accum1, ax = a[7], bx = bm[7]);
-
+
/* 1 terms */
-
+
smlal(&accum2, ax = a[0], bx);
smlal(&accum0, ax, bx = bm[6]);
smlal(&accum2, ax = a[1], bx);
@@ -408,17 +393,17 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
smlal(&accum2, ax = a[6], bx);
smlal(&accum0, ax, bx = bm[0]);
smlal(&accum2, ax = a[7], bx);
-
+
accum0 += accumC0;
accum1 += accumC1;
accum2 += accum0 >> 28;
accum3 += accum1 >> 28;
-
+
c[6] = ((uint32_t)(accum0)) & mask;
c[7] = ((uint32_t)(accum2)) & mask;
c[14] = ((uint32_t)(accum1)) & mask;
c[15] = ((uint32_t)(accum3)) & mask;
-
+
accum0 = accum2 >> 28;
accum1 = accum3 >> 28;
}
@@ -428,28 +413,29 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
accum1 += c[0];
c[8] = ((uint32_t)(accum0)) & mask;
c[0] = ((uint32_t)(accum1)) & mask;
-
+
accum0 >>= 28;
accum1 >>= 28;
c[9] += ((uint32_t)(accum0));
c[1] += ((uint32_t)(accum1));
}
-void gf_sqr (gf_s *__restrict__ cs, const gf as) {
+void gf_sqr(gf_s * __restrict__ cs, const gf as)
+{
const uint32_t *a = as->limb;
uint32_t *c = cs->limb;
uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp;
- uint32_t mask = (1<<28) - 1;
+ uint32_t mask = (1 << 28) - 1;
uint32_t bm[8];
-
+
int i;
- for (i=0; i<8; i++) {
- bm[i] = a[i] - a[i+8];
+ for (i = 0; i < 8; i++) {
+ bm[i] = a[i] - a[i + 8];
}
- uint32_t ax,bx;
+ uint32_t ax, bx;
{
/* t^3 terms */
smull2(&accum1, ax = a[9], bx = a[15]);
@@ -459,14 +445,14 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
smlal2(&accum1, ax, bx = a[13]);
smlal2(&accum3, ax = a[12], bx);
smlal(&accum1, ax, ax);
-
+
accum0 = accum1;
accum2 = accum3;
-
+
/* t^2 terms */
smlal2(&accum2, ax = a[8], a[9]);
smlal(&accum0, ax, ax);
-
+
smlal2(&accum0, ax = a[1], bx = a[7]);
smlal2(&accum2, ax = a[2], bx);
smlal2(&accum0, ax, bx = a[6]);
@@ -474,18 +460,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
smlal2(&accum0, ax, bx = a[5]);
smlal2(&accum2, ax = a[4], bx);
smlal(&accum0, ax, ax);
-
+
/* t terms */
accum1 += accum0;
accum3 += accum2;
smlal2(&accum3, ax = a[0], bx = a[1]);
smlal(&accum1, ax, ax);
-