diff options
author | Matt Caswell <matt@openssl.org> | 2017-12-04 11:38:58 +0000 |
---|---|---|
committer | Matt Caswell <matt@openssl.org> | 2018-02-20 12:59:30 +0000 |
commit | 205fd6388175704bd7597dbfb571c84f868ce6da (patch) | |
tree | 4a5a69b4f40a22b614bdfae9924679cafe263cb6 /crypto | |
parent | 1308e022e1a62214b9e7f8ec92ca7045e70af3a2 (diff) |
Run util/openssl-format-source on the Curve448 code
Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
(Merged from https://github.com/openssl/openssl/pull/5105)
Diffstat (limited to 'crypto')
29 files changed, 4178 insertions, 3068 deletions
diff --git a/crypto/ec/curve448/arch_32/arch_intrinsics.h b/crypto/ec/curve448/arch_32/arch_intrinsics.h index 4e6aac2889..33439822fe 100644 --- a/crypto/ec/curve448/arch_32/arch_intrinsics.h +++ b/crypto/ec/curve448/arch_32/arch_intrinsics.h @@ -11,20 +11,21 @@ */ #ifndef __ARCH_ARCH_32_ARCH_INTRINSICS_H__ -#define __ARCH_ARCH_32_ARCH_INTRINSICS_H__ +# define __ARCH_ARCH_32_ARCH_INTRINSICS_H__ -#define ARCH_WORD_BITS 32 +# define ARCH_WORD_BITS 32 -static __inline__ __attribute((always_inline,unused)) -uint32_t word_is_zero(uint32_t a) { +static __inline__ __attribute((always_inline, unused)) +uint32_t word_is_zero(uint32_t a) +{ /* let's hope the compiler isn't clever enough to optimize this. */ - return (((uint64_t)a)-1)>>32; + return (((uint64_t)a) - 1) >> 32; } -static __inline__ __attribute((always_inline,unused)) -uint64_t widemul(uint32_t a, uint32_t b) { +static __inline__ __attribute((always_inline, unused)) +uint64_t widemul(uint32_t a, uint32_t b) +{ return ((uint64_t)a) * b; } -#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */ - +#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */ diff --git a/crypto/ec/curve448/arch_32/f_impl.c b/crypto/ec/curve448/arch_32/f_impl.c index ca67d496df..76ec9711f0 100644 --- a/crypto/ec/curve448/arch_32/f_impl.c +++ b/crypto/ec/curve448/arch_32/f_impl.c @@ -14,84 +14,80 @@ #if (defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) && !I_HATE_UNROLLED_LOOPS) \ || defined(DECAF_FORCE_UNROLL) -#define REPEAT8(_x) _x _x _x _x _x _x _x _x -#define FOR_LIMB(_i,_start,_end,_x) do { _i=_start; REPEAT8( if (_i<_end) { _x; } _i++;) } while (0) +# define REPEAT8(_x) _x _x _x _x _x _x _x _x +# define FOR_LIMB(_i,_start,_end,_x) do { _i=_start; REPEAT8( if (_i<_end) { _x; } _i++;) } while (0) #else -#define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0) +# define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0) #endif -void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { +void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs) +{ const uint32_t *a = as->limb, *b = bs->limb; uint32_t *c = cs->limb; uint64_t accum0 = 0, accum1 = 0, accum2 = 0; - uint32_t mask = (1<<28) - 1; + uint32_t mask = (1 << 28) - 1; uint32_t aa[8], bb[8]; - - int i,j; - for (i=0; i<8; i++) { - aa[i] = a[i] + a[i+8]; - bb[i] = b[i] + b[i+8]; - } - - FOR_LIMB(j,0,8,{ - accum2 = 0; - - FOR_LIMB (i,0,j+1,{ - accum2 += widemul(a[j-i],b[i]); - accum1 += widemul(aa[j-i],bb[i]); - accum0 += widemul(a[8+j-i], b[8+i]); - }); - - accum1 -= accum2; - accum0 += accum2; - accum2 = 0; - - FOR_LIMB (i,j+1,8,{ - accum0 -= widemul(a[8+j-i], b[i]); - accum2 += widemul(aa[8+j-i], bb[i]); - accum1 += widemul(a[16+j-i], b[8+i]); - }); - accum1 += accum2; - accum0 += accum2; + int i, j; + for (i = 0; i < 8; i++) { + aa[i] = a[i] + a[i + 8]; + bb[i] = b[i] + b[i + 8]; + } - c[j] = ((uint32_t)(accum0)) & mask; - c[j+8] = ((uint32_t)(accum1)) & mask; + FOR_LIMB(j, 0, 8, { + accum2 = 0; + FOR_LIMB(i, 0, j + 1, { + accum2 += widemul(a[j - i], b[i]); + accum1 += widemul(aa[j - i], bb[i]); + accum0 += widemul(a[8 + j - i], b[8 + i]); + } + ); accum1 -= accum2; accum0 += accum2; + accum2 = 0; + FOR_LIMB(i, j + 1, 8, { + accum0 -= + widemul(a[8 + j - i], b[i]); + accum2 += + widemul(aa[8 + j - i], + bb[i]); + accum1 += widemul(a[16 + j - i], b[8 + i]); + } + ); + accum1 += accum2; + accum0 += accum2; + c[j] = ((uint32_t)(accum0)) & mask; + c[j + 8] = ((uint32_t)(accum1)) & mask; + accum0 >>= 28; accum1 >>= 28; + }); - accum0 >>= 28; - accum1 >>= 28; - }); - accum0 += accum1; accum0 += c[8]; accum1 += c[0]; c[8] = ((uint32_t)(accum0)) & mask; c[0] = ((uint32_t)(accum1)) & mask; - + accum0 >>= 28; accum1 >>= 28; c[9] += ((uint32_t)(accum0)); c[1] += ((uint32_t)(accum1)); } -void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { +void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b) +{ const uint32_t *a = as->limb; uint32_t *c = cs->limb; uint64_t accum0 = 0, accum8 = 0; - uint32_t mask = (1<<28)-1; + uint32_t mask = (1 << 28) - 1; int i; - assert(b<1<<28); + assert(b < 1 << 28); - FOR_LIMB(i,0,8,{ - accum0 += widemul(b, a[i]); - accum8 += widemul(b, a[i+8]); - - c[i] = accum0 & mask; accum0 >>= 28; - c[i+8] = accum8 & mask; accum8 >>= 28; - }); + FOR_LIMB(i, 0, 8, { + accum0 += widemul(b, a[i]); accum8 += widemul(b, a[i + 8]); + c[i] = accum0 & mask; accum0 >>= 28; + c[i + 8] = accum8 & mask; accum8 >>= 28; + }); accum0 += accum8 + c[8]; c[8] = accum0 & mask; @@ -102,7 +98,7 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { c[1] += accum8 >> 28; } -void gf_sqr (gf_s *__restrict__ cs, const gf as) { - gf_mul(cs,as,as); /* Performs better with a dedicated square */ +void gf_sqr(gf_s * __restrict__ cs, const gf as) +{ + gf_mul(cs, as, as); /* Performs better with a dedicated square */ } - diff --git a/crypto/ec/curve448/arch_32/f_impl.h b/crypto/ec/curve448/arch_32/f_impl.h index 427e03de3f..25bfa1f79e 100644 --- a/crypto/ec/curve448/arch_32/f_impl.h +++ b/crypto/ec/curve448/arch_32/f_impl.h @@ -13,43 +13,46 @@ #define LIMB(x) (x)&((1<<28)-1), (x)>>28 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \ {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}} - + #define LIMB_PLACE_VALUE(i) 28 -void gf_add_RAW (gf out, const gf a, const gf b) { +void gf_add_RAW(gf out, const gf a, const gf b) +{ unsigned int i; - for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { + for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) { out->limb[i] = a->limb[i] + b->limb[i]; } } -void gf_sub_RAW (gf out, const gf a, const gf b) { +void gf_sub_RAW(gf out, const gf a, const gf b) +{ unsigned int i; - for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) { + for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) { out->limb[i] = a->limb[i] - b->limb[i]; } } -void gf_bias (gf a, int amt) { +void gf_bias(gf a, int amt) +{ unsigned int i; - uint32_t co1 = ((1<<28)-1)*amt, co2 = co1-amt; + uint32_t co1 = ((1 << 28) - 1) * amt, co2 = co1 - amt; - for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) { - a->limb[i] += (i==sizeof(*a)/sizeof(a->limb[0])/2) ? co2 : co1; + for (i = 0; i < sizeof(*a) / sizeof(a->limb[0]); i++) { + a->limb[i] += (i == sizeof(*a) / sizeof(a->limb[0]) / 2) ? co2 : co1; } } -void gf_weak_reduce (gf a) { - uint32_t mask = (1<<28) - 1; +void gf_weak_reduce(gf a) +{ + uint32_t mask = (1 << 28) - 1; uint32_t tmp = a->limb[15] >> 28; unsigned int i; a->limb[8] += tmp; - for (i=15; i>0; i--) { - a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); + for (i = 15; i > 0; i--) { + a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28); } a->limb[0] = (a->limb[0] & mask) + tmp; } - diff --git a/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h b/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h index 9c3d481db6..73b82755c3 100644 --- a/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h +++ b/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h @@ -11,22 +11,26 @@ */ #ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__ -#define __ARCH_ARM_32_ARCH_INTRINSICS_H__ +# define __ARCH_ARM_32_ARCH_INTRINSICS_H__ -#define ARCH_WORD_BITS 32 +# define ARCH_WORD_BITS 32 -static __inline__ __attribute((always_inline,unused)) -uint32_t word_is_zero(uint32_t a) { +static __inline__ __attribute((always_inline, unused)) +uint32_t word_is_zero(uint32_t a) +{ uint32_t ret; - asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc"); + asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc"); return ret; } -static __inline__ __attribute((always_inline,unused)) -uint64_t widemul(uint32_t a, uint32_t b) { - /* Could be UMULL, but it's hard to express to CC that the registers must be different */ - return ((uint64_t)a) * b; +static __inline__ __attribute((always_inline, unused)) +uint64_t widemul(uint32_t a, uint32_t b) +{ + /* + * Could be UMULL, but it's hard to express to CC that the registers must + * be different + */ + return ((uint64_t)a) * b; } -#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */ - +#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */ diff --git a/crypto/ec/curve448/arch_arm_32/f_impl.c b/crypto/ec/curve448/arch_arm_32/f_impl.c index b43e24db75..25e970389b 100644 --- a/crypto/ec/curve448/arch_arm_32/f_impl.c +++ b/crypto/ec/curve448/arch_arm_32/f_impl.c @@ -12,100 +12,89 @@ #include "f_field.h" -static inline void __attribute__((gnu_inline,always_inline)) -smlal ( - uint64_t *acc, - const uint32_t a, - const uint32_t b -) { +static inline void __attribute__ ((gnu_inline, always_inline)) + smlal(uint64_t *acc, const uint32_t a, const uint32_t b) +{ #ifdef __ARMEL__ - uint32_t lo = *acc, hi = (*acc)>>32; - - __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" - : [lo]"+&r"(lo), [hi]"+&r"(hi) - : [a]"r"(a), [b]"r"(b)); - - *acc = lo + (((uint64_t)hi)<<32); + uint32_t lo = *acc, hi = (*acc) >> 32; + + __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo), + [hi] "+&r"(hi) + :[a] "r"(a),[b] "r"(b)); + + *acc = lo + (((uint64_t)hi) << 32); #else - *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; + *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b; #endif } -static inline void __attribute__((gnu_inline,always_inline)) -smlal2 ( - uint64_t *acc, - const uint32_t a, - const uint32_t b -) { +static inline void __attribute__ ((gnu_inline, always_inline)) + smlal2(uint64_t *acc, const uint32_t a, const uint32_t b) +{ #ifdef __ARMEL__ - uint32_t lo = *acc, hi = (*acc)>>32; - - __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" - : [lo]"+&r"(lo), [hi]"+&r"(hi) - : [a]"r"(a), [b]"r"(2*b)); - - *acc = lo + (((uint64_t)hi)<<32); + uint32_t lo = *acc, hi = (*acc) >> 32; + + __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo), + [hi] "+&r"(hi) + :[a] "r"(a),[b] "r"(2 * b)); + + *acc = lo + (((uint64_t)hi) << 32); #else - *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2); + *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2); #endif } -static inline void __attribute__((gnu_inline,always_inline)) -smull ( - uint64_t *acc, - const uint32_t a, - const uint32_t b -) { +static inline void __attribute__ ((gnu_inline, always_inline)) + smull(uint64_t *acc, const uint32_t a, const uint32_t b) +{ #ifdef __ARMEL__ uint32_t lo, hi; - - __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]" - : [lo]"=&r"(lo), [hi]"=&r"(hi) - : [a]"r"(a), [b]"r"(b)); - - *acc = lo + (((uint64_t)hi)<<32); + + __asm__ __volatile__("smull %[lo], %[hi], %[a], %[b]":[lo] "=&r"(lo), + [hi] "=&r"(hi) + :[a] "r"(a),[b] "r"(b)); + + *acc = lo + (((uint64_t)hi) << 32); #else - *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; + *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b; #endif } -static inline void __attribute__((gnu_inline,always_inline)) -smull2 ( - uint64_t *acc, - const uint32_t a, - const uint32_t b -) { +static inline void __attribute__ ((gnu_inline, always_inline)) + smull2(uint64_t *acc, const uint32_t a, const uint32_t b) +{ #ifdef __ARMEL__ uint32_t lo, hi; - + __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]" - : [lo]"=&r"(lo), [hi]"=&r"(hi) - : [a]"r"(a), [b]"r"(2*b)); - - *acc = lo + (((uint64_t)hi)<<32); + : [lo] "=&r"(lo),[hi] "=&r"(hi) + : [a] "r"(a),[b] "r"(2 * b)); + + *acc = lo + (((uint64_t)hi) << 32); #else - *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2); + *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2); #endif } -void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { - +void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs) +{ + const uint32_t *a = as->limb, *b = bs->limb; uint32_t *c = cs->limb; uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1; - uint32_t mask = (1<<28) - 1; + uint32_t mask = (1 << 28) - 1; uint32_t aa[8], bm[8]; int i; - for (i=0; i<8; i++) { - aa[i] = a[i] + a[i+8]; - bm[i] = b[i] - b[i+8]; + for (i = 0; i < 8; i++) { + aa[i] = a[i] + a[i + 8]; + bm[i] = b[i] - b[i + 8]; } - uint32_t ax,bx; + uint32_t ax, bx; { /* t^3 terms */ smull(&accum1, ax = aa[1], bx = b[15]); @@ -121,15 +110,15 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum1, ax, bx = b[10]); smlal(&accum3, ax = aa[7], bx); smlal(&accum1, ax, bx = b[9]); - + accum0 = accum1; accum2 = accum3; - + /* t^2 terms */ smlal(&accum2, ax = aa[0], bx); smlal(&accum0, ax, bx = b[8]); smlal(&accum2, ax = aa[1], bx); - + smlal(&accum0, ax = a[9], bx = b[7]); smlal(&accum2, ax = a[10], bx); smlal(&accum0, ax, bx = b[6]); @@ -143,14 +132,14 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum0, ax, bx = b[2]); smlal(&accum2, ax = a[15], bx); smlal(&accum0, ax, bx = b[1]); - + /* t terms */ accum1 += accum0; accum3 += accum2; smlal(&accum3, ax = a[8], bx); smlal(&accum1, ax, bx = b[0]); smlal(&accum3, ax = a[9], bx); - + smlal(&accum1, ax = a[1], bx = bm[7]); smlal(&accum3, ax = a[2], bx); smlal(&accum1, ax, bx = bm[6]); @@ -164,20 +153,20 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum1, ax, bx = bm[2]); smlal(&accum3, ax = a[7], bx); smlal(&accum1, ax, bx = bm[1]); - + /* 1 terms */ smlal(&accum2, ax = a[0], bx); smlal(&accum0, ax, bx = bm[0]); smlal(&accum2, ax = a[1], bx); - + accum2 += accum0 >> 28; accum3 += accum1 >> 28; - + c[0] = ((uint32_t)(accum0)) & mask; c[1] = ((uint32_t)(accum2)) & mask; c[8] = ((uint32_t)(accum1)) & mask; c[9] = ((uint32_t)(accum3)) & mask; - + accumC0 = accum2 >> 28; accumC1 = accum3 >> 28; } @@ -192,10 +181,10 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum1, ax, bx = b[12]); smlal(&accum3, ax = aa[7], bx); smlal(&accum1, ax, bx = b[11]); - + accum0 = accum1; accum2 = accum3; - + /* t^2 terms */ smlal(&accum2, ax = aa[0], bx); smlal(&accum0, ax, bx = b[10]); @@ -204,7 +193,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum2, ax = aa[2], bx); smlal(&accum0, ax, bx = b[8]); smlal(&accum2, ax = aa[3], bx); - + smlal(&accum0, ax = a[11], bx = b[7]); smlal(&accum2, ax = a[12], bx); smlal(&accum0, ax, bx = b[6]); @@ -214,7 +203,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum0, ax, bx = b[4]); smlal(&accum2, ax = a[15], bx); smlal(&accum0, ax, bx = b[3]); - + /* t terms */ accum1 += accum0; accum3 += accum2; @@ -225,7 +214,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum3, ax = a[10], bx); smlal(&accum1, ax, bx = b[0]); smlal(&accum3, ax = a[11], bx); - + smlal(&accum1, ax = a[3], bx = bm[7]); smlal(&accum3, ax = a[4], bx); smlal(&accum1, ax, bx = bm[6]); @@ -235,7 +224,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum1, ax, bx = bm[4]); smlal(&accum3, ax = a[7], bx); smlal(&accum1, ax, bx = bm[3]); - + /* 1 terms */ smlal(&accum2, ax = a[0], bx); smlal(&accum0, ax, bx = bm[2]); @@ -244,34 +233,34 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum2, ax = a[2], bx); smlal(&accum0, ax, bx = bm[0]); smlal(&accum2, ax = a[3], bx); - + accum0 += accumC0; accum1 += accumC1; accum2 += accum0 >> 28; accum3 += accum1 >> 28; - + c[2] = ((uint32_t)(accum0)) & mask; c[3] = ((uint32_t)(accum2)) & mask; c[10] = ((uint32_t)(accum1)) & mask; c[11] = ((uint32_t)(accum3)) & mask; - + accumC0 = accum2 >> 28; accumC1 = accum3 >> 28; } { - + /* t^3 terms */ smull(&accum1, ax = aa[5], bx = b[15]); smull(&accum3, ax = aa[6], bx); smlal(&accum1, ax, bx = b[14]); smlal(&accum3, ax = aa[7], bx); smlal(&accum1, ax, bx = b[13]); - + accum0 = accum1; accum2 = accum3; - + /* t^2 terms */ - + smlal(&accum2, ax = aa[0], bx); smlal(&accum0, ax, bx = b[12]); smlal(&accum2, ax = aa[1], bx); @@ -283,18 +272,17 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum2, ax = aa[4], bx); smlal(&accum0, ax, bx = b[8]); smlal(&accum2, ax = aa[5], bx); - - + smlal(&accum0, ax = a[13], bx = b[7]); smlal(&accum2, ax = a[14], bx); smlal(&accum0, ax, bx = b[6]); smlal(&accum2, ax = a[15], bx); smlal(&accum0, ax, bx = b[5]); - + /* t terms */ accum1 += accum0; accum3 += accum2; - + smlal(&accum3, ax = a[8], bx); smlal(&accum1, ax, bx = b[4]); smlal(&accum3, ax = a[9], bx); @@ -306,16 +294,15 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum3, ax = a[12], bx); smlal(&accum1, ax, bx = b[0]); smlal(&accum3, ax = a[13], bx); - - + smlal(&accum1, ax = a[5], bx = bm[7]); smlal(&accum3, ax = a[6], bx); smlal(&accum1, ax, bx = bm[6]); smlal(&accum3, ax = a[7], bx); smlal(&accum1, ax, bx = bm[5]); - + /* 1 terms */ - + smlal(&accum2, ax = a[0], bx); smlal(&accum0, ax, bx = bm[4]); smlal(&accum2, ax = a[1], bx); @@ -327,28 +314,28 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum2, ax = a[4], bx); smlal(&accum0, ax, bx = bm[0]); smlal(&accum2, ax = a[5], bx); - + accum0 += accumC0; accum1 += accumC1; accum2 += accum0 >> 28; accum3 += accum1 >> 28; - + c[4] = ((uint32_t)(accum0)) & mask; c[5] = ((uint32_t)(accum2)) & mask; c[12] = ((uint32_t)(accum1)) & mask; c[13] = ((uint32_t)(accum3)) & mask; - + accumC0 = accum2 >> 28; accumC1 = accum3 >> 28; } { - + /* t^3 terms */ smull(&accum1, ax = aa[7], bx = b[15]); accum0 = accum1; - + /* t^2 terms */ - + smull(&accum2, ax = aa[0], bx); smlal(&accum0, ax, bx = b[14]); smlal(&accum2, ax = aa[1], bx); @@ -364,14 +351,13 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum2, ax = aa[6], bx); smlal(&accum0, ax, bx = b[8]); smlal(&accum2, ax = aa[7], bx); - - + smlal(&accum0, ax = a[15], bx = b[7]); - + /* t terms */ accum1 += accum0; accum3 = accum2; - + smlal(&accum3, ax = a[8], bx); smlal(&accum1, ax, bx = b[6]); smlal(&accum3, ax = a[9], bx); @@ -387,12 +373,11 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum3, ax = a[14], bx); smlal(&accum1, ax, bx = b[0]); smlal(&accum3, ax = a[15], bx); - - + smlal(&accum1, ax = a[7], bx = bm[7]); - + /* 1 terms */ - + smlal(&accum2, ax = a[0], bx); smlal(&accum0, ax, bx = bm[6]); smlal(&accum2, ax = a[1], bx); @@ -408,17 +393,17 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum2, ax = a[6], bx); smlal(&accum0, ax, bx = bm[0]); smlal(&accum2, ax = a[7], bx); - + accum0 += accumC0; accum1 += accumC1; accum2 += accum0 >> 28; accum3 += accum1 >> 28; - + c[6] = ((uint32_t)(accum0)) & mask; c[7] = ((uint32_t)(accum2)) & mask; c[14] = ((uint32_t)(accum1)) & mask; c[15] = ((uint32_t)(accum3)) & mask; - + accum0 = accum2 >> 28; accum1 = accum3 >> 28; } @@ -428,28 +413,29 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { accum1 += c[0]; c[8] = ((uint32_t)(accum0)) & mask; c[0] = ((uint32_t)(accum1)) & mask; - + accum0 >>= 28; accum1 >>= 28; c[9] += ((uint32_t)(accum0)); c[1] += ((uint32_t)(accum1)); } -void gf_sqr (gf_s *__restrict__ cs, const gf as) { +void gf_sqr(gf_s * __restrict__ cs, const gf as) +{ const uint32_t *a = as->limb; uint32_t *c = cs->limb; uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp; - uint32_t mask = (1<<28) - 1; + uint32_t mask = (1 << 28) - 1; uint32_t bm[8]; - + int i; - for (i=0; i<8; i++) { - bm[i] = a[i] - a[i+8]; + for (i = 0; i < 8; i++) { + bm[i] = a[i] - a[i + 8]; } - uint32_t ax,bx; + uint32_t ax, bx; { /* t^3 terms */ smull2(&accum1, ax = a[9], bx = a[15]); @@ -459,14 +445,14 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum1, ax, bx = a[13]); smlal2(&accum3, ax = a[12], bx); smlal(&accum1, ax, ax); - + accum0 = accum1; accum2 = accum3; - + /* t^2 terms */ smlal2(&accum2, ax = a[8], a[9]); smlal(&accum0, ax, ax); - + smlal2(&accum0, ax = a[1], bx = a[7]); smlal2(&accum2, ax = a[2], bx); smlal2(&accum0, ax, bx = a[6]); @@ -474,18 +460,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum0, ax, bx = a[5]); smlal2(&accum2, ax = a[4], bx); smlal(&accum0, ax, ax); - + /* t terms */ accum1 += accum0; accum3 += accum2; smlal2(&accum3, ax = a[0], bx = a[1]); smlal(&accum1, ax, ax); - |