summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatt Caswell <matt@openssl.org>2017-12-04 13:30:53 +0000
committerMatt Caswell <matt@openssl.org>2018-02-20 12:59:30 +0000
commit8d55f844b08199e0ac6a2ddc501de39f3237c5e9 (patch)
tree171d30861a2e4a75cd71b9ebfd37849a22860bf3
parent205fd6388175704bd7597dbfb571c84f868ce6da (diff)
Manual formatting tweaks to Curve448 code
Following running openssl-format-source there were a lot of manual tweaks that were requried. Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de> (Merged from https://github.com/openssl/openssl/pull/5105)
-rw-r--r--crypto/ec/curve448/arch_32/f_impl.c4
-rw-r--r--crypto/ec/curve448/arch_32/f_impl.h6
-rw-r--r--crypto/ec/curve448/arch_arm_32/arch_intrinsics.h3
-rw-r--r--crypto/ec/curve448/arch_arm_32/f_impl.c33
-rw-r--r--crypto/ec/curve448/arch_arm_32/f_impl.h10
-rw-r--r--crypto/ec/curve448/arch_neon/arch_intrinsics.h2
-rw-r--r--crypto/ec/curve448/arch_neon/f_impl.c1069
-rw-r--r--crypto/ec/curve448/arch_neon/f_impl.h30
-rw-r--r--crypto/ec/curve448/arch_ref64/f_impl.c9
-rw-r--r--crypto/ec/curve448/arch_ref64/f_impl.h11
-rw-r--r--crypto/ec/curve448/arch_x86_64/arch_intrinsics.h495
-rw-r--r--crypto/ec/curve448/arch_x86_64/f_impl.c4
-rw-r--r--crypto/ec/curve448/arch_x86_64/f_impl.h9
-rw-r--r--crypto/ec/curve448/constant_time.h122
-rw-r--r--crypto/ec/curve448/curve448.c118
-rw-r--r--crypto/ec/curve448/curve448_tables.c1689
-rw-r--r--crypto/ec/curve448/curve448utils.h49
-rw-r--r--crypto/ec/curve448/ed448.h289
-rw-r--r--crypto/ec/curve448/eddsa.c76
-rw-r--r--crypto/ec/curve448/f_generic.c25
-rw-r--r--crypto/ec/curve448/field.h14
-rw-r--r--crypto/ec/curve448/point_448.h287
-rw-r--r--crypto/ec/curve448/scalar.c78
-rw-r--r--crypto/ec/curve448/word.h9
24 files changed, 1693 insertions, 2748 deletions
diff --git a/crypto/ec/curve448/arch_32/f_impl.c b/crypto/ec/curve448/arch_32/f_impl.c
index 76ec9711f0..3d8a331e3e 100644
--- a/crypto/ec/curve448/arch_32/f_impl.c
+++ b/crypto/ec/curve448/arch_32/f_impl.c
@@ -24,13 +24,11 @@ void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
{
const uint32_t *a = as->limb, *b = bs->limb;
uint32_t *c = cs->limb;
-
uint64_t accum0 = 0, accum1 = 0, accum2 = 0;
uint32_t mask = (1 << 28) - 1;
-
uint32_t aa[8], bb[8];
-
int i, j;
+
for (i = 0; i < 8; i++) {
aa[i] = a[i] + a[i + 8];
bb[i] = b[i] + b[i + 8];
diff --git a/crypto/ec/curve448/arch_32/f_impl.h b/crypto/ec/curve448/arch_32/f_impl.h
index 25bfa1f79e..40a9fb93d9 100644
--- a/crypto/ec/curve448/arch_32/f_impl.h
+++ b/crypto/ec/curve448/arch_32/f_impl.h
@@ -39,9 +39,8 @@ void gf_bias(gf a, int amt)
unsigned int i;
uint32_t co1 = ((1 << 28) - 1) * amt, co2 = co1 - amt;
- for (i = 0; i < sizeof(*a) / sizeof(a->limb[0]); i++) {
+ for (i = 0; i < sizeof(*a) / sizeof(a->limb[0]); i++)
a->limb[i] += (i == sizeof(*a) / sizeof(a->limb[0]) / 2) ? co2 : co1;
- }
}
void gf_weak_reduce(gf a)
@@ -51,8 +50,7 @@ void gf_weak_reduce(gf a)
unsigned int i;
a->limb[8] += tmp;
- for (i = 15; i > 0; i--) {
+ for (i = 15; i > 0; i--)
a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28);
- }
a->limb[0] = (a->limb[0] & mask) + tmp;
}
diff --git a/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h b/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h
index 73b82755c3..aa578a40e5 100644
--- a/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h
+++ b/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h
@@ -19,7 +19,8 @@ static __inline__ __attribute((always_inline, unused))
uint32_t word_is_zero(uint32_t a)
{
uint32_t ret;
- asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
+
+ asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
return ret;
}
diff --git a/crypto/ec/curve448/arch_arm_32/f_impl.c b/crypto/ec/curve448/arch_arm_32/f_impl.c
index 25e970389b..5956d6ceba 100644
--- a/crypto/ec/curve448/arch_arm_32/f_impl.c
+++ b/crypto/ec/curve448/arch_arm_32/f_impl.c
@@ -19,9 +19,10 @@ static inline void __attribute__ ((gnu_inline, always_inline))
#ifdef __ARMEL__
uint32_t lo = *acc, hi = (*acc) >> 32;
- __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
- [hi] "+&r"(hi)
- :[a] "r"(a),[b] "r"(b));
+ __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
+ : [lo]"+&r"(lo), [hi]"+&r"(hi)
+ : [a]"r"(a), [b]"r"(b));
+
*acc = lo + (((uint64_t)hi) << 32);
#else
@@ -35,9 +36,11 @@ static inline void __attribute__ ((gnu_inline, always_inline))
#ifdef __ARMEL__
uint32_t lo = *acc, hi = (*acc) >> 32;
- __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
- [hi] "+&r"(hi)
- :[a] "r"(a),[b] "r"(2 * b));
+ __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
+ : [lo]"+&r"(lo), [hi]"+&r"(hi)
+ : [a]"r"(a), [b]"r"(2 * b));
+
+
*acc = lo + (((uint64_t)hi) << 32);
#else
@@ -51,9 +54,9 @@ static inline void __attribute__ ((gnu_inline, always_inline))
#ifdef __ARMEL__
uint32_t lo, hi;
- __asm__ __volatile__("smull %[lo], %[hi], %[a], %[b]":[lo] "=&r"(lo),
- [hi] "=&r"(hi)
- :[a] "r"(a),[b] "r"(b));
+ __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]"
+ : [lo]"=&r"(lo), [hi]"=&r"(hi)
+ : [a]"r"(a), [b]"r"(b));
*acc = lo + (((uint64_t)hi) << 32);
#else
@@ -68,8 +71,8 @@ static inline void __attribute__ ((gnu_inline, always_inline))
uint32_t lo, hi;
__asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
- : [lo] "=&r"(lo),[hi] "=&r"(hi)
- : [a] "r"(a),[b] "r"(2 * b));
+ : [lo]"=&r"(lo), [hi]"=&r"(hi)
+ : [a]"r"(a), [b]"r"(2*b));
*acc = lo + (((uint64_t)hi) << 32);
#else
@@ -729,16 +732,14 @@ void gf_sqr(gf_s * __restrict__ cs, const gf as)
void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
{
uint32_t mask = (1ull << 28) - 1;
- assert(b <= mask);
-
const uint32_t *a = as->limb;
uint32_t *c = cs->limb;
-
uint64_t accum0, accum8;
-
int i;
-
uint32_t c0, c8, n0, n8;
+
+ assert(b <= mask);
+
c0 = a[0];
c8 = a[8];
accum0 = widemul(b, c0);
diff --git a/crypto/ec/curve448/arch_arm_32/f_impl.h b/crypto/ec/curve448/arch_arm_32/f_impl.h
index 9008619ce7..2e385d5214 100644
--- a/crypto/ec/curve448/arch_arm_32/f_impl.h
+++ b/crypto/ec/curve448/arch_arm_32/f_impl.h
@@ -23,10 +23,6 @@ void gf_add_RAW(gf out, const gf a, const gf b)
((uint32xn_t *) out)[i] =
((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
}
- /*
- * for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
- * out->limb[i] = a->limb[i] + b->limb[i]; }
- */
}
void gf_sub_RAW(gf out, const gf a, const gf b)
@@ -35,10 +31,6 @@ void gf_sub_RAW(gf out, const gf a, const gf b)
((uint32xn_t *) out)[i] =
((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
}
- /*
- * for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
- * out->limb[i] = a->limb[i] - b->limb[i]; }
- */
}
void gf_bias(gf a, int amt)
@@ -47,6 +39,7 @@ void gf_bias(gf a, int amt)
uint32x4_t lo = { co1, co1, co1, co1 }, hi = {
co2, co1, co1, co1};
uint32x4_t *aa = (uint32x4_t *) a;
+
aa[0] += lo;
aa[1] += lo;
aa[2] += hi;
@@ -57,6 +50,7 @@ void gf_weak_reduce(gf a)
{
uint64_t mask = (1ull << 28) - 1;
uint64_t tmp = a->limb[15] >> 28;
+
a->limb[8] += tmp;
for (unsigned int i = 15; i > 0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28);
diff --git a/crypto/ec/curve448/arch_neon/arch_intrinsics.h b/crypto/ec/curve448/arch_neon/arch_intrinsics.h
index 201b3735b3..3947b43485 100644
--- a/crypto/ec/curve448/arch_neon/arch_intrinsics.h
+++ b/crypto/ec/curve448/arch_neon/arch_intrinsics.h
@@ -19,7 +19,7 @@ static __inline__ __attribute((always_inline, unused))
uint32_t word_is_zero(uint32_t a)
{
uint32_t ret;
- __asm__("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
+ __asm__("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
return ret;
}
diff --git a/crypto/ec/curve448/arch_neon/f_impl.c b/crypto/ec/curve448/arch_neon/f_impl.c
index fe21aee128..7f6fda24f9 100644
--- a/crypto/ec/curve448/arch_neon/f_impl.c
+++ b/crypto/ec/curve448/arch_neon/f_impl.c
@@ -12,588 +12,583 @@
#include "f_field.h"
-static __inline__ uint64x2_t __attribute__ ((gnu_inline, always_inline, unused))
- xx_vaddup_u64(uint64x2_t x)
+static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused))
+xx_vaddup_u64(uint64x2_t x)
{
- __asm__("vadd.s64 %f0, %e0":"+w"(x));
+ __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
return x;
}
-static __inline__ int64x2_t __attribute__ ((gnu_inline, always_inline, unused))
- vrev128_s64(int64x2_t x)
+static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused))
+vrev128_s64(int64x2_t x)
{
- __asm__("vswp.s64 %e0, %f0":"+w"(x));
+ __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
return x;
}
-static __inline__ uint64x2_t __attribute__ ((gnu_inline, always_inline))
- vrev128_u64(uint64x2_t x)
+static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline))
+vrev128_u64(uint64x2_t x)
{
- __asm__("vswp.s64 %e0, %f0":"+w"(x));
+ __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
return x;
}
-static inline void __attribute__ ((gnu_inline, always_inline, unused))
- smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
+static inline void __attribute__((gnu_inline,always_inline,unused))
+smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
{
- *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b;
+ *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
}
-static inline void __attribute__ ((gnu_inline, always_inline, unused))
- smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
+static inline void __attribute__((gnu_inline,always_inline,unused))
+smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
{
- *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b *2;
+ *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
}
-static inline void __attribute__ ((gnu_inline, always_inline, unused))
- smull(uint64_t *acc, const uint32_t a, const uint32_t b)
+static inline void __attribute__((gnu_inline,always_inline,unused))
+smull(uint64_t *acc, const uint32_t a, const uint32_t b)
{
- *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b;
+ *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
}
-static inline void __attribute__ ((gnu_inline, always_inline, unused))
- smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
+static inline void __attribute__((gnu_inline,always_inline,unused))
+smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
{
- *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b *2;
+ *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
}
-void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
+void gf_mul(gf_s *__restrict__ cs, const gf as, const gf bs)
{
-#define _bl0 "q0"
-#define _bl0_0 "d0"
-#define _bl0_1 "d1"
-#define _bh0 "q1"
-#define _bh0_0 "d2"
-#define _bh0_1 "d3"
-#define _bs0 "q2"
-#define _bs0_0 "d4"
-#define _bs0_1 "d5"
-#define _bl2 "q3"
-#define _bl2_0 "d6"
-#define _bl2_1 "d7"
-#define _bh2 "q4"
-#define _bh2_0 "d8"
-#define _bh2_1 "d9"
-#define _bs2 "q5"
-#define _bs2_0 "d10"
-#define _bs2_1 "d11"
-
-#define _as0 "q6"
-#define _as0_0 "d12"
-#define _as0_1 "d13"
-#define _as2 "q7"
-#define _as2_0 "d14"
-#define _as2_1 "d15"
-#define _al0 "q8"
-#define _al0_0 "d16"
-#define _al0_1 "d17"
-#define _ah0 "q9"
-#define _ah0_0 "d18"
-#define _ah0_1 "d19"
-#define _al2 "q10"
-#define _al2_0 "d20"
-#define _al2_1 "d21"
-#define _ah2 "q11"
-#define _ah2_0 "d22"
-#define _ah2_1 "d23"
-
-#define _a0a "q12"
-#define _a0a_0 "d24"
-#define _a0a_1 "d25"
-#define _a0b "q13"
-#define _a0b_0 "d26"
-#define _a0b_1 "d27"
-#define _a1a "q14"
-#define _a1a_0 "d28"
-#define _a1a_1 "d29"
-#define _a1b "q15"
-#define _a1b_0 "d30"
-#define _a1b_1 "d31"
-#define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t"
-#define VOP3(op,result,a,b) #op" "result", "a", "b"\n\t"
-#define VOP2(op,result,a) #op" "result", "a"\n\t"
-
- int32x2_t *vc = (int32x2_t *) cs->limb;
-
- __asm__ __volatile__("vld2.32 {" _al0_0 "," _al0_1 "," _ah0_0 "," _ah0_1
- "}, [%[a],:128]!" "\n\t" VOP3(vadd.i32, _as0, _al0,
- _ah0)
- "vld2.32 {" _bl0_0 "," _bl0_1 "," _bh0_0 "," _bh0_1
- "}, [%[b],:128]!" "\n\t" VOP3(vadd.i32, _bs0_1, _bl0_1,
- _bh0_1) VOP3(vsub.i32,
- _bs0_0,
- _bl0_0,
- _bh0_0)
- "vld2.32 {" _bl2_0 "," _bl2_1 "," _bh2_0 "," _bh2_1
- "}, [%[b],:128]!" "\n\t" VOP3(vadd.i32, _bs2, _bl2,
- _bh2)
- "vld2.32 {" _al2_0 "," _al2_1 "," _ah2_0 "," _ah2_1
- "}, [%[a],:128]!" "\n\t" VOP3(vadd.i32, _as2, _al2,
- _ah2)
- VMAC(vmull.s32, _a0b, _as0_1, _bs2_1,
- 0) VMAC(vmlal.s32, _a0b, _as2_0, _bs2_0,
- 0) VMAC(vmlal.s32, _a0b, _as2_1, _bs0_1,
- 0) VMAC(vmlal.s32, _a0b, _as0_0,
- _bh0_0, 0)
- VMAC(vmull.s32, _a1b, _as0_1, _bs2_1,
- 1) VMAC(vmlal.s32, _a1b, _as2_0, _bs2_0,
- 1) VMAC(vmlal.s32, _a1b, _as2_1, _bs0_1,
- 1) VMAC(vmlal.s32, _a1b, _as0_0,
- _bh0_0, 1)
- VOP2(vmov, _a0a, _a0b) VMAC(vmlal.s32, _a0a, _ah0_1,
- _bh2_1, 0) VMAC(vmlal.s32,
- _a0a,
- _ah2_0,
- _bh2_0,
- 0)
- VMAC(vmlal.s32, _a0a, _ah2_1, _bh0_1,
- 0) VMAC(vmlal.s32, _a0a, _ah0_0, _bl0_0, 0)
- VMAC(vmlsl.s32, _a0b, _al0_1, _bl2_1,
- 0) VMAC(vmlsl.s32, _a0b, _al2_0, _bl2_0,
- 0) VMAC(vmlsl.s32, _a0b, _al2_1, _bl0_1,
- 0) VMAC(vmlal.s32, _a0b, _al0_0,
- _bs0_0, 0)
- VOP2(vmov, _a1a, _a1b) VMAC(vmlal.s32, _a1a, _ah0_1,
- _bh2_1, 1) VMAC(vmlal.s32,
- _a1a,
- _ah2_0,
- _bh2_0,
- 1)
- VMAC(vmlal.s32, _a1a, _ah2_1, _bh0_1,
- 1) VMAC(vmlal.s32, _a1a, _ah0_0, _bl0_0, 1)
- VOP2(vswp, _a0b_1, _a0a_0)
- VMAC(vmlsl.s32, _a1b, _al0_1, _bl2_1, 1)
- VMAC(vmlsl.s32, _a1b, _al2_0, _bl2_0, 1)
- VMAC(vmlsl.s32, _a1b, _al2_1, _bl0_1, 1)
- VMAC(vmlal.s32, _a1b, _al0_0, _bs0_0, 1)
- VOP3(vsra.u64, _a0a, _a0b, "#28")
- VOP3(vsub.i32, _bs0_1, _bl0_1, _bh0_1)
- VOP2(vmovn.i64, _a0b_0, _a0b)
- VOP2(vswp, _a1b_1, _a1a_0)
- VOP3(vadd.i64, _a1b, _a0a, _a1b)
- VMAC(vmull.s32, _a0a, _as2_0, _bs2_1, 0)
- VOP2(vmovn.i64, _a0b_1, _a1b)
- VMAC(vmlal.s32, _a0a, _as2_1, _bs2_0, 0)
- VOP3(vsra.u64, _a1a, _a1b, "#28")
- VMAC(vmlal.s32, _a0a, _as0_0, _bh0_1, 0)
- VOP2(vbic.i32, _a0b, "#0xf0000000")
- VMAC(vmlal.s32, _a0a, _as0_1, _bh0_0, 0)
- "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
- VMAC(vmull.s32, _a1b, _as2_0, _bs2_1, 1)
- VMAC(vmlal.s32, _a1b, _as2_1, _bs2_0, 1)
- VMAC(vmlal.s32, _a1b, _as0_0, _bh0_1, 1)
- VMAC(vmlal.s32, _a1b, _as0_1, _bh0_0, 1)
- VOP2(vmov, _a0b_1, _a0a_1)
- VOP3(vadd.i64, _a0b_0, _a0a_0, _a1a_0)
- VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
- VMAC(vmlal.s32, _a0a, _ah2_0, _bh2_1, 0)
- VMAC(vmlal.s32, _a0a, _ah2_1, _bh2_0, 0)
- VMAC(vmlal.s32, _a0a, _ah0_0, _bl0_1, 0)
- VMAC(vmlal.s32, _a0a, _ah0_1, _bl0_0, 0)
- VMAC(vmlsl.s32, _a0b, _al2_0, _bl2_1, 0)
- VMAC(vmlsl.s32, _a0b, _al2_1, _bl2_0, 0)
- VMAC(vmlal.s32, _a0b, _al0_0, _bs0_1, 0)
- VMAC(vmlal.s32, _a0b, _al0_1, _bs0_0, 0)
- VOP2(vmov, _a1a, _a1b)
- VMAC(vmlal.s32, _a1a, _ah2_0, _bh2_1, 1)
- VMAC(vmlal.s32, _a1a, _ah2_1, _bh2_0, 1)
- VMAC(vmlal.s32, _a1a, _ah0_0, _bl0_1, 1)
- VMAC(vmlal.s32, _a1a, _ah0_1, _bl0_0, 1)
- VOP2(vswp, _a0b_1, _a0a_0)
- VMAC(vmlsl.s32, _a1b, _al2_0, _bl2_1, 1)
- VMAC(vmlsl.s32, _a1b, _al2_1, _bl2_0, 1)
- VMAC(vmlal.s32, _a1b, _al0_0, _bs0_1, 1)
- VMAC(vmlal.s32, _a1b, _al0_1, _bs0_0, 1)
- VOP3(vsra.u64, _a0a, _a0b, "#28")
- VOP3(vsub.i32, _bs2_0, _bl2_0, _bh2_0)
- VOP2(vmovn.i64, _a0b_0, _a0b)
- VOP2(vswp, _a1b_1, _a1a_0)
- VOP3(vadd.i64, _a1b, _a0a, _a1b)
- VMAC(vmull.s32, _a0a, _as2_1, _bs2_1, 0)
- VOP2(vmovn.i64, _a0b_1, _a1b)
- VMAC(vmlal.s32, _a0a, _as0_0, _bh2_0, 0)
- VOP3(vsra.u64, _a1a, _a1b, "#28")
- VMAC(vmlal.s32, _a0a, _as0_1, _bh0_1, 0)
- VOP2(vbic.i32, _a0b, "#0xf0000000")
- VMAC(vmlal.s32, _a0a, _as2_0, _bh0_0, 0)
- "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
- VMAC(vmull.s32, _a1b, _as2_1, _bs2_1, 1)
- VMAC(vmlal.s32, _a1b, _as0_0, _bh2_0, 1)
- VMAC(vmlal.s32, _a1b, _as0_1, _bh0_1, 1)
- VMAC(vmlal.s32, _a1b, _as2_0, _bh0_0, 1)
- VOP2(vmov, _a0b_1, _a0a_1)
- VOP3(vadd.i64, _a0b_0, _a0a_0, _a1a_0)
- VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
- VMAC(vmlal.s32, _a0a, _ah2_1, _bh2_1, 0)
- VMAC(vmlal.s32, _a0a, _ah0_0, _bl2_0, 0)
- VMAC(vmlal.s32, _a0a, _ah0_1, _bl0_1, 0)
- VMAC(vmlal.s32, _a0a, _ah2_0, _bl0_0, 0)
- VMAC(vmlsl.s32, _a0b, _al2_1, _bl2_1, 0)
- VMAC(vmlal.s32, _a0b, _al0_0, _bs2_0, 0)
- VMAC(vmlal.s32, _a0b, _al0_1, _bs0_1, 0)
- VMAC(vmlal.s32, _a0b, _al2_0, _bs0_0, 0)
- VOP2(vmov, _a1a, _a1b)
- VMAC(vmlal.s32, _a1a, _ah2_1, _bh2_1, 1)
- VMAC(vmlal.s32, _a1a, _ah0_0, _bl2_0, 1)
- VMAC(vmlal.s32, _a1a, _ah0_1, _bl0_1, 1)
- VMAC(vmlal.s32, _a1a, _ah2_0, _bl0_0, 1)
- VOP2(vswp, _a0b_1, _a0a_0)
- VMAC(vmlsl.s32, _a1b, _al2_1, _bl2_1, 1)
- VMAC(vmlal.s32, _a1b, _al0_0, _bs2_0, 1)
- VMAC(vmlal.s32, _a1b, _al0_1, _bs0_1, 1)
- VMAC(vmlal.s32, _a1b, _al2_0, _bs0_0, 1)
- VOP3(vsub.i32, _bs2_1, _bl2_1, _bh2_1)
- VOP3(vsra.u64, _a0a, _a0b, "#28")
- VOP2(vmovn.i64, _a0b_0, _a0b)
- VOP2(vswp, _a1b_1, _a1a_0)
- VOP3(vadd.i64, _a1b, _a0a, _a1b)
- VMAC(vmull.s32, _a0a, _as0_0, _bh2_1, 0)
- VOP2(vmovn.i64, _a0b_1, _a1b)
- VMAC(vmlal.s32, _a0a, _as0_1, _bh2_0, 0)
- VOP3(vsra.u64, _a1a, _a1b, "#28")
- VMAC(vmlal.s32, _a0a, _as2_0, _bh0_1, 0)
- VOP2(vbic.i32, _a0b, "#0xf0000000")
- VMAC(vmlal.s32, _a0a, _as2_1, _bh0_0, 0)
- "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
- VMAC(vmull.s32, _a1b, _as0_0, _bh2_1, 1)
- VMAC(vmlal.s32, _a1b, _as0_1, _bh2_0, 1)
- VMAC(vmlal.s32, _a1b, _as2_0, _bh0_1, 1)
- VMAC(vmlal.s32, _a1b, _as2_1, _bh0_0, 1)
- VOP2(vmov, _a0b_1, _a0a_1)
- VOP3(vadd.i64, _a0b_0, _a0a_0, _a1a_0)
- VOP3(vadd.i64, _a0a_0, _a0a_0, _a1a_1)
- VMAC(vmlal.s32, _a0a, _ah0_0, _bl2_1, 0)
- VMAC(vmlal.s32, _a0a, _ah0_1, _bl2_0, 0)
- VMAC(vmlal.s32, _a0a, _ah2_0, _bl0_1, 0)
- VMAC(vmlal.s32, _a0a, _ah2_1, _bl0_0, 0)
- VMAC(vmlal.s32, _a0b, _al0_0, _bs2_1, 0)
- VMAC(vmlal.s32, _a0b, _al0_1, _bs2_0, 0)
- VMAC(vmlal.s32, _a0b, _al2_0, _bs0_1, 0)
- VMAC(vmlal.s32, _a0b, _al2_1, _bs0_0, 0)
- VOP2(vmov, _a1a, _a1b)
- VMAC(vmlal.s32, _a1a, _ah0_0, _bl2_1, 1)
- VMAC(vmlal.s32, _a1a, _ah0_1, _bl2_0, 1)
- VMAC(vmlal.s32, _a1a, _ah2_0, _bl0_1, 1)
- VMAC(vmlal.s32, _a1a, _ah2_1, _bl0_0, 1)
- VOP2(vswp, _a0b_1, _a0a_0)
- VMAC(vmlal.s32, _a1b, _al0_0, _bs2_1, 1)
- VMAC(vmlal.s32, _a1b, _al0_1, _bs2_0, 1)
- VMAC(vmlal.s32, _a1b, _al2_0, _bs0_1, 1)
- VMAC(vmlal.s32, _a1b, _al2_1, _bs0_0, 1)
- VOP3(vsra.u64, _a0a, _a0b, "#28")
- VOP2(vmovn.i64, _a0b_0, _a0b)
- VOP2(vswp, _a1b_1, _a1a_0)
- VOP3(vadd.i64, _a0a, _a0a, _a1b)
- VOP2(vmovn.i64, _a0b_1, _a0a)
- VOP3(vsra.u64, _a1a, _a0a, "#28")
- VOP2(vbic.i32, _a0b, "#0xf0000000")
- VOP2(vswp, _a1a_0, _a1a_1)
- "vstmia %[c]!, {" _a0b_0 ", " _a0b_1 "}" "\n\t"
- "sub %[c], #64" "\n\t"
- VOP3(vadd.i64, _a1a_1, _a1a_1, _a1a_0)
- "vldmia %[c], {" _a0a_0 ", " _a0a_1 ", " _a0b_0 "}"
- "\n\t" VOP2(vaddw.s32, _a1a, _a0a_0) VOP2(vmovn.i64,
- _a0a_0,
- _a1a)
- VOP2(vshr.s64, _a1a, "#28")
- VOP2(vaddw.s32, _a1a, _a0a_1) VOP2(vmovn.i64, _a0a_1,
- _a1a) VOP2(vshr.s64,
- _a1a,
- "#28")
- VOP2(vbic.i32, _a0a, "#0xf0000000")
- VOP2(vaddw.s32, _a1a, _a0b_0)
- VOP2(vmovn.i64, _a0b_0, _a1a)
- "vstmia %[c], {" _a0a_0 ", " _a0a_1 ", " _a0b_0 "}"
- "\n\t":[a] "+r"(as)
- ,[b] "+r"(bs)
- ,[c] "+r"(vc)
-
- ::"q0", "q1", "q2", "q3",
- "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15", "memory");
+ #define _bl0 "q0"
+ #define _bl0_0 "d0"
+ #define _bl0_1 "d1"
+ #define _bh0 "q1"
+ #define _bh0_0 "d2"
+ #define _bh0_1 "d3"
+ #define _bs0 "q2"
+ #define _bs0_0 "d4"
+ #define _bs0_1 "d5"
+ #define _bl2 "q3"
+ #define _bl2_0 "d6"
+ #define _bl2_1 "d7"
+ #define _bh2 "q4"
+ #define _bh2_0 "d8"
+ #define _bh2_1 "d9"
+ #define _bs2 "q5"
+ #define _bs2_0 "d10"
+ #define _bs2_1 "d11"
+
+ #define _as0 "q6"
+ #define _as0_0 "d12"
+ #define _as0_1 "d13"
+ #define _as2 "q7"
+ #define _as2_0 "d14"
+ #define _as2_1 "d15"
+ #define _al0 "q8"
+ #define _al0_0 "d16"
+ #define _al0_1 "d17"
+ #define _ah0 "q9"
+ #define _ah0_0 "d18"
+ #define _ah0_1 "d19"
+ #define _al2 "q10"
+ #define _al2_0 "d20"
+ #define _al2_1 "d21"
+ #define _ah2 "q11"
+ #define _ah2_0 "d22"
+ #define _ah2_1 "d23"
+
+ #define _a0a "q12"
+ #define _a0a_0 "d24"
+ #define _a0a_1 "d25"
+ #define _a0b "q13"
+ #define _a0b_0 "d26"
+ #define _a0b_1 "d27"
+ #define _a1a "q14"
+ #define _a1a_0 "d28"
+ #define _a1a_1 "d29"
+ #define _a1b "q15"
+ #define _a1b_0 "d30"
+ #define _a1b_1 "d31"
+ #define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t"
+ #define VOP3(op,result,a,b) #op" "result", "a", "b"\n\t"
+ #define VOP2(op,result,a) #op" "result", "a"\n\t"
+
+ int32x2_t *vc = (int32x2_t*) cs->limb;
+
+ __asm__ __volatile__(
+
+ "vld2.32 {"_al0_0","_al0_1","_ah0_0","_ah0_1"}, [%[a],:128]!" "\n\t"
+ VOP3(vadd.i32,_as0,_al0,_ah0)
+
+ "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
+ VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1)
+ VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0)
+
+ "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
+ VOP3(vadd.i32,_bs2,_bl2,_bh2)
+
+ "vld2.32 {"_al2_0","_al2_1","_ah2_0","_ah2_1"}, [%[a],:128]!" "\n\t"
+ VOP3(vadd.i32,_as2,_al2,_ah2)
+
+ VMAC(vmull.s32,_a0b,_as0_1,_bs2_1,0)
+ VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)
+ VMAC(vmlal.s32,_a0b,_as2_1,_bs0_1,0)
+ VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)
+
+ VMAC(vmull.s32,_a1b,_as0_1,_bs2_1,1)
+ VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)
+ VMAC(vmlal.s32,_a1b,_as2_1,_bs0_1,1)
+ VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)
+
+ VOP2(vmov,_a0a,_a0b)
+ VMAC(vmlal.s32,_a0a,_ah0_1,_bh2_1,0)
+ VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_0,0)
+ VMAC(vmlal.s32,_a0a,_ah2_1,_bh0_1,0)
+ VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_0,0)
+
+ VMAC(vmlsl.s32,_a0b,_al0_1,_bl2_1,0)
+ VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_0,0)
+ VMAC(vmlsl.s32,_a0b,_al2_1,_bl0_1,0)
+ VMAC(vmlal.s32,_a0b,_al0_0,_bs0_0,0)
+
+ VOP2(vmov,_a1a,_a1b)
+ VMAC(vmlal.s32,_a1a,_ah0_1,_bh2_1,1)
+ VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_0,1)
+ VMAC(vmlal.s32,_a1a,_ah2_1,_bh0_1,1)
+ VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_0,1)
+
+ VOP2(vswp,_a0b_1,_a0a_0)
+
+ VMAC(vmlsl.s32,_a1b,_al0_1,_bl2_1,1)
+ VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_0,1)
+ VMAC(vmlsl.s32,_a1b,_al2_1,_bl0_1,1)
+ VMAC(vmlal.s32,_a1b,_al0_0,_bs0_0,1)
+
+ VOP3(vsra.u64,_a0a,_a0b,"#28")
+ VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
+ VOP2(vmovn.i64,_a0b_0,_a0b)
+
+ VOP2(vswp,_a1b_1,_a1a_0)
+ VOP3(vadd.i64,_a1b,_a0a,_a1b)
+
+
+ VMAC(vmull.s32,_a0a,_as2_0,_bs2_1,0)
+ VOP2(vmovn.i64,_a0b_1,_a1b)
+ VMAC(vmlal.s32,_a0a,_as2_1,_bs2_0,0)
+ VOP3(vsra.u64,_a1a,_a1b,"#28")
+ VMAC(vmlal.s32,_a0a,_as0_0,_bh0_1,0)
+ VOP2(vbic.i32,_a0b,"#0xf0000000")
+ VMAC(vmlal.s32,_a0a,_as0_1,_bh0_0,0)
+ "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
+
+ VMAC(vmull.s32,_a1b,_as2_0,_bs2_1,1)
+ VMAC(vmlal.s32,_a1b,_as2_1,_bs2_0,1)
+ VMAC(vmlal.s32,_a1b,_as0_0,_bh0_1,1)
+ VMAC(vmlal.s32,_a1b,_as0_1,_bh0_0,1)
+
+ VOP2(vmov,_a0b_1,_a0a_1)
+ VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
+ VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
+ VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_1,0)
+ VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_0,0)
+ VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_1,0)
+ VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_0,0)
+
+ VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_1,0)
+ VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_0,0)
+ VMAC(vmlal.s32,_a0b,_al0_0,_bs0_1,0)
+ VMAC(vmlal.s32,_a0b,_al0_1,_bs0_0,0)
+
+ VOP2(vmov,_a1a,_a1b)
+ VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_1,1)
+ VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_0,1)
+ VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_1,1)
+ VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_0,1)
+
+ VOP2(vswp,_a0b_1,_a0a_0)
+
+ VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_1,1)
+ VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_0,1)
+ VMAC(vmlal.s32,_a1b,_al0_0,_bs0_1,1)
+ VMAC(vmlal.s32,_a1b,_al0_1,_bs0_0,1)
+
+ VOP3(vsra.u64,_a0a,_a0b,"#28")
+ VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
+ VOP2(vmovn.i64,_a0b_0,_a0b)
+
+ VOP2(vswp,_a1b_1,_a1a_0)
+ VOP3(vadd.i64,_a1b,_a0a,_a1b)
+
+ VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
+ VOP2(vmovn.i64,_a0b_1,_a1b)
+ VMAC(vmlal.s32,_a0a,_as0_0,_bh2_0,0)
+ VOP3(vsra.u64,_a1a,_a1b,"#28")
+ VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
+ VOP2(vbic.i32,_a0b,"#0xf0000000")
+ VMA