summaryrefslogtreecommitdiffstats
path: root/crypto/poly1305
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2016-02-28 21:48:43 +0100
committerAndy Polyakov <appro@openssl.org>2016-03-02 13:11:38 +0100
commit1ea8ae5090f557fea2e5b4d5758b10566825d74b (patch)
tree79c0646cba72315661edfa581c4b3f0c3cd7844f /crypto/poly1305
parentbdbd3aea590e45d52c7b120ea6eaff38295b5011 (diff)
poly1305/asm/poly1305-*.pl: flip horizontal add and reduction.
Formally only 32-bit AVX2 code path needs this, but I choose to harmonize all vector code paths. RT#4346 Reviewed-by: Richard Levitte <levitte@openssl.org>
Diffstat (limited to 'crypto/poly1305')
-rwxr-xr-xcrypto/poly1305/asm/poly1305-armv4.pl18
-rwxr-xr-xcrypto/poly1305/asm/poly1305-armv8.pl26
-rwxr-xr-xcrypto/poly1305/asm/poly1305-x86.pl59
-rwxr-xr-xcrypto/poly1305/asm/poly1305-x86_64.pl88
-rw-r--r--crypto/poly1305/poly1305.c31
5 files changed, 128 insertions, 94 deletions
diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl
index 86a6070bf4..06301aa2e1 100755
--- a/crypto/poly1305/asm/poly1305-armv4.pl
+++ b/crypto/poly1305/asm/poly1305-armv4.pl
@@ -1057,6 +1057,15 @@ poly1305_blocks_neon:
.Lshort_tail:
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ horizontal addition
+
+ vadd.i64 $D3#lo,$D3#lo,$D3#hi
+ vadd.i64 $D0#lo,$D0#lo,$D0#hi
+ vadd.i64 $D4#lo,$D4#lo,$D4#hi
+ vadd.i64 $D1#lo,$D1#lo,$D1#hi
+ vadd.i64 $D2#lo,$D2#lo,$D2#hi
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ lazy reduction, but without narrowing
vshr.u64 $T0,$D3,#26
@@ -1086,15 +1095,6 @@ poly1305_blocks_neon:
vadd.i64 $D1,$D1,$T0 @ h0 -> h1
vadd.i64 $D4,$D4,$T1 @ h3 -> h4
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ horizontal addition
-
- vadd.i64 $D2#lo,$D2#lo,$D2#hi
- vadd.i64 $D0#lo,$D0#lo,$D0#hi
- vadd.i64 $D3#lo,$D3#lo,$D3#hi
- vadd.i64 $D1#lo,$D1#lo,$D1#hi
- vadd.i64 $D4#lo,$D4#lo,$D4#hi
-
cmp $len,#0
bne .Leven
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl
index 79185d2bdd..f1359fd44a 100755
--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@@ -791,6 +791,19 @@ poly1305_blocks_neon:
.Lshort_tail:
////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp $ACC3,$ACC3,$ACC3
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp $ACC0,$ACC0,$ACC0
+ ldp d10,d11,[sp,#32]
+ addp $ACC4,$ACC4,$ACC4
+ ldp d12,d13,[sp,#48]
+ addp $ACC1,$ACC1,$ACC1
+ ldp d14,d15,[sp,#64]
+ addp $ACC2,$ACC2,$ACC2
+
+ ////////////////////////////////////////////////////////////////
// lazy reduction, but without narrowing
ushr $T0.2d,$ACC3,#26
@@ -822,19 +835,6 @@ poly1305_blocks_neon:
add $ACC4,$ACC4,$T1.2d // h3 -> h4
////////////////////////////////////////////////////////////////
- // horizontal add
-
- addp $ACC2,$ACC2,$ACC2
- ldp d8,d9,[sp,#16] // meet ABI requirements
- addp $ACC0,$ACC0,$ACC0
- ldp d10,d11,[sp,#32]
- addp $ACC1,$ACC1,$ACC1
- ldp d12,d13,[sp,#48]
- addp $ACC3,$ACC3,$ACC3
- ldp d14,d15,[sp,#64]
- addp $ACC4,$ACC4,$ACC4
-
- ////////////////////////////////////////////////////////////////
// write the result, can be partially reduced
st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl
index 7c1aee5fe0..fb9fa2bc34 100755
--- a/crypto/poly1305/asm/poly1305-x86.pl
+++ b/crypto/poly1305/asm/poly1305-x86.pl
@@ -536,6 +536,8 @@ my $base = shift; $base = "esp" if (!defined($base));
},"edx");
sub lazy_reduction {
+my $extra = shift;
+
################################################################
# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
# and P. Schwabe
@@ -543,6 +545,7 @@ sub lazy_reduction {
&movdqa ($T0,$D3);
&pand ($D3,$MASK);
&psrlq ($T0,26);
+ &$extra () if (defined($extra));
&paddq ($T0,$D4); # h3 -> h4
&movdqa ($T1,$D0);
&pand ($D0,$MASK);
@@ -1091,21 +1094,21 @@ my $addr = shift;
&set_label("short_tail");
- &lazy_reduction ();
-
################################################################
# horizontal addition
+ &pshufd ($T1,$D4,0b01001110);
+ &pshufd ($T0,$D3,0b01001110);
+ &paddq ($D4,$T1);
+ &paddq ($D3,$T0);
&pshufd ($T1,$D0,0b01001110);
&pshufd ($T0,$D1,0b01001110);
- &paddd ($D0,$T1);
+ &paddq ($D0,$T1);
+ &paddq ($D1,$T0);
&pshufd ($T1,$D2,0b01001110);
- &paddd ($D1,$T0);
- &pshufd ($T0,$D3,0b01001110);
- &paddd ($D2,$T1);
- &pshufd ($T1,$D4,0b01001110);
- &paddd ($D3,$T0);
- &paddd ($D4,$T1);
+ #&paddq ($D2,$T1);
+
+ &lazy_reduction (sub { &paddq ($D2,$T1) });
&set_label("done");
&movd (&DWP(-16*3+4*0,"edi"),$D0); # store hash value
@@ -1113,8 +1116,8 @@ my $addr = shift;
&movd (&DWP(-16*3+4*2,"edi"),$D2);
&movd (&DWP(-16*3+4*3,"edi"),$D3);
&movd (&DWP(-16*3+4*4,"edi"),$D4);
-&set_label("nodata");
&mov ("esp","ebp");
+&set_label("nodata");
&function_end("_poly1305_blocks_sse2");
&align (32);
@@ -1435,7 +1438,7 @@ sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; }
&test ("eax","eax"); # is_base2_26?
&jz (&label("enter_blocks"));
-&set_label("enter_avx2",16);
+&set_label("enter_avx2");
&vzeroupper ();
&call (&label("pic_point"));
@@ -1731,31 +1734,31 @@ sub vlazy_reduction {
&vpmuladd (sub { my $i=shift; &QWP(4+32*$i-128,"edx"); });
- &vlazy_reduction();
-
################################################################
# horizontal addition
+ &vpsrldq ($T0,$D4,8);
+ &vpsrldq ($T1,$D3,8);
+ &vpaddq ($D4,$D4,$T0);
&vpsrldq ($T0,$D0,8);
+ &vpaddq ($D3,$D3,$T1);
&vpsrldq ($T1,$D1,8);
&vpaddq ($D0,$D0,$T0);
&vpsrldq ($T0,$D2,8);
&vpaddq ($D1,$D1,$T1);
- &vpsrldq ($T1,$D3,8);
+ &vpermq ($T1,$D4,2); # keep folding
&vpaddq ($D2,$D2,$T0);
- &vpsrldq ($T0,$D4,8);
- &vpaddq ($D3,$D3,$T1);
- &vpermq ($T1,$D0,2); # keep folding
- &vpaddq ($D4,$D4,$T0);
+ &vpermq ($T0,$D3,2);
+ &vpaddq ($D4,$D4,$T1);
+ &vpermq ($T1,$D0,2);
+ &vpaddq ($D3,$D3,$T0);
&vpermq ($T0,$D1,2);
&vpaddq ($D0,$D0,$T1);
&vpermq ($T1,$D2,2);
&vpaddq ($D1,$D1,$T0);
- &vpermq ($T0,$D3,2);
&vpaddq ($D2,$D2,$T1);
- &vpermq ($T1,$D4,2);
- &vpaddq ($D3,$D3,$T0);
- &vpaddq ($D4,$D4,$T1);
+
+ &vlazy_reduction();
&cmp ("ecx",0);
&je (&label("done"));
@@ -1772,14 +1775,14 @@ sub vlazy_reduction {
&jmp (&label("even"));
&set_label("done",16);
- &vmovd (&DWP(-16*3+4*0,"edi"),"xmm0"); # store hash value
- &vmovd (&DWP(-16*3+4*1,"edi"),"xmm1");
- &vmovd (&DWP(-16*3+4*2,"edi"),"xmm2");
- &vmovd (&DWP(-16*3+4*3,"edi"),"xmm3");
- &vmovd (&DWP(-16*3+4*4,"edi"),"xmm4");
+ &vmovd (&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value
+ &vmovd (&DWP(-16*3+4*1,"edi"),&X($D1));
+ &vmovd (&DWP(-16*3+4*2,"edi"),&X($D2));
+ &vmovd (&DWP(-16*3+4*3,"edi"),&X($D3));
+ &vmovd (&DWP(-16*3+4*4,"edi"),&X($D4));
&vzeroupper ();
-&set_label("nodata");
&mov ("esp","ebp");
+&set_label("nodata");
&function_end("_poly1305_blocks_avx2");
}
&set_label("const_sse2",64);
diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl
index b827d24b1a..2265664180 100755
--- a/crypto/poly1305/asm/poly1305-x86_64.pl
+++ b/crypto/poly1305/asm/poly1305-x86_64.pl
@@ -1198,6 +1198,20 @@ $code.=<<___;
.Lshort_tail_avx:
################################################################
+ # horizontal addition
+
+ vpsrldq \$8,$D4,$T4
+ vpsrldq \$8,$D3,$T3
+ vpsrldq \$8,$D1,$T1
+ vpsrldq \$8,$D0,$T0
+ vpsrldq \$8,$D2,$T2
+ vpaddq $T3,$D3,$D3
+ vpaddq $T4,$D4,$D4
+ vpaddq $T0,$D0,$D0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$D2,$D2
+
+ ################################################################
# lazy reduction
vpsrlq \$26,$D3,$H3
@@ -1231,25 +1245,11 @@ $code.=<<___;
vpand $MASK,$D3,$D3
vpaddq $H3,$D4,$D4 # h3 -> h4
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D2,$T2
- vpsrldq \$8,$D0,$T0
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$D3,$T3
- vpsrldq \$8,$D4,$T4
- vpaddq $T2,$D2,$H2
- vpaddq $T0,$D0,$H0
- vpaddq $T1,$D1,$H1
- vpaddq $T3,$D3,$H3
- vpaddq $T4,$D4,$H4
-
- vmovd $H0,`4*0-48-64`($ctx) # save partially reduced
- vmovd $H1,`4*1-48-64`($ctx)
- vmovd $H2,`4*2-48-64`($ctx)
- vmovd $H3,`4*3-48-64`($ctx)
- vmovd $H4,`4*4-48-64`($ctx)
+ vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
+ vmovd $D1,`4*1-48-64`($ctx)
+ vmovd $D2,`4*2-48-64`($ctx)
+ vmovd $D3,`4*3-48-64`($ctx)
+ vmovd $D4,`4*4-48-64`($ctx)
___
$code.=<<___ if ($win64);
vmovdqa 0x50(%r11),%xmm6
@@ -1888,6 +1888,31 @@ $code.=<<___;
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
################################################################
+ # horizontal addition
+
+ vpsrldq \$8,$D1,$T1
+ vpsrldq \$8,$H2,$T2
+ vpsrldq \$8,$H3,$T3
+ vpsrldq \$8,$H4,$T4
+ vpsrldq \$8,$H0,$T0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$H2,$H2
+ vpaddq $T3,$H3,$H3
+ vpaddq $T4,$H4,$H4
+ vpaddq $T0,$H0,$H0
+
+ vpermq \$0x2,$H3,$T3
+ vpermq \$0x2,$H4,$T4
+ vpermq \$0x2,$H0,$T0
+ vpermq \$0x2,$D1,$T1
+ vpermq \$0x2,$H2,$T2
+ vpaddq $T3,$H3,$H3
+ vpaddq $T4,$H4,$H4
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$H2,$H2
+
+ ################################################################
# lazy reduction
vpsrlq \$26,$H3,$D3
@@ -1921,31 +1946,6 @@ $code.=<<___;
vpand $MASK,$H3,$H3
vpaddq $D3,$H4,$H4 # h3 -> h4
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$H2,$T2
- vpsrldq \$8,$H0,$T0
- vpsrldq \$8,$H1,$T1
- vpsrldq \$8,$H3,$T3
- vpsrldq \$8,$H4,$T4
- vpaddq $T2,$H2,$H2
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
-
- vpermq \$0x2,$H2,$T2
- vpermq \$0x2,$H0,$T0
- vpermq \$0x2,$H1,$T1
- vpermq \$0x2,$H3,$T3
- vpermq \$0x2,$H4,$T4
- vpaddq $T2,$H2,$H2
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
-
vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
vmovd %x#$H1,`4*1-48-64`($ctx)
vmovd %x#$H2,`4*2-48-64`($ctx)
diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c
index 7c9f302bfc..303822e62a 100644
--- a/crypto/poly1305/poly1305.c
+++ b/crypto/poly1305/poly1305.c
@@ -668,6 +668,20 @@ static const struct poly1305_test poly1305_tests[] = {
"f248312e578d9d58f8b7bb4d19105431"
},
/*
+ * AVX2 in poly1305-x86.pl failed this with 176+32 split
+ */
+ {
+ "248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd"
+ "2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e8"
+ "74cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c"
+ "8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936a"
+ "ff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a37"
+ "09894e4eb0a4eedc4ae19468e66b81f2"
+ "71351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb",
+ "000102030405060708090a0b0c0d0e0f""00000000000000000000000000000000",
+ "bc939bc5281480fa99c6d68c258ec42f"
+ },
+ /*
* test vectors from Google
*/
{
@@ -844,6 +858,23 @@ int main()
printf("\n");
return 1;
}
+
+ for (half = 16; half < inlen; half += 16) {
+ Poly1305_Init(&poly1305, key);
+ Poly1305_Update(&poly1305, in, half);
+ Poly1305_Update(&poly1305, in+half, inlen-half);
+ Poly1305_Final(&poly1305, out);
+
+ if (memcmp(out, expected, sizeof(expected)) != 0) {
+ printf("Poly1305 test #%d/%d failed.\n", i, half);
+ printf("got: ");
+ hexdump(out, sizeof(out));
+ printf("\nexpected: ");
+ hexdump(expected, sizeof(expected));
+ printf("\n");
+ return 1;
+ }
+ }
}
free(in);