summaryrefslogtreecommitdiffstats
path: root/crypto
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2015-01-23 17:27:10 +0100
committerAndy Polyakov <appro@openssl.org>2015-04-20 15:43:05 +0200
commit23f6eec71dbd472044db7dc854599f1de14a1f48 (patch)
tree1a5a4d31fd89dfd3d2ecb3d698a777387672d6c0 /crypto
parent313e6ec11fb8a7bda1676ce5804bee8755664141 (diff)
aes/asm/aesni-x86[_64].pl update.
This addresses - request for improvement for faster key setup in RT#3576; - clearing registers and stack in RT#3554 (this is more of a gesture to see if there will be some traction from compiler side); - more commentary around input parameters handling and stack layout (desired when RT#3553 was reviewed); - minor size and single block performance optimization (was lying around); Reviewed-by: Matt Caswell <matt@openssl.org>
Diffstat (limited to 'crypto')
-rw-r--r--crypto/aes/asm/aesni-x86.pl319
-rw-r--r--crypto/aes/asm/aesni-x86_64.pl945
2 files changed, 1025 insertions, 239 deletions
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index 3deb86aed6..847695fb1b 100644
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -51,7 +51,7 @@
# Westmere 3.77/1.37 1.37 1.52 1.27
# * Bridge 5.07/0.98 0.99 1.09 0.91
# Haswell 4.44/0.80 0.97 1.03 0.72
-# Atom 5.77/3.56 3.67 4.03 3.46
+# Silvermont 5.77/3.56 3.67 4.03 3.46
# Bulldozer 5.80/0.98 1.05 1.24 0.93
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
@@ -65,6 +65,9 @@ require "x86asm.pl";
&asm_init($ARGV[0],$0);
+&external_label("OPENSSL_ia32cap_P");
+&static_label("key_const");
+
if ($PREFIX eq "aesni") { $movekey=\&movups; }
else { $movekey=\&movups; }
@@ -181,7 +184,10 @@ sub aesni_generate1 # fully unrolled loop
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
+ &pxor ($rndkey0,$rndkey0); # clear register bank
+ &pxor ($rndkey1,$rndkey1);
&movups (&QWP(0,"eax"),$inout0);
+ &pxor ($inout0,$inout0);
&ret ();
&function_end_B("${PREFIX}_encrypt");
@@ -197,7 +203,10 @@ sub aesni_generate1 # fully unrolled loop
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
+ &pxor ($rndkey0,$rndkey0); # clear register bank
+ &pxor ($rndkey1,$rndkey1);
&movups (&QWP(0,"eax"),$inout0);
+ &pxor ($inout0,$inout0);
&ret ();
&function_end_B("${PREFIX}_decrypt");
@@ -349,17 +358,15 @@ sub aesni_generate6
&neg ($rounds);
eval"&aes${p} ($inout2,$rndkey1)";
&pxor ($inout5,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0,$key,$rounds));
&add ($rounds,16);
- eval"&aes${p} ($inout3,$rndkey1)";
- eval"&aes${p} ($inout4,$rndkey1)";
- eval"&aes${p} ($inout5,$rndkey1)";
- &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
- &jmp (&label("_aesni_${p}rypt6_enter"));
+ &jmp (&label("_aesni_${p}rypt6_inner"));
&set_label("${p}6_loop",16);
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
+ &set_label("_aesni_${p}rypt6_inner");
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p} ($inout4,$rndkey1)";
eval"&aes${p} ($inout5,$rndkey1)";
@@ -615,6 +622,14 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0x30,$out),$inout3);
&set_label("ecb_ret");
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &pxor ("xmm6","xmm6");
+ &pxor ("xmm7","xmm7");
&function_end("aesni_ecb_encrypt");
######################################################################
@@ -704,6 +719,15 @@ if ($PREFIX eq "aesni") {
&mov ("esp",&DWP(48,"esp"));
&mov ($out,&wparam(5));
&movups (&QWP(0,$out),$cmac);
+
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &pxor ("xmm6","xmm6");
+ &pxor ("xmm7","xmm7");
&function_end("aesni_ccm64_encrypt_blocks");
&function_begin("aesni_ccm64_decrypt_blocks");
@@ -804,6 +828,15 @@ if ($PREFIX eq "aesni") {
&mov ("esp",&DWP(48,"esp"));
&mov ($out,&wparam(5));
&movups (&QWP(0,$out),$cmac);
+
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &pxor ("xmm6","xmm6");
+ &pxor ("xmm7","xmm7");
&function_end("aesni_ccm64_decrypt_blocks");
}
@@ -1053,6 +1086,17 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0x30,$out),$inout3);
&set_label("ctr32_ret");
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
+ &pxor ("xmm5","xmm5");
+ &movdqa (&QWP(48,"esp"),"xmm0");
+ &pxor ("xmm6","xmm6");
+ &movdqa (&QWP(64,"esp"),"xmm0");
+ &pxor ("xmm7","xmm7");
&mov ("esp",&DWP(80,"esp"));
&function_end("aesni_ctr32_encrypt_blocks");
@@ -1394,6 +1438,20 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(-16,$out),$inout0); # write output
&set_label("xts_enc_ret");
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
+ &pxor ("xmm3","xmm3");
+ &movdqa (&QWP(16*1,"esp"),"xmm0");
+ &pxor ("xmm4","xmm4");
+ &movdqa (&QWP(16*2,"esp"),"xmm0");
+ &pxor ("xmm5","xmm5");
+ &movdqa (&QWP(16*3,"esp"),"xmm0");
+ &pxor ("xmm6","xmm6");
+ &movdqa (&QWP(16*4,"esp"),"xmm0");
+ &pxor ("xmm7","xmm7");
+ &movdqa (&QWP(16*5,"esp"),"xmm0");
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
&function_end("aesni_xts_encrypt");
@@ -1756,6 +1814,20 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0,$out),$inout0); # write output
&set_label("xts_dec_ret");
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
+ &pxor ("xmm3","xmm3");
+ &movdqa (&QWP(16*1,"esp"),"xmm0");
+ &pxor ("xmm4","xmm4");
+ &movdqa (&QWP(16*2,"esp"),"xmm0");
+ &pxor ("xmm5","xmm5");
+ &movdqa (&QWP(16*3,"esp"),"xmm0");
+ &pxor ("xmm6","xmm6");
+ &movdqa (&QWP(16*4,"esp"),"xmm0");
+ &pxor ("xmm7","xmm7");
+ &movdqa (&QWP(16*5,"esp"),"xmm0");
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
&function_end("aesni_xts_decrypt");
}
@@ -1808,6 +1880,7 @@ if ($PREFIX eq "aesni") {
&add ($len,16);
&jnz (&label("cbc_enc_tail"));
&movaps ($ivec,$inout0);
+ &pxor ($inout0,$inout0);
&jmp (&label("cbc_ret"));
&set_label("cbc_enc_tail");
@@ -1871,7 +1944,7 @@ if ($PREFIX eq "aesni") {
&movaps ($inout0,$inout5);
&movaps ($ivec,$rndkey0);
&add ($len,0x50);
- &jle (&label("cbc_dec_tail_collected"));
+ &jle (&label("cbc_dec_clear_tail_collected"));
&movups (&QWP(0,$out),$inout0);
&lea ($out,&DWP(0x10,$out));
&set_label("cbc_dec_tail");
@@ -1910,10 +1983,14 @@ if ($PREFIX eq "aesni") {
&xorps ($inout4,$rndkey0);
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
+ &pxor ($inout1,$inout1);
&movups (&QWP(0x20,$out),$inout2);
+ &pxor ($inout2,$inout2);
&movups (&QWP(0x30,$out),$inout3);
+ &pxor ($inout3,$inout3);
&lea ($out,&DWP(0x40,$out));
&movaps ($inout0,$inout4);
+ &pxor ($inout4,$inout4);
&sub ($len,0x50);
&jmp (&label("cbc_dec_tail_collected"));
@@ -1933,6 +2010,7 @@ if ($PREFIX eq "aesni") {
&xorps ($inout1,$in0);
&movups (&QWP(0,$out),$inout0);
&movaps ($inout0,$inout1);
+ &pxor ($inout1,$inout1);
&lea ($out,&DWP(0x10,$out));
&movaps ($ivec,$in1);
&sub ($len,0x20);
@@ -1945,7 +2023,9 @@ if ($PREFIX eq "aesni") {
&xorps ($inout2,$in1);
&movups (&QWP(0,$out),$inout0);
&movaps ($inout0,$inout2);
+ &pxor ($inout2,$inout2);
&movups (&QWP(0x10,$out),$inout1);
+ &pxor ($inout1,$inout1);
&lea ($out,&DWP(0x20,$out));
&movups ($ivec,&QWP(0x20,$inp));
&sub ($len,0x30);
@@ -1961,29 +2041,44 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0,$out),$inout0);
&xorps ($inout2,$rndkey1);
&movups (&QWP(0x10,$out),$inout1);
+ &pxor ($inout1,$inout1);
&xorps ($inout3,$rndkey0);
&movups (&QWP(0x20,$out),$inout2);
+ &pxor ($inout2,$inout2);
&lea ($out,&DWP(0x30,$out));
&movaps ($inout0,$inout3);
+ &pxor ($inout3,$inout3);
&sub ($len,0x40);
+ &jmp (&label("cbc_dec_tail_collected"));
+&set_label("cbc_dec_clear_tail_collected",16);
+ &pxor ($inout1,$inout1);
+ &pxor ($inout2,$inout2);
+ &pxor ($inout3,$inout3);
+ &pxor ($inout4,$inout4);
&set_label("cbc_dec_tail_collected");
&and ($len,15);
&jnz (&label("cbc_dec_tail_partial"));
&movups (&QWP(0,$out),$inout0);
+ &pxor ($rndkey0,$rndkey0);
&jmp (&label("cbc_ret"));
&set_label("cbc_dec_tail_partial",16);
&movaps (&QWP(0,"esp"),$inout0);
+ &pxor ($rndkey0,$rndkey0);
&mov ("ecx",16);
&mov ($inp,"esp");
&sub ("ecx",$len);
&data_word(0xA4F3F689); # rep movsb
+ &movdqa (&QWP(0,"esp"),$inout0);
&set_label("cbc_ret");
&mov ("esp",&DWP(16,"esp")); # pull original %esp
&mov ($key_,&wparam(4));
+ &pxor ($inout0,$inout0);
+ &pxor ($rndkey1,$rndkey1);
&movups (&QWP(0,$key_),$ivec); # output IV
+ &pxor ($ivec,$ivec);
&set_label("cbc_abort");
&function_end("${PREFIX}_cbc_encrypt");
@@ -2000,14 +2095,24 @@ if ($PREFIX eq "aesni") {
# $round rounds
&function_begin_B("_aesni_set_encrypt_key");
+ &push ("ebp");
+ &push ("ebx");
&test ("eax","eax");
&jz (&label("bad_pointer"));
&test ($key,$key);
&jz (&label("bad_pointer"));
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop("ebx");
+ &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
+
+ &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
&movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
&xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
+ &mov ("ebp",&DWP(4,"ebp"));
&lea ($key,&DWP(16,$key));
+ &and ("ebp",1<<28|1<<11); # AVX and XOP bits
&cmp ($rounds,256);
&je (&label("14rounds"));
&cmp ($rounds,192);
@@ -2016,6 +2121,9 @@ if ($PREFIX eq "aesni") {
&jne (&label("bad_keybits"));
&set_label("10rounds",16);
+ &cmp ("ebp",1<<28);
+ &je (&label("10rounds_alt"));
+
&mov ($rounds,9);
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
&aeskeygenassist("xmm1","xmm0",0x01); # round 1
@@ -2040,8 +2148,8 @@ if ($PREFIX eq "aesni") {
&call (&label("key_128"));
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(80,$key),$rounds);
- &xor ("eax","eax");
- &ret();
+
+ &jmp (&label("good_key"));
&set_label("key_128",16);
&$movekey (&QWP(0,$key),"xmm0");
@@ -2055,8 +2163,76 @@ if ($PREFIX eq "aesni") {
&xorps ("xmm0","xmm1");
&ret();
+&set_label("10rounds_alt",16);
+ &movdqa ("xmm5",&QWP(0x00,"ebx"));
+ &mov ($rounds,8);
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
+ &movdqa ("xmm2","xmm0");
+ &movdqu (&DWP(-16,$key),"xmm0");
+
+&set_label("loop_key128");
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+ &pslld ("xmm4",1);
+ &lea ($key,&DWP(16,$key));
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(-16,$key),"xmm0");
+ &movdqa ("xmm2","xmm0");
+
+ &dec ($rounds);
+ &jnz (&label("loop_key128"));
+
+ &movdqa ("xmm4",&QWP(0x30,"ebx"));
+
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+ &pslld ("xmm4",1);
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(0,$key),"xmm0");
+
+ &movdqa ("xmm2","xmm0");
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(16,$key),"xmm0");
+
+ &mov ($rounds,9);
+ &mov (&DWP(96,$key),$rounds);
+
+ &jmp (&label("good_key"));
+
&set_label("12rounds",16);
&movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
+ &cmp ("ebp",1<<28);
+ &je (&label("12rounds_alt"));
+
&mov ($rounds,11);
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
&aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
@@ -2077,8 +2253,8 @@ if ($PREFIX eq "aesni") {
&call (&label("key_192b"));
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(48,$key),$rounds);
- &xor ("eax","eax");
- &ret();
+
+ &jmp (&label("good_key"));
&set_label("key_192a",16);
&$movekey (&QWP(0,$key),"xmm0");
@@ -2108,10 +2284,52 @@ if ($PREFIX eq "aesni") {
&lea ($key,&DWP(32,$key));
&jmp (&label("key_192b_warm"));
+&set_label("12rounds_alt",16);
+ &movdqa ("xmm5",&QWP(0x10,"ebx"));
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
+ &mov ($rounds,8);
+ &movdqu (&QWP(-16,$key),"xmm0");
+
+&set_label("loop_key192");
+ &movq (&QWP(0,$key),"xmm2");
+ &movdqa ("xmm1","xmm2");
+ &pshufb ("xmm2","xmm5");
+ &aesenclast ("xmm2","xmm4");
+ &pslld ("xmm4",1);
+ &lea ($key,&DWP(24,$key));
+
+ &movdqa ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm0","xmm3");
+
+ &pshufd ("xmm3","xmm0",0xff);
+ &pxor ("xmm3","xmm1");
+ &pslldq ("xmm1",4);
+ &pxor ("xmm3","xmm1");
+
+ &pxor ("xmm0","xmm2");
+ &pxor ("xmm2","xmm3");
+ &movdqu (&QWP(-16,$key),"xmm0");
+
+ &dec ($rounds);
+ &jnz (&label("loop_key192"));
+
+ &mov ($rounds,11);
+ &mov (&DWP(32,$key),$rounds);
+
+ &jmp (&label("good_key"));
+
&set_label("14rounds",16);
&movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
- &mov ($rounds,13);
&lea ($key,&DWP(16,$key));
+ &cmp ("ebp",1<<28);
+ &je (&label("14rounds_alt"));
+
+ &mov ($rounds,13);
&$movekey (&QWP(-32,$key),"xmm0"); # round 0
&$movekey (&QWP(-16,$key),"xmm2"); # round 1
&aeskeygenassist("xmm1","xmm2",0x01); # round 2
@@ -2143,7 +2361,8 @@ if ($PREFIX eq "aesni") {
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(16,$key),$rounds);
&xor ("eax","eax");
- &ret();
+
+ &jmp (&label("good_key"));
&set_label("key_256a",16);
&$movekey (&QWP(0,$key),"xmm2");
@@ -2169,11 +2388,77 @@ if ($PREFIX eq "aesni") {
&xorps ("xmm2","xmm1");
&ret();
+&set_label("14rounds_alt",16);
+ &movdqa ("xmm5",&QWP(0x00,"ebx"));
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
+ &mov ($rounds,7);
+ &movdqu (&QWP(-32,$key),"xmm0");
+ &movdqa ("xmm1","xmm2");
+ &movdqu (&QWP(-16,$key),"xmm2");
+
+&set_label("loop_key256");
+ &pshufb ("xmm2","xmm5");
+ &aesenclast ("xmm2","xmm4");
+
+ &movdqa ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm0","xmm3");
+ &pslld ("xmm4",1);
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(0,$key),"xmm0");
+
+ &dec ($rounds);
+ &jz (&label("done_key256"));
+
+ &pshufd ("xmm2","xmm0",0xff);
+ &pxor ("xmm3","xmm3");
+ &aesenclast ("xmm2","xmm3");
+
+ &movdqa ("xmm3","xmm1")
+ &pslldq ("xmm1",4);
+ &pxor ("xmm3","xmm1");
+ &pslldq ("xmm1",4);
+ &pxor ("xmm3","xmm1");
+ &pslldq ("xmm1",4);
+ &pxor ("xmm1","xmm3");
+
+ &pxor ("xmm2","xmm1");
+ &movdqu (&QWP(16,$key),"xmm2");
+ &lea ($key,&DWP(32,$key));
+ &movdqa ("xmm1","xmm2");
+ &jmp (&label("loop_key256"));
+
+&set_label("done_key256");
+ &mov ($rounds,13);
+ &mov (&DWP(16,$key),$rounds);
+
+&set_label("good_key");
+ &pxor ("xmm0","xmm0");
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &xor ("eax","eax");
+ &pop ("ebx");
+ &pop ("ebp");
+ &ret ();
+
&set_label("bad_pointer",4);
&mov ("eax",-1);
+ &pop ("ebx");
+ &pop ("ebp");
&ret ();
&set_label("bad_keybits",4);
+ &pxor ("xmm0","xmm0");
&mov ("eax",-2);
+ &pop ("ebx");
+ &pop ("ebp");
&ret ();
&function_end_B("_aesni_set_encrypt_key");
@@ -2223,10 +2508,18 @@ if ($PREFIX eq "aesni") {
&aesimc ("xmm0","xmm0");
&$movekey (&QWP(0,$key),"xmm0");
+ &pxor ("xmm0","xmm0");
+ &pxor ("xmm1","xmm1");
&xor ("eax","eax"); # return success
&set_label("dec_key_ret");
&ret ();
&function_end_B("${PREFIX}_set_decrypt_key");
+
+&set_label("key_const",64);
+&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
+&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
+&data_word(1,1,1,1);
+&data_word(0x1b,0x1b,0x1b,0x1b);
&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
index 5f6174635f..25ca574f6a 100644
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -165,11 +165,11 @@
# Westmere 3.77/1.25 1.25 1.25 1.26
# * Bridge 5.07/0.74 0.75 0.90 0.85
# Haswell 4.44/0.63 0.63 0.73 0.63
-# Atom 5.75/3.54 3.56 4.12 3.87(*)
+# Silvermont 5.75/3.54 3.56 4.12 3.87(*)
# Bulldozer 5.77/0.70 0.72 0.90 0.70
#
-# (*) Atom ECB result is suboptimal because of penalties incurred
-# by operations on %xmm8-15. As ECB is not considered
+# (*) Atom Silvermont ECB result is suboptimal because of penalties
+# incurred by operations on %xmm8-15. As ECB is not considered
# critical, nothing was done to mitigate the problem.
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
@@ -263,7 +263,10 @@ ${PREFIX}_encrypt:
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
movups $inout0,($out) # output
+ pxor $inout0,$inout0
ret
.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
@@ -276,7 +279,10 @@ ${PREFIX}_decrypt:
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
movups $inout0,($out) # output
+ pxor $inout0,$inout0
ret
.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
___
@@ -445,21 +451,18 @@ _aesni_${dir}rypt6:
pxor $rndkey0,$inout4
aes${dir} $rndkey1,$inout2
pxor $rndkey0,$inout5
+ $movkey ($key,%rax),$rndkey0
add \$16,%rax
- aes${dir} $rndkey1,$inout3
- aes${dir} $rndkey1,$inout4
- aes${dir} $rndkey1,$inout5
- $movkey -16($key,%rax),$rndkey0
jmp .L${dir}_loop6_enter
.align 16
.L${dir}_loop6:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
aes${dir} $rndkey1,$inout2
+.L${dir}_loop6_enter:
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
aes${dir} $rndkey1,$inout5
-.L${dir}_loop6_enter:
$movkey ($key,%rax),$rndkey1
add \$32,%rax
aes${dir} $rndkey0,$inout0
@@ -506,23 +509,18 @@ _aesni_${dir}rypt8:
lea 32($key,$rounds),$key
neg %rax # $rounds
aes${dir} $rndkey1,$inout0
- add \$16,%rax
pxor $rndkey0,$inout5
- aes${dir} $rndkey1,$inout1
pxor $rndkey0,$inout6
+ aes${dir} $rndkey1,$inout1
pxor $rndkey0,$inout7
- aes${dir} $rndkey1,$inout2
- aes${dir} $rndkey1,$inout3
- aes${dir} $rndkey1,$inout4
- aes${dir} $rndkey1,$inout5
- aes${dir} $rndkey1,$inout6
- aes${dir} $rndkey1,$inout7
- $movkey -16($key,%rax),$rndkey0
- jmp .L${dir}_loop8_enter
+ $movkey ($key,%rax),$rndkey0
+ add \$16,%rax
+ jmp .L${dir}_loop8_inner
.align 16
.L${dir}_loop8:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
+.L${dir}_loop8_inner:
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
@@ -587,15 +585,15 @@ aesni_ecb_encrypt:
___
$code.=<<___ if ($win64);
lea -0x58(%rsp),%rsp
- movaps %xmm6,(%rsp)
+ movaps %xmm6,(%rsp) # offload $inout4..7
movaps %xmm7,0x10(%rsp)
movaps %xmm8,0x20(%rsp)
movaps %xmm9,0x30(%rsp)
.Lecb_enc_body:
___
$code.=<<___;
- and \$-16,$len
- jz .Lecb_ret
+ and \$-16,$len # if ($len<16)
+ jz .Lecb_ret # return
mov 240($key),$rounds # key->rounds
$movkey ($key),$rndkey0
@@ -604,10 +602,10 @@ $code.=<<___;
test %r8d,%r8d # 5th argument
jz .Lecb_decrypt
#--------------------------- ECB ENCRYPT ------------------------------#
- cmp \$0x80,$len
- jb .Lecb_enc_tail
+ cmp \$0x80,$len # if ($len<8*16)
+ jb .Lecb_enc_tail # short input
- movdqu ($inp),$inout0
+ movdqu ($inp),$inout0 # load 8 input blocks
movdqu 0x10($inp),$inout1
movdqu 0x20($inp),$inout2
movdqu 0x30($inp),$inout3
@@ -615,14 +613,14 @@ $code.=<<___;
movdqu 0x50($inp),$inout5
movdqu 0x60($inp),$inout6
movdqu 0x70($inp),$inout7
- lea 0x80($inp),$inp
- sub \$0x80,$len
+ lea 0x80($inp),$inp # $inp+=8*16
+ sub \$0x80,$len # $len-=8*16 (can be zero)
jmp .Lecb_enc_loop8_enter
.align 16
.Lecb_enc_loop8:
- movups $inout0,($out)
+ movups $inout0,($out) # store 8 output blocks
mov $key_,$key # restore $key
- movdqu ($inp),$inout0
+ movdqu ($inp),$inout0 # load 8 input blocks
mov $rnds_,$rounds # restore $rounds
movups $inout1,0x10($out)
movdqu 0x10($inp),$inout1
@@ -637,17 +635,17 @@ $code.=<<___;
movups $inout6,0x60($out)
movdqu 0x60($inp),$inout6
movups $inout7,0x70($out)
- lea 0x80($out),$out
+ lea 0x80($out),$out # $out+=8*16
movdqu 0x70($inp),$inout7
- lea 0x80($inp),$inp
+ lea 0x80($inp),$inp # $inp+=8*16
.Lecb_enc_loop8_enter:
call _aesni_encrypt8
sub \$0x80,$len
- jnc .Lecb_enc_loop8
+ jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
- movups $inout0,($out)
+ movups $inout0,($out) # store 8 output blocks
mov $key_,$key # restore $key
movups $inout1,0x10($out)
mov $rnds_,$rounds # restore $rounds
@@ -657,11 +655,11 @@ $code.=<<___;
movups $inout5,0x50($out)
movups $inout6,0x60($out)
movups $inout7,0x70($out)
- lea 0x80($out),$out
- add \$0x80,$len
- jz .Lecb_ret
+ lea 0x80($out),$out # $out+=8*16
+ add \$0x80,$len # restore real remaining $len
+ jz .Lecb_ret # done if ($len==0)
-.Lecb_enc_tail:
+.Lecb_enc_tail: # $len is less than 8*16
movups ($inp),$inout0
cmp \$0x20,$len
jb .Lecb_enc_one
@@ -678,8 +676,9 @@ $code.=<<___;
movups 0x50($inp),$inout5
je .Lecb_enc_six
movdqu 0x60($inp),$inout6
+ xorps $inout7,$inout7
call _aesni_encrypt8
- movups $inout0,($out)
+ movups $inout0,($out) # store 7 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
@@ -692,25 +691,25 @@ $code.=<<___;
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
- movups $inout0,($out)
+ movups $inout0,($out) # store one output block
jmp .Lecb_ret
.align 16
.Lecb_enc_two:
call _aesni_encrypt2
- movups $inout0,($out)
+ movups $inout0,($out) # store 2 output blocks
movups $inout1,0x10($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_three:
call _aesni_encrypt3
- movups $inout0,($out)
+ movups $inout0,($out) # store 3 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_four:
call _aesni_encrypt4
- movups $inout0,($out)
+ movups $inout0,($out) # store 4 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
@@ -719,7 +718,7 @@ $code.=<<___;
.Lecb_enc_five:
xorps $inout5,$inout5
call _aesni_encrypt6
- movups $inout0,($out)
+ movups $inout0,($out) # store 5 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
@@ -728,7 +727,7 @@ $code.=<<___;
.align 16
.Lecb_enc_six:
call _aesni_encrypt6
- movups $inout0,($out)
+ movups $inout0,($out) # store 6 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
@@ -738,10 +737,10 @@ $code.=<<___;
#--------------------------- ECB DECRYPT ------------------------------#
.align 16
.Lecb_decrypt:
- cmp \$0x80,$len
- jb .Lecb_dec_tail
+ cmp \$0x80,$len # if ($len<8*16)
+ jb .Lecb_dec_tail # short input
- movdqu ($inp),$inout0
+ movdqu ($inp),$inout0 # load 8 input blocks
movdqu 0x10($inp),$inout1
movdqu 0x20($inp),$inout2
movdqu 0x30($inp),$inout3
@@ -749,14 +748,14 @@ $code.=<<___;
movdqu 0x50($inp),$inout5
movdqu 0x60($inp),$inout6
movdqu 0x70($inp),$inout7
- lea 0x80($inp),$inp
- sub \$0x80,$len
+ lea 0x80($inp),$inp # $inp+=8*16
+ sub \$0x80,$len # $len-=8*16 (can be zero)
jmp .Lecb_dec_loop8_enter
.align 16
.Lecb_dec_loop8:
- movups $inout0,($out)
+ movups $inout0,($out) # store 8 output blocks
mov $key_,$key # restore $key
- movdqu ($inp),$inout0
+ movdqu ($inp),$inout0 # load 8 input blocks
mov $rnds_,$rounds # restore $rounds
movups $inout1,0x10($out)
movdqu 0x10($inp),$inout1
@@ -771,30 +770,38 @@ $code.=<<___;
movups $inout6,0x60($out)
movdqu 0x60($inp),$inout6
movups $inout7,0x70($out)
- lea 0x80($out),$out
+ lea 0x80($out),$out # $out+=8*16
movdqu 0x70($inp),$inout7
- lea 0x80($inp),$inp
+ lea 0x80($inp),$inp # $inp+=8*16
.Lecb_dec_loop8_enter:
call _aesni_decrypt8
$movkey ($key_),$rndkey0
sub \$0x80,$len
- jnc .Lecb_dec_loop8
+ jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
- movups $inout0,($out)
+ movups $inout0,($out) # store 8 output blocks
+ pxor $inout0,$inout0 # clear register bank
mov $key_,$key # restore $key
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
mov $rnds_,$rounds # restore $rounds
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
movups $inout4,0x40($out)
+ pxor $inout4,$inout4
movups $inout5,0x50($out)
+ pxor $inout5,$inout5
movups $inout6,0x60($out)
+ pxor $inout6,$inout6
movups $inout7,0x70($out)
- lea 0x80($out),$out
- add \$0x80,$len
- jz .Lecb_ret
+ pxor $inout7,$inout7
+ lea 0x80($out),$out # $out+=8*16
+ add \$0x80,$len # restore real remaining $len
+ jz .Lecb_ret # done if ($len==0)
.Lecb_dec_tail:
movups ($inp),$inout0
@@ -814,70 +821,107 @@ $code.=<<___;
je .Lecb_dec_six
movups 0x60($inp),$inout6
$movkey ($key),$rndkey0
+ xorps $inout7,$inout7
call _aesni_decrypt8
- movups $inout0,($out)
+ movups $inout0,($out) # store 7 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
movups $inout4,0x40($out)
+ pxor $inout4,$inout4
movups $inout5,0x50($out)
+ pxor $inout5,$inout5
movups $inout6,0x60($out)
+ pxor $inout6,$inout6
+ pxor $inout7,$inout7
jmp .Lecb_ret
.align 16
.Lecb_dec_one:
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
- movups $inout0,($out)
+ movups $inout0,($out) # store one output block
+ pxor $inout0,$inout0 # clear register bank
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
call _aesni_decrypt2
- movups $inout0,($out)
+ movups $inout0,($out) # store 2 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
jmp .Lecb_ret
.align 16
.Lecb_dec_three:
call _aesni_decrypt3
- movups $inout0,($out)
+ movups $inout0,($out) # store 3 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
jmp .Lecb_ret
.align 16
.Lecb_dec_four:
call _aesni_decrypt4
- movups $inout0,($out)
+ movups $inout0,($out) # store 4 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
jmp .Lecb_ret
.align 16
.Lecb_dec_five:
xorps $inout5,$inout5
call _aesni_decrypt6
- movups $inout0,($out)
+ movups $inout0,($out) # store 5 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
movups $inout4,0x40($out)
+ pxor $inout4,$inout4
+ pxor $inout5,$inout5
jmp .Lecb_ret
.align 16
.Lecb_dec_six:
call _aesni_decrypt6
- movups $inout0,($out)
+ movups $inout0,($out) # store 6 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
movups $inout4,0x40($out)
+ pxor $inout4,$inout4
movups $inout5,0x50($out)
+ pxor $inout5,$inout5
.Lecb_ret:
+ xorps $rndkey0,$rndkey0 # %xmm0
+ pxor $rndkey1,$rndkey1
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
lea 0x58(%rsp),%rsp
.Lecb_enc_ret:
___
@@ -911,10 +955,10 @@ aesni_ccm64_encrypt_blocks:
___
$code.=<<___ if ($win64);
lea -0x58(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
- movaps %xmm8,0x20(%rsp)
- movaps %xmm9,0x30(%rsp)
+ movaps %xmm6,(%rsp) # $iv
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
+ movaps %xmm8,0x20(%rsp) # $in0
+ movaps %xmm9,0x30(%rsp) # $increment
.Lccm64_enc_body:
___
$code.=<<___;
@@ -956,7 +1000,7 @@ $code.=<<___;
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
paddq $increment,$iv
- dec $len
+ dec $len # $len-- ($len is in blocks)
aesenclast $rndkey0,$inout0
aesenclast $rndkey0,$inout1
@@ -965,16 +1009,26 @@ $code.=<<___;
movdqa $iv,$inout0
movups $in0,($out) # save output
pshufb $bswap_mask,$inout0
- lea 16($out),$out
- jnz .Lccm64_enc_outer
+ lea 16($out),$out # $out+=16
+ jnz .Lccm64_enc_outer # loop if ($len!=0)
- movups $inout1,($cmac)
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ pxor $inout0,$inout0
+ movups $inout1,($cmac) # store resulting mac
+ pxor $inout1,$inout1
+ pxor $in0,$in0
+ pxor $iv,$iv
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
lea 0x58(%rsp),%rsp
.Lccm64_enc_ret:
___
@@ -991,10 +1045,10 @@ aesni_ccm64_decrypt_blocks:
___
$code.=<<___ if ($win64);
lea -0x58(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
- movaps %xmm8,0x20(%rsp)
- movaps %xmm9,0x30(%rsp)
+ movaps %xmm6,(%rsp) # $iv
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
+ movaps %xmm8,0x20(%rsp) # $in8
+ movaps %xmm9,0x30(%rsp) # $increment
.Lccm64_dec_body:
___
$code.=<<___;
@@ -1015,7 +1069,7 @@ $code.=<<___;
mov \$16,$rounds
movups ($inp),$in0 # load inp
paddq $increment,$iv
- lea 16($inp),$inp
+ lea 16($inp),$inp # $inp+=16
sub %r10,%rax # twisted $rounds
lea 32($key_,$rnds_),$key # end of key schedule
mov %rax,%r10
@@ -1025,11 +1079,11 @@ $code.=<<___;
xorps $inout0,$in0 # inp ^= E(iv)
movdqa $iv,$inout0
movups $in0,($out) # save output
- lea 16($out),$out
+ lea 16($out),$out # $out+=16
pshufb $bswap_mask,$inout0
- sub \$1,$len
- jz .Lccm64_dec_break
+ sub \$1,$len # $len-- ($len is in blocks)
+ jz .Lccm64_dec_break # if ($len==0) break
$movkey ($key_),$rndkey0
mov %r10,%rax
@@ -1049,13 +1103,13 @@