summaryrefslogtreecommitdiffstats
path: root/crypto/evp
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2014-02-13 14:39:55 +0100
committerAndy Polyakov <appro@openssl.org>2014-02-13 14:41:10 +0100
commitfcc6f699e3e73e060e6c03a9a02f9797f0dff9e5 (patch)
treeb378eaba608837df785df17c8f0d7c5d5a97a043 /crypto/evp
parent7078d93307d795cec577ec4a792b72fffed551ab (diff)
evp/e_aes_cbc_hmac_sha*.c: improve cache locality.
(cherry picked from commit 9587429fa07a34066107e926fbc8708220f058fa)
Diffstat (limited to 'crypto/evp')
-rw-r--r--crypto/evp/e_aes_cbc_hmac_sha1.c84
-rw-r--r--crypto/evp/e_aes_cbc_hmac_sha256.c84
2 files changed, 114 insertions, 54 deletions
diff --git a/crypto/evp/e_aes_cbc_hmac_sha1.c b/crypto/evp/e_aes_cbc_hmac_sha1.c
index 09f928190d..0b6f292f62 100644
--- a/crypto/evp/e_aes_cbc_hmac_sha1.c
+++ b/crypto/evp/e_aes_cbc_hmac_sha1.c
@@ -205,13 +205,15 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
u32 d[32];
u8 c[128]; } blocks[8];
SHA1_MB_CTX *ctx;
- unsigned int frag, last, packlen, i, x4=4*n4x;
+ unsigned int frag, last, packlen, i, x4=4*n4x, minblocks, processed=0;
size_t ret = 0;
u8 *IVs;
#if defined(BSWAP8)
u64 seqnum;
#endif
+ RAND_bytes((IVs=blocks[0].c),16*x4); /* ask for IVs in bulk */
+
ctx = (SHA1_MB_CTX *)(storage+32-((size_t)storage%32)); /* align */
frag = (unsigned int)inp_len>>(1+n4x);
@@ -221,8 +223,21 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
last -= x4-1;
}
+ packlen = 5+16+((frag+20+16)&-16);
+
+ /* populate descriptors with pointers and IVs */
hash_d[0].ptr = inp;
- for (i=1;i<x4;i++) hash_d[i].ptr = hash_d[i-1].ptr+frag;
+ ciph_d[0].inp = inp;
+ ciph_d[0].out = out+5+16; /* 5+16 is place for header and explicit IV */
+ memcpy(ciph_d[0].out-16,IVs,16);
+ memcpy(ciph_d[0].iv,IVs,16); IVs += 16;
+
+ for (i=1;i<x4;i++) {
+ ciph_d[i].inp = hash_d[i].ptr = hash_d[i-1].ptr+frag;
+ ciph_d[i].out = ciph_d[i-1].out+packlen;
+ memcpy(ciph_d[i].out-16,IVs,16);
+ memcpy(ciph_d[i].iv,IVs,16); IVs+=16;
+ }
#if defined(BSWAP8)
memcpy(blocks[0].c,key->md.data,8);
@@ -268,6 +283,39 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
/* hash 13-byte headers and first 64-13 bytes of inputs */
sha1_multi_block(ctx,edges,n4x);
/* hash bulk inputs */
+#define MAXCHUNKSIZE 2048
+#if MAXCHUNKSIZE%64
+#error "MAXCHUNKSIZE is not divisible by 64"
+#elif MAXCHUNKSIZE
+ /* goal is to minimize pressure on L1 cache by moving
+ * in shorter steps, so that hashed data is still in
+ * the cache by the time we encrypt it */
+ minblocks = ((frag<=last ? frag : last)-(64-13))/64;
+ if (minblocks>MAXCHUNKSIZE/64) {
+ for (i=0;i<x4;i++) {
+ edges[i].ptr = hash_d[i].ptr;
+ edges[i].blocks = MAXCHUNKSIZE/64;
+ ciph_d[i].blocks = MAXCHUNKSIZE/16;
+ }
+ do {
+ sha1_multi_block(ctx,edges,n4x);
+ aesni_multi_cbc_encrypt(ciph_d,&key->ks,n4x);
+
+ for (i=0;i<x4;i++) {
+ edges[i].ptr = hash_d[i].ptr += MAXCHUNKSIZE;
+ hash_d[i].blocks -= MAXCHUNKSIZE/64;
+ edges[i].blocks = MAXCHUNKSIZE/64;
+ ciph_d[i].inp += MAXCHUNKSIZE;
+ ciph_d[i].out += MAXCHUNKSIZE;
+ ciph_d[i].blocks = MAXCHUNKSIZE/16;
+ memcpy(ciph_d[i].iv,ciph_d[i].out-16,16);
+ }
+ processed += MAXCHUNKSIZE;
+ minblocks -= MAXCHUNKSIZE/64;
+ } while (minblocks>MAXCHUNKSIZE/64);
+ }
+#endif
+#undef MAXCHUNKSIZE
sha1_multi_block(ctx,hash_d,n4x);
memset(blocks,0,sizeof(blocks));
@@ -276,7 +324,7 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
off = hash_d[i].blocks*64;
const unsigned char *ptr = hash_d[i].ptr+off;
- off = len-(64-13)-off; /* remainder actually */
+ off = (len-processed)-(64-13)-off; /* remainder actually */
memcpy(blocks[i].c,ptr,off);
blocks[i].c[off]=0x80;
len += 64+13; /* 64 is HMAC header */
@@ -310,23 +358,14 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
/* finalize MACs */
sha1_multi_block(ctx,edges,n4x);
- packlen = 5+16+((frag+20+16)&-16);
-
- out += (packlen<<(1+n4x))-packlen;
- inp += (frag<<(1+n4x))-frag;
-
- RAND_bytes((IVs=blocks[0].c),16*x4); /* ask for IVs in bulk */
-
- for (i=x4-1;;i--) {
+ for (i=0;i<x4;i++) {
unsigned int len = (i==(x4-1)?last:frag), pad, j;
unsigned char *out0 = out;
- out += 5+16; /* place for header and explicit IV */
- ciph_d[i].inp = out;
- ciph_d[i].out = out;
+ memcpy(ciph_d[i].out,ciph_d[i].inp,len-processed);
+ ciph_d[i].inp = ciph_d[i].out;
- memmove(out,inp,len);
- out += len;
+ out += 5+16+len;
/* write MAC */
((u32 *)out)[0] = BSWAP4(ctx->A[i]);
@@ -342,7 +381,7 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
for (j=0;j<=pad;j++) *(out++) = pad;
len += pad+1;
- ciph_d[i].blocks = len/16;
+ ciph_d[i].blocks = (len-processed)/16;
len += 16; /* account for explicit iv */
/* arrange header */
@@ -352,17 +391,8 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
out0[3] = (u8)(len>>8);
out0[4] = (u8)(len);
- /* explicit iv */
- memcpy(ciph_d[i].iv, IVs, 16);
- memcpy(&out0[5], IVs, 16);
-
ret += len+5;
-
- if (i==0) break;
-
- out = out0-packlen;
- inp -= frag;
- IVs += 16;
+ inp += frag;
}
aesni_multi_cbc_encrypt(ciph_d,&key->ks,n4x);
diff --git a/crypto/evp/e_aes_cbc_hmac_sha256.c b/crypto/evp/e_aes_cbc_hmac_sha256.c
index 95bdd42b13..c2c48f045c 100644
--- a/crypto/evp/e_aes_cbc_hmac_sha256.c
+++ b/crypto/evp/e_aes_cbc_hmac_sha256.c
@@ -201,13 +201,15 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
u32 d[32];
u8 c[128]; } blocks[8];
SHA256_MB_CTX *ctx;
- unsigned int frag, last, packlen, i, x4=4*n4x;
+ unsigned int frag, last, packlen, i, x4=4*n4x, minblocks, processed=0;
size_t ret = 0;
u8 *IVs;
#if defined(BSWAP8)
u64 seqnum;
#endif
+ RAND_bytes((IVs=blocks[0].c),16*x4); /* ask for IVs in bulk */
+
ctx = (SHA256_MB_CTX *)(storage+32-((size_t)storage%32)); /* align */
frag = (unsigned int)inp_len>>(1+n4x);
@@ -217,8 +219,21 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
last -= x4-1;
}
+ packlen = 5+16+((frag+32+16)&-16);
+
+ /* populate descriptors with pointers and IVs */
hash_d[0].ptr = inp;
- for (i=1;i<x4;i++) hash_d[i].ptr = hash_d[i-1].ptr+frag;
+ ciph_d[0].inp = inp;
+ ciph_d[0].out = out+5+16; /* 5+16 is place for header and explicit IV */
+ memcpy(ciph_d[0].out-16,IVs,16);
+ memcpy(ciph_d[0].iv,IVs,16); IVs += 16;
+
+ for (i=1;i<x4;i++) {
+ ciph_d[i].inp = hash_d[i].ptr = hash_d[i-1].ptr+frag;
+ ciph_d[i].out = ciph_d[i-1].out+packlen;
+ memcpy(ciph_d[i].out-16,IVs,16);
+ memcpy(ciph_d[i].iv,IVs,16); IVs+=16;
+ }
#if defined(BSWAP8)
memcpy(blocks[0].c,key->md.data,8);
@@ -267,6 +282,39 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
/* hash 13-byte headers and first 64-13 bytes of inputs */
sha256_multi_block(ctx,edges,n4x);
/* hash bulk inputs */
+#define MAXCHUNKSIZE 2048
+#if MAXCHUNKSIZE%64
+#error "MAXCHUNKSIZE is not divisible by 64"
+#elif MAXCHUNKSIZE
+ /* goal is to minimize pressure on L1 cache by moving
+ * in shorter steps, so that hashed data is still in
+ * the cache by the time we encrypt it */
+ minblocks = ((frag<=last ? frag : last)-(64-13))/64;
+ if (minblocks>MAXCHUNKSIZE/64) {
+ for (i=0;i<x4;i++) {
+ edges[i].ptr = hash_d[i].ptr;
+ edges[i].blocks = MAXCHUNKSIZE/64;
+ ciph_d[i].blocks = MAXCHUNKSIZE/16;
+ }
+ do {
+ sha256_multi_block(ctx,edges,n4x);
+ aesni_multi_cbc_encrypt(ciph_d,&key->ks,n4x);
+
+ for (i=0;i<x4;i++) {
+ edges[i].ptr = hash_d[i].ptr += MAXCHUNKSIZE;
+ hash_d[i].blocks -= MAXCHUNKSIZE/64;
+ edges[i].blocks = MAXCHUNKSIZE/64;
+ ciph_d[i].inp += MAXCHUNKSIZE;
+ ciph_d[i].out += MAXCHUNKSIZE;
+ ciph_d[i].blocks = MAXCHUNKSIZE/16;
+ memcpy(ciph_d[i].iv,ciph_d[i].out-16,16);
+ }
+ processed += MAXCHUNKSIZE;
+ minblocks -= MAXCHUNKSIZE/64;
+ } while (minblocks>MAXCHUNKSIZE/64);
+ }
+#endif
+#undef MAXCHUNKSIZE
sha256_multi_block(ctx,hash_d,n4x);
memset(blocks,0,sizeof(blocks));
@@ -275,7 +323,7 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
off = hash_d[i].blocks*64;
const unsigned char *ptr = hash_d[i].ptr+off;
- off = len-(64-13)-off; /* remainder actually */
+ off = (len-processed)-(64-13)-off; /* remainder actually */
memcpy(blocks[i].c,ptr,off);
blocks[i].c[off]=0x80;
len += 64+13; /* 64 is HMAC header */
@@ -312,23 +360,14 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
/* finalize MACs */
sha256_multi_block(ctx,edges,n4x);
- packlen = 5+16+((frag+32+16)&-16);
-
- out += (packlen<<(1+n4x))-packlen;
- inp += (frag<<(1+n4x))-frag;
-
- RAND_bytes((IVs=blocks[0].c),16*x4); /* ask for IVs in bulk */
-
- for (i=x4-1;;i--) {
+ for (i=0;i<x4;i++) {
unsigned int len = (i==(x4-1)?last:frag), pad, j;
unsigned char *out0 = out;
- out += 5+16; /* place for header and explicit IV */
- ciph_d[i].inp = out;
- ciph_d[i].out = out;
+ memcpy(ciph_d[i].out,ciph_d[i].inp,len-processed);
+ ciph_d[i].inp = ciph_d[i].out;
- memmove(out,inp,len);
- out += len;
+ out += 5+16+len;
/* write MAC */
((u32 *)out)[0] = BSWAP4(ctx->A[i]);
@@ -347,7 +386,7 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
for (j=0;j<=pad;j++) *(out++) = pad;
len += pad+1;
- ciph_d[i].blocks = len/16;
+ ciph_d[i].blocks = (len-processed)/16;
len += 16; /* account for explicit iv */
/* arrange header */
@@ -357,17 +396,8 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
out0[3] = (u8)(len>>8);
out0[4] = (u8)(len);
- /* explicit iv */
- memcpy(ciph_d[i].iv, IVs, 16);
- memcpy(&out0[5], IVs, 16);
-
ret += len+5;
-
- if (i==0) break;
-
- out = out0-packlen;
- inp -= frag;
- IVs += 16;
+ inp += frag;
}
aesni_multi_cbc_encrypt(ciph_d,&key->ks,n4x);