summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2010-07-14 08:43:38 +0000
committerAndy Polyakov <appro@openssl.org>2010-07-14 08:43:38 +0000
commit26064d7f77ac20b61a35d758046eeb2344745e12 (patch)
tree6da299ae1f4cb0f0d2e1429e991b70574de1271b
parentc981086d40b6cc7f59ca1273c92d77e2dfacd156 (diff)
aes-s390x.pl: revisit buffer allocation and add performance data.
-rw-r--r--crypto/aes/asm/aes-s390x.pl35
1 files changed, 22 insertions, 13 deletions
diff --git a/crypto/aes/asm/aes-s390x.pl b/crypto/aes/asm/aes-s390x.pl
index 38e18b2452..5ffacb8cf4 100644
--- a/crypto/aes/asm/aes-s390x.pl
+++ b/crypto/aes/asm/aes-s390x.pl
@@ -44,7 +44,7 @@
# Unlike previous version hardware support detection takes place only
# at the moment of key schedule setup, which is denoted in key->rounds.
# This is done, because deferred key setup can't be made MT-safe, not
-# for key lengthes longer than 128 bits.
+# for keys longer than 128 bits.
#
# Add AES_cbc_encrypt, which gives incredible performance improvement,
# it was measured to be ~6.6x. It's less than previously mentioned 8x,
@@ -52,7 +52,13 @@
# May 2010.
#
-# Add AES_ctr32_encrypt.
+# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
+# performance improvement over "generic" counter mode routine relying
+# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
+# to the fact that exact throughput value depends on current stack
+# frame alignment within 4KB page. In worst case you get ~75% of the
+# maximum, but *on average* it would be as much as ~98%. Meaning that
+# worst case is unlike, it's like hitting ravine on plateau.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@@ -1367,24 +1373,27 @@ $code.=<<___ if (!$softonly);
lg $iv0,0($ivp) # load ivec
lg $ivp,8($ivp)
- # prepare and allocate stack frame
- lghi $s0,-272 # guarantee at least 256-bytes buffer
+ # prepare and allocate stack frame at the top of 4K page
+ # with 1K reserved for eventual signal handling
+ lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
lghi $s1,-4096
- lgr $fp,$sp
algr $s0,$sp
+ lgr $fp,$sp
ngr $s0,$s1 # align at page boundary
- la $sp,0($s0) # alloca
- stg $fp,0($s0) # back-chain
-
- # calculate resultant buffer size
- la $s0,16($s0) # buffer starts at offset of 16
- slgr $fp,$s0
- srlg $fp,$fp,4 # $fp is buffer length in blocks, minimum 16
+ slgr $fp,$s0 # total buffer size
+ lgr $s2,$sp
+ lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
+ slgr $fp,$s1 # deduct reservation to get usable buffer size
+ # buffer size is at lest 256 and at most 3072+256-16
+
+ la $sp,1024($s0) # alloca
+ srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
+ stg $s2,0($sp) # back-chain
stg $fp,8($sp)
slgr $len,$fp
brc 1,.Lctr32_hw_loop # not zero, no borrow
- algr $fp,$len
+ algr $fp,$len # input is shorter than allocated buffer
lghi $len,0
stg $fp,8($sp)