Fix s390x bugs and correct performance coefficients.

author: Andy Polyakov <appro@openssl.org> 2007-05-02 11:44:02 +0000
committer: Andy Polyakov <appro@openssl.org> 2007-05-02 11:44:02 +0000
commit: 251718e4c1d764324c121de039efb619cd4d077c (patch)
tree: 37f47ab4fd098e6fbd2b2fa09ff3b1a5bdc73f9d /crypto
parent: c504a5e78386aa9f02462d18a90da759f9131321 (diff)
3 files changed, 5 insertions, 3 deletions
diff --git a/crypto/aes/asm/aes-s390x.pl b/crypto/aes/asm/aes-s390x.pl
index 4a9713aea1..3fcb9a6c29 100644
--- a/crypto/aes/asm/aes-s390x.pl
+++ b/crypto/aes/asm/aes-s390x.pl
@@ -23,7 +23,7 @@
 # for CBC is not utilized, nor multiple blocks are ever processed.
 # Then software key schedule can be postponed till hardware support
 # detection... Performance improvement over assembler is reportedly
-# ~2.5x, but can reach >15x [naturally on larger chunks] if proper
+# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
 # support is implemented.
 
 $t1="%r0";
diff --git a/crypto/sha/asm/sha1-s390x.pl b/crypto/sha/asm/sha1-s390x.pl
index 9f4436d525..5c36436d45 100644
--- a/crypto/sha/asm/sha1-s390x.pl
+++ b/crypto/sha/asm/sha1-s390x.pl
@@ -13,7 +13,7 @@
 #
 # Performance is >30% better than gcc 3.3 generated code. But the real
 # twist is that SHA1 hardware support is detected and utilized. In
-# which case performance can reach further >8x for larger chunks.
+# which case performance can reach further >4.5x for larger chunks.
 
 $kimdfunc=1;	# magic function code for kimd instruction
 
@@ -160,6 +160,7 @@ $code.=<<___ if ($kimdfunc);
 	lgr	%r2,$inp
 	sllg	%r3,$len,6
 	.long	0xb93e0002	# kimd %r0,%r2
+	brc	1,.-4		# pay attention to "partial completion"
 	br	%r14
 .Lsoftware:
 ___
diff --git a/crypto/sha/asm/sha512-s390x.pl b/crypto/sha/asm/sha512-s390x.pl
index 67a17d3808..5dd17473fa 100644
--- a/crypto/sha/asm/sha512-s390x.pl
+++ b/crypto/sha/asm/sha512-s390x.pl
@@ -16,7 +16,7 @@
 # "pathologically" high, in particular in comparison to other SHA
 # modules). But the real twist is that it detects if hardware support
 # for SHA256 is available and in such case utilizes it. Then the
-# performance can reach >12x of assembler one for larger chunks.
+# performance can reach >6.5x of assembler one for larger chunks.
 #
 # sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
 
@@ -219,6 +219,7 @@ $code.=<<___ if ($kimdfunc);
 	lgr	%r2,$inp
 	sllg	%r3,$len,`log(16*$SZ)/log(2)`
 	.long	0xb93e0002	# kimd %r0,%r2
+	brc	1,.-4		# pay attention to "partial completion"
 	br	%r14
 .Lsoftware:
 ___
author	Andy Polyakov <appro@openssl.org>	2007-05-02 11:44:02 +0000
committer	Andy Polyakov <appro@openssl.org>	2007-05-02 11:44:02 +0000
commit	251718e4c1d764324c121de039efb619cd4d077c (patch)
tree	37f47ab4fd098e6fbd2b2fa09ff3b1a5bdc73f9d /crypto
parent	c504a5e78386aa9f02462d18a90da759f9131321 (diff)