ARM assembler pack: reschedule instructions for dual-issue pipeline.

Modest improvement coefficients mean that code already had some parallelism and there was not very much room for improvement. Special thanks to Ted Krovetz for benchmarking the code with such patience.
author: Andy Polyakov <appro@openssl.org> 2010-07-13 14:03:31 +0000
committer: Andy Polyakov <appro@openssl.org> 2010-07-13 14:03:31 +0000
commit: 2d22e08083d031eb63cada59aa95afe5279008d3 (patch)
tree: 5971d660922f8ef8c58bc18ab4fc6f1a22346ed5 /crypto/modes/asm/ghash-armv4.pl
parent: 0852f90c300405c79c2af5c549e74d0d4a8f664c (diff)
1 files changed, 18 insertions, 12 deletions
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
index 45d79b6000..2036f46f40 100644
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -19,6 +19,12 @@
 # loop, this assembler loop body was found to be ~3x smaller than
 # compiler-generated one...
 #
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
 # Note about "528B" variant. In ARM case it makes lesser sense to
 # implement it for following reasons:
 #
@@ -123,12 +129,12 @@ gcm_ghash_4bit:
 
 	add	$Zhh,$Htbl,$nlo,lsl#4
 	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	add	$Thh,$Htbl,$nhi
 	ldrb	$nlo,[$inp,#14]
 
-	add	$Thh,$Htbl,$nhi
 	and	$nhi,$Zll,#0xf		@ rem
 	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
-	mov	$nhi,$nhi,lsl#1
+	add	$nhi,$nhi,$nhi
 	eor	$Zll,$Tll,$Zll,lsr#4
 	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
 	eor	$Zll,$Zll,$Zlh,lsl#28
@@ -139,15 +145,15 @@ gcm_ghash_4bit:
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
 	eor	$nlo,$nlo,$nhi
-	eor	$Zhh,$Zhh,$Tll,lsl#16
 	and	$nhi,$nlo,#0xf0
 	and	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16
 
 .Loop:
 	add	$Thh,$Htbl,$nlo,lsl#4
 	subs	$cnt,$cnt,#1
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
 	and	$nlo,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
 	add	$nlo,$nlo,$nlo
 	eor	$Zll,$Tll,$Zll,lsr#4
 	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
@@ -161,22 +167,22 @@ gcm_ghash_4bit:
 
 	add	$Thh,$Htbl,$nhi
 	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
 	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
 	add	$nhi,$nhi,$nhi
 	eor	$Zll,$Tll,$Zll,lsr#4
 	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
 	eor	$Zll,$Zll,$Zlh,lsl#28
-	ldrplb	$nhi,[$Xi,$cnt]
 	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrplb	$nhi,[$Xi,$cnt]
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	eor	$Zhl,$Thl,$Zhl,lsr#4
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
-	eor	$Zhh,$Thh,$Zhh,lsr#4
 	eorpl	$nlo,$nlo,$nhi
-	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	eor	$Zhh,$Thh,$Zhh,lsr#4
 	andpl	$nhi,$nlo,#0xf0
 	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
 	bpl	.Loop
 
 	ldr	$len,[sp,#32]		@ re-load $len/end
@@ -212,7 +218,7 @@ gcm_gmult_4bit:
 	add	$Thh,$Htbl,$nhi
 	and	$nhi,$Zll,#0xf		@ rem
 	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
-	mov	$nhi,$nhi,lsl#1
+	add	$nhi,$nhi,$nhi
 	eor	$Zll,$Tll,$Zll,lsr#4
 	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
 	eor	$Zll,$Zll,$Zlh,lsl#28
@@ -228,8 +234,8 @@ gcm_gmult_4bit:
 .Loop2:
 	add	$Thh,$Htbl,$nlo,lsl#4
 	subs	$cnt,$cnt,#1
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
 	and	$nlo,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
 	add	$nlo,$nlo,$nlo
 	eor	$Zll,$Tll,$Zll,lsr#4
 	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
@@ -243,8 +249,8 @@ gcm_gmult_4bit:
 
 	add	$Thh,$Htbl,$nhi
 	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
 	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
 	add	$nhi,$nhi,$nhi
 	eor	$Zll,$Tll,$Zll,lsr#4
 	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
@@ -255,8 +261,8 @@ gcm_gmult_4bit:
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
 	andpl	$nhi,$nlo,#0xf0
-	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
 	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
 	bpl	.Loop2
 ___
 	&Zsmash();
author	Andy Polyakov <appro@openssl.org>	2010-07-13 14:03:31 +0000
committer	Andy Polyakov <appro@openssl.org>	2010-07-13 14:03:31 +0000
commit	2d22e08083d031eb63cada59aa95afe5279008d3 (patch)
tree	5971d660922f8ef8c58bc18ab4fc6f1a22346ed5 /crypto/modes/asm/ghash-armv4.pl
parent	0852f90c300405c79c2af5c549e74d0d4a8f664c (diff)