diff options
author | Andy Polyakov <appro@openssl.org> | 2010-07-13 14:03:31 +0000 |
---|---|---|
committer | Andy Polyakov <appro@openssl.org> | 2010-07-13 14:03:31 +0000 |
commit | 2d22e08083d031eb63cada59aa95afe5279008d3 (patch) | |
tree | 5971d660922f8ef8c58bc18ab4fc6f1a22346ed5 /crypto/modes/asm/ghash-armv4.pl | |
parent | 0852f90c300405c79c2af5c549e74d0d4a8f664c (diff) |
ARM assembler pack: reschedule instructions for dual-issue pipeline.
Modest improvement coefficients mean that code already had some
parallelism and there was not very much room for improvement. Special
thanks to Ted Krovetz for benchmarking the code with such patience.
Diffstat (limited to 'crypto/modes/asm/ghash-armv4.pl')
-rw-r--r-- | crypto/modes/asm/ghash-armv4.pl | 30 |
1 files changed, 18 insertions, 12 deletions
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl index 45d79b6000..2036f46f40 100644 --- a/crypto/modes/asm/ghash-armv4.pl +++ b/crypto/modes/asm/ghash-armv4.pl @@ -19,6 +19,12 @@ # loop, this assembler loop body was found to be ~3x smaller than # compiler-generated one... # +# July 2010 +# +# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on +# Cortex A8 core and ~25 cycles per processed byte (which was observed +# to be ~3 times faster than gcc-generated code:-) +# # Note about "528B" variant. In ARM case it makes lesser sense to # implement it for following reasons: # @@ -123,12 +129,12 @@ gcm_ghash_4bit: add $Zhh,$Htbl,$nlo,lsl#4 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] + add $Thh,$Htbl,$nhi ldrb $nlo,[$inp,#14] - add $Thh,$Htbl,$nhi and $nhi,$Zll,#0xf @ rem ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] - mov $nhi,$nhi,lsl#1 + add $nhi,$nhi,$nhi eor $Zll,$Tll,$Zll,lsr#4 ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] eor $Zll,$Zll,$Zlh,lsl#28 @@ -139,15 +145,15 @@ gcm_ghash_4bit: eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhh,$Thh,$Zhh,lsr#4 eor $nlo,$nlo,$nhi - eor $Zhh,$Zhh,$Tll,lsl#16 and $nhi,$nlo,#0xf0 and $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tll,lsl#16 .Loop: add $Thh,$Htbl,$nlo,lsl#4 subs $cnt,$cnt,#1 - ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] and $nlo,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] add $nlo,$nlo,$nlo eor $Zll,$Tll,$Zll,lsr#4 ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] @@ -161,22 +167,22 @@ gcm_ghash_4bit: add $Thh,$Htbl,$nhi eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] - ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] and $nhi,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] add $nhi,$nhi,$nhi eor $Zll,$Tll,$Zll,lsr#4 ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] eor $Zll,$Zll,$Zlh,lsl#28 - ldrplb $nhi,[$Xi,$cnt] eor $Zlh,$Tlh,$Zlh,lsr#4 + ldrplb $nhi,[$Xi,$cnt] eor $Zlh,$Zlh,$Zhl,lsl#28 eor $Zhl,$Thl,$Zhl,lsr#4 eor $Zhl,$Zhl,$Zhh,lsl#28 - eor $Zhh,$Thh,$Zhh,lsr#4 eorpl $nlo,$nlo,$nhi - eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + eor $Zhh,$Thh,$Zhh,lsr#4 andpl $nhi,$nlo,#0xf0 andpl $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] bpl .Loop ldr $len,[sp,#32] @ re-load $len/end @@ -212,7 +218,7 @@ gcm_gmult_4bit: add $Thh,$Htbl,$nhi and $nhi,$Zll,#0xf @ rem ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] - mov $nhi,$nhi,lsl#1 + add $nhi,$nhi,$nhi eor $Zll,$Tll,$Zll,lsr#4 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] eor $Zll,$Zll,$Zlh,lsl#28 @@ -228,8 +234,8 @@ gcm_gmult_4bit: .Loop2: add $Thh,$Htbl,$nlo,lsl#4 subs $cnt,$cnt,#1 - ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] and $nlo,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] add $nlo,$nlo,$nlo eor $Zll,$Tll,$Zll,lsr#4 ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] @@ -243,8 +249,8 @@ gcm_gmult_4bit: add $Thh,$Htbl,$nhi eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] - ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] and $nhi,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] add $nhi,$nhi,$nhi eor $Zll,$Tll,$Zll,lsr#4 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] @@ -255,8 +261,8 @@ gcm_gmult_4bit: eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhh,$Thh,$Zhh,lsr#4 andpl $nhi,$nlo,#0xf0 - eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] andpl $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] bpl .Loop2 ___ &Zsmash(); |