summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2010-07-13 14:03:31 +0000
committerAndy Polyakov <appro@openssl.org>2010-07-13 14:03:31 +0000
commit2d22e08083d031eb63cada59aa95afe5279008d3 (patch)
tree5971d660922f8ef8c58bc18ab4fc6f1a22346ed5
parent0852f90c300405c79c2af5c549e74d0d4a8f664c (diff)
ARM assembler pack: reschedule instructions for dual-issue pipeline.
Modest improvement coefficients mean that code already had some parallelism and there was not very much room for improvement. Special thanks to Ted Krovetz for benchmarking the code with such patience.
-rw-r--r--crypto/aes/asm/aes-armv4.pl393
-rw-r--r--crypto/modes/asm/ghash-armv4.pl30
-rw-r--r--crypto/sha/asm/sha256-armv4.pl31
-rw-r--r--crypto/sha/asm/sha512-armv4.pl30
4 files changed, 247 insertions, 237 deletions
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index 5a736744a9..c51ee1fbf6 100644
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -16,12 +16,17 @@
# allows to merge logical or arithmetic operation with shift or rotate
# in one instruction and emit combined result every cycle. The module
# is endian-neutral. The performance is ~42 cycles/byte for 128-bit
-# key.
+# key [on single-issue Xscale PXA250 core].
# May 2007.
#
# AES_set_[en|de]crypt_key is added.
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 12% improvement on
+# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
+
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@@ -167,24 +172,24 @@ AES_encrypt:
ldrb $t2,[$rounds,#1]
ldrb $t3,[$rounds,#0]
orr $s0,$s0,$t1,lsl#8
- orr $s0,$s0,$t2,lsl#16
- orr $s0,$s0,$t3,lsl#24
ldrb $s1,[$rounds,#7]
+ orr $s0,$s0,$t2,lsl#16
ldrb $t1,[$rounds,#6]
+ orr $s0,$s0,$t3,lsl#24
ldrb $t2,[$rounds,#5]
ldrb $t3,[$rounds,#4]
orr $s1,$s1,$t1,lsl#8
- orr $s1,$s1,$t2,lsl#16
- orr $s1,$s1,$t3,lsl#24
ldrb $s2,[$rounds,#11]
+ orr $s1,$s1,$t2,lsl#16
ldrb $t1,[$rounds,#10]
+ orr $s1,$s1,$t3,lsl#24
ldrb $t2,[$rounds,#9]
ldrb $t3,[$rounds,#8]
orr $s2,$s2,$t1,lsl#8
- orr $s2,$s2,$t2,lsl#16
- orr $s2,$s2,$t3,lsl#24
ldrb $s3,[$rounds,#15]
+ orr $s2,$s2,$t2,lsl#16
ldrb $t1,[$rounds,#14]
+ orr $s2,$s2,$t3,lsl#24
ldrb $t2,[$rounds,#13]
ldrb $t3,[$rounds,#12]
orr $s3,$s3,$t1,lsl#8
@@ -199,24 +204,24 @@ AES_encrypt:
mov $t3,$s0,lsr#8
strb $t1,[$rounds,#0]
strb $t2,[$rounds,#1]
- strb $t3,[$rounds,#2]
- strb $s0,[$rounds,#3]
mov $t1,$s1,lsr#24
+ strb $t3,[$rounds,#2]
mov $t2,$s1,lsr#16
+ strb $s0,[$rounds,#3]
mov $t3,$s1,lsr#8
strb $t1,[$rounds,#4]
strb $t2,[$rounds,#5]
- strb $t3,[$rounds,#6]
- strb $s1,[$rounds,#7]
mov $t1,$s2,lsr#24
+ strb $t3,[$rounds,#6]
mov $t2,$s2,lsr#16
+ strb $s1,[$rounds,#7]
mov $t3,$s2,lsr#8
strb $t1,[$rounds,#8]
strb $t2,[$rounds,#9]
- strb $t3,[$rounds,#10]
- strb $s2,[$rounds,#11]
mov $t1,$s3,lsr#24
+ strb $t3,[$rounds,#10]
mov $t2,$s3,lsr#16
+ strb $s2,[$rounds,#11]
mov $t3,$s3,lsr#8
strb $t1,[$rounds,#12]
strb $t2,[$rounds,#13]
@@ -233,141 +238,137 @@ AES_encrypt:
.align 2
_armv4_AES_encrypt:
str lr,[sp,#-4]! @ push lr
- ldr $t1,[$key],#16
- ldr $t2,[$key,#-12]
- ldr $t3,[$key,#-8]
- ldr $i1,[$key,#-4]
- ldr $rounds,[$key,#240-16]
+ ldmia $key!,{$t1-$i1}
eor $s0,$s0,$t1
+ ldr $rounds,[$key,#240-16]
eor $s1,$s1,$t2
eor $s2,$s2,$t3
eor $s3,$s3,$i1
sub $rounds,$rounds,#1
mov lr,#255
-.Lenc_loop:
+ and $i1,lr,$s0
and $i2,lr,$s0,lsr#8
and $i3,lr,$s0,lsr#16
- and $i1,lr,$s0
mov $s0,$s0,lsr#24
+.Lenc_loop:
ldr $t1,[$tbl,$i1,lsl#2] @ Te3[s0>>0]
- ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24]
- ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8]
- ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16]
-
and $i1,lr,$s1,lsr#16 @ i0
+ ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8]
and $i2,lr,$s1
+ ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16]
and $i3,lr,$s1,lsr#8
+ ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24]
mov $s1,$s1,lsr#24
+
ldr $i1,[$tbl,$i1,lsl#2] @ Te1[s1>>16]
- ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24]
ldr $i2,[$tbl,$i2,lsl#2] @ Te3[s1>>0]
ldr $i3,[$tbl,$i3,lsl#2] @ Te2[s1>>8]
eor $s0,$s0,$i1,ror#8
- eor $s1,$s1,$t1,ror#24
- eor $t2,$t2,$i2,ror#8
- eor $t3,$t3,$i3,ror#8
-
+ ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24]
and $i1,lr,$s2,lsr#8 @ i0
+ eor $t2,$t2,$i2,ror#8
and $i2,lr,$s2,lsr#16 @ i1
+ eor $t3,$t3,$i3,ror#8
and $i3,lr,$s2
- mov $s2,$s2,lsr#24
+ eor $s1,$s1,$t1,ror#24
ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
+ mov $s2,$s2,lsr#24
+
ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
- ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
eor $s0,$s0,$i1,ror#16
- eor $s1,$s1,$i2,ror#8
- eor $s2,$s2,$t2,ror#16
- eor $t3,$t3,$i3,ror#16
-
+ ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
and $i1,lr,$s3 @ i0
+ eor $s1,$s1,$i2,ror#8
and $i2,lr,$s3,lsr#8 @ i1
+ eor $t3,$t3,$i3,ror#16
and $i3,lr,$s3,lsr#16 @ i2
- mov $s3,$s3,lsr#24
+ eor $s2,$s2,$t2,ror#16
ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
+ mov $s3,$s3,lsr#24
+
ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
- ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s0,$s0,$i1,ror#24
+ ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s1,$s1,$i2,ror#16
+ ldr $i1,[$key],#16
eor $s2,$s2,$i3,ror#8
+ ldr $t1,[$key,#-12]
eor $s3,$s3,$t3,ror#8
- ldr $t1,[$key],#16
- ldr $t2,[$key,#-12]
- ldr $t3,[$key,#-8]
- ldr $i1,[$key,#-4]
- eor $s0,$s0,$t1
- eor $s1,$s1,$t2
- eor $s2,$s2,$t3
- eor $s3,$s3,$i1
+ ldr $t2,[$key,#-8]
+ eor $s0,$s0,$i1
+ ldr $t3,[$key,#-4]
+ and $i1,lr,$s0
+ eor $s1,$s1,$t1
+ and $i2,lr,$s0,lsr#8
+ eor $s2,$s2,$t2
+ and $i3,lr,$s0,lsr#16
+ eor $s3,$s3,$t3
+ mov $s0,$s0,lsr#24
subs $rounds,$rounds,#1
bne .Lenc_loop
add $tbl,$tbl,#2
- and $i1,lr,$s0
- and $i2,lr,$s0,lsr#8
- and $i3,lr,$s0,lsr#16
- mov $s0,$s0,lsr#24
ldrb $t1,[$tbl,$i1,lsl#2] @ Te4[s0>>0]
- ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24]
- ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8]
- ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16]
-
and $i1,lr,$s1,lsr#16 @ i0
+ ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8]
and $i2,lr,$s1
+ ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16]
and $i3,lr,$s1,lsr#8
+ ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24]
mov $s1,$s1,lsr#24
+
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s1>>16]
- ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24]
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s1>>0]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s1>>8]
eor $s0,$i1,$s0,lsl#8
- eor $s1,$t1,$s1,lsl#24
- eor $t2,$i2,$t2,lsl#8
- eor $t3,$i3,$t3,lsl#8
-
+ ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24]
and $i1,lr,$s2,lsr#8 @ i0
+ eor $t2,$i2,$t2,lsl#8
and $i2,lr,$s2,lsr#16 @ i1
+ eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s2
- mov $s2,$s2,lsr#24
+ eor $s1,$t1,$s1,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
+ mov $s2,$s2,lsr#24
+
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
- ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
eor $s0,$i1,$s0,lsl#8
- eor $s1,$s1,$i2,lsl#16
- eor $s2,$t2,$s2,lsl#24
- eor $t3,$i3,$t3,lsl#8
-
+ ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
and $i1,lr,$s3 @ i0
+ eor $s1,$s1,$i2,lsl#16
and $i2,lr,$s3,lsr#8 @ i1
+ eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s3,lsr#16 @ i2
- mov $s3,$s3,lsr#24
+ eor $s2,$t2,$s2,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
+ mov $s3,$s3,lsr#24
+
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
- ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
eor $s0,$i1,$s0,lsl#8
+ ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
+ ldr $i1,[$key,#0]
eor $s1,$s1,$i2,lsl#8
+ ldr $t1,[$key,#4]
eor $s2,$s2,$i3,lsl#16
+ ldr $t2,[$key,#8]
eor $s3,$t3,$s3,lsl#24
+ ldr $t3,[$key,#12]
- ldr lr,[sp],#4 @ pop lr
- ldr $t1,[$key,#0]
- ldr $t2,[$key,#4]
- ldr $t3,[$key,#8]
- ldr $i1,[$key,#12]
- eor $s0,$s0,$t1
- eor $s1,$s1,$t2
- eor $s2,$s2,$t3
- eor $s3,$s3,$i1
+ eor $s0,$s0,$i1
+ eor $s1,$s1,$t1
+ eor $s2,$s2,$t2
+ eor $s3,$s3,$t3
sub $tbl,$tbl,#2
- mov pc,lr @ return
+ ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
.global AES_set_encrypt_key
@@ -402,31 +403,31 @@ AES_set_encrypt_key:
ldrb $t2,[$rounds,#1]
ldrb $t3,[$rounds,#0]
orr $s0,$s0,$t1,lsl#8
- orr $s0,$s0,$t2,lsl#16
- orr $s0,$s0,$t3,lsl#24
ldrb $s1,[$rounds,#7]
+ orr $s0,$s0,$t2,lsl#16
ldrb $t1,[$rounds,#6]
+ orr $s0,$s0,$t3,lsl#24
ldrb $t2,[$rounds,#5]
ldrb $t3,[$rounds,#4]
orr $s1,$s1,$t1,lsl#8
- orr $s1,$s1,$t2,lsl#16
- orr $s1,$s1,$t3,lsl#24
ldrb $s2,[$rounds,#11]
+ orr $s1,$s1,$t2,lsl#16
ldrb $t1,[$rounds,#10]
+ orr $s1,$s1,$t3,lsl#24
ldrb $t2,[$rounds,#9]
ldrb $t3,[$rounds,#8]
orr $s2,$s2,$t1,lsl#8
- orr $s2,$s2,$t2,lsl#16
- orr $s2,$s2,$t3,lsl#24
ldrb $s3,[$rounds,#15]
+ orr $s2,$s2,$t2,lsl#16
ldrb $t1,[$rounds,#14]
+ orr $s2,$s2,$t3,lsl#24
ldrb $t2,[$rounds,#13]
ldrb $t3,[$rounds,#12]
orr $s3,$s3,$t1,lsl#8
- orr $s3,$s3,$t2,lsl#16
- orr $s3,$s3,$t3,lsl#24
str $s0,[$key],#16
+ orr $s3,$s3,$t2,lsl#16
str $s1,[$key,#-12]
+ orr $s3,$s3,$t3,lsl#24
str $s2,[$key,#-8]
str $s3,[$key,#-4]
@@ -440,27 +441,26 @@ AES_set_encrypt_key:
.L128_loop:
and $t2,lr,$s3,lsr#24
and $i1,lr,$s3,lsr#16
- and $i2,lr,$s3,lsr#8
- and $i3,lr,$s3
ldrb $t2,[$tbl,$t2]
+ and $i2,lr,$s3,lsr#8
ldrb $i1,[$tbl,$i1]
+ and $i3,lr,$s3
ldrb $i2,[$tbl,$i2]
- ldrb $i3,[$tbl,$i3]
- ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i1,lsl#24
+ ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i2,lsl#16
+ ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i3,lsl#8
eor $t2,$t2,$t1
eor $s0,$s0,$t2 @ rk[4]=rk[0]^...
eor $s1,$s1,$s0 @ rk[5]=rk[1]^rk[4]
- eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5]
- eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6]
str $s0,[$key],#16
+ eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5]
str $s1,[$key,#-12]
+ eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6]
str $s2,[$key,#-8]
- str $s3,[$key,#-4]
-
subs $rounds,$rounds,#1
+ str $s3,[$key,#-4]
bne .L128_loop
sub r2,$key,#176
b .Ldone
@@ -471,16 +471,16 @@ AES_set_encrypt_key:
ldrb $t2,[$rounds,#17]
ldrb $t3,[$rounds,#16]
orr $i2,$i2,$t1,lsl#8
- orr $i2,$i2,$t2,lsl#16
- orr $i2,$i2,$t3,lsl#24
ldrb $i3,[$rounds,#23]
+ orr $i2,$i2,$t2,lsl#16
ldrb $t1,[$rounds,#22]
+ orr $i2,$i2,$t3,lsl#24
ldrb $t2,[$rounds,#21]
ldrb $t3,[$rounds,#20]
orr $i3,$i3,$t1,lsl#8
orr $i3,$i3,$t2,lsl#16
- orr $i3,$i3,$t3,lsl#24
str $i2,[$key],#8
+ orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4]
teq lr,#192
@@ -494,27 +494,26 @@ AES_set_encrypt_key:
.L192_loop:
and $t2,lr,$i3,lsr#24
and $i1,lr,$i3,lsr#16
- and $i2,lr,$i3,lsr#8
- and $i3,lr,$i3
ldrb $t2,[$tbl,$t2]
+ and $i2,lr,$i3,lsr#8
ldrb $i1,[$tbl,$i1]
+ and $i3,lr,$i3
ldrb $i2,[$tbl,$i2]
- ldrb $i3,[$tbl,$i3]
- ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i1,lsl#24
+ ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i2,lsl#16
+ ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i3,lsl#8
eor $i3,$t2,$t1
eor $s0,$s0,$i3 @ rk[6]=rk[0]^...
eor $s1,$s1,$s0 @ rk[7]=rk[1]^rk[6]
- eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7]
- eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8]
str $s0,[$key],#24
+ eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7]
str $s1,[$key,#-20]
+ eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8]
str $s2,[$key,#-16]
- str $s3,[$key,#-12]
-
subs $rounds,$rounds,#1
+ str $s3,[$key,#-12]
subeq r2,$key,#216
beq .Ldone
@@ -532,16 +531,16 @@ AES_set_encrypt_key:
ldrb $t2,[$rounds,#25]
ldrb $t3,[$rounds,#24]
orr $i2,$i2,$t1,lsl#8
- orr $i2,$i2,$t2,lsl#16
- orr $i2,$i2,$t3,lsl#24
ldrb $i3,[$rounds,#31]
+ orr $i2,$i2,$t2,lsl#16
ldrb $t1,[$rounds,#30]
+ orr $i2,$i2,$t3,lsl#24
ldrb $t2,[$rounds,#29]
ldrb $t3,[$rounds,#28]
orr $i3,$i3,$t1,lsl#8
orr $i3,$i3,$t2,lsl#16
- orr $i3,$i3,$t3,lsl#24
str $i2,[$key],#8
+ orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4]
mov $rounds,#14
@@ -553,52 +552,51 @@ AES_set_encrypt_key:
.L256_loop:
and $t2,lr,$i3,lsr#24
and $i1,lr,$i3,lsr#16
- and $i2,lr,$i3,lsr#8
- and $i3,lr,$i3
ldrb $t2,[$tbl,$t2]
+ and $i2,lr,$i3,lsr#8
ldrb $i1,[$tbl,$i1]
+ and $i3,lr,$i3
ldrb $i2,[$tbl,$i2]
- ldrb $i3,[$tbl,$i3]
- ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i1,lsl#24
+ ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i2,lsl#16
+ ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i3,lsl#8
eor $i3,$t2,$t1
eor $s0,$s0,$i3 @ rk[8]=rk[0]^...
eor $s1,$s1,$s0 @ rk[9]=rk[1]^rk[8]
- eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9]
- eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10]
str $s0,[$key],#32
+ eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9]
str $s1,[$key,#-28]
+ eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10]
str $s2,[$key,#-24]
- str $s3,[$key,#-20]
-
subs $rounds,$rounds,#1
+ str $s3,[$key,#-20]
subeq r2,$key,#256
beq .Ldone
and $t2,lr,$s3
and $i1,lr,$s3,lsr#8
- and $i2,lr,$s3,lsr#16
- and $i3,lr,$s3,lsr#24
ldrb $t2,[$tbl,$t2]
+ and $i2,lr,$s3,lsr#16
ldrb $i1,[$tbl,$i1]
+ and $i3,lr,$s3,lsr#24
ldrb $i2,[$tbl,$i2]
- ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i1,lsl#8
+ ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i2,lsl#16
+ ldr $t1,[$key,#-48]
orr $t2,$t2,$i3,lsl#24
- ldr $t1,[$key,#-48]
ldr $i1,[$key,#-44]
ldr $i2,[$key,#-40]
- ldr $i3,[$key,#-36]
eor $t1,$t1,$t2 @ rk[12]=rk[4]^...
+ ldr $i3,[$key,#-36]
eor $i1,$i1,$t1 @ rk[13]=rk[5]^rk[12]
- eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13]
- eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14]
str $t1,[$key,#-16]
+ eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13]
str $i1,[$key,#-12]
+ eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14]
str $i2,[$key,#-8]
str $i3,[$key,#-4]
b .L256_loop
@@ -819,24 +817,24 @@ AES_decrypt:
ldrb $t2,[$rounds,#1]
ldrb $t3,[$rounds,#0]
orr $s0,$s0,$t1,lsl#8
- orr $s0,$s0,$t2,lsl#16
- orr $s0,$s0,$t3,lsl#24
ldrb $s1,[$rounds,#7]
+ orr $s0,$s0,$t2,lsl#16
ldrb $t1,[$rounds,#6]
+ orr $s0,$s0,$t3,lsl#24
ldrb $t2,[$rounds,#5]
ldrb $t3,[$rounds,#4]
orr $s1,$s1,$t1,lsl#8
- orr $s1,$s1,$t2,lsl#16
- orr $s1,$s1,$t3,lsl#24
ldrb $s2,[$rounds,#11]
+ orr $s1,$s1,$t2,lsl#16
ldrb $t1,[$rounds,#10]
+ orr $s1,$s1,$t3,lsl#24
ldrb $t2,[$rounds,#9]
ldrb $t3,[$rounds,#8]
orr $s2,$s2,$t1,lsl#8
- orr $s2,$s2,$t2,lsl#16
- orr $s2,$s2,$t3,lsl#24
ldrb $s3,[$rounds,#15]
+ orr $s2,$s2,$t2,lsl#16
ldrb $t1,[$rounds,#14]
+ orr $s2,$s2,$t3,lsl#24
ldrb $t2,[$rounds,#13]
ldrb $t3,[$rounds,#12]
orr $s3,$s3,$t1,lsl#8
@@ -851,24 +849,24 @@ AES_decrypt:
mov $t3,$s0,lsr#8
strb $t1,[$rounds,#0]
strb $t2,[$rounds,#1]
- strb $t3,[$rounds,#2]
- strb $s0,[$rounds,#3]
mov $t1,$s1,lsr#24
+ strb $t3,[$rounds,#2]
mov $t2,$s1,lsr#16
+ strb $s0,[$rounds,#3]
mov $t3,$s1,lsr#8
strb $t1,[$rounds,#4]
strb $t2,[$rounds,#5]
- strb $t3,[$rounds,#6]
- strb $s1,[$rounds,#7]
mov $t1,$s2,lsr#24
+ strb $t3,[$rounds,#6]
mov $t2,$s2,lsr#16
+ strb $s1,[$rounds,#7]
mov $t3,$s2,lsr#8
strb $t1,[$rounds,#8]
strb $t2,[$rounds,#9]
- strb $t3,[$rounds,#10]
- strb $s2,[$rounds,#11]
mov $t1,$s3,lsr#24
+ strb $t3,[$rounds,#10]
mov $t2,$s3,lsr#16
+ strb $s2,[$rounds,#11]
mov $t3,$s3,lsr#8
strb $t1,[$rounds,#12]
strb $t2,[$rounds,#13]
@@ -885,146 +883,143 @@ AES_decrypt:
.align 2
_armv4_AES_decrypt:
str lr,[sp,#-4]! @ push lr
- ldr $t1,[$key],#16
- ldr $t2,[$key,#-12]
- ldr $t3,[$key,#-8]
- ldr $i1,[$key,#-4]
- ldr $rounds,[$key,#240-16]
+ ldmia $key!,{$t1-$i1}
eor $s0,$s0,$t1
+ ldr $rounds,[$key,#240-16]
eor $s1,$s1,$t2
eor $s2,$s2,$t3
eor $s3,$s3,$i1
sub $rounds,$rounds,#1
mov lr,#255
-.Ldec_loop:
and $i1,lr,$s0,lsr#16
and $i2,lr,$s0,lsr#8
and $i3,lr,$s0
mov $s0,$s0,lsr#24
+.Ldec_loop:
ldr $t1,[$tbl,$i1,lsl#2] @ Td1[s0>>16]
- ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24]
- ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8]
- ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0]
-
and $i1,lr,$s1 @ i0
+ ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8]
and $i2,lr,$s1,lsr#16
+ ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0]
and $i3,lr,$s1,lsr#8
+ ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24]
mov $s1,$s1,lsr#24
+
ldr $i1,[$tbl,$i1,lsl#2] @ Td3[s1>>0]
- ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24]
ldr $i2,[$tbl,$i2,lsl#2] @ Td1[s1>>16]
ldr $i3,[$tbl,$i3,lsl#2] @ Td2[s1>>8]
eor $s0,$s0,$i1,ror#24
- eor $s1,$s1,$t1,ror#8
- eor $t2,$i2,$t2,ror#8
- eor $t3,$i3,$t3,ror#8
-
+ ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24]
and $i1,lr,$s2,lsr#8 @ i0
+ eor $t2,$i2,$t2,ror#8
and $i2,lr,$s2 @ i1
+ eor $t3,$i3,$t3,ror#8
and $i3,lr,$s2,lsr#16
- mov $s2,$s2,lsr#24
+ eor $s1,$s1,$t1,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
+ mov $s2,$s2,lsr#24
+
ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
- ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
eor $s0,$s0,$i1,ror#16
- eor $s1,$s1,$i2,ror#24
- eor $s2,$s2,$t2,ror#8
- eor $t3,$i3,$t3,ror#8
-
+ ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
and $i1,lr,$s3,lsr#16 @ i0
+ eor $s1,$s1,$i2,ror#24
and $i2,lr,$s3,lsr#8 @ i1
+ eor $t3,$i3,$t3,ror#8
and $i3,lr,$s3 @ i2
- mov $s3,$s3,lsr#24
+ eor $s2,$s2,$t2,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
+ mov $s3,$s3,lsr#24
+
ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
- ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s0,$s0,$i1,ror#8
+ ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s1,$s1,$i2,ror#16
eor $s2,$s2,$i3,ror#24
+ ldr $i1,[$key],#16
eor $s3,$s3,$t3,ror#8
- ldr $t1,[$key],#16
- ldr $t2,[$key,#-12]
- ldr $t3,[$key,#-8]
- ldr $i1,[$key,#-4]
- eor $s0,$s0,$t1
- eor $s1,$s1,$t2
- eor $s2,$s2,$t3
- eor $s3,$s3,$i1
+ ldr $t1,[$key,#-12]
+ ldr $t2,[$key,#-8]
+ eor $s0,$s0,$i1
+ ldr $t3,[$key,#-4]
+ and $i1,lr,$s0,lsr#16
+ eor $s1,$s1,$t1
+ and $i2,lr,$s0,lsr#8
+ eor $s2,$s2,$t2
+ and $i3,lr,$s0
+ eor $s3,$s3,$t3
+ mov $s0,$s0,lsr#24
subs $rounds,$rounds,#1
bne .Ldec_loop
add $tbl,$tbl,#1024
- ldr $t1,[$tbl,#0] @ prefetch Td4
- ldr $t2,[$tbl,#32]
- ldr $t3,[$tbl,#64]
- ldr $i1,[$tbl,#96]
- ldr $i2,[$tbl,#128]
- ldr $i3,[$tbl,#160]
- ldr $t1,[$tbl,#192]
- ldr $t2,[$tbl,#224]
+ ldr $t2,[$tbl,#0] @ prefetch Td4
+ ldr $t3,[$tbl,#32]
+ ldr $t1,[$tbl,#64]
+ ldr $t2,[$tbl,#96]
+ ldr $t3,[$tbl,#128]
+ ldr $t1,[$tbl,#160]
+ ldr $t2,[$tbl,#192]
+ ldr $t3,[$tbl,#224]
- and $i1,lr,$s0,lsr#16
- and $i2,lr,$s0,lsr#8
- and $i3,lr,$s0
- ldrb $s0,[$tbl,$s0,lsr#24] @ Td4[s0>>24]
+ ldrb $s0,[$tbl,$s0] @ Td4[s0>>24]
ldrb $t1,[$tbl,$i1] @ Td4[s0>>16]
- ldrb $t2,[$tbl,$i2] @ Td4[s0>>8]
- ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
-
and $i1,lr,$s1 @ i0
+ ldrb $t2,[$tbl,$i2] @ Td4[s0>>8]
and $i2,lr,$s1,lsr#16
+ ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
and $i3,lr,$s1,lsr#8
+
ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
- ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
eor $s0,$i1,$s0,lsl#24
+ ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
eor $s1,$t1,$s1,lsl#8
- eor $t2,$t2,$i2,lsl#8
- eor $t3,$t3,$i3,lsl#8
-
and $i1,lr,$s2,lsr#8 @ i0
+ eor $t2,$t2,$i2,lsl#8
and $i2,lr,$s2 @ i1
- and $i3,lr,$s2,lsr#16
+ eor $t3,$t3,$i3,lsl#8
ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
+ and $i3,lr,$s2,lsr#16
+
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
- ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
eor $s0,$s0,$i1,lsl#8
+ ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
eor $s1,$i2,$s1,lsl#16
- eor $s2,$t2,$s2,lsl#16
- eor $t3,$t3,$i3,lsl#16
-
and $i1,lr,$s3,lsr#16 @ i0
+ eor $s2,$t2,$s2,lsl#16
and $i2,lr,$s3,lsr#8 @ i1
- and $i3,lr,$s3 @ i2
+ eor $t3,$t3,$i3,lsl#16
ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
+ and $i3,lr,$s3 @ i2
+
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
eor $s0,$s0,$i1,lsl#16
+ ldr $i1,[$key,#0]
eor $s1,$s1,$i2,lsl#8
+ ldr $t1,[$key,#4]
eor $s2,$i3,$s2,lsl#8
+ ldr $t2,[$key,#8]
eor $s3,$t3,$s3,lsl#24
+ ldr $t3,[$key,#12]
- ldr lr,[sp],#4 @ pop lr
- ldr $t1,[$key,#0]
- ldr $t2,[$key,#4]
- ldr $t3,[$key,#8]
- ldr $i1,[$key,#12]
- eor $s0,$s0,$t1
- eor $s1,$s1,$t2
- eor $s2,$s2,$t3
- eor $s3,$s3,$i1
+ eor $s0,$s0,$i1
+ eor $s1,$s1,$t1
+ eor $s2,$s2,$t2
+ eor $s3,$s3,$t3
sub $tbl,$tbl,#1024
- mov pc,lr @ return
+ ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_decrypt,.-_armv4_AES_decrypt
.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
index 45d79b6000..2036f46f40 100644
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -19,6 +19,12 @@
# loop, this assembler loop body was found to be ~3x smaller than
# compiler-generated one...
#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
# Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons:
#
@@ -123,12 +129,12 @@ gcm_ghash_4bit:
add $Zhh,$Htbl,$nlo,lsl#4
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
+ add $Thh,$Htbl,$nhi
ldrb $nlo,[$inp,#14]
- add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
- mov $nhi,$nhi,lsl#1
+ add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
@@ -139,15 +145,15 @@ gcm_ghash_4bit:
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
eor $nlo,$nlo,$nhi
- eor $Zhh,$Zhh,$Tll,lsl#16
and $nhi,$nlo,#0xf0
and $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tll,lsl#16
.Loop:
add $Thh,$Htbl,$nlo,lsl#4
subs $cnt,$cnt,#1
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
and $nlo,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
add $nlo,$nlo,$nlo
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
@@ -161,22 +167,22 @@ gcm_ghash_4bit:
add $Thh,$Htbl,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
and $nhi,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
- ldrplb $nhi,[$Xi,$cnt]
eor $Zlh,$Tlh,$Zlh,lsr#4
+ ldrplb $nhi,[$Xi,$cnt]
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
- eor $Zhh,$Thh,$Zhh,lsr#4
eorpl $nlo,$nlo,$nhi
- eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
+ eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop
ldr $len,[sp,#32] @ re-load $len/end
@@ -212,7 +218,7 @@ gcm_gmult_4bit:
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
- mov $nhi,$nhi,lsl#1
+ add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
@@ -228,8 +234,8 @@ gcm_gmult_4bit:
.Loop2:
add $Thh,$Htbl,$nlo,lsl#4
subs $cnt,$cnt,#1
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
and $nlo,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
add $nlo,$nlo,$nlo
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
@@ -243,8 +249,8 @@ gcm_gmult_4bit:
add $Thh,$Htbl,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
and $nhi,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
@@ -255,8 +261,8 @@ gcm_gmult_4bit:
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0
- eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
andpl $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop2
___
&Zsmash();
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index db87434f91..492cb62bc0 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -11,7 +11,12 @@
# Performance is ~2x better than gcc 3.4 generated code and in "abso-
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-# byte.
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@@ -52,27 +57,27 @@ $code.=<<___ if ($i<16);
___
$code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++
- str $T1,[sp,#`$i%16`*4]
mov $t0,$e,ror#$Sigma1[0]
+ str $T1,[sp,#`$i%16`*4]
eor $t0,$t0,$e,ror#$Sigma1[1]
- eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
- add $T1,$T1,$t0
eor $t1,$f,$g
+ eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
and $t1,$t1,$e
+ add $T1,$T1,$t0
eor $t1,$t1,$g @ Ch(e,f,g)
- add $T1,$T1,$t1
add $T1,$T1,$h
- add $T1,$T1,$t2
mov $h,$a,ror#$Sigma0[0]
+ add $T1,$T1,$t1
eor $h,$h,$a,ror#$Sigma0[1]
+ add $T1,$T1,$t2
eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
orr $t0,$a,$b
- and $t0,$t0,$c
and $t1,$a,$b
+ and $t0,$t0,$c
+ add $h,$h,$T1
orr $t0,$t0,$t1 @ Maj(a,b,c)
- add $h,$h,$t0
add $d,$d,$T1
- add $h,$h,$T1
+ add $h,$h,$t0
___
}
@@ -80,19 +85,19 @@ sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
- ldr $t1,[sp,#`($i+1)%16`*4] @ $i
+ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
ldr $t2,[sp,#`($i+14)%16`*4]
ldr $T1,[sp,#`($i+0)%16`*4]
- ldr $inp,[sp,#`($i+9)%16`*4]
mov $t0,$t1,ror#$sigma0[0]
+ ldr $inp,[sp,#`($i+9)%16`*4]
eor $t0,$t0,$t1,ror#$sigma0[1]
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
mov $t1,$t2,ror#$sigma1[0]
+ add $T1,$T1,$t0
eor $t1,$t1,$t2,ror#$sigma1[1]
+ add $T1,$T1,$inp
eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
- add $T1,$T1,$t0
add $T1,$T1,$t1
- add $T1,$T1,$inp
___
&BODY_00_15(@_);
}
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 7d27f0b78d..3a35861ac6 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -10,7 +10,13 @@
# SHA512 block procedure for ARMv4. September 2007.
# This code is ~4.5 (four and a half) times faster than code generated
-# by gcc 3.4 and it spends ~72 clock cycles per byte.
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
# Byte order [in]dependence. =========================================
#
@@ -73,33 +79,31 @@ $code.=<<___;
eor $t0,$t0,$Elo,lsl#23
eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
adds $Tlo,$Tlo,$t0
- adc $Thi,$Thi,$t1 @ T += Sigma1(e)
- adds $Tlo,$Tlo,$t2
- adc $Thi,$Thi,$t3 @ T += h
-
ldr $t0,[sp,#$Foff+0] @ f.lo
+ adc $Thi,$Thi,$t1 @ T += Sigma1(e)
ldr $t1,[sp,#$Foff+4] @ f.hi
+ adds $Tlo,$Tlo,$t2
ldr $t2,[sp,#$Goff+0] @ g.lo
+ adc $Thi,$Thi,$t3 @ T += h
ldr $t3,[sp,#$Goff+4] @ g.hi
- str $Elo,[sp,#$Eoff+0]
- str $Ehi,[sp,#$Eoff+4]
- str $Alo,[sp,#$Aoff+0]
- str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
+ str $Elo,[sp,#$Eoff+0]
eor $t1,$t1,$t3
+ str $Ehi,[sp,#$Eoff+4]
and $t0,$t0,$Elo
+ str $Alo,[sp,#$Aoff+0]
and $t1,$t1,$Ehi
+ str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
- eor $t1,$t1,$t3 @ Ch(e,f,g)
-
ldr $t2,[$Ktbl,#4] @ K[i].lo
+ eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#0] @ K[i].hi
- ldr $Elo,[sp,#$Doff+0] @ d.lo
- ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t0
+ ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
+ ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo