ARM assembler pack: reschedule instructions for dual-issue pipeline.

Modest improvement coefficients mean that code already had some parallelism and there was not very much room for improvement. Special thanks to Ted Krovetz for benchmarking the code with such patience.
author: Andy Polyakov <appro@openssl.org> 2010-07-13 14:03:31 +0000
committer: Andy Polyakov <appro@openssl.org> 2010-07-13 14:03:31 +0000
commit: 2d22e08083d031eb63cada59aa95afe5279008d3 (patch)
tree: 5971d660922f8ef8c58bc18ab4fc6f1a22346ed5
parent: 0852f90c300405c79c2af5c549e74d0d4a8f664c (diff)
4 files changed, 247 insertions, 237 deletions
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index 5a736744a9..c51ee1fbf6 100644
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -16,12 +16,17 @@
 # allows to merge logical or arithmetic operation with shift or rotate
 # in one instruction and emit combined result every cycle. The module
 # is endian-neutral. The performance is ~42 cycles/byte for 128-bit
-# key.
+# key [on single-issue Xscale PXA250 core].
 
 # May 2007.
 #
 # AES_set_[en|de]crypt_key is added.
 
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 12% improvement on
+# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
@@ -167,24 +172,24 @@ AES_encrypt:
 	ldrb	$t2,[$rounds,#1]
 	ldrb	$t3,[$rounds,#0]
 	orr	$s0,$s0,$t1,lsl#8
-	orr	$s0,$s0,$t2,lsl#16
-	orr	$s0,$s0,$t3,lsl#24
 	ldrb	$s1,[$rounds,#7]
+	orr	$s0,$s0,$t2,lsl#16
 	ldrb	$t1,[$rounds,#6]
+	orr	$s0,$s0,$t3,lsl#24
 	ldrb	$t2,[$rounds,#5]
 	ldrb	$t3,[$rounds,#4]
 	orr	$s1,$s1,$t1,lsl#8
-	orr	$s1,$s1,$t2,lsl#16
-	orr	$s1,$s1,$t3,lsl#24
 	ldrb	$s2,[$rounds,#11]
+	orr	$s1,$s1,$t2,lsl#16
 	ldrb	$t1,[$rounds,#10]
+	orr	$s1,$s1,$t3,lsl#24
 	ldrb	$t2,[$rounds,#9]
 	ldrb	$t3,[$rounds,#8]
 	orr	$s2,$s2,$t1,lsl#8
-	orr	$s2,$s2,$t2,lsl#16
-	orr	$s2,$s2,$t3,lsl#24
 	ldrb	$s3,[$rounds,#15]
+	orr	$s2,$s2,$t2,lsl#16
 	ldrb	$t1,[$rounds,#14]
+	orr	$s2,$s2,$t3,lsl#24
 	ldrb	$t2,[$rounds,#13]
 	ldrb	$t3,[$rounds,#12]
 	orr	$s3,$s3,$t1,lsl#8
@@ -199,24 +204,24 @@ AES_encrypt:
 	mov	$t3,$s0,lsr#8
 	strb	$t1,[$rounds,#0]
 	strb	$t2,[$rounds,#1]
-	strb	$t3,[$rounds,#2]
-	strb	$s0,[$rounds,#3]
 	mov	$t1,$s1,lsr#24
+	strb	$t3,[$rounds,#2]
 	mov	$t2,$s1,lsr#16
+	strb	$s0,[$rounds,#3]
 	mov	$t3,$s1,lsr#8
 	strb	$t1,[$rounds,#4]
 	strb	$t2,[$rounds,#5]
-	strb	$t3,[$rounds,#6]
-	strb	$s1,[$rounds,#7]
 	mov	$t1,$s2,lsr#24
+	strb	$t3,[$rounds,#6]
 	mov	$t2,$s2,lsr#16
+	strb	$s1,[$rounds,#7]
 	mov	$t3,$s2,lsr#8
 	strb	$t1,[$rounds,#8]
 	strb	$t2,[$rounds,#9]
-	strb	$t3,[$rounds,#10]
-	strb	$s2,[$rounds,#11]
 	mov	$t1,$s3,lsr#24
+	strb	$t3,[$rounds,#10]
 	mov	$t2,$s3,lsr#16
+	strb	$s2,[$rounds,#11]
 	mov	$t3,$s3,lsr#8
 	strb	$t1,[$rounds,#12]
 	strb	$t2,[$rounds,#13]
@@ -233,141 +238,137 @@ AES_encrypt:
 .align	2
 _armv4_AES_encrypt:
 	str	lr,[sp,#-4]!		@ push lr
-	ldr	$t1,[$key],#16
-	ldr	$t2,[$key,#-12]
-	ldr	$t3,[$key,#-8]
-	ldr	$i1,[$key,#-4]
-	ldr	$rounds,[$key,#240-16]
+	ldmia	$key!,{$t1-$i1}
 	eor	$s0,$s0,$t1
+	ldr	$rounds,[$key,#240-16]
 	eor	$s1,$s1,$t2
 	eor	$s2,$s2,$t3
 	eor	$s3,$s3,$i1
 	sub	$rounds,$rounds,#1
 	mov	lr,#255
 
-.Lenc_loop:
+	and	$i1,lr,$s0
 	and	$i2,lr,$s0,lsr#8
 	and	$i3,lr,$s0,lsr#16
-	and	$i1,lr,$s0
 	mov	$s0,$s0,lsr#24
+.Lenc_loop:
 	ldr	$t1,[$tbl,$i1,lsl#2]	@ Te3[s0>>0]
-	ldr	$s0,[$tbl,$s0,lsl#2]	@ Te0[s0>>24]
-	ldr	$t2,[$tbl,$i2,lsl#2]	@ Te2[s0>>8]
-	ldr	$t3,[$tbl,$i3,lsl#2]	@ Te1[s0>>16]
-
 	and	$i1,lr,$s1,lsr#16	@ i0
+	ldr	$t2,[$tbl,$i2,lsl#2]	@ Te2[s0>>8]
 	and	$i2,lr,$s1
+	ldr	$t3,[$tbl,$i3,lsl#2]	@ Te1[s0>>16]
 	and	$i3,lr,$s1,lsr#8
+	ldr	$s0,[$tbl,$s0,lsl#2]	@ Te0[s0>>24]
 	mov	$s1,$s1,lsr#24
+
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te1[s1>>16]
-	ldr	$s1,[$tbl,$s1,lsl#2]	@ Te0[s1>>24]
 	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te3[s1>>0]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te2[s1>>8]
 	eor	$s0,$s0,$i1,ror#8
-	eor	$s1,$s1,$t1,ror#24
-	eor	$t2,$t2,$i2,ror#8
-	eor	$t3,$t3,$i3,ror#8
-
+	ldr	$s1,[$tbl,$s1,lsl#2]	@ Te0[s1>>24]
 	and	$i1,lr,$s2,lsr#8	@ i0
+	eor	$t2,$t2,$i2,ror#8
 	and	$i2,lr,$s2,lsr#16	@ i1
+	eor	$t3,$t3,$i3,ror#8
 	and	$i3,lr,$s2
-	mov	$s2,$s2,lsr#24
+	eor	$s1,$s1,$t1,ror#24
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te2[s2>>8]
+	mov	$s2,$s2,lsr#24
+
 	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
-	ldr	$s2,[$tbl,$s2,lsl#2]	@ Te0[s2>>24]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te3[s2>>0]
 	eor	$s0,$s0,$i1,ror#16
-	eor	$s1,$s1,$i2,ror#8
-	eor	$s2,$s2,$t2,ror#16
-	eor	$t3,$t3,$i3,ror#16
-
+	ldr	$s2,[$tbl,$s2,lsl#2]	@ Te0[s2>>24]
 	and	$i1,lr,$s3		@ i0
+	eor	$s1,$s1,$i2,ror#8
 	and	$i2,lr,$s3,lsr#8	@ i1
+	eor	$t3,$t3,$i3,ror#16
 	and	$i3,lr,$s3,lsr#16	@ i2
-	mov	$s3,$s3,lsr#24
+	eor	$s2,$s2,$t2,ror#16
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te3[s3>>0]
+	mov	$s3,$s3,lsr#24
+
 	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te1[s3>>16]
-	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
 	eor	$s0,$s0,$i1,ror#24
+	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
 	eor	$s1,$s1,$i2,ror#16
+	ldr	$i1,[$key],#16
 	eor	$s2,$s2,$i3,ror#8
+	ldr	$t1,[$key,#-12]
 	eor	$s3,$s3,$t3,ror#8
 
-	ldr	$t1,[$key],#16
-	ldr	$t2,[$key,#-12]
-	ldr	$t3,[$key,#-8]
-	ldr	$i1,[$key,#-4]
-	eor	$s0,$s0,$t1
-	eor	$s1,$s1,$t2
-	eor	$s2,$s2,$t3
-	eor	$s3,$s3,$i1
+	ldr	$t2,[$key,#-8]
+	eor	$s0,$s0,$i1
+	ldr	$t3,[$key,#-4]
+	and	$i1,lr,$s0
+	eor	$s1,$s1,$t1
+	and	$i2,lr,$s0,lsr#8
+	eor	$s2,$s2,$t2
+	and	$i3,lr,$s0,lsr#16
+	eor	$s3,$s3,$t3
+	mov	$s0,$s0,lsr#24
 
 	subs	$rounds,$rounds,#1
 	bne	.Lenc_loop
 
 	add	$tbl,$tbl,#2
 
-	and	$i1,lr,$s0
-	and	$i2,lr,$s0,lsr#8
-	and	$i3,lr,$s0,lsr#16
-	mov	$s0,$s0,lsr#24
 	ldrb	$t1,[$tbl,$i1,lsl#2]	@ Te4[s0>>0]
-	ldrb	$s0,[$tbl,$s0,lsl#2]	@ Te4[s0>>24]
-	ldrb	$t2,[$tbl,$i2,lsl#2]	@ Te4[s0>>8]
-	ldrb	$t3,[$tbl,$i3,lsl#2]	@ Te4[s0>>16]
-
 	and	$i1,lr,$s1,lsr#16	@ i0
+	ldrb	$t2,[$tbl,$i2,lsl#2]	@ Te4[s0>>8]
 	and	$i2,lr,$s1
+	ldrb	$t3,[$tbl,$i3,lsl#2]	@ Te4[s0>>16]
 	and	$i3,lr,$s1,lsr#8
+	ldrb	$s0,[$tbl,$s0,lsl#2]	@ Te4[s0>>24]
 	mov	$s1,$s1,lsr#24
+
 	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s1>>16]
-	ldrb	$s1,[$tbl,$s1,lsl#2]	@ Te4[s1>>24]
 	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s1>>0]
 	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s1>>8]
 	eor	$s0,$i1,$s0,lsl#8
-	eor	$s1,$t1,$s1,lsl#24
-	eor	$t2,$i2,$t2,lsl#8
-	eor	$t3,$i3,$t3,lsl#8
-
+	ldrb	$s1,[$tbl,$s1,lsl#2]	@ Te4[s1>>24]
 	and	$i1,lr,$s2,lsr#8	@ i0
+	eor	$t2,$i2,$t2,lsl#8
 	and	$i2,lr,$s2,lsr#16	@ i1
+	eor	$t3,$i3,$t3,lsl#8
 	and	$i3,lr,$s2
-	mov	$s2,$s2,lsr#24
+	eor	$s1,$t1,$s1,lsl#24
 	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s2>>8]
+	mov	$s2,$s2,lsr#24
+
 	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
-	ldrb	$s2,[$tbl,$s2,lsl#2]	@ Te4[s2>>24]
 	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s2>>0]
 	eor	$s0,$i1,$s0,lsl#8
-	eor	$s1,$s1,$i2,lsl#16
-	eor	$s2,$t2,$s2,lsl#24
-	eor	$t3,$i3,$t3,lsl#8
-
+	ldrb	$s2,[$tbl,$s2,lsl#2]	@ Te4[s2>>24]
 	and	$i1,lr,$s3		@ i0
+	eor	$s1,$s1,$i2,lsl#16
 	and	$i2,lr,$s3,lsr#8	@ i1
+	eor	$t3,$i3,$t3,lsl#8
 	and	$i3,lr,$s3,lsr#16	@ i2
-	mov	$s3,$s3,lsr#24
+	eor	$s2,$t2,$s2,lsl#24
 	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s3>>0]
+	mov	$s3,$s3,lsr#24
+
 	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
 	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s3>>16]
-	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
 	eor	$s0,$i1,$s0,lsl#8
+	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
+	ldr	$i1,[$key,#0]
 	eor	$s1,$s1,$i2,lsl#8
+	ldr	$t1,[$key,#4]
 	eor	$s2,$s2,$i3,lsl#16
+	ldr	$t2,[$key,#8]
 	eor	$s3,$t3,$s3,lsl#24
+	ldr	$t3,[$key,#12]
 
-	ldr	lr,[sp],#4		@ pop lr
-	ldr	$t1,[$key,#0]
-	ldr	$t2,[$key,#4]
-	ldr	$t3,[$key,#8]
-	ldr	$i1,[$key,#12]
-	eor	$s0,$s0,$t1
-	eor	$s1,$s1,$t2
-	eor	$s2,$s2,$t3
-	eor	$s3,$s3,$i1
+	eor	$s0,$s0,$i1
+	eor	$s1,$s1,$t1
+	eor	$s2,$s2,$t2
+	eor	$s3,$s3,$t3
 
 	sub	$tbl,$tbl,#2
-	mov	pc,lr			@ return
+	ldr	pc,[sp],#4		@ pop and return
 .size	_armv4_AES_encrypt,.-_armv4_AES_encrypt
 
 .global AES_set_encrypt_key
@@ -402,31 +403,31 @@ AES_set_encrypt_key:
 	ldrb	$t2,[$rounds,#1]
 	ldrb	$t3,[$rounds,#0]
 	orr	$s0,$s0,$t1,lsl#8
-	orr	$s0,$s0,$t2,lsl#16
-	orr	$s0,$s0,$t3,lsl#24
 	ldrb	$s1,[$rounds,#7]
+	orr	$s0,$s0,$t2,lsl#16
 	ldrb	$t1,[$rounds,#6]
+	orr	$s0,$s0,$t3,lsl#24
 	ldrb	$t2,[$rounds,#5]
 	ldrb	$t3,[$rounds,#4]
 	orr	$s1,$s1,$t1,lsl#8
-	orr	$s1,$s1,$t2,lsl#16
-	orr	$s1,$s1,$t3,lsl#24
 	ldrb	$s2,[$rounds,#11]
+	orr	$s1,$s1,$t2,lsl#16
 	ldrb	$t1,[$rounds,#10]
+	orr	$s1,$s1,$t3,lsl#24
 	ldrb	$t2,[$rounds,#9]
 	ldrb	$t3,[$rounds,#8]
 	orr	$s2,$s2,$t1,lsl#8
-	orr	$s2,$s2,$t2,lsl#16
-	orr	$s2,$s2,$t3,lsl#24
 	ldrb	$s3,[$rounds,#15]
+	orr	$s2,$s2,$t2,lsl#16
 	ldrb	$t1,[$rounds,#14]
+	orr	$s2,$s2,$t3,lsl#24
 	ldrb	$t2,[$rounds,#13]
 	ldrb	$t3,[$rounds,#12]
 	orr	$s3,$s3,$t1,lsl#8
-	orr	$s3,$s3,$t2,lsl#16
-	orr	$s3,$s3,$t3,lsl#24
 	str	$s0,[$key],#16
+	orr	$s3,$s3,$t2,lsl#16
 	str	$s1,[$key,#-12]
+	orr	$s3,$s3,$t3,lsl#24
 	str	$s2,[$key,#-8]
 	str	$s3,[$key,#-4]
 
@@ -440,27 +441,26 @@ AES_set_encrypt_key:
 .L128_loop:
 	and	$t2,lr,$s3,lsr#24
 	and	$i1,lr,$s3,lsr#16
-	and	$i2,lr,$s3,lsr#8
-	and	$i3,lr,$s3
 	ldrb	$t2,[$tbl,$t2]
+	and	$i2,lr,$s3,lsr#8
 	ldrb	$i1,[$tbl,$i1]
+	and	$i3,lr,$s3
 	ldrb	$i2,[$tbl,$i2]
-	ldrb	$i3,[$tbl,$i3]
-	ldr	$t1,[$t3],#4			@ rcon[i++]
 	orr	$t2,$t2,$i1,lsl#24
+	ldrb	$i3,[$tbl,$i3]
 	orr	$t2,$t2,$i2,lsl#16
+	ldr	$t1,[$t3],#4			@ rcon[i++]
 	orr	$t2,$t2,$i3,lsl#8
 	eor	$t2,$t2,$t1
 	eor	$s0,$s0,$t2			@ rk[4]=rk[0]^...
 	eor	$s1,$s1,$s0			@ rk[5]=rk[1]^rk[4]
-	eor	$s2,$s2,$s1			@ rk[6]=rk[2]^rk[5]
-	eor	$s3,$s3,$s2			@ rk[7]=rk[3]^rk[6]
 	str	$s0,[$key],#16
+	eor	$s2,$s2,$s1			@ rk[6]=rk[2]^rk[5]
 	str	$s1,[$key,#-12]
+	eor	$s3,$s3,$s2			@ rk[7]=rk[3]^rk[6]
 	str	$s2,[$key,#-8]
-	str	$s3,[$key,#-4]
-
 	subs	$rounds,$rounds,#1
+	str	$s3,[$key,#-4]
 	bne	.L128_loop
 	sub	r2,$key,#176
 	b	.Ldone
@@ -471,16 +471,16 @@ AES_set_encrypt_key:
 	ldrb	$t2,[$rounds,#17]
 	ldrb	$t3,[$rounds,#16]
 	orr	$i2,$i2,$t1,lsl#8
-	orr	$i2,$i2,$t2,lsl#16
-	orr	$i2,$i2,$t3,lsl#24
 	ldrb	$i3,[$rounds,#23]
+	orr	$i2,$i2,$t2,lsl#16
 	ldrb	$t1,[$rounds,#22]
+	orr	$i2,$i2,$t3,lsl#24
 	ldrb	$t2,[$rounds,#21]
 	ldrb	$t3,[$rounds,#20]
 	orr	$i3,$i3,$t1,lsl#8
 	orr	$i3,$i3,$t2,lsl#16
-	orr	$i3,$i3,$t3,lsl#24
 	str	$i2,[$key],#8
+	orr	$i3,$i3,$t3,lsl#24
 	str	$i3,[$key,#-4]
 
 	teq	lr,#192
@@ -494,27 +494,26 @@ AES_set_encrypt_key:
 .L192_loop:
 	and	$t2,lr,$i3,lsr#24
 	and	$i1,lr,$i3,lsr#16
-	and	$i2,lr,$i3,lsr#8
-	and	$i3,lr,$i3
 	ldrb	$t2,[$tbl,$t2]
+	and	$i2,lr,$i3,lsr#8
 	ldrb	$i1,[$tbl,$i1]
+	and	$i3,lr,$i3
 	ldrb	$i2,[$tbl,$i2]
-	ldrb	$i3,[$tbl,$i3]
-	ldr	$t1,[$t3],#4			@ rcon[i++]
 	orr	$t2,$t2,$i1,lsl#24
+	ldrb	$i3,[$tbl,$i3]
 	orr	$t2,$t2,$i2,lsl#16
+	ldr	$t1,[$t3],#4			@ rcon[i++]
 	orr	$t2,$t2,$i3,lsl#8
 	eor	$i3,$t2,$t1
 	eor	$s0,$s0,$i3			@ rk[6]=rk[0]^...
 	eor	$s1,$s1,$s0			@ rk[7]=rk[1]^rk[6]
-	eor	$s2,$s2,$s1			@ rk[8]=rk[2]^rk[7]
-	eor	$s3,$s3,$s2			@ rk[9]=rk[3]^rk[8]
 	str	$s0,[$key],#24
+	eor	$s2,$s2,$s1			@ rk[8]=rk[2]^rk[7]
 	str	$s1,[$key,#-20]
+	eor	$s3,$s3,$s2			@ rk[9]=rk[3]^rk[8]
 	str	$s2,[$key,#-16]
-	str	$s3,[$key,#-12]
-
 	subs	$rounds,$rounds,#1
+	str	$s3,[$key,#-12]
 	subeq	r2,$key,#216
 	beq	.Ldone
 
@@ -532,16 +531,16 @@ AES_set_encrypt_key:
 	ldrb	$t2,[$rounds,#25]
 	ldrb	$t3,[$rounds,#24]
 	orr	$i2,$i2,$t1,lsl#8
-	orr	$i2,$i2,$t2,lsl#16
-	orr	$i2,$i2,$t3,lsl#24
 	ldrb	$i3,[$rounds,#31]
+	orr	$i2,$i2,$t2,lsl#16
 	ldrb	$t1,[$rounds,#30]
+	orr	$i2,$i2,$t3,lsl#24
 	ldrb	$t2,[$rounds,#29]
 	ldrb	$t3,[$rounds,#28]
 	orr	$i3,$i3,$t1,lsl#8
 	orr	$i3,$i3,$t2,lsl#16
-	orr	$i3,$i3,$t3,lsl#24
 	str	$i2,[$key],#8
+	orr	$i3,$i3,$t3,lsl#24
 	str	$i3,[$key,#-4]
 
 	mov	$rounds,#14
@@ -553,52 +552,51 @@ AES_set_encrypt_key:
 .L256_loop:
 	and	$t2,lr,$i3,lsr#24
 	and	$i1,lr,$i3,lsr#16
-	and	$i2,lr,$i3,lsr#8
-	and	$i3,lr,$i3
 	ldrb	$t2,[$tbl,$t2]
+	and	$i2,lr,$i3,lsr#8
 	ldrb	$i1,[$tbl,$i1]
+	and	$i3,lr,$i3
 	ldrb	$i2,[$tbl,$i2]
-	ldrb	$i3,[$tbl,$i3]
-	ldr	$t1,[$t3],#4			@ rcon[i++]
 	orr	$t2,$t2,$i1,lsl#24
+	ldrb	$i3,[$tbl,$i3]
 	orr	$t2,$t2,$i2,lsl#16
+	ldr	$t1,[$t3],#4			@ rcon[i++]
 	orr	$t2,$t2,$i3,lsl#8
 	eor	$i3,$t2,$t1
 	eor	$s0,$s0,$i3			@ rk[8]=rk[0]^...
 	eor	$s1,$s1,$s0			@ rk[9]=rk[1]^rk[8]
-	eor	$s2,$s2,$s1			@ rk[10]=rk[2]^rk[9]
-	eor	$s3,$s3,$s2			@ rk[11]=rk[3]^rk[10]
 	str	$s0,[$key],#32
+	eor	$s2,$s2,$s1			@ rk[10]=rk[2]^rk[9]
 	str	$s1,[$key,#-28]
+	eor	$s3,$s3,$s2			@ rk[11]=rk[3]^rk[10]
 	str	$s2,[$key,#-24]
-	str	$s3,[$key,#-20]
-
 	subs	$rounds,$rounds,#1
+	str	$s3,[$key,#-20]
 	subeq	r2,$key,#256
 	beq	.Ldone
 
 	and	$t2,lr,$s3
 	and	$i1,lr,$s3,lsr#8
-	and	$i2,lr,$s3,lsr#16
-	and	$i3,lr,$s3,lsr#24
 	ldrb	$t2,[$tbl,$t2]
+	and	$i2,lr,$s3,lsr#16
 	ldrb	$i1,[$tbl,$i1]
+	and	$i3,lr,$s3,lsr#24
 	ldrb	$i2,[$tbl,$i2]
-	ldrb	$i3,[$tbl,$i3]
 	orr	$t2,$t2,$i1,lsl#8
+	ldrb	$i3,[$tbl,$i3]
 	orr	$t2,$t2,$i2,lsl#16
+	ldr	$t1,[$key,#-48]
 	orr	$t2,$t2,$i3,lsl#24
 
-	ldr	$t1,[$key,#-48]
 	ldr	$i1,[$key,#-44]
 	ldr	$i2,[$key,#-40]
-	ldr	$i3,[$key,#-36]
 	eor	$t1,$t1,$t2			@ rk[12]=rk[4]^...
+	ldr	$i3,[$key,#-36]
 	eor	$i1,$i1,$t1			@ rk[13]=rk[5]^rk[12]
-	eor	$i2,$i2,$i1			@ rk[14]=rk[6]^rk[13]
-	eor	$i3,$i3,$i2			@ rk[15]=rk[7]^rk[14]
 	str	$t1,[$key,#-16]
+	eor	$i2,$i2,$i1			@ rk[14]=rk[6]^rk[13]
 	str	$i1,[$key,#-12]
+	eor	$i3,$i3,$i2			@ rk[15]=rk[7]^rk[14]
 	str	$i2,[$key,#-8]
 	str	$i3,[$key,#-4]
 	b	.L256_loop
@@ -819,24 +817,24 @@ AES_decrypt:
 	ldrb	$t2,[$rounds,#1]
 	ldrb	$t3,[$rounds,#0]
 	orr	$s0,$s0,$t1,lsl#8
-	orr	$s0,$s0,$t2,lsl#16
-	orr	$s0,$s0,$t3,lsl#24
 	ldrb	$s1,[$rounds,#7]
+	orr	$s0,$s0,$t2,lsl#16
 	ldrb	$t1,[$rounds,#6]
+	orr	$s0,$s0,$t3,lsl#24
 	ldrb	$t2,[$rounds,#5]
 	ldrb	$t3,[$rounds,#4]
 	orr	$s1,$s1,$t1,lsl#8
-	orr	$s1,$s1,$t2,lsl#16
-	orr	$s1,$s1,$t3,lsl#24
 	ldrb	$s2,[$rounds,#11]
+	orr	$s1,$s1,$t2,lsl#16
 	ldrb	$t1,[$rounds,#10]
+	orr	$s1,$s1,$t3,lsl#24
 	ldrb	$t2,[$rounds,#9]
 	ldrb	$t3,[$rounds,#8]
 	orr	$s2,$s2,$t1,lsl#8
-	orr	$s2,$s2,$t2,lsl#16
-	orr	$s2,$s2,$t3,lsl#24
 	ldrb	$s3,[$rounds,#15]
+	orr	$s2,$s2,$t2,lsl#16
 	ldrb	$t1,[$rounds,#14]
+	orr	$s2,$s2,$t3,lsl#24
 	ldrb	$t2,[$rounds,#13]
 	ldrb	$t3,[$rounds,#12]
 	orr	$s3,$s3,$t1,lsl#8
@@ -851,24 +849,24 @@ AES_decrypt:
 	mov	$t3,$s0,lsr#8
 	strb	$t1,[$rounds,#0]
 	strb	$t2,[$rounds,#1]
-	strb	$t3,[$rounds,#2]
-	strb	$s0,[$rounds,#3]
 	mov	$t1,$s1,lsr#24
+	strb	$t3,[$rounds,#2]
 	mov	$t2,$s1,lsr#16
+	strb	$s0,[$rounds,#3]
 	mov	$t3,$s1,lsr#8
 	strb	$t1,[$rounds,#4]
 	strb	$t2,[$rounds,#5]
-	strb	$t3,[$rounds,#6]
-	strb	$s1,[$rounds,#7]
 	mov	$t1,$s2,lsr#24
+	strb	$t3,[$rounds,#6]
 	mov	$t2,$s2,lsr#16
+	strb	$s1,[$rounds,#7]
 	mov	$t3,$s2,lsr#8
 	strb	$t1,[$rounds,#8]
 	strb	$t2,[$rounds,#9]
-	strb	$t3,[$rounds,#10]
-	strb	$s2,[$rounds,#11]
 	mov	$t1,$s3,lsr#24
+	strb	$t3,[$rounds,#10]
 	mov	$t2,$s3,lsr#16
+	strb	$s2,[$rounds,#11]
 	mov	$t3,$s3,lsr#8
 	strb	$t1,[$rounds,#12]
 	strb	$t2,[$rounds,#13]
@@ -885,146 +883,143 @@ AES_decrypt:
 .align	2
 _armv4_AES_decrypt:
 	str	lr,[sp,#-4]!		@ push lr
-	ldr	$t1,[$key],#16
-	ldr	$t2,[$key,#-12]
-	ldr	$t3,[$key,#-8]
-	ldr	$i1,[$key,#-4]
-	ldr	$rounds,[$key,#240-16]
+	ldmia	$key!,{$t1-$i1}
 	eor	$s0,$s0,$t1
+	ldr	$rounds,[$key,#240-16]
 	eor	$s1,$s1,$t2
 	eor	$s2,$s2,$t3
 	eor	$s3,$s3,$i1
 	sub	$rounds,$rounds,#1
 	mov	lr,#255
 
-.Ldec_loop:
 	and	$i1,lr,$s0,lsr#16
 	and	$i2,lr,$s0,lsr#8
 	and	$i3,lr,$s0
 	mov	$s0,$s0,lsr#24
+.Ldec_loop:
 	ldr	$t1,[$tbl,$i1,lsl#2]	@ Td1[s0>>16]
-	ldr	$s0,[$tbl,$s0,lsl#2]	@ Td0[s0>>24]
-	ldr	$t2,[$tbl,$i2,lsl#2]	@ Td2[s0>>8]
-	ldr	$t3,[$tbl,$i3,lsl#2]	@ Td3[s0>>0]
-
 	and	$i1,lr,$s1		@ i0
+	ldr	$t2,[$tbl,$i2,lsl#2]	@ Td2[s0>>8]
 	and	$i2,lr,$s1,lsr#16
+	ldr	$t3,[$tbl,$i3,lsl#2]	@ Td3[s0>>0]
 	and	$i3,lr,$s1,lsr#8
+	ldr	$s0,[$tbl,$s0,lsl#2]	@ Td0[s0>>24]
 	mov	$s1,$s1,lsr#24
+
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td3[s1>>0]
-	ldr	$s1,[$tbl,$s1,lsl#2]	@ Td0[s1>>24]
 	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td1[s1>>16]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td2[s1>>8]
 	eor	$s0,$s0,$i1,ror#24
-	eor	$s1,$s1,$t1,ror#8
-	eor	$t2,$i2,$t2,ror#8
-	eor	$t3,$i3,$t3,ror#8
-
+	ldr	$s1,[$tbl,$s1,lsl#2]	@ Td0[s1>>24]
 	and	$i1,lr,$s2,lsr#8	@ i0
+	eor	$t2,$i2,$t2,ror#8
 	and	$i2,lr,$s2		@ i1
+	eor	$t3,$i3,$t3,ror#8
 	and	$i3,lr,$s2,lsr#16
-	mov	$s2,$s2,lsr#24
+	eor	$s1,$s1,$t1,ror#8
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td2[s2>>8]
+	mov	$s2,$s2,lsr#24
+
 	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
-	ldr	$s2,[$tbl,$s2,lsl#2]	@ Td0[s2>>24]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td1[s2>>16]
 	eor	$s0,$s0,$i1,ror#16
-	eor	$s1,$s1,$i2,ror#24
-	eor	$s2,$s2,$t2,ror#8
-	eor	$t3,$i3,$t3,ror#8
-
+	ldr	$s2,[$tbl,$s2,lsl#2]	@ Td0[s2>>24]
 	and	$i1,lr,$s3,lsr#16	@ i0
+	eor	$s1,$s1,$i2,ror#24
 	and	$i2,lr,$s3,lsr#8	@ i1
+	eor	$t3,$i3,$t3,ror#8
 	and	$i3,lr,$s3		@ i2
-	mov	$s3,$s3,lsr#24
+	eor	$s2,$s2,$t2,ror#8
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td1[s3>>16]
+	mov	$s3,$s3,lsr#24
+
 	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td3[s3>>0]
-	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
 	eor	$s0,$s0,$i1,ror#8
+	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
 	eor	$s1,$s1,$i2,ror#16
 	eor	$s2,$s2,$i3,ror#24
+	ldr	$i1,[$key],#16
 	eor	$s3,$s3,$t3,ror#8
 
-	ldr	$t1,[$key],#16
-	ldr	$t2,[$key,#-12]
-	ldr	$t3,[$key,#-8]
-	ldr	$i1,[$key,#-4]
-	eor	$s0,$s0,$t1
-	eor	$s1,$s1,$t2
-	eor	$s2,$s2,$t3
-	eor	$s3,$s3,$i1
+	ldr	$t1,[$key,#-12]
+	ldr	$t2,[$key,#-8]
+	eor	$s0,$s0,$i1
+	ldr	$t3,[$key,#-4]
+	and	$i1,lr,$s0,lsr#16
+	eor	$s1,$s1,$t1
+	and	$i2,lr,$s0,lsr#8
+	eor	$s2,$s2,$t2
+	and	$i3,lr,$s0
+	eor	$s3,$s3,$t3
+	mov	$s0,$s0,lsr#24
 
 	subs	$rounds,$rounds,#1
 	bne	.Ldec_loop
 
 	add	$tbl,$tbl,#1024
 
-	ldr	$t1,[$tbl,#0]		@ prefetch Td4
-	ldr	$t2,[$tbl,#32]
-	ldr	$t3,[$tbl,#64]
-	ldr	$i1,[$tbl,#96]
-	ldr	$i2,[$tbl,#128]
-	ldr	$i3,[$tbl,#160]
-	ldr	$t1,[$tbl,#192]
-	ldr	$t2,[$tbl,#224]
+	ldr	$t2,[$tbl,#0]		@ prefetch Td4
+	ldr	$t3,[$tbl,#32]
+	ldr	$t1,[$tbl,#64]
+	ldr	$t2,[$tbl,#96]
+	ldr	$t3,[$tbl,#128]
+	ldr	$t1,[$tbl,#160]
+	ldr	$t2,[$tbl,#192]
+	ldr	$t3,[$tbl,#224]
 
-	and	$i1,lr,$s0,lsr#16
-	and	$i2,lr,$s0,lsr#8
-	and	$i3,lr,$s0
-	ldrb	$s0,[$tbl,$s0,lsr#24]	@ Td4[s0>>24]
+	ldrb	$s0,[$tbl,$s0]		@ Td4[s0>>24]
 	ldrb	$t1,[$tbl,$i1]		@ Td4[s0>>16]
-	ldrb	$t2,[$tbl,$i2]		@ Td4[s0>>8]
-	ldrb	$t3,[$tbl,$i3]		@ Td4[s0>>0]
-
 	and	$i1,lr,$s1		@ i0
+	ldrb	$t2,[$tbl,$i2]		@ Td4[s0>>8]
 	and	$i2,lr,$s1,lsr#16
+	ldrb	$t3,[$tbl,$i3]		@ Td4[s0>>0]
 	and	$i3,lr,$s1,lsr#8
+
 	ldrb	$i1,[$tbl,$i1]		@ Td4[s1>>0]
 	ldrb	$s1,[$tbl,$s1,lsr#24]	@ Td4[s1>>24]
 	ldrb	$i2,[$tbl,$i2]		@ Td4[s1>>16]
-	ldrb	$i3,[$tbl,$i3]		@ Td4[s1>>8]
 	eor	$s0,$i1,$s0,lsl#24
+	ldrb	$i3,[$tbl,$i3]		@ Td4[s1>>8]
 	eor	$s1,$t1,$s1,lsl#8
-	eor	$t2,$t2,$i2,lsl#8
-	eor	$t3,$t3,$i3,lsl#8
-
 	and	$i1,lr,$s2,lsr#8	@ i0
+	eor	$t2,$t2,$i2,lsl#8
 	and	$i2,lr,$s2		@ i1
-	and	$i3,lr,$s2,lsr#16
+	eor	$t3,$t3,$i3,lsl#8
 	ldrb	$i1,[$tbl,$i1]		@ Td4[s2>>8]
+	and	$i3,lr,$s2,lsr#16
+
 	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
 	ldrb	$s2,[$tbl,$s2,lsr#24]	@ Td4[s2>>24]
-	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]
 	eor	$s0,$s0,$i1,lsl#8
+	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]
 	eor	$s1,$i2,$s1,lsl#16
-	eor	$s2,$t2,$s2,lsl#16
-	eor	$t3,$t3,$i3,lsl#16
-
 	and	$i1,lr,$s3,lsr#16	@ i0
+	eor	$s2,$t2,$s2,lsl#16
 	and	$i2,lr,$s3,lsr#8	@ i1
-	and	$i3,lr,$s3		@ i2
+	eor	$t3,$t3,$i3,lsl#16
 	ldrb	$i1,[$tbl,$i1]		@ Td4[s3>>16]
+	and	$i3,lr,$s3		@ i2
+
 	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
 	ldrb	$i3,[$tbl,$i3]		@ Td4[s3>>0]
 	ldrb	$s3,[$tbl,$s3,lsr#24]	@ Td4[s3>>24]
 	eor	$s0,$s0,$i1,lsl#16
+	ldr	$i1,[$key,#0]
 	eor	$s1,$s1,$i2,lsl#8
+	ldr	$t1,[$key,#4]
 	eor	$s2,$i3,$s2,lsl#8
+	ldr	$t2,[$key,#8]
 	eor	$s3,$t3,$s3,lsl#24
+	ldr	$t3,[$key,#12]
 
-	ldr	lr,[sp],#4		@ pop lr
-	ldr	$t1,[$key,#0]
-	ldr	$t2,[$key,#4]
-	ldr	$t3,[$key,#8]
-	ldr	$i1,[$key,#12]
-	eor	$s0,$s0,$t1
-	eor	$s1,$s1,$t2
-	eor	$s2,$s2,$t3
-	eor	$s3,$s3,$i1
+	eor	$s0,$s0,$i1
+	eor	$s1,$s1,$t1
+	eor	$s2,$s2,$t2
+	eor	$s3,$s3,$t3
 
 	sub	$tbl,$tbl,#1024
-	mov	pc,lr			@ return
+	ldr	pc,[sp],#4		@ pop and return
 .size	_armv4_AES_decrypt,.-_armv4_AES_decrypt
 .asciz	"AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
index 45d79b6000..2036f46f40 100644
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -19,6 +19,12 @@
 # loop, this assembler loop body was found to be ~3x smaller than
 # compiler-generated one...
 #
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
 # Note about "528B" variant. In ARM case it makes lesser sense to
 # implement it for following reasons:
 #
@@ -123,12 +129,12 @@ gcm_ghash_4bit:
 
 	add	$Zhh,$Htbl,$nlo,lsl#4
 	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	add	$Thh,$Htbl,$nhi
 	ldrb	$nlo,[$inp,#14]
 
-	add	$Thh,$Htbl,$nhi
 	and	$nhi,$Zll,#0xf		@ rem
 	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
-	mov	$nhi,$nhi,lsl#1
+	add	$nhi,$nhi,$nhi
 	eor	$Zll,$Tll,$Zll,lsr#4
 	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
 	eor	$Zll,$Zll,$Zlh,lsl#28
@@ -139,15 +145,15 @@ gcm_ghash_4bit:
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
 	eor	$nlo,$nlo,$nhi
-	eor	$Zhh,$Zhh,$Tll,lsl#16
 	and	$nhi,$nlo,#0xf0
 	and	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16
 
 .Loop:
 	add	$Thh,$Htbl,$nlo,lsl#4
 	subs	$cnt,$cnt,#1
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
 	and	$nlo,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
 	add	$nlo,$nlo,$nlo
 	eor	$Zll,$Tll,$Zll,lsr#4
 	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
@@ -161,22 +167,22 @@ gcm_ghash_4bit:
 
 	add	$Thh,$Htbl,$nhi
 	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
 	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
 	add	$nhi,$nhi,$nhi
 	eor	$Zll,$Tll,$Zll,lsr#4
 	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
 	eor	$Zll,$Zll,$Zlh,lsl#28
-	ldrplb	$nhi,[$Xi,$cnt]
 	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrplb	$nhi,[$Xi,$cnt]
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	eor	$Zhl,$Thl,$Zhl,lsr#4
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
-	eor	$Zhh,$Thh,$Zhh,lsr#4
 	eorpl	$nlo,$nlo,$nhi
-	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	eor	$Zhh,$Thh,$Zhh,lsr#4
 	andpl	$nhi,$nlo,#0xf0
 	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
 	bpl	.Loop
 
 	ldr	$len,[sp,#32]		@ re-load $len/end
@@ -212,7 +218,7 @@ gcm_gmult_4bit:
 	add	$Thh,$Htbl,$nhi
 	and	$nhi,$Zll,#0xf		@ rem
 	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
-	mov	$nhi,$nhi,lsl#1
+	add	$nhi,$nhi,$nhi
 	eor	$Zll,$Tll,$Zll,lsr#4
 	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
 	eor	$Zll,$Zll,$Zlh,lsl#28
@@ -228,8 +234,8 @@ gcm_gmult_4bit:
 .Loop2:
 	add	$Thh,$Htbl,$nlo,lsl#4
 	subs	$cnt,$cnt,#1
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
 	and	$nlo,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
 	add	$nlo,$nlo,$nlo
 	eor	$Zll,$Tll,$Zll,lsr#4
 	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
@@ -243,8 +249,8 @@ gcm_gmult_4bit:
 
 	add	$Thh,$Htbl,$nhi
 	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
 	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
 	add	$nhi,$nhi,$nhi
 	eor	$Zll,$Tll,$Zll,lsr#4
 	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
@@ -255,8 +261,8 @@ gcm_gmult_4bit:
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
 	andpl	$nhi,$nlo,#0xf0
-	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
 	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
 	bpl	.Loop2
 ___
 	&Zsmash();
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index db87434f91..492cb62bc0 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -11,7 +11,12 @@
 
 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-# byte.
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
 
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
@@ -52,27 +57,27 @@ $code.=<<___ if ($i<16);
 ___
 $code.=<<___;
 	ldr	$t2,[$Ktbl],#4			@ *K256++
-	str	$T1,[sp,#`$i%16`*4]
 	mov	$t0,$e,ror#$Sigma1[0]
+	str	$T1,[sp,#`$i%16`*4]
 	eor	$t0,$t0,$e,ror#$Sigma1[1]
-	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
-	add	$T1,$T1,$t0
 	eor	$t1,$f,$g
+	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
 	and	$t1,$t1,$e
+	add	$T1,$T1,$t0
 	eor	$t1,$t1,$g			@ Ch(e,f,g)
-	add	$T1,$T1,$t1
 	add	$T1,$T1,$h
-	add	$T1,$T1,$t2
 	mov	$h,$a,ror#$Sigma0[0]
+	add	$T1,$T1,$t1
 	eor	$h,$h,$a,ror#$Sigma0[1]
+	add	$T1,$T1,$t2
 	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a)
 	orr	$t0,$a,$b
-	and	$t0,$t0,$c
 	and	$t1,$a,$b
+	and	$t0,$t0,$c
+	add	$h,$h,$T1
 	orr	$t0,$t0,$t1			@ Maj(a,b,c)
-	add	$h,$h,$t0
 	add	$d,$d,$T1
-	add	$h,$h,$T1
+	add	$h,$h,$t0
 ___
 }
 
@@ -80,19 +85,19 @@ sub BODY_16_XX {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	ldr	$t1,[sp,#`($i+1)%16`*4]	@ $i
+	ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
 	ldr	$t2,[sp,#`($i+14)%16`*4]
 	ldr	$T1,[sp,#`($i+0)%16`*4]
-	ldr	$inp,[sp,#`($i+9)%16`*4]
 	mov	$t0,$t1,ror#$sigma0[0]
+	ldr	$inp,[sp,#`($i+9)%16`*4]
 	eor	$t0,$t0,$t1,ror#$sigma0[1]
 	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
 	mov	$t1,$t2,ror#$sigma1[0]
+	add	$T1,$T1,$t0
 	eor	$t1,$t1,$t2,ror#$sigma1[1]
+	add	$T1,$T1,$inp
 	eor	$t1,$t1,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
-	add	$T1,$T1,$t0
 	add	$T1,$T1,$t1
-	add	$T1,$T1,$inp
 ___
 	&BODY_00_15(@_);
 }
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 7d27f0b78d..3a35861ac6 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -10,7 +10,13 @@
 # SHA512 block procedure for ARMv4. September 2007.
 
 # This code is ~4.5 (four and a half) times faster than code generated
-# by gcc 3.4 and it spends ~72 clock cycles per byte. 
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
 
 # Byte order [in]dependence. =========================================
 #
@@ -73,33 +79,31 @@ $code.=<<___;
 	eor	$t0,$t0,$Elo,lsl#23
 	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
 	adds	$Tlo,$Tlo,$t0
-	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
-	adds	$Tlo,$Tlo,$t2
-	adc	$Thi,$Thi,$t3		@ T += h
-
 	ldr	$t0,[sp,#$Foff+0]	@ f.lo
+	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
 	ldr	$t1,[sp,#$Foff+4]	@ f.hi
+	adds	$Tlo,$Tlo,$t2
 	ldr	$t2,[sp,#$Goff+0]	@ g.lo
+	adc	$Thi,$Thi,$t3		@ T += h
 	ldr	$t3,[sp,#$Goff+4]	@ g.hi
-	str	$Elo,[sp,#$Eoff+0]
-	str	$Ehi,[sp,#$Eoff+4]
-	str	$Alo,[sp,#$Aoff+0]
-	str	$Ahi,[sp,#$Aoff+4]
 
 	eor	$t0,$t0,$t2
+	str	$Elo,[sp,#$Eoff+0]
 	eor	$t1,$t1,$t3
+	str	$Ehi,[sp,#$Eoff+4]
 	and	$t0,$t0,$Elo
+	str	$Alo,[sp,#$Aoff+0]
 	and	$t1,$t1,$Ehi
+	str	$Ahi,[sp,#$Aoff+4]
 	eor	$t0,$t0,$t2
-	eor	$t1,$t1,$t3		@ Ch(e,f,g)
-
 	ldr	$t2,[$Ktbl,#4]		@ K[i].lo
+	eor	$t1,$t1,$t3		@ Ch(e,f,g)
 	ldr	$t3,[$Ktbl,#0]		@ K[i].hi
-	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
-	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
 
 	adds	$Tlo,$Tlo,$t0
+	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
 	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
+	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
 	adds	$Tlo,$Tlo,$t2
 	adc	$Thi,$Thi,$t3		@ T += K[i]
 	adds	$Elo,$Elo,$Tlo
author	Andy Polyakov <appro@openssl.org>	2010-07-13 14:03:31 +0000
committer	Andy Polyakov <appro@openssl.org>	2010-07-13 14:03:31 +0000
commit	2d22e08083d031eb63cada59aa95afe5279008d3 (patch)
tree	5971d660922f8ef8c58bc18ab4fc6f1a22346ed5
parent	0852f90c300405c79c2af5c549e74d0d4a8f664c (diff)