Bignum division tune-up. Idea is to move multiplications in front of

loop body and replace 'em with addition/subtraction.
author: Andy Polyakov <appro@openssl.org> 1999-07-30 11:43:43 +0000
committer: Andy Polyakov <appro@openssl.org> 1999-07-30 11:43:43 +0000
commit: 0dd25e3606f767381de8c8722b76f813c8e3b013 (patch)
tree: 7d1d8e5a80e41a637c3321907a9d890dcc006e2c /crypto/bn/asm/mips3.s
parent: a40f6dce871e8e020ca16713cc19798fcc141ed1 (diff)
1 files changed, 132 insertions, 56 deletions
diff --git a/crypto/bn/asm/mips3.s b/crypto/bn/asm/mips3.s
index 2193c954b2..191345d920 100644
--- a/crypto/bn/asm/mips3.s
+++ b/crypto/bn/asm/mips3.s
@@ -1,5 +1,5 @@
 .rdata
-.asciiz "mips3.s, Version 1.0 (prerelease)"
+.asciiz "mips3.s, Version 1.0"
 .asciiz	"MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
 
 /*
@@ -19,19 +19,26 @@
  * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
  * module. For updates see http://fy.chalmers.se/~appro/hpe/.
  *
- * The module is designed to work with "new" IRIX ABI(5), namely
- * N32 and N64. But it was tested only with MIPSpro 7.2.x assembler,
- * i.e. depends on preprocessor options set up by MIPSspro 7.2.x
- * driver. Another neat gadget offered by MIPSpro 7.2.x assembler is
- * an peep-hole(?) optimization pass. This gave me the opportunity
- * to make the code looking more regular as all those architecture
- * dependent(!) instruction rescheduling details were left to the
- * assembler. Cool, huh? Do note that I have no idea if GNU assembler
- * does anything similar nor how GNU C will do with this module.
- * Feedback on the matter is therefore very much appreciated:-)
+ * The module is designed to work with either of the "new" MIPS ABI(5),
+ * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
+ * IRIX 5.x not only because it doesn't support new ABIs but also
+ * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
+ * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
+ * cause illegal instruction exception:-(
+ *
+ * In addition the code depends on preprocessor flags set up by MIPSpro
+ * compiler driver (either as or cc) and therefore (probably?) can't be
+ * compiled by the GNU assembler. GNU C driver manages fine though...
+ * I mean as long as -mmips-as is specified or is the default option,
+ * because then it simply invokes /usr/bin/as which in turn takes
+ * perfect care of the preprocessor definitions. Another neat feature
+ * offered by the MIPSpro assembler is an optimization pass. This gave
+ * me the opportunity to have the code looking more regular as all those
+ * architecture dependent instruction rescheduling details were left to
+ * the assembler. Cool, huh?
  *
  * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
- * exhibits 3-3.5-3.7 times improvement!
+ * goes way over 3 times faster!
  *
  *					<appro@fy.chalmers.se>
  */
@@ -56,8 +63,8 @@
 
 #define	MINUS4	v1
 
+.align	5
 LEAF(bn_mul_add_words)
-	.align	5
 	.set	noreorder
 	bgtzl	a2,.L_bn_mul_add_words_proceed
 	ld	t0,0(a1)
@@ -185,8 +192,8 @@ LEAF(bn_mul_add_words)
 	jr	ra
 END(bn_mul_add_words)
 
+.align	5
 LEAF(bn_mul_words)
-	.align	5
 	.set	noreorder
 	bgtzl	a2,.L_bn_mul_words_proceed
 	ld	t0,0(a1)
@@ -284,8 +291,8 @@ LEAF(bn_mul_words)
 	jr	ra
 END(bn_mul_words)
 
+.align	5
 LEAF(bn_sqr_words)
-	.align	5
 	.set	noreorder
 	bgtzl	a2,.L_bn_sqr_words_proceed
 	ld	t0,0(a1)
@@ -371,8 +378,8 @@ LEAF(bn_sqr_words)
 	jr	ra
 END(bn_sqr_words)
 
+.align	5
 LEAF(bn_add_words)
-	.align	5
 	.set	noreorder
 	bgtzl	a3,.L_bn_add_words_proceed
 	ld	t0,0(a1)
@@ -471,8 +478,8 @@ LEAF(bn_add_words)
 	jr	ra
 END(bn_add_words)
 
+.align	5
 LEAF(bn_sub_words)
-	.align	5
 	.set	noreorder
 	bgtzl	a3,.L_bn_sub_words_proceed
 	ld	t0,0(a1)
@@ -567,24 +574,24 @@ END(bn_sub_words)
 
 #undef	MINUS4
 
+.align	5
 LEAF(bn_div_words)
-	.align	5
 	.set	noreorder
 	bnezl	a2,.L_bn_div_words_proceed
-	move	t0,zero
+	move	v1,zero
 	jr	ra
 	li	v0,-1		/* I'd rather signal div-by-zero
 				 * which can be done with 'break 7' */
-	.set	reorder
 
 .L_bn_div_words_proceed:
 	bltz	a2,.L_bn_div_words_body
-	.set	noreorder
+	move	t9,v1
 	dsll	a2,1
 	bgtz	a2,.-4
-	addu	t0,1
+	addu	t9,1
+
 	.set	reorder
-	negu	t1,t0
+	negu	t1,t9
 	li	t2,-1
 	dsll	t2,t1
 	and	t2,a0
@@ -593,65 +600,135 @@ LEAF(bn_div_words)
 	bnezl	t2,.+8
 	break	6		/* signal overflow */
 	.set	reorder
-	dsll	a0,t0
-	dsll	a1,t0
+	dsll	a0,t9
+	dsll	a1,t9
 	or	a0,AT
 
 #define	QT	ta0
-#define	DH	ta1
-#define	HH	ta2
-#define	MINUS1	ta3
+#define	HH	ta1
+#define	DH	v1
 .L_bn_div_words_body:
 	dsrl	DH,a2,32
-	li	v1,2
 	sgeu	AT,a0,a2
-	li	MINUS1,-1
 	.set	noreorder
 	bnezl	AT,.+8
 	dsubu	a0,a2
 	.set	reorder
 
-.L_bn_div_words_outer_loop:
+	li	QT,-1
 	dsrl	HH,a0,32
-	subu	v1,1
-	dsrl	QT,MINUS1,32	/* q=0xffffffff */
-	beq	DH,HH,.L_bn_div_words_inner_loop
+	dsrl	QT,32	/* q=0xffffffff */
+	beq	DH,HH,.L_bn_div_words_skip_div1
 	ddivu	zero,a0,DH
 	mflo	QT
-.L_bn_div_words_inner_loop:
+.L_bn_div_words_skip_div1:
 	dmultu	a2,QT
 	dsll	t3,a0,32
 	dsrl	AT,a1,32
 	or	t3,AT
 	mflo	t0
 	mfhi	t1
+.L_bn_div_words_inner_loop1:
 	sltu	t2,t3,t0
 	seq	t8,HH,t1
 	sltu	AT,HH,t1
 	and	t2,t8
 	or	AT,t2
 	.set	noreorder
-	bnezl	AT,.L_bn_div_words_inner_loop
-	dsubu	QT,1
+	beqz	AT,.L_bn_div_words_inner_loop1_done
+	sltu	t2,t0,a2
 	.set	reorder
-	
-	dsubu	a0,t3,t0
-	beqz	v1,.L_bn_div_words_outer_loop_done
+	dsubu	QT,1
+	dsubu	t0,a2
+	dsubu	t1,t2
+	b	.L_bn_div_words_inner_loop1
+.L_bn_div_words_inner_loop1_done:	
 
 	dsll	a1,32
+	dsubu	a0,t3,t0
 	dsll	v0,QT,32
-	b	.L_bn_div_words_outer_loop
 
-.L_bn_div_words_outer_loop_done:
+	li	QT,-1
+	dsrl	HH,a0,32
+	dsrl	QT,32	/* q=0xffffffff */
+	beq	DH,HH,.L_bn_div_words_skip_div2
+	ddivu	zero,a0,DH
+	mflo	QT
+.L_bn_div_words_skip_div2:
+	dmultu	a2,QT
+	dsll	t3,a0,32
+	dsrl	AT,a1,32
+	or	t3,AT
+	mflo	t0
+	mfhi	t1
+.L_bn_div_words_inner_loop2:
+	sltu	t2,t3,t0
+	seq	t8,HH,t1
+	sltu	AT,HH,t1
+	and	t2,t8
+	or	AT,t2
+	.set	noreorder
+	beqz	AT,.L_bn_div_words_inner_loop2_done
+	sltu	t2,t0,a2
+	.set	reorder
+	dsubu	QT,1
+	dsubu	t0,a2
+	dsubu	t1,t2
+	b	.L_bn_div_words_inner_loop2
+.L_bn_div_words_inner_loop2_done:	
+
+	dsubu	a0,t3,t0
 	or	v0,QT
-	move	v1,a0	/* v1 contains remainder if one wants it */
+	dsrl	v1,a0,t9	/* v1 contains remainder if anybody wants it */
+	dsrl	a2,t9		/* restore a2 */
 	jr	ra
-#undef	MINUS1
 #undef	HH
 #undef	DH
 #undef	QT
 END(bn_div_words)
 
+.align 5
+LEAF(bn_div_3_words)
+	.set	reorder
+	move	a3,a0		/* we know that bn_div_words doesn't
+				 * touch a3, ta2, ta3 and preserves a2
+				 * so that we can save two arguments
+				 * and return address in registers
+				 * instead of stack:-)
+				 */
+	ld	a0,(a3)
+	move	ta2,a2
+	move	a2,a1
+	ld	a1,-8(a3)
+	move	ta3,ra
+	move	v1,zero
+	li	v0,-1
+	beq	a0,a2,.L_bn_div_3_words_skip_div
+	jal	bn_div_words
+	move	ra,ta3
+.L_bn_div_3_words_skip_div:
+	dmultu	ta2,v0
+	ld	t2,-16(a3)
+	mflo	t0
+	mfhi	t1
+.L_bn_div_3_words_inner_loop:
+	sgeu	AT,t2,t0
+	seq	t9,t1,v1
+	sltu	t8,t1,v1
+	and	AT,t9
+	or	AT,t8
+	bnez	AT,.L_bn_div_3_words_inner_loop_done
+	daddu	v1,a2
+	sltu	t3,t0,ta2
+	sltu	AT,v1,a2
+	dsubu	v0,1
+	dsubu	t0,ta2
+	dsubu	t1,t3
+	beqz	AT,.L_bn_div_3_words_inner_loop
+.L_bn_div_3_words_inner_loop_done:
+	jr	ra
+END(bn_div_3_words)
+
 #define	a_0	t0
 #define	a_1	t1
 #define	a_2	t2
@@ -679,20 +756,19 @@ END(bn_div_words)
 
 #define	FRAME_SIZE	48
 
+.align	5
 LEAF(bn_mul_comba8)
-	.align	5
 	.set	noreorder
 	PTR_SUB	sp,FRAME_SIZE
 	.frame	sp,64,ra
 	.set	reorder
-	ld	a_0,0(a1)	/* If compiled with -mips3 options
-				 * assembler barks on this line with
-				 * "shouldn't have mult/div as last
-				 * instruction in bb (R10K bug)"
-				 * warning. If anybody out there has
-				 * a clue on what does "bb" mean and
-				 * how to circumvent this do send me
-				 * a note.
+	ld	a_0,0(a1)	/* If compiled with -mips3 option on
+				 * R5000 box assembler barks on this
+				 * line with "shouldn't have mult/div
+				 * as last instruction in bb (R10K
+				 * bug)" warning. If anybody out there
+				 * has a clue about how to circumvent
+				 * this do send me a note.
 				 *		<appro@fy.chalmers.se>
 				 */
 	ld	b_0,0(a2)
@@ -1286,8 +1362,8 @@ LEAF(bn_mul_comba8)
 	jr	ra
 END(bn_mul_comba8)
 
+.align	5
 LEAF(bn_mul_comba4)
-	.align	5
 	.set	reorder
 	ld	a_0,0(a1)
 	ld	b_0,0(a2)
@@ -1444,8 +1520,8 @@ END(bn_mul_comba4)
 #define	a_6	b_2
 #define	a_7	b_3
 
+.align	5
 LEAF(bn_sqr_comba8)
-	.align	5
 	.set	reorder
 	ld	a_0,0(a1)
 	ld	a_1,8(a1)
@@ -1934,8 +2010,8 @@ LEAF(bn_sqr_comba8)
 	jr	ra
 END(bn_sqr_comba8)
 
+.align	5
 LEAF(bn_sqr_comba4)
-	.align	5
 	.set	reorder
 	ld	a_0,0(a1)
 	ld	a_1,8(a1)
author	Andy Polyakov <appro@openssl.org>	1999-07-30 11:43:43 +0000
committer	Andy Polyakov <appro@openssl.org>	1999-07-30 11:43:43 +0000
commit	0dd25e3606f767381de8c8722b76f813c8e3b013 (patch)
tree	7d1d8e5a80e41a637c3321907a9d890dcc006e2c /crypto/bn/asm/mips3.s
parent	a40f6dce871e8e020ca16713cc19798fcc141ed1 (diff)