summaryrefslogtreecommitdiffstats
path: root/crypto/bn/asm/bn-c64xplus.asm
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2012-11-28 13:19:10 +0000
committerAndy Polyakov <appro@openssl.org>2012-11-28 13:19:10 +0000
commit904732f68bcc6ebd3f8961a9272bc811dc26bcbd (patch)
tree7ab988b23467d2545e4aa2e50bd176bf6923fc2a /crypto/bn/asm/bn-c64xplus.asm
parentcf5ecc3e1fd112dd8a544bfb26bfb96c96b604c7 (diff)
C64x+ assembly pack: improve EABI support.
Diffstat (limited to 'crypto/bn/asm/bn-c64xplus.asm')
-rw-r--r--crypto/bn/asm/bn-c64xplus.asm51
1 files changed, 44 insertions, 7 deletions
diff --git a/crypto/bn/asm/bn-c64xplus.asm b/crypto/bn/asm/bn-c64xplus.asm
index 161547c3b0..f07b09e439 100644
--- a/crypto/bn/asm/bn-c64xplus.asm
+++ b/crypto/bn/asm/bn-c64xplus.asm
@@ -12,6 +12,18 @@
;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
;;====================================================================
.text
+ .if __TI_EABI__
+ .asg bn_mul_add_words,_bn_mul_add_words
+ .asg bn_mul_words,_bn_mul_words
+ .asg bn_sqr_words,_bn_sqr_words
+ .asg bn_add_words,_bn_add_words
+ .asg bn_sub_words,_bn_sub_words
+ .asg bn_div_words,_bn_div_words
+ .asg bn_sqr_comba8,_bn_sqr_comba8
+ .asg bn_mul_comba8,_bn_mul_comba8
+ .asg bn_sqr_comba4,_bn_sqr_comba4
+ .asg bn_mul_comba4,_bn_mul_comba4
+ .endif
.asg B3,RA
.asg A4,ARG0
@@ -158,14 +170,39 @@ _bn_sub_words:
.endasmfunc
.global _bn_div_words
- .global __divull
_bn_div_words:
.asmfunc
- CALLP __divull,A3 ; jump to rts64plus.lib
-|| MV ARG0,A5
-|| MV ARG1,ARG0
-|| MV ARG2,ARG1
-|| ZERO B5
+ LMBD 1,A6,A0 ; leading zero bits in dv
+ LMBD 1,A4,A1 ; leading zero bits in hi
+|| MVK 32,B0
+ CMPLTU A1,A0,A2
+|| ADD A0,B0,B0
+ [ A2] BNOP RA
+||[ A2] MVK -1,A4 ; return overflow
+||[!A2] MV A4,A3 ; reassign hi
+ [!A2] MV B4,A4 ; reassign lo, will be quotient
+||[!A2] MVC B0,ILC
+ [!A2] SHL A6,A0,A6 ; normalize dv
+|| MVK 1,A1
+
+ [!A2] CMPLTU A3,A6,A1 ; hi<dv?
+||[!A2] SHL A4,1,A5:A4 ; lo<<1
+ [!A1] SUB A3,A6,A3 ; hi-=dv
+||[!A1] OR 1,A4,A4
+ [!A2] SHRU A3,31,A1 ; upper bit
+||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31
+
+ SPLOOP 3
+ [!A1] CMPLTU A3,A6,A1 ; hi<dv?
+||[ A1] ZERO A1
+|| SHL A4,1,A5:A4 ; lo<<1
+ [!A1] SUB A3,A6,A3 ; hi-=dv
+||[!A1] OR 1,A4,A4 ; quotient
+ SHRU A3,31,A1 ; upper bit
+|| ADDAH A5,A3,A3 ; hi<<1|lo>>31
+ SPKERNEL
+
+ BNOP RA,5
.endasmfunc
;;====================================================================
@@ -256,7 +293,7 @@ _bn_mul_comba4:
|| LDW *A5++,B6 ; ap[0]
|| MV A0,A3 ; const A3=M
.else
- ;; This alternative is exercise in fully unrolled Comba
+ ;; This alternative is an exercise in fully unrolled Comba
;; algorithm implementation that operates at n*(n+1)+12, or
;; as little as 32 cycles...
LDW *ARG1[0],B16 ; a[0]