summaryrefslogtreecommitdiffstats
path: root/crypto/bn/asm/ppc.pl
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/bn/asm/ppc.pl')
-rw-r--r--crypto/bn/asm/ppc.pl2081
1 files changed, 2081 insertions, 0 deletions
diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl
new file mode 100644
index 0000000000..307c7ccb35
--- /dev/null
+++ b/crypto/bn/asm/ppc.pl
@@ -0,0 +1,2081 @@
+#!/usr/bin/env perl
+#
+# Implemented as a Perl wrapper as we want to support several different
+# architectures with single file. We pick up the target based on the
+# file name we are asked to generate.
+#
+# It should be noted though that this perl code is nothing like
+# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
+# as pre-processor to cover for platform differences in name decoration,
+# linker tables, 32-/64-bit instruction sets...
+#
+# As you might know there're several PowerPC ABI in use. Most notably
+# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
+# are similar enough to implement leaf(!) functions, which would be ABI
+# neutral. And that's what you find here: ABI neutral leaf functions.
+# In case you wonder what that is...
+#
+# AIX performance
+#
+# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
+#
+# The following is the performance of 32-bit compiler
+# generated code:
+#
+# OpenSSL 0.9.6c 21 dec 2001
+# built on: Tue Jun 11 11:06:51 EDT 2002
+# options:bn(64,32) ...
+#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
+# sign verify sign/s verify/s
+#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
+#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
+#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
+#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
+#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
+#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
+#
+# Same bechmark with this assembler code:
+#
+#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
+#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
+#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
+#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
+#dsa 512 bits 0.0052s 0.0062s 191.6 162.0
+#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
+#
+# Number of operations increases by at almost 75%
+#
+# Here are performance numbers for 64-bit compiler
+# generated code:
+#
+# OpenSSL 0.9.6g [engine] 9 Aug 2002
+# built on: Fri Apr 18 16:59:20 EDT 2003
+# options:bn(64,64) ...
+# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
+# sign verify sign/s verify/s
+#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
+#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
+#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
+#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
+#dsa 512 bits 0.0026s 0.0032s 382.5 313.7
+#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
+#
+# Same benchmark with this assembler code:
+#
+#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
+#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
+#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
+#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
+#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
+#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
+#
+# Again, performance increases by at about 75%
+#
+# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
+# OpenSSL 0.9.7c 30 Sep 2003
+#
+# Original code.
+#
+#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
+#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
+#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
+#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
+#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
+#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
+#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
+#
+# Same benchmark with this assembler code:
+#
+#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
+#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
+#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
+#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
+#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
+#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
+#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
+#
+# Performance increase of ~60%
+#
+# If you have comments or suggestions to improve code send
+# me a note at schari@us.ibm.com
+#
+
+$opf = shift;
+
+if ($opf =~ /32\.s/) {
+ $BITS= 32;
+ $BNSZ= $BITS/8;
+ $ISA= "\"ppc\"";
+
+ $LD= "lwz"; # load
+ $LDU= "lwzu"; # load and update
+ $ST= "stw"; # store
+ $STU= "stwu"; # store and update
+ $UMULL= "mullw"; # unsigned multiply low
+ $UMULH= "mulhwu"; # unsigned multiply high
+ $UDIV= "divwu"; # unsigned divide
+ $UCMPI= "cmplwi"; # unsigned compare with immediate
+ $UCMP= "cmplw"; # unsigned compare
+ $COUNTZ="cntlzw"; # count leading zeros
+ $SHL= "slw"; # shift left
+ $SHR= "srw"; # unsigned shift right
+ $SHRI= "srwi"; # unsigned shift right by immediate
+ $SHLI= "slwi"; # shift left by immediate
+ $CLRU= "clrlwi"; # clear upper bits
+ $INSR= "insrwi"; # insert right
+ $ROTL= "rotlwi"; # rotate left by immediate
+} elsif ($opf =~ /64\.s/) {
+ $BITS= 64;
+ $BNSZ= $BITS/8;
+ $ISA= "\"ppc64\"";
+
+ # same as above, but 64-bit mnemonics...
+ $LD= "ld"; # load
+ $LDU= "ldu"; # load and update
+ $ST= "std"; # store
+ $STU= "stdu"; # store and update
+ $UMULL= "mulld"; # unsigned multiply low
+ $UMULH= "mulhdu"; # unsigned multiply high
+ $UDIV= "divdu"; # unsigned divide
+ $UCMPI= "cmpldi"; # unsigned compare with immediate
+ $UCMP= "cmpld"; # unsigned compare
+ $COUNTZ="cntlzd"; # count leading zeros
+ $SHL= "sld"; # shift left
+ $SHR= "srd"; # unsigned shift right
+ $SHRI= "srdi"; # unsigned shift right by immediate
+ $SHLI= "sldi"; # shift left by immediate
+ $CLRU= "clrldi"; # clear upper bits
+ $INSR= "insrdi"; # insert right
+ $ROTL= "rotldi"; # rotate left by immediate
+} else { die "nonsense $opf"; }
+
+( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!";
+
+# function entry points from the AIX code
+#
+# There are other, more elegant, ways to handle this. We (IBM) chose
+# this approach as it plays well with scripts we run to 'namespace'
+# OpenSSL .i.e. we add a prefix to all the public symbols so we can
+# co-exist in the same process with other implementations of OpenSSL.
+# 'cleverer' ways of doing these substitutions tend to hide data we
+# need to be obvious.
+#
+my @items = ("bn_sqr_comba4",
+ "bn_sqr_comba8",
+ "bn_mul_comba4",
+ "bn_mul_comba8",
+ "bn_sub_words",
+ "bn_add_words",
+ "bn_div_words",
+ "bn_sqr_words",
+ "bn_mul_words",
+ "bn_mul_add_words");
+
+if ($opf =~ /linux/) { do_linux(); }
+elsif ($opf =~ /aix/) { do_aix(); }
+elsif ($opf =~ /osx/) { do_osx(); }
+else { do_bsd(); }
+
+sub do_linux {
+ $d=&data();
+
+ if ($BITS==64) {
+ foreach $t (@items) {
+ $d =~ s/\.$t:/\
+\t.section\t".opd","aw"\
+\t.align\t3\
+\t.globl\t$t\
+$t:\
+\t.quad\t.$t,.TOC.\@tocbase,0\
+\t.size\t$t,24\
+\t.previous\n\
+\t.type\t.$t,\@function\
+\t.globl\t.$t\
+.$t:/g;
+ }
+ }
+ else {
+ foreach $t (@items) {
+ $d=~s/\.$t/$t/g;
+ }
+ }
+ # hide internal labels to avoid pollution of name table...
+ $d=~s/Lppcasm_/.Lppcasm_/gm;
+ print $d;
+}
+
+sub do_aix {
+ # AIX assembler is smart enough to please the linker without
+ # making us do something special...
+ print &data();
+}
+
+# MacOSX 32 bit
+sub do_osx {
+ $d=&data();
+ # Change the bn symbol prefix from '.' to '_'
+ foreach $t (@items) {
+ $d=~s/\.$t/_$t/g;
+ }
+ # Change .machine to something OS X asm will accept
+ $d=~s/\.machine.*/.text/g;
+ $d=~s/\#/;/g; # change comment from '#' to ';'
+ print $d;
+}
+
+# BSD (Untested)
+sub do_bsd {
+ $d=&data();
+ foreach $t (@items) {
+ $d=~s/\.$t/_$t/g;
+ }
+ print $d;
+}
+
+sub data {
+ local($data)=<<EOF;
+#--------------------------------------------------------------------
+#
+#
+#
+#
+# File: ppc32.s
+#
+# Created by: Suresh Chari
+# IBM Thomas J. Watson Research Library
+# Hawthorne, NY
+#
+#
+# Description: Optimized assembly routines for OpenSSL crypto
+# on the 32 bitPowerPC platform.
+#
+#
+# Version History
+#
+# 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
+# cleaned up code. Also made a single version which can
+# be used for both the AIX and Linux compilers. See NOTE
+# below.
+# 12/05/03 Suresh Chari
+# (with lots of help from) Andy Polyakov
+##
+# 1. Initial version 10/20/02 Suresh Chari
+#
+#
+# The following file works for the xlc,cc
+# and gcc compilers.
+#
+# NOTE: To get the file to link correctly with the gcc compiler
+# you have to change the names of the routines and remove
+# the first .(dot) character. This should automatically
+# be done in the build process.
+#
+# Hand optimized assembly code for the following routines
+#
+# bn_sqr_comba4
+# bn_sqr_comba8
+# bn_mul_comba4
+# bn_mul_comba8
+# bn_sub_words
+# bn_add_words
+# bn_div_words
+# bn_sqr_words
+# bn_mul_words
+# bn_mul_add_words
+#
+# NOTE: It is possible to optimize this code more for
+# specific PowerPC or Power architectures. On the Northstar
+# architecture the optimizations in this file do
+# NOT provide much improvement.
+#
+# If you have comments or suggestions to improve code send
+# me a note at schari\@us.ibm.com
+#
+#--------------------------------------------------------------------------
+#
+# Defines to be used in the assembly code.
+#
+.set r0,0 # we use it as storage for value of 0
+.set SP,1 # preserved
+.set RTOC,2 # preserved
+.set r3,3 # 1st argument/return value
+.set r4,4 # 2nd argument/volatile register
+.set r5,5 # 3rd argument/volatile register
+.set r6,6 # ...
+.set r7,7
+.set r8,8
+.set r9,9
+.set r10,10
+.set r11,11
+.set r12,12
+.set r13,13 # not used, nor any other "below" it...
+
+.set BO_IF_NOT,4
+.set BO_IF,12
+.set BO_dCTR_NZERO,16
+.set BO_dCTR_ZERO,18
+.set BO_ALWAYS,20
+.set CR0_LT,0;
+.set CR0_GT,1;
+.set CR0_EQ,2
+.set CR1_FX,4;
+.set CR1_FEX,5;
+.set CR1_VX,6
+.set LR,8
+
+# Declare function names to be global
+# NOTE: For gcc these names MUST be changed to remove
+# the first . i.e. for example change ".bn_sqr_comba4"
+# to "bn_sqr_comba4". This should be automatically done
+# in the build.
+
+ .globl .bn_sqr_comba4
+ .globl .bn_sqr_comba8
+ .globl .bn_mul_comba4
+ .globl .bn_mul_comba8
+ .globl .bn_sub_words
+ .globl .bn_add_words
+ .globl .bn_div_words
+ .globl .bn_sqr_words
+ .globl .bn_mul_words
+ .globl .bn_mul_add_words
+
+# .text section
+
+ .machine $ISA
+
+#
+# NOTE: The following label name should be changed to
+# "bn_sqr_comba4" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_sqr_comba4:
+#
+# Optimized version of bn_sqr_comba4.
+#
+# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+# r3 contains r
+# r4 contains a
+#
+# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
+#
+# r5,r6 are the two BN_ULONGs being multiplied.
+# r7,r8 are the results of the 32x32 giving 64 bit multiply.
+# r9,r10, r11 are the equivalents of c1,c2, c3.
+# Here's the assembly
+#
+#
+ xor r0,r0,r0 # set r0 = 0. Used in the addze
+ # instructions below
+
+ #sqr_add_c(a,0,c1,c2,c3)
+ $LD r5,`0*$BNSZ`(r4)
+ $UMULL r9,r5,r5
+ $UMULH r10,r5,r5 #in first iteration. No need
+ #to add since c1=c2=c3=0.
+ # Note c3(r11) is NOT set to 0
+ # but will be.
+
+ $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
+ # sqr_add_c2(a,1,0,c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
+ adde r8,r8,r8
+ addze r9,r0 # catch carry if any.
+ # r9= r0(=0) and carry
+
+ addc r10,r7,r10 # now add to temp result.
+ addze r11,r8 # r8 added to r11 which is 0
+ addze r9,r9
+
+ $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
+ #sqr_add_c(a,1,c3,c1,c2)
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ #sqr_add_c2(a,2,0,c3,c1,c2)
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`2*$BNSZ`(r3) #r[2]=c3
+ #sqr_add_c2(a,3,0,c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r11,r0
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,2,1,c1,c2,c3);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ $ST r9,`3*$BNSZ`(r3) #r[3]=c1
+ #sqr_add_c(a,2,c2,c3,c1);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ #sqr_add_c2(a,3,1,c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r9,r9
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`4*$BNSZ`(r3) #r[4]=c2
+ #sqr_add_c2(a,3,2,c3,c1,c2);
+ $LD r5,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r10,r0
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`5*$BNSZ`(r3) #r[5] = c3
+ #sqr_add_c(a,3,c1,c2,c3);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+
+ $ST r9,`6*$BNSZ`(r3) #r[6]=c1
+ $ST r10,`7*$BNSZ`(r3) #r[7]=c2
+ bclr BO_ALWAYS,CR0_LT
+ .long 0x00000000
+
+#
+# NOTE: The following label name should be changed to
+# "bn_sqr_comba8" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_sqr_comba8:
+#
+# This is an optimized version of the bn_sqr_comba8 routine.
+# Tightly uses the adde instruction
+#
+#
+# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+# r3 contains r
+# r4 contains a
+#
+# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
+#
+# r5,r6 are the two BN_ULONGs being multiplied.
+# r7,r8 are the results of the 32x32 giving 64 bit multiply.
+# r9,r10, r11 are the equivalents of c1,c2, c3.
+#
+# Possible optimization of loading all 8 longs of a into registers
+# doesnt provide any speedup
+#
+
+ xor r0,r0,r0 #set r0 = 0.Used in addze
+ #instructions below.
+
+ #sqr_add_c(a,0,c1,c2,c3);
+ $LD r5,`0*$BNSZ`(r4)
+ $UMULL r9,r5,r5 #1st iteration: no carries.
+ $UMULH r10,r5,r5
+ $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
+ #sqr_add_c2(a,1,0,c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10 #add the two register number
+ adde r11,r8,r0 # (r8,r7) to the three register
+ addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
+
+ addc r10,r7,r10 #add the two register number
+ adde r11,r8,r11 # (r8,r7) to the three register
+ addze r9,r9 # number (r9,r11,r10).
+
+ $ST r10,`1*$BNSZ`(r3) # r[1]=c2
+
+ #sqr_add_c(a,1,c3,c1,c2);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ #sqr_add_c2(a,2,0,c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ $ST r11,`2*$BNSZ`(r3) #r[2]=c3
+ #sqr_add_c2(a,3,0,c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,2,1,c1,c2,c3);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ $ST r9,`3*$BNSZ`(r3) #r[3]=c1;
+ #sqr_add_c(a,2,c2,c3,c1);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ #sqr_add_c2(a,3,1,c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,4,0,c2,c3,c1);
+ $LD r5,`0*$BNSZ`(r4)
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`4*$BNSZ`(r3) #r[4]=c2;
+ #sqr_add_c2(a,5,0,c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,4,1,c3,c1,c2);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,3,2,c3,c1,c2);
+ $LD r5,`2*$BNSZ`(r4)
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`5*$BNSZ`(r3) #r[5]=c3;
+ #sqr_add_c(a,3,c1,c2,c3);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+ #sqr_add_c2(a,4,2,c1,c2,c3);
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,5,1,c1,c2,c3);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,6,0,c1,c2,c3);
+ $LD r5,`0*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ $ST r9,`6*$BNSZ`(r3) #r[6]=c1;
+ #sqr_add_c2(a,7,0,c2,c3,c1);
+ $LD r6,`7*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,6,1,c2,c3,c1);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,5,2,c2,c3,c1);
+ $LD r5,`2*$BNSZ`(r4)
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,4,3,c2,c3,c1);
+ $LD r5,`3*$BNSZ`(r4)
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`7*$BNSZ`(r3) #r[7]=c2;
+ #sqr_add_c(a,4,c3,c1,c2);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ #sqr_add_c2(a,5,3,c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,6,2,c3,c1,c2);
+ $LD r5,`2*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,7,1,c3,c1,c2);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`7*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`8*$BNSZ`(r3) #r[8]=c3;
+ #sqr_add_c2(a,7,2,c1,c2,c3);
+ $LD r5,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,6,3,c1,c2,c3);
+ $LD r5,`3*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,5,4,c1,c2,c3);
+ $LD r5,`4*$BNSZ`(r4)
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ $ST r9,`9*$BNSZ`(r3) #r[9]=c1;
+ #sqr_add_c(a,5,c2,c3,c1);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ #sqr_add_c2(a,6,4,c2,c3,c1);
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,7,3,c2,c3,c1);
+ $LD r5,`3*$BNSZ`(r4)
+ $LD r6,`7*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`10*$BNSZ`(r3) #r[10]=c2;
+ #sqr_add_c2(a,7,4,c3,c1,c2);
+ $LD r5,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,6,5,c3,c1,c2);
+ $LD r5,`5*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`11*$BNSZ`(r3) #r[11]=c3;
+ #sqr_add_c(a,6,c1,c2,c3);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+ #sqr_add_c2(a,7,5,c1,c2,c3)
+ $LD r6,`7*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ $ST r9,`12*$BNSZ`(r3) #r[12]=c1;
+
+ #sqr_add_c2(a,7,6,c2,c3,c1)
+ $LD r5,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`13*$BNSZ`(r3) #r[13]=c2;
+ #sqr_add_c(a,7,c3,c1,c2);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ $ST r11,`14*$BNSZ`(r3) #r[14]=c3;
+ $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
+
+
+ bclr BO_ALWAYS,CR0_LT
+
+ .long 0x00000000
+
+#
+# NOTE: The following label name should be changed to
+# "bn_mul_comba4" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_mul_comba4:
+#
+# This is an optimized version of the bn_mul_comba4 routine.
+#
+# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+# r3 contains r
+# r4 contains a
+# r5 contains b
+# r6, r7 are the 2 BN_ULONGs being multiplied.
+# r8, r9 are the results of the 32x32 giving 64 multiply.
+# r10, r11, r12 are the equivalents of c1, c2, and c3.
+#
+ xor r0,r0,r0 #r0=0. Used in addze below.
+ #mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r10,r6,r7
+ $UMULH r11,r6,r7
+ $ST r10,`0*$BNSZ`(r3) #r[0]=c1
+ #mul_add_c(a[0],b[1],c2,c3,c1);
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r0
+ addze r10,r0
+ #mul_add_c(a[1],b[0],c2,c3,c1);
+ $LD r6, `1*$BNSZ`(r4)
+ $LD r7, `0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r12
+ addze r10,r10
+ $ST r11,`1*$BNSZ`(r3) #r[1]=c2
+ #mul_add_c(a[2],b[0],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r0
+ #mul_add_c(a[1],b[1],c3,c1,c2);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r11
+ #mul_add_c(a[0],b[2],c3,c1,c2);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r11
+ $ST r12,`2*$BNSZ`(r3) #r[2]=c3
+ #mul_add_c(a[0],b[3],c1,c2,c3);
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+ addze r12,r0
+ #mul_add_c(a[1],b[2],c1,c2,c3);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+ addze r12,r12
+ #mul_add_c(a[2],b[1],c1,c2,c3);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+ addze r12,r12
+ #mul_add_c(a[3],b[0],c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+ addze r12,r12
+ $ST r10,`3*$BNSZ`(r3) #r[3]=c1
+ #mul_add_c(a[3],b[1],c2,c3,c1);
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r12
+ addze r10,r0
+ #mul_add_c(a[2],b[2],c2,c3,c1);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r12
+ addze r10,r10
+ #mul_add_c(a[1],b[3],c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r12
+ addze r10,r10
+ $ST r11,`4*$BNSZ`(r3) #r[4]=c2
+ #mul_add_c(a[2],b[3],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r0
+ #mul_add_c(a[3],b[2],c3,c1,c2);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r11
+ $ST r12,`5*$BNSZ`(r3) #r[5]=c3
+ #mul_add_c(a[3],b[3],c1,c2,c3);
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+
+ $ST r10,`6*$BNSZ`(r3) #r[6]=c1
+ $ST r11,`7*$BNSZ`(r3) #r[7]=c2
+ bclr BO_ALWAYS,CR0_LT
+ .long 0x00000000
+
+#
+# NOTE: The following label name should be changed to
+# "bn_mul_comba8" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_mul_comba8:
+#
+# Optimized version of the bn_mul_comba8 routine.
+#
+# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+# r3 contains r
+# r4 contains a
+# r5 contains b
+# r6, r7 are the 2 BN_ULONGs being multiplied.
+# r8, r9 are the results of the 32x32 giving 64 multiply.
+# r10, r11, r12 are the equivalents of c1, c2, and c3.
+#
+ xor r0,r0,r0 #r0=0. Used in addze below.
+
+ #mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD r6,`0*$BNSZ`(r4) #a[0]
+ $LD r7,`0*$BNSZ`(r5) #b[0]
+ $UMULL r10,r6,r7
+ $UMULH r11,r6,r7
+ $ST r10,`0*$BNSZ`(r3) #r[0]=c1;
+ #mul_add_c(a[0],b[1],c2,c3,c1);
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ addze r12,r9 # since we didnt set r12 to zero before.
+ addze r10,r0
+ #mul_add_c(a[1],b[0],c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ $ST r11,`1*$BNSZ`(r3) #r[1]=c2;
+ #mul_add_c(a[2],b[0],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r0
+ #mul_add_c(a[1],b[1],c3,c1,c2);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[0],b[2],c3,c1,c2);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ $ST r12,`2*$BNSZ`(r3) #r[2]=c3;
+ #mul_add_c(a[0],b[3],c1,c2,c3);
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r0
+ #mul_add_c(a[1],b[2],c1,c2,c3);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+
+ #mul_add_c(a[2],b[1],c1,c2,c3);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[3],b[0],c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ $ST r10,`3*$BNSZ`(r3) #r[3]=c1;
+ #mul_add_c(a[4],b[0],c2,c3,c1);
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r0
+ #mul_add_c(a[3],b[1],c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[2],b[2],c2,c3,c1);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[1],b[3],c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[0],b[4],c2,c3,c1);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ $ST r11,`4*$BNSZ`(r3) #r[4]=c2;
+ #mul_add_c(a[0],b[5],c3,c1,c2);
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r0
+ #mul_add_c(a[1],b[4],c3,c1,c2);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[2],b[3],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[3],b[2],c3,c1,c2);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[4],b[1],c3,c1,c2);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[5],b[0],c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ $ST r12,`5*$BNSZ`(r3) #r[5]=c3;
+ #mul_add_c(a[6],b[0],c1,c2,c3);
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r0
+ #mul_add_c(a[5],b[1],c1,c2,c3);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[4],b[2],c1,c2,c3);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMUL