summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2004-04-27 22:05:50 +0000
committerAndy Polyakov <appro@openssl.org>2004-04-27 22:05:50 +0000
commitdd55880644b1f60784f7a7100b94c0e5f9a1394e (patch)
treee481aaa2cd75ac9a502e068d70ea95e6a8a445ea
parentbd1640bb01cbd336da830fa1f3e0042c63acab7d (diff)
Improved PowerPC support. Proper ./config support for ppc targets,
especially for AIX. But most important BIGNUM assembler implementation submitted by IBM. Submitted by: Peter Waltenberg <pwalten@au1.ibm.com> Reviewed by: appro
-rwxr-xr-xConfigure16
-rw-r--r--TABLE98
-rwxr-xr-xconfig42
-rw-r--r--crypto/bn/Makefile.ssl7
-rw-r--r--crypto/bn/asm/ppc.pl2081
5 files changed, 2178 insertions, 66 deletions
diff --git a/Configure b/Configure
index 84b03a6626..72b76bbf38 100755
--- a/Configure
+++ b/Configure
@@ -390,7 +390,8 @@ my %table=(
"linux-aout", "gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -m486 -Wall::(unknown):::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_out_asm}",
"linux-mipsel", "gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL DES_RISC2::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-mips", "gcc:-DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL DES_RISC2::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"linux-ppc", "gcc:-DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"linux-ppc", "gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:asm/linux_ppc32.o:::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"linux-ppc64", "gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:asm/linux_ppc64.o:::::::::dlfcn:linux-shared:-fPIC:-m64:.so..\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-m68k", "gcc:-DB_ENDIAN -DTERMIO -O2 -fomit-frame-pointer -Wall::-D_REENTRANT:::BN_LLONG::",
"linux-s390", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-s390x", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
@@ -446,11 +447,12 @@ my %table=(
# IBM's AIX.
-"aix-cc", "cc:-O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::BN_LLONG RC4_CHAR:::",
-"aix-gcc", "gcc:-O3 -DB_ENDIAN::(unknown):AIX::BN_LLONG RC4_CHAR:::",
-"aix43-cc", "cc:-O -DAIX -DB_ENDIAN -qmaxmem=16384::(unknown):::BN_LLONG RC4_CHAR::::::::::dlfcn:aix-shared:::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::",
-"aix43-gcc", "gcc:-O1 -DAIX -DB_ENDIAN::(unknown):::BN_LLONG RC4_CHAR::::::::::dlfcn:",
-"aix64-cc", "cc:-O -DAIX -DB_ENDIAN -qmaxmem=16384 -q64::(unknown):::SIXTY_FOUR_BIT_LONG RC4_CHAR::::::::::dlfcn:aix-shared::-q64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64",
+"aix-gcc", "gcc:-O -DB_ENDIAN::(unknown):AIX::BN_LLONG RC4_CHAR:asm/aix_ppc32.o:::::::::dlfcn:",
+"aix43-cc", "cc:-O -DB_ENDIAN -qmaxmem=16384::(unknown):::BN_LLONG RC4_CHAR:asm/aix_ppc32.o:::::::::dlfcn:aix-shared:::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::",
+# Below targets assume AIX 5. Idea is to effectively disregard $OBJECT_MODE
+# at build time. $OBJECT_MODE is respected at ./config stage!
+"aix-cc", "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::BN_LLONG RC4_CHAR:asm/aix_ppc32.o:::::::::dlfcn:aix-shared::-q32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32",
+"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR:asm/aix_ppc64.o:::::::::dlfcn:aix-shared::-q64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64",
#
# Cray T90 and similar (SDSC)
@@ -553,7 +555,7 @@ my %table=(
##### MacOS X (a.k.a. Rhapsody or Darwin) setup
"rhapsody-ppc-cc","cc:-O3 -DB_ENDIAN::(unknown):MACOSX_RHAPSODY::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:::",
-"darwin-ppc-cc","cc:-O3 -fomit-frame-pointer -fno-common -DB_ENDIAN::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:::::::::::darwin-shared:-fPIC::.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
+"darwin-ppc-cc","cc:-O3 -fno-common -DB_ENDIAN::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:asm/osx_ppc32.o::::::::::darwin-shared:-fPIC::.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
"darwin-i386-cc","cc:-O3 -fomit-frame-pointer -fno-common -DB_ENDIAN::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:::::::::::darwin-shared:-fPIC::.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
##### A/UX
diff --git a/TABLE b/TABLE
index 1500669dec..e9895cb181 100644
--- a/TABLE
+++ b/TABLE
@@ -1027,13 +1027,13 @@ $arflags =
*** aix-cc
$cc = cc
-$cflags = -O -DB_ENDIAN -qmaxmem=16384
+$cflags = -q32 -O -DB_ENDIAN -qmaxmem=16384
$unistd =
$thread_cflag = (unknown)
$sys_id = AIX
$lflags =
$bn_ops = BN_LLONG RC4_CHAR
-$bn_obj =
+$bn_obj = asm/aix_ppc32.o
$des_obj =
$bf_obj =
$md5_obj =
@@ -1042,23 +1042,23 @@ $cast_obj =
$rc4_obj =
$rmd160_obj =
$rc5_obj =
-$dso_scheme =
-$shared_target=
+$dso_scheme = dlfcn
+$shared_target= aix-shared
$shared_cflag =
-$shared_ldflag =
-$shared_extension =
+$shared_ldflag = -q32
+$shared_extension = .so.$(SHLIB_MAJOR).$(SHLIB_MINOR)
$ranlib =
-$arflags =
+$arflags = -X 32
*** aix-gcc
$cc = gcc
-$cflags = -O3 -DB_ENDIAN
+$cflags = -O -DB_ENDIAN
$unistd =
$thread_cflag = (unknown)
$sys_id = AIX
$lflags =
$bn_ops = BN_LLONG RC4_CHAR
-$bn_obj =
+$bn_obj = asm/aix_ppc32.o
$des_obj =
$bf_obj =
$md5_obj =
@@ -1067,7 +1067,7 @@ $cast_obj =
$rc4_obj =
$rmd160_obj =
$rc5_obj =
-$dso_scheme =
+$dso_scheme = dlfcn
$shared_target=
$shared_cflag =
$shared_ldflag =
@@ -1077,13 +1077,13 @@ $arflags =
*** aix43-cc
$cc = cc
-$cflags = -O -DAIX -DB_ENDIAN -qmaxmem=16384
+$cflags = -O -DB_ENDIAN -qmaxmem=16384
$unistd =
$thread_cflag = (unknown)
$sys_id =
$lflags =
$bn_ops = BN_LLONG RC4_CHAR
-$bn_obj =
+$bn_obj = asm/aix_ppc32.o
$des_obj =
$bf_obj =
$md5_obj =
@@ -1100,40 +1100,15 @@ $shared_extension = .so.$(SHLIB_MAJOR).$(SHLIB_MINOR)
$ranlib =
$arflags =
-*** aix43-gcc
-$cc = gcc
-$cflags = -O1 -DAIX -DB_ENDIAN
-$unistd =
-$thread_cflag = (unknown)
-$sys_id =
-$lflags =
-$bn_ops = BN_LLONG RC4_CHAR
-$bn_obj =
-$des_obj =
-$bf_obj =
-$md5_obj =
-$sha1_obj =
-$cast_obj =
-$rc4_obj =
-$rmd160_obj =
-$rc5_obj =
-$dso_scheme = dlfcn
-$shared_target=
-$shared_cflag =
-$shared_ldflag =
-$shared_extension =
-$ranlib =
-$arflags =
-
*** aix64-cc
$cc = cc
-$cflags = -O -DAIX -DB_ENDIAN -qmaxmem=16384 -q64
+$cflags = -q64 -O -DB_ENDIAN -qmaxmem=16384
$unistd =
$thread_cflag = (unknown)
-$sys_id =
+$sys_id = AIX
$lflags =
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR
-$bn_obj =
+$bn_obj = asm/aix_ppc64.o
$des_obj =
$bf_obj =
$md5_obj =
@@ -1452,13 +1427,13 @@ $arflags =
*** darwin-ppc-cc
$cc = cc
-$cflags = -O3 -fomit-frame-pointer -fno-common -DB_ENDIAN
+$cflags = -O3 -fno-common -DB_ENDIAN
$unistd =
$thread_cflag = -D_REENTRANT
$sys_id = MACOSX
$lflags =
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
-$bn_obj =
+$bn_obj = asm/osx_ppc32.o
$des_obj =
$bf_obj =
$md5_obj =
@@ -1702,7 +1677,7 @@ $arflags =
*** debug-levitte-linux-elf
$cc = gcc
-$cflags = -DLEVITTE_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DBN_DEBUG -DBN_DEBUG_RAND -DCRYPTO_MDEBUG -DENGINE_CONF_DEBUG -DL_ENDIAN -DTERMIO -D_POSIX_SOURCE -DPEDANTIC -ggdb -g3 -mcpu=i486 -pedantic -ansi -Wall -Wshadow -Wcast-align -Wstrict-prototypes -Wmissing-prototypes -Wno-long-long -Wundef -Wconversion -pipe
+$cflags = -DLEVITTE_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_DEBUG -DBN_DEBUG_RAND -DCRYPTO_MDEBUG -DENGINE_CONF_DEBUG -DL_ENDIAN -DTERMIO -D_POSIX_SOURCE -DPEDANTIC -ggdb -g3 -mcpu=i486 -pedantic -ansi -Wall -Wshadow -Wcast-align -Wstrict-prototypes -Wmissing-prototypes -Wno-long-long -Wundef -Wconversion -pipe
$unistd =
$thread_cflag = -D_REENTRANT
$sys_id =
@@ -1727,7 +1702,7 @@ $arflags =
*** debug-levitte-linux-elf-extreme
$cc = gcc
-$cflags = -DLEVITTE_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DBN_DEBUG -DBN_DEBUG_RAND -DCRYPTO_MDEBUG -DENGINE_CONF_DEBUG -DL_ENDIAN -DTERMIO -D_POSIX_SOURCE -DPEDANTIC -ggdb -g3 -mcpu=i486 -pedantic -ansi -Wall -W -Wundef -Wshadow -Wcast-align -Wstrict-prototypes -Wmissing-prototypes -Wno-long-long -Wundef -Wconversion -pipe
+$cflags = -DLEVITTE_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_DEBUG -DBN_DEBUG_RAND -DCRYPTO_MDEBUG -DENGINE_CONF_DEBUG -DL_ENDIAN -DTERMIO -D_POSIX_SOURCE -DPEDANTIC -ggdb -g3 -mcpu=i486 -pedantic -ansi -Wall -W -Wundef -Wshadow -Wcast-align -Wstrict-prototypes -Wmissing-prototypes -Wno-long-long -Wundef -Wconversion -pipe
$unistd =
$thread_cflag = -D_REENTRANT
$sys_id =
@@ -1752,7 +1727,7 @@ $arflags =
*** debug-levitte-linux-noasm
$cc = gcc
-$cflags = -DLEVITTE_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DBN_DEBUG -DBN_DEBUG_RAND -DCRYPTO_MDEBUG -DENGINE_CONF_DEBUG -DOPENSSL_NO_ASM -DL_ENDIAN -DTERMIO -D_POSIX_SOURCE -DPEDANTIC -ggdb -g3 -mcpu=i486 -pedantic -ansi -Wall -Wshadow -Wcast-align -Wstrict-prototypes -Wmissing-prototypes -Wno-long-long -Wundef -Wconversion -pipe
+$cflags = -DLEVITTE_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_DEBUG -DBN_DEBUG_RAND -DCRYPTO_MDEBUG -DENGINE_CONF_DEBUG -DOPENSSL_NO_ASM -DL_ENDIAN -DTERMIO -D_POSIX_SOURCE -DPEDANTIC -ggdb -g3 -mcpu=i486 -pedantic -ansi -Wall -Wshadow -Wcast-align -Wstrict-prototypes -Wmissing-prototypes -Wno-long-long -Wundef -Wconversion -pipe
$unistd =
$thread_cflag = -D_REENTRANT
$sys_id =
@@ -1777,7 +1752,7 @@ $arflags =
*** debug-levitte-linux-noasm-extreme
$cc = gcc
-$cflags = -DLEVITTE_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DBN_DEBUG -DBN_DEBUG_RAND -DCRYPTO_MDEBUG -DENGINE_CONF_DEBUG -DOPENSSL_NO_ASM -DL_ENDIAN -DTERMIO -D_POSIX_SOURCE -DPEDANTIC -ggdb -g3 -mcpu=i486 -pedantic -ansi -Wall -W -Wundef -Wshadow -Wcast-align -Wstrict-prototypes -Wmissing-prototypes -Wno-long-long -Wundef -Wconversion -pipe
+$cflags = -DLEVITTE_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_DEBUG -DBN_DEBUG_RAND -DCRYPTO_MDEBUG -DENGINE_CONF_DEBUG -DOPENSSL_NO_ASM -DL_ENDIAN -DTERMIO -D_POSIX_SOURCE -DPEDANTIC -ggdb -g3 -mcpu=i486 -pedantic -ansi -Wall -W -Wundef -Wshadow -Wcast-align -Wstrict-prototypes -Wmissing-prototypes -Wno-long-long -Wundef -Wconversion -pipe
$unistd =
$thread_cflag = -D_REENTRANT
$sys_id =
@@ -3127,7 +3102,7 @@ $arflags =
*** linux-ia32-icc
$cc = icc
-$cflags = -DL_ENDIAN -DTERMIO -O2
+$cflags = -DL_ENDIAN -DTERMIO -O2 -no_cpprt
$unistd =
$thread_cflag = -D_REENTRANT
$sys_id =
@@ -3352,13 +3327,13 @@ $arflags =
*** linux-ppc
$cc = gcc
-$cflags = -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall
+$cflags = -DB_ENDIAN -DTERMIO -O3 -Wall
$unistd =
$thread_cflag = -D_REENTRANT
$sys_id =
$lflags = -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL
-$bn_obj =
+$bn_obj = asm/linux_ppc32.o
$des_obj =
$bf_obj =
$md5_obj =
@@ -3375,6 +3350,31 @@ $shared_extension = .so.$(SHLIB_MAJOR).$(SHLIB_MINOR)
$ranlib =
$arflags =
+*** linux-ppc64
+$cc = gcc
+$cflags = -m64 -DB_ENDIAN -DTERMIO -O3 -Wall
+$unistd =
+$thread_cflag = -D_REENTRANT
+$sys_id =
+$lflags = -ldl
+$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL
+$bn_obj = asm/linux_ppc64.o
+$des_obj =
+$bf_obj =
+$md5_obj =
+$sha1_obj =
+$cast_obj =
+$rc4_obj =
+$rmd160_obj =
+$rc5_obj =
+$dso_scheme = dlfcn
+$shared_target= linux-shared
+$shared_cflag = -fPIC
+$shared_ldflag = -m64
+$shared_extension = .so..$(SHLIB_MAJOR).$(SHLIB_MINOR)
+$ranlib =
+$arflags =
+
*** linux-ppro
$cc = gcc
$cflags = -DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -mcpu=pentiumpro -Wall
diff --git a/config b/config
index a0dc457c55..46102cebde 100755
--- a/config
+++ b/config
@@ -110,12 +110,8 @@ case "${SYSTEM}:${RELEASE}:${VERSION}:${MACHINE}" in
echo "m68k-apple-aux3"; exit 0
;;
- AIX:[3456789]:4:*)
- echo "${MACHINE}-ibm-aix43"; exit 0
- ;;
-
- AIX:*:[56789]:*)
- echo "${MACHINE}-ibm-aix43"; exit 0
+ AIX:*:[5-9]:*)
+ echo "${MACHINE}-ibm-aix5"; exit 0
;;
AIX:*)
@@ -537,10 +533,7 @@ EOF
${CC} -o dummy dummy.c && OUT=`./dummy ${MACHINE}`
rm dummy dummy.c
;;
- ppc64-*-linux2)
- #Use the standard target for PPC architecture until we create a
- #special one for the 64bit architecture.
- OUT="linux-ppc" ;;
+ ppc64-*-linux2) OUT="linux-ppc64" ;;
ppc-*-linux2) OUT="linux-ppc" ;;
m68k-*-linux*) OUT="linux-m68k" ;;
ia64-*-linux?) OUT="linux-ia64" ;;
@@ -719,6 +712,35 @@ EOF
fi
options="$options -D_REENTRANT" ;;
*-hpux) OUT="hpux-parisc-$CC" ;;
+ *-aix5)
+ KERNEL_BITS=`(getconf KERNEL_BITMODE) 2>/dev/null`
+ KERNEL_BITS=${KERNEL_BITS:-32}
+ if [ $KERNEL_BITS -eq 64 ]; then
+ # we default to 64-bit because PKI performance is >3x better...
+ OBJECT_MODE=${OBJECT_MODE:-$KERNEL_BITS}
+ else
+ OBJECT_MODE=32
+ fi
+ OUT="aix-cc"
+ if [ "$CC" = "cc" -a $OBJECT_MODE -eq 64 ]; then
+ OUT="aix64-cc"
+ echo "WARNING! If you wish to build 32-bit kit, then you have to"
+ echo " invoke './Configure aix-cc' *manually*."
+ if [ "$TEST" = "false" ]; then
+ echo " You have ~5 seconds to press Ctrl-C to abort."
+ (stty -icanon min 0 time 50; read waste) < /dev/tty
+ fi
+ elif [ "$CC" = "gcc ]; then
+ OUT="aix-gcc"
+ fi
+ ;;
+ *-aix)
+ if [ "$CC" = "gcc" ]; then
+ OUT="aix-gcc"
+ else
+ OUT="aix43-cc"
+ fi
+ ;;
# these are all covered by the catchall below
# *-aix) OUT="aix-$CC" ;;
# *-dgux) OUT="dgux" ;;
diff --git a/crypto/bn/Makefile.ssl b/crypto/bn/Makefile.ssl
index 70fec63620..490d6d7dae 100644
--- a/crypto/bn/Makefile.ssl
+++ b/crypto/bn/Makefile.ssl
@@ -129,6 +129,13 @@ asm/pa-risc2W.o: asm/pa-risc2W.s
asm/pa-risc2.o: asm/pa-risc2.s
/usr/ccs/bin/as -o asm/pa-risc2.o asm/pa-risc2.s
+# ppc - AIX, Linux, MacOS X...
+asm/linux_ppc32.s: asm/ppc.pl; $(PERL) $< $@
+asm/linux_ppc64.s: asm/ppc.pl; $(PERL) $< $@
+asm/aix_ppc32.s: asm/ppc.pl; $(PERL) $< $@
+asm/aix_ppc64.s: asm/ppc.pl; $(PERL) $< $@
+asm/osx_ppc32.s: asm/ppc.pl; $(PERL) $< $@
+
files:
$(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO
diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl
new file mode 100644
index 0000000000..307c7ccb35
--- /dev/null
+++ b/crypto/bn/asm/ppc.pl
@@ -0,0 +1,2081 @@
+#!/usr/bin/env perl
+#
+# Implemented as a Perl wrapper as we want to support several different
+# architectures with single file. We pick up the target based on the
+# file name we are asked to generate.
+#
+# It should be noted though that this perl code is nothing like
+# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
+# as pre-processor to cover for platform differences in name decoration,
+# linker tables, 32-/64-bit instruction sets...
+#
+# As you might know there're several PowerPC ABI in use. Most notably
+# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
+# are similar enough to implement leaf(!) functions, which would be ABI
+# neutral. And that's what you find here: ABI neutral leaf functions.
+# In case you wonder what that is...
+#
+# AIX performance
+#
+# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
+#
+# The following is the performance of 32-bit compiler
+# generated code:
+#
+# OpenSSL 0.9.6c 21 dec 2001
+# built on: Tue Jun 11 11:06:51 EDT 2002
+# options:bn(64,32) ...
+#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
+# sign verify sign/s verify/s
+#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
+#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
+#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
+#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
+#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
+#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
+#
+# Same bechmark with this assembler code:
+#
+#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
+#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
+#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
+#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
+#dsa 512 bits 0.0052s 0.0062s 191.6 162.0
+#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
+#
+# Number of operations increases by at almost 75%
+#
+# Here are performance numbers for 64-bit compiler
+# generated code:
+#
+# OpenSSL 0.9.6g [engine] 9 Aug 2002
+# built on: Fri Apr 18 16:59:20 EDT 2003
+# options:bn(64,64) ...
+# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
+# sign verify sign/s verify/s
+#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
+#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
+#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
+#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
+#dsa 512 bits 0.0026s 0.0032s 382.5 313.7
+#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
+#
+# Same benchmark with this assembler code:
+#
+#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
+#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
+#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
+#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
+#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
+#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
+#
+# Again, performance increases by at about 75%
+#
+# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
+# OpenSSL 0.9.7c 30 Sep 2003
+#
+# Original code.
+#
+#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
+#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
+#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
+#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
+#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
+#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
+#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
+#
+# Same benchmark with this assembler code:
+#
+#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
+#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
+#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
+#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
+#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
+#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
+#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
+#
+# Performance increase of ~60%
+#
+# If you have comments or suggestions to improve code send
+# me a note at schari@us.ibm.com
+#
+
+$opf = shift;
+
+if ($opf =~ /32\.s/) {
+ $BITS= 32;
+ $BNSZ= $BITS/8;
+ $ISA= "\"ppc\"";
+
+ $LD= "lwz"; # load
+ $LDU= "lwzu"; # load and update
+ $ST= "stw"; # store
+ $STU= "stwu"; # store and update
+ $UMULL= "mullw"; # unsigned multiply low
+ $UMULH= "mulhwu"; # unsigned multiply high
+ $UDIV= "divwu"; # unsigned divide
+ $UCMPI= "cmplwi"; # unsigned compare with immediate
+ $UCMP= "cmplw"; # unsigned compare
+ $COUNTZ="cntlzw"; # count leading zeros
+ $SHL= "slw"; # shift left
+ $SHR= "srw"; # unsigned shift right
+ $SHRI= "srwi"; # unsigned shift right by immediate
+ $SHLI= "slwi"; # shift left by immediate
+ $CLRU= "clrlwi"; # clear upper bits
+ $INSR= "insrwi"; # insert right
+ $ROTL= "rotlwi"; # rotate left by immediate
+} elsif ($opf =~ /64\.s/) {
+ $BITS= 64;
+ $BNSZ= $BITS/8;
+ $ISA= "\"ppc64\"";
+
+ # same as above, but 64-bit mnemonics...
+ $LD= "ld"; # load
+ $LDU= "ldu"; # load and update
+ $ST= "std"; # store
+ $STU= "stdu"; # store and update
+ $UMULL= "mulld"; # unsigned multiply low
+ $UMULH= "mulhdu"; # unsigned multiply high
+ $UDIV= "divdu"; # unsigned divide
+ $UCMPI= "cmpldi"; # unsigned compare with immediate
+ $UCMP= "cmpld"; # unsigned compare
+ $COUNTZ="cntlzd"; # count leading zeros
+ $SHL= "sld"; # shift left
+ $SHR= "srd"; # unsigned shift right
+ $SHRI= "srdi"; # unsigned shift right by immediate
+ $SHLI= "sldi"; # shift left by immediate
+ $CLRU= "clrldi"; # clear upper bits
+ $INSR= "insrdi"; # insert right
+ $ROTL= "rotldi"; # rotate left by immediate
+} else { die "nonsense $opf"; }
+
+( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!";
+
+# function entry points from the AIX code
+#
+# There are other, more elegant, ways to handle this. We (IBM) chose
+# this approach as it plays well with scripts we run to 'namespace'
+# OpenSSL .i.e. we add a prefix to all the public symbols so we can
+# co-exist in the same process with other implementations of OpenSSL.
+# 'cleverer' ways of doing these substitutions tend to hide data we
+# need to be obvious.
+#
+my @items = ("bn_sqr_comba4",
+ "bn_sqr_comba8",
+ "bn_mul_comba4",
+ "bn_mul_comba8",
+ "bn_sub_words",
+ "bn_add_words",
+ "bn_div_words",
+ "bn_sqr_words",
+ "bn_mul_words",
+ "bn_mul_add_words");
+
+if ($opf =~ /linux/) { do_linux(); }
+elsif ($opf =~ /aix/) { do_aix(); }
+elsif ($opf =~ /osx/) { do_osx(); }
+else { do_bsd(); }
+
+sub do_linux {
+ $d=&data();
+
+ if ($BITS==64) {
+ foreach $t (@items) {
+ $d =~ s/\.$t:/\
+\t.section\t".opd","aw"\
+\t.align\t3\
+\t.globl\t$t\
+$t:\
+\t.quad\t.$t,.TOC.\@tocbase,0\
+\t.size\t$t,24\
+\t.previous\n\
+\t.type\t.$t,\@function\
+\t.globl\t.$t\
+.$t:/g;
+ }
+ }
+ else {
+ foreach $t (@items) {
+ $d=~s/\.$t/$t/g;
+ }
+ }
+ # hide internal labels to avoid pollution of name table...
+ $d=~s/Lppcasm_/.Lppcasm_/gm;
+ print $d;
+}
+
+sub do_aix {
+ # AIX assembler is smart enough to please the linker without
+ # making us do something special...
+ print &data();
+}
+
+# MacOSX 32 bit
+sub do_osx {
+ $d=&data();
+ # Change the bn symbol prefix from '.' to '_'
+ foreach $t (@items) {
+ $d=~s/\.$t/_$t/g;
+ }
+ # Change .machine to something OS X asm will accept
+ $d=~s/\.machine.*/.text/g;
+ $d=~s/\#/;/g; # change comment from '#' to ';'
+ print $d;
+}
+
+# BSD (Untested)
+sub do_bsd {
+ $d=&data();
+ foreach $t (@items) {
+ $d=~s/\.$t/_$t/g;
+ }
+ print $d;
+}
+
+sub data {
+ local($data)=<<EOF;
+#--------------------------------------------------------------------
+#
+#
+#
+#
+# File: ppc32.s
+#
+# Created by: Suresh Chari
+# IBM Thomas J. Watson Research Library
+# Hawthorne, NY
+#
+#
+# Description: Optimized assembly routines for OpenSSL crypto
+# on the 32 bitPowerPC platform.
+#
+#
+# Version History
+#
+# 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
+# cleaned up code. Also made a single version which can
+# be used for both the AIX and Linux compilers. See NOTE
+# below.
+# 12/05/03 Suresh Chari
+# (with lots of help from) Andy Polyakov
+##
+# 1. Initial version 10/20/02 Suresh Chari
+#
+#
+# The following file works for the xlc,cc
+# and gcc compilers.
+#
+# NOTE: To get the file to link correctly with the gcc compiler
+# you have to change the names of the routines and remove
+# the first .(dot) character. This should automatically
+# be done in the build process.
+#
+# Hand optimized assembly code for the following routines
+#
+# bn_sqr_comba4
+# bn_sqr_comba8
+# bn_mul_comba4
+# bn_mul_comba8
+# bn_sub_words
+# bn_add_words
+# bn_div_words
+# bn_sqr_words
+# bn_mul_words
+# bn_mul_add_words
+#
+# NOTE: It is possible to optimize this code more for
+# specific PowerPC or Power architectures. On the Northstar
+# architecture the optimizations in this file do
+# NOT provide much improvement.
+#
+# If you have comments or suggestions to improve code send
+# me a note at schari\@us.ibm.com
+#
+#--------------------------------------------------------------------------
+#
+# Defines to be used in the assembly code.
+#
+.set r0,0 # we use it as storage for value of 0
+.set SP,1 # preserved
+.set RTOC,2 # preserved
+.set r3,3 # 1st argument/return value
+.set r4,4 # 2nd argument/volatile register
+.set r5,5 # 3rd argument/volatile register
+.set r6,6 # ...
+.set r7,7
+.set r8,8
+.set r9,9
+.set r10,10
+.set r11,11
+.set r12,12
+.set r13,13 # not used, nor any other "below" it...
+
+.set BO_IF_NOT,4
+.set BO_IF,12
+.set BO_dCTR_NZERO,16
+.set BO_dCTR_ZERO,18
+.set BO_ALWAYS,20
+.set CR0_LT,0;
+.set CR0_GT,1;
+.set CR0_EQ,2
+.set CR1_FX,4;
+.set CR1_FEX,5;
+.set CR1_VX,6
+.set LR,8
+
+# Declare function names to be global
+# NOTE: For gcc these names MUST be changed to remove
+# the first . i.e. for example change ".bn_sqr_comba4"
+# to "bn_sqr_comba4". This should be automatically done
+# in the build.
+
+ .globl .bn_sqr_comba4
+ .globl .bn_sqr_comba8
+ .globl .bn_mul_comba4
+ .globl .bn_mul_comba8
+ .globl .bn_sub_words
+ .globl .bn_add_words
+ .globl .bn_div_words
+ .globl .bn_sqr_words
+ .globl .bn_mul_words
+ .globl .bn_mul_add_words
+
+# .text section
+
+ .machine $ISA
+
+#
+# NOTE: The following label name should be changed to
+# "bn_sqr_comba4" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_sqr_comba4:
+#
+# Optimized version of bn_sqr_comba4.
+#
+# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+# r3 contains r
+# r4 contains a
+#
+# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
+#
+# r5,r6 are the two BN_ULONGs being multiplied.
+# r7,r8 are the results of the 32x32 giving 64 bit multiply.
+# r9,r10, r11 are the equivalents of c1,c2, c3.
+# Here's the assembly
+#
+#
+ xor r0,r0,r0 # set r0 = 0. Used in the addze
+ # instructions below
+
+ #sqr_add_c(a,0,c1,c2,c3)
+ $LD r5,`0*$BNSZ`(r4)
+ $UMULL r9,r5,r5
+ $UMULH r10,r5,r5 #in first iteration. No need
+ #to add since c1=c2=c3=0.
+ # Note c3(r11) is NOT set to 0
+ # but will be.
+
+ $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
+ # sqr_add_c2(a,1,0,c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
+ adde r8,r8,r8
+ addze r9,r0 # catch carry if any.
+ # r9= r0(=0) and carry
+
+ addc r10,r7,r10 # now add to temp result.
+ addze r11,r8 # r8 added to r11 which is 0
+ addze r9,r9
+
+ $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
+ #sqr_add_c(a,1,c3,c1,c2)
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ #sqr_add_c2(a,2,0,c3,c1,c2)
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`2*$BNSZ`(r3) #r[2]=c3
+ #sqr_add_c2(a,3,0,c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r11,r0
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,2,1,c1,c2,c3);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ $ST r9,`3*$BNSZ`(r3) #r[3]=c1
+ #sqr_add_c(a,2,c2,c3,c1);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ #sqr_add_c2(a,3,1,c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r9,r9
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`4*$BNSZ`(r3) #r[4]=c2
+ #sqr_add_c2(a,3,2,c3,c1,c2);
+ $LD r5,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r10,r0
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`5*$BNSZ`(r3) #r[5] = c3
+ #sqr_add_c(a,3,c1,c2,c3);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+
+ $ST r9,`6*$BNSZ`(r3) #r[6]=c1
+ $ST r10,`7*$BNSZ`(r3) #r[7]=c2
+ bclr BO_ALWAYS,CR0_LT
+ .long 0x00000000
+
+#
+# NOTE: The following label name should be changed to
+# "bn_sqr_comba8" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_sqr_comba8:
+#
+# This is an optimized version of the bn_sqr_comba8 routine.
+# Tightly uses the adde instruction
+#
+#
+# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+# r3 contains r
+# r4 contains a
+#
+# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
+#
+# r5,r6 are the two BN_ULONGs being multiplied.
+# r7,r8 are the results of the 32x32 giving 64 bit multiply.
+# r9,r10, r11 are the equivalents of c1,c2, c3.
+#
+# Possible optimization of loading all 8 longs of a into registers
+# doesnt provide any speedup
+#
+
+ xor r0,r0,r0 #set r0 = 0.Used in addze
+ #instructions below.
+
+ #sqr_add_c(a,0,c1,c2,c3);
+ $LD r5,`0*$BNSZ`(r4)
+ $UMULL r9,r5,r5 #1st iteration: no carries.
+ $UMULH r10,r5,r5
+ $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
+ #sqr_add_c2(a,1,0,c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10 #add the two register number
+ adde r11,r8,r0 # (r8,r7) to the three register
+ addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
+
+ addc r10,r7,r10 #add the two register number
+ adde r11,r8,r11 # (r8,r7) to the three register
+ addze r9,r9 # number (r9,r11,r10).
+
+ $ST r10,`1*$BNSZ`(r3) # r[1]=c2
+
+ #sqr_add_c(a,1,c3,c1,c2);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ #sqr_add_c2(a,2,0,c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ $ST r11,`2*$BNSZ`(r3) #r[2]=c3
+ #sqr_add_c2(a,3,0,c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,2,1,c1,c2,c3);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ $ST r9,`3*$BNSZ`(r3) #r[3]=c1;
+ #sqr_add_c(a,2,c2,c3,c1);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ #sqr_add_c2(a,3,1,c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,4,0,c2,c3,c1);
+ $LD r5,`0*$BNSZ`(r4)
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`4*$BNSZ`(r3) #r[4]=c2;
+ #sqr_add_c2(a,5,0,c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,4,1,c3,c1,c2);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,3,2,c3,c1,c2);
+ $LD r5,`2*$BNSZ`(r4)
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`5*$BNSZ`(r3) #r[5]=c3;
+ #sqr_add_c(a,3,c1,c2,c3);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+ #sqr_add_c2(a,4,2,c1,c2,c3);
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,5,1,c1,c2,c3);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,6,0,c1,c2,c3);
+ $LD r5,`0*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11<