diff options
author | Andy Polyakov <appro@openssl.org> | 2012-11-17 10:34:11 +0000 |
---|---|---|
committer | Andy Polyakov <appro@openssl.org> | 2012-11-17 10:34:11 +0000 |
commit | 68c06bf6b2627021cb7f26d0ab0baee1cd2fad65 (patch) | |
tree | 8b8b55217405694074359d34b151cc8976345582 | |
parent | c7b7984ac914d33590dfe9e46e35336f5e4f723f (diff) |
Support for SPARC T4 MONT[MUL|SQR] instructions.
Submitted by: David Miller, Andy Polyakov
-rwxr-xr-x | Configure | 2 | ||||
-rw-r--r-- | TABLE | 51 | ||||
-rw-r--r-- | crypto/bn/Makefile | 2 | ||||
-rwxr-xr-x | crypto/bn/asm/sparct4-mont.pl | 1201 | ||||
-rw-r--r-- | crypto/bn/bn_exp.c | 105 | ||||
-rw-r--r-- | crypto/sparc_arch.h | 3 | ||||
-rw-r--r-- | crypto/sparcv9cap.c | 32 |
7 files changed, 1386 insertions, 10 deletions
@@ -130,7 +130,7 @@ my $x86_elf_asm="$x86_asm:elf"; my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:e_padlock-x86_64.o"; my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; -my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void"; +my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void"; my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void"; my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::"; @@ -174,7 +174,7 @@ $sys_id = $lflags = $bn_ops = BN_LLONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -1782,6 +1782,39 @@ $ranlib = $arflags = $multilib = +*** debug-ben-debug-64 +$cc = gcc +$cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -DL_ENDIAN -DTERMIOS -g3 -O3 +$unistd = +$thread_cflag = -pthread -D_THREAD_SAFE -D_REENTRANT +$sys_id = +$lflags = +$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL +$cpuid_obj = x86_64cpuid.o +$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o +$des_obj = +$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o +$bf_obj = +$md5_obj = md5-x86_64.o +$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o +$cast_obj = +$rc4_obj = rc4-x86_64.o rc4-md5-x86_64.o +$rmd160_obj = +$rc5_obj = +$wp_obj = wp-x86_64.o +$cmll_obj = cmll-x86_64.o cmll_misc.o +$modes_obj = ghash-x86_64.o +$engines_obj = e_padlock-x86_64.o +$perlasm_scheme = elf +$dso_scheme = dlfcn +$shared_target= bsd-gcc-shared +$shared_cflag = -fPIC +$shared_ldflag = +$shared_extension = .so.$(SHLIB_MAJOR).$(SHLIB_MINOR) +$ranlib = +$arflags = +$multilib = + *** debug-ben-macos $cc = cc $cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -DOPENSSL_NO_ASM -DBN_DEBUG -DCONF_DEBUG -DDEBUG_SAFESTACK -DDEBUG_UNUSED -DOPENSSL_THREADS -D_REENTRANT -DDSO_DLFCN -DHAVE_DLFCN_H -arch i386 -O3 -DL_ENDIAN -g3 -pipe @@ -2616,7 +2649,7 @@ $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -2649,7 +2682,7 @@ $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -4398,7 +4431,7 @@ $sys_id = ULTRASPARC $lflags = -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -4596,7 +4629,7 @@ $sys_id = ULTRASPARC $lflags = -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -5454,7 +5487,7 @@ $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -5487,7 +5520,7 @@ $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -5586,7 +5619,7 @@ $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -5619,7 +5652,7 @@ $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile index 2f6446ec14..6267ef4bb9 100644 --- a/crypto/bn/Makefile +++ b/crypto/bn/Makefile @@ -79,6 +79,8 @@ sparcv9-mont.s: asm/sparcv9-mont.pl $(PERL) asm/sparcv9-mont.pl $(CFLAGS) > $@ vis3-mont.s: asm/vis3-mont.pl $(PERL) asm/vis3-mont.pl $(CFLAGS) > $@ +sparct4-mont.S: asm/sparct4-mont.pl + $(PERL) asm/sparct4-mont.pl $(CFLAGS) > $@ sparcv9-gf2m.S: asm/sparcv9-gf2m.pl $(PERL) asm/sparcv9-gf2m.pl $(CFLAGS) > $@ diff --git a/crypto/bn/asm/sparct4-mont.pl b/crypto/bn/asm/sparct4-mont.pl new file mode 100755 index 0000000000..d8e544fd56 --- /dev/null +++ b/crypto/bn/asm/sparct4-mont.pl @@ -0,0 +1,1201 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov +# <appro@openssl.org>. The module is licensed under 2-clause BSD +# license. November 2012. All rights reserved. +# ==================================================================== + +###################################################################### +# Montgomery squaring-n-multiplication module for SPARC T4. +# +# The module consists of three parts: +# +# 1) collection of "single-op" subroutines that perform single +# operation, Montgomery squaring or multiplication, on 512-, +# 1024-, 1536- and 2048-bit operands; +# 2) collection of "multi-op" subroutines that perform 5 squaring and +# 1 multiplication operations on operands of above lengths; +# 3) fall-back and helper VIS3 subroutines. +# +# RSA sign is dominated by multi-op subroutine, while RSA verify and +# DSA - by single-op. Special note about 4096-bit RSA verify result. +# Operands are too long for dedicated hardware and it's handled by +# VIS3 code, which is why you don't see any improvement. It's surely +# possible to improve it [by deploying 'mpmul' instruction], maybe in +# the future... +# +# Performance improvement. +# +# 64-bit process, VIS3: +# sign verify sign/s verify/s +# rsa 1024 bits 0.000633s 0.000033s 1578.9 30513.3 +# rsa 2048 bits 0.003297s 0.000116s 303.3 8585.8 +# rsa 4096 bits 0.026000s 0.000387s 38.5 2587.0 +# dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9 +# dsa 2048 bits 0.001056s 0.001233s 946.9 810.8 +# +# 64-bit process, this module: +# sign verify sign/s verify/s +# rsa 1024 bits 0.000341s 0.000021s 2931.5 46873.8 +# rsa 2048 bits 0.001244s 0.000044s 803.9 22569.1 +# rsa 4096 bits 0.006203s 0.000387s 161.2 2586.3 +# dsa 1024 bits 0.000179s 0.000195s 5573.9 5115.6 +# dsa 2048 bits 0.000311s 0.000350s 3212.3 2856.6 +# +###################################################################### +# 32-bit process, VIS3: +# sign verify sign/s verify/s +# rsa 1024 bits 0.000675s 0.000033s 1480.9 30159.0 +# rsa 2048 bits 0.003383s 0.000118s 295.6 8499.9 +# rsa 4096 bits 0.026178s 0.000394s 38.2 2541.3 +# dsa 1024 bits 0.000326s 0.000343s 3070.0 2918.8 +# dsa 2048 bits 0.001121s 0.001291s 891.9 774.4 +# +# 32-bit process, this module: +# sign verify sign/s verify/s +# rsa 1024 bits 0.000386s 0.000022s 2589.6 45704.9 +# rsa 2048 bits 0.001335s 0.000046s 749.3 21766.8 +# rsa 4096 bits 0.006390s 0.000393s 156.5 2544.8 +# dsa 1024 bits 0.000208s 0.000204s 4817.6 4896.6 +# dsa 2048 bits 0.000345s 0.000364s 2898.8 2747.3 +# +# 32-bit code is prone to performance degradation as interrupt rate +# dispatched to CPU executing the code grows. This is because in +# standard process of handling interrupt in 32-bit process context +# upper halves of most integer registers used as input or output are +# zeroed. This renders result invalid, and operation has to be re-run. +# If CPU is "bothered" with timer interrupts only, the penalty is +# hardly measurable. But in order to mitigate this problem for higher +# interrupt rates contemporary Linux kernel recognizes biased stack +# even in 32-bit process context and preserves full register contents. +# See http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;h=517ffce4e1a03aea979fe3a18a3dd1761a24fafb +# for details. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "sparcv9_modes.pl"; + +$code.=<<___; +#include "sparc_arch.h" + +#ifdef __arch64__ +.register %g2,#scratch +.register %g3,#scratch +#endif + +.section ".text",#alloc,#execinstr + +#ifdef __PIC__ +SPARC_PIC_THUNK(%g1) +#endif +___ + +######################################################################## +# Register layout for mont[mul|sqr] instructions. +# For details see "Oracle SPARC Architecture 2011" manual at +# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/. +# +my @R=map("%f".2*$_,(0..11,30,31,12..29)); +my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]); +my @B=(map("%o$_",(0..5)),@N[0..13],@N[0..11]); +my @A=(@N[0..13],@R[14..31]); + +######################################################################## +# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp, +# const u64 *np,const BN_ULONG *n0); +# +sub generate_bn_mul_mont_t4() { +my $NUM=shift; +my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5)); + +$code.=<<___; +.globl bn_mul_mont_t4_$NUM +.align 32 +bn_mul_mont_t4_$NUM: +#ifdef __arch64__ + mov 0,$sentinel + mov -128,%g4 +#elif defined(SPARCV9_64BIT_STACK) + SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) + ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] + mov -2047,%g4 + and %g1,SPARCV9_64BIT_STACK,%g1 + movrz %g1,0,%g4 + mov -1,$sentinel + add %g4,-128,%g4 +#else + mov -1,$sentinel + mov -128,%g4 +#endif + sllx $sentinel,32,$sentinel + save %sp,%g4,%sp +#ifndef __arch64__ + save %sp,-128,%sp ! warm it up + save %sp,-128,%sp + save %sp,-128,%sp + save %sp,-128,%sp + save %sp,-128,%sp + save %sp,-128,%sp + restore + restore + restore + restore + restore + restore +#endif + and %sp,1,%g4 + or $sentinel,%fp,%fp + or %g4,$sentinel,$sentinel + + ! copy arguments to global registers + mov %i0,$rp + mov %i1,$ap + mov %i2,$bp + mov %i3,$np + ld [%i4+0],%f1 ! load *n0 + ld [%i4+4],%f0 + fsrc2 %f0,%f60 +___ + +# load ap[$NUM] ######################################################## +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for($i=0; $i<14 && $i<$NUM; $i++) { +my $lo=$i<13?@A[$i+1]:"%o7"; +$code.=<<___; + ld [$ap+$i*8+0],$lo + ld [$ap+$i*8+4],@A[$i] + sllx @A[$i],32,@A[$i] + or $lo,@A[$i],@A[$i] +___ +} +for(; $i<$NUM; $i++) { +my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); +$code.=<<___; + ld [$ap+$i*8+0],$lo + ld [$ap+$i*8+4],$hi + fsrc2 $hi,@A[$i] +___ +} +# load np[$NUM] ######################################################## +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for($i=0; $i<14 && $i<$NUM; $i++) { +my $lo=$i<13?@N[$i+1]:"%o7"; +$code.=<<___; + ld [$np+$i*8+0],$lo + ld [$np+$i*8+4],@N[$i] + sllx @N[$i],32,@N[$i] + or $lo,@N[$i],@N[$i] +___ +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<28 && $i<$NUM; $i++) { +my $lo=$i<27?@N[$i+1]:"%o7"; +$code.=<<___; + ld [$np+$i*8+0],$lo + ld [$np+$i*8+4],@N[$i] + sllx @N[$i],32,@N[$i] + or $lo,@N[$i],@N[$i] +___ +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<$NUM; $i++) { +my $lo=($i<$NUM-1)?@N[$i+1]:"%o7"; +$code.=<<___; + ld [$np+$i*8+0],$lo + ld [$np+$i*8+4],@N[$i] + sllx @N[$i],32,@N[$i] + or $lo,@N[$i],@N[$i] +___ +} +$code.=<<___; + cmp $ap,$bp + be SIZE_T_CC,.Lmsquare_$NUM + nop +___ + +# load bp[$NUM] ######################################################## +for($i=0; $i<6 && $i<$NUM; $i++) { +my $lo=$i<5?@B[$i+1]:"%o7"; +$code.=<<___; + ld [$bp+$i*8+0],$lo + ld [$bp+$i*8+4],@B[$i] + sllx @B[$i],32,@B[$i] + or $lo,@B[$i],@B[$i] +___ +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<20 && $i<$NUM; $i++) { +my $lo=$i<19?@B[$i+1]:"%o7"; +$code.=<<___; + ld [$bp+$i*8+0],$lo + ld [$bp+$i*8+4],@B[$i] + sllx @B[$i],32,@B[$i] + or $lo,@B[$i],@B[$i] +___ +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<$NUM; $i++) { +my $lo=($i<$NUM-1)?@B[$i+1]:"%o7"; +$code.=<<___; + ld [$bp+$i*8+0],$lo + ld [$bp+$i*8+4],@B[$i] + sllx @B[$i],32,@B[$i] + or $lo,@B[$i],@B[$i] +___ +} +# magic ################################################################ +$code.=<<___; + .word 0x81b02920+$NUM-1 ! montmul $NUM-1 +.Lmresume_$NUM: + fbu,pn %fcc3,.Lmabort_$NUM +#ifndef __arch64__ + and %fp,$sentinel,$sentinel + brz,pn $sentinel,.Lmabort_$NUM +#endif + nop +#ifdef __arch64__ + restore + restore + restore + restore + restore +#else + restore; and %fp,$sentinel,$sentinel + restore; and %fp,$sentinel,$sentinel + restore; and %fp,$sentinel,$sentinel + restore; and %fp,$sentinel,$sentinel + brz,pn $sentinel,.Lmabort1_$NUM + restore +#endif +___ + +# save tp[$NUM] ######################################################## +for($i=0; $i<14 && $i<$NUM; $i++) { +$code.=<<___; + movxtod @A[$i],@R[$i] +___ +} +$code.=<<___; +#ifdef __arch64__ + restore +#else + and %fp,$sentinel,$sentinel + restore + and $sentinel,1,%o7 + and %fp,$sentinel,$sentinel + srl %fp,0,%fp ! just in case? + or %o7,$sentinel,$sentinel + brz,a,pn $sentinel,.Lmdone_$NUM + mov 0,%i0 ! return failure +#endif +___ +for($i=0; $i<12 && $i<$NUM; $i++) { +@R[$i] =~ /%f([0-9]+)/; +my $lo = "%f".($1+1); +$code.=<<___; + st $lo,[$rp+$i*8+0] + st @R[$i],[$rp+$i*8+4] +___ +} +for(; $i<$NUM; $i++) { +my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); +$code.=<<___; + fsrc2 @R[$i],$hi + st $lo,[$rp+$i*8+0] + st $hi,[$rp+$i*8+4] +___ +} +$code.=<<___; + mov 1,%i0 ! return success +.Lmdone_$NUM: + ret + restore + +.Lmabort_$NUM: + restore + restore + restore + restore + restore +.Lmabort1_$NUM: + restore + + mov 0,%i0 ! return failure + ret + restore + +.align 32 +.Lmsquare_$NUM: + save %sp,-128,%sp; or $sentinel,%fp,%fp + save %sp,-128,%sp; or $sentinel,%fp,%fp + .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 + ba .Lmresume_$NUM + nop +.type bn_mul_mont_t4_$NUM, #function +.size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM +___ +} + +for ($i=8;$i<=32;$i+=8) { + &generate_bn_mul_mont_t4($i); +} + +######################################################################## +# +sub load_fcc() { +my ($ptbl,$pwr,$tmp)=@_; +$code.=<<___; + sethi %hi(.Lmagic-1f),$tmp +1: call .+8 + add %o7, $tmp, %o7 + inc %lo(.Lmagic-1b),%o7 + and $pwr, 7<<2, $tmp ! offset within "magic table" + add $tmp, %o7, %o7 + and $pwr, 3, $tmp + sll $tmp, 3, $tmp ! offset within first cache line + add $tmp, $ptbl, $ptbl ! of the pwrtbl + + ! "magic table" is organized in such way that below comparisons + ! make %fcc3:%fcc2:%fcc1:%fcc0 form a byte of 1s with one 0, + ! e.g. 0b11011111, with 0 denoting relevant cache line. + ld [%o7+0], %f0 ! load column + ld [%o7+32], %f1 + ld [%o7+64], %f2 + fcmps %fcc0, %f0, %f1 + ld [%o7+96], %f3 + fcmps %fcc1, %f1, %f2 + fcmps %fcc2, %f2, %f3 + fcmps %fcc3, %f3, %f0 +___ +} +sub load_f16() { +my $ptbl=shift; +$code.=<<___; + ldd [$ptbl+0*32],%f0 ! load all cache lines + ldd [$ptbl+1*32],%f2 + ldd [$ptbl+2*32],%f4 + fmovdg %fcc0,%f0,%f16 ! pick one value + ldd [$ptbl+3*32],%f6 + fmovdl %fcc0,%f2,%f16 + ldd [$ptbl+4*32],%f8 + fmovdg %fcc1,%f4,%f16 + ldd [$ptbl+5*32],%f10 + fmovdl %fcc1,%f6,%f16 + ldd [$ptbl+6*32],%f12 + fmovdg %fcc2,%f8,%f16 + ldd [$ptbl+7*32],%f14 + fmovdl %fcc2,%f10,%f16 + fmovdg %fcc3,%f12,%f16 + fmovdl %fcc3,%f14,%f16 + add $ptbl,8*32,$ptbl +___ +} + +######################################################################## +# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0, +# const u64 *pwrtbl,int pwr); +# +sub generate_bn_pwr5_mont_t4() { +my $NUM=shift; +my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5)); + +$code.=<<___; +.globl bn_pwr5_mont_t4_$NUM +.align 32 +bn_pwr5_mont_t4_$NUM: +#ifdef __arch64__ + mov 0,$sentinel + mov -128,%g4 +#elif defined(SPARCV9_64BIT_STACK) + SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) + ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] + mov -2047,%g4 + and %g1,SPARCV9_64BIT_STACK,%g1 + movrz %g1,0,%g4 + mov -1,$sentinel + add %g4,-128,%g4 +#else + mov -1,$sentinel + mov -128,%g4 +#endif + sllx $sentinel,32,$sentinel + save %sp,%g4,%sp +#ifndef __arch64__ + save %sp,-128,%sp ! warm it up + save %sp,-128,%sp + save %sp,-128,%sp + save %sp,-128,%sp + save %sp,-128,%sp + save %sp,-128,%sp + restore + restore + restore + restore + restore + restore +#endif + and %sp,1,%g4 + or $sentinel,%fp,%fp + or %g4,$sentinel,$sentinel + + ! copy arguments to global registers + mov %i0,$tp + mov %i1,$np + ld [%i2+0],%f1 ! load *n0 + ld [%i2+4],%f0 + mov %i3,$pwrtbl + mov %i4,$pwr + fsrc2 %f0,%f60 +___ + +# load tp[$NUM] ######################################################## +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for($i=0; $i<14 && $i<$NUM; $i++) { +$code.=<<___; + ldx [$tp+$i*8],@A[$i] +___ +} +for(; $i<$NUM; $i++) { +$code.=<<___; + ldd [$tp+$i*8],@A[$i] +___ +} +# load np[$NUM] ######################################################## +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for($i=0; $i<14 && $i<$NUM; $i++) { +$code.=<<___; + ldx [$np+$i*8],@N[$i] +___ +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<28 && $i<$NUM; $i++) { +$code.=<<___; + ldx [$np+$i*8],@N[$i] +___ +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<$NUM; $i++) { +$code.=<<___; + ldx [$np+$i*8],@N[$i] +___ +} +# load pwrtbl[pwr] ######################################################## + &load_fcc($pwrtbl,$pwr,@B[0]); +for($i=0; $i<6 && $i<$NUM; $i++) { + &load_f16($pwrtbl); +$code.=<<___; + movdtox %f16,@B[$i] +___ +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<20 && $i<$NUM; $i++) { + &load_f16($pwrtbl); +$code.=<<___; + movdtox %f16,@B[$i] +___ +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<$NUM; $i++) { + &load_f16($pwrtbl); +$code.=<<___; + movdtox %f16,@B[$i] +___ +} + +# magic ################################################################ +for($i=0; $i<5; $i++) { +$code.=<<___; + .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 + fbu,pn %fcc3,.Labort_$NUM +#ifndef __arch64__ + and %fp,$sentinel,$sentinel + brz,pn $sentinel,.Labort_$NUM +#endif + nop +___ +} +$code.=<<___; + .word 0x81b02920+$NUM-1 ! montmul $NUM-1 + fbu,pn %fcc3,.Labort_$NUM +#ifndef __arch64__ + and %fp,$sentinel,$sentinel + brz,pn $sentinel,.Labort_$NUM +#endif + nop + +#ifdef __arch64__ + restore + restore + restore + restore + restore +#else + restore; and %fp,$sentinel,$sentinel + restore; and %fp,$sentinel,$sentinel + restore; and %fp,$sentinel,$sentinel + restore; and %fp,$sentinel,$sentinel + brz,pn $sentinel,.Labort1_$NUM + restore +#endif +___ + +# save tp[$NUM] ######################################################## +for($i=0; $i<14 && $i<$NUM; $i++) { +$code.=<<___; + movxtod @A[$i],@R[$i] +___ +} +$code.=<<___; +#ifdef __arch64__ + restore +#else + and %fp,$sentinel,$sentinel + restore + and $sentinel,1,%o7 + and %fp,$sentinel,$sentinel + srl %fp,0,%fp ! just in case? + or %o7,$sentinel,$sentinel + brz,a,pn $sentinel,.Ldone_$NUM + mov 0,%i0 ! return failure +#endif +___ +for($i=0; $i<$NUM; $i++) { +$code.=<<___; + std @R[$i],[$tp+$i*8] +___ +} +$code.=<<___; + mov 1,%i0 ! return success +.Ldone_$NUM: + ret + restore + +.Labort_$NUM: + restore + restore + restore + restore + restore +.Labort1_$NUM: + restore + + mov 0,%i0 ! return failure + ret + restore +.type bn_pwr5_mont_t4_$NUM, #function +.size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM +___ +} + +for ($i=8;$i<=32;$i+=8) { + &generate_bn_pwr5_mont_t4($i); +} + +{ +######################################################################## +# Fall-back subroutines +# +# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values +# +($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)= + (map("%g$_",(1..5)),map("%o$_",(0..5,7))); + +# int bn_mul_mont( +$rp="%o0"; # u64 *rp, +$ap="%o1"; # const u64 *ap, +$bp="%o2"; # const u64 *bp, +$np="%o3"; # const u64 *np, +$n0p="%o4"; # const BN_ULONG *n0, +$num="%o5"; # int num); # caller ensures that num is >=3 +$code.=<<___; +.globl bn_mul_mont_t4 +.align 32 +bn_mul_mont_t4: + add %sp, STACK_BIAS, %g4 ! real top of stack + sll $num, 3, $num ! size in bytes + add $num, 63, %g1 + andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes + sub %g4, %g1, %g1 + andn %g1, 63, %g1 ! align at 64 byte + sub %g1, STACK_FRAME, %g1 ! new top of stack + sub %g1, %g4, %g1 + + save %sp, %g1, %sp +___ +# +-------------------------------+<----- %sp +# . . +# +-------------------------------+<----- aligned at 64 bytes +# | __int64 tmp[0] | +# +-------------------------------+ +# . . +# . . +# +-------------------------------+<----- aligned at 64 bytes +# . . +($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); +($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7)); +($ovf,$i)=($t0,$t1); +$code.=<<___; + ld [$n0p+0], $t0 ! pull n0[0..1] value + ld [$n0p+4], $t1 + add %sp, STACK_BIAS+STACK_FRAME, $tp + ldx [$bp+0], $m0 ! m0=bp[0] + sllx $t1, 32, $n0 + add $bp, 8, $bp + or $t0, $n0, $n0 + + ldx [$ap+0], $aj ! ap[0] + + mulx $aj, $m0, $lo0 ! ap[0]*bp[0] + umulxhi $aj, $m0, $hi0 + + ldx [$ap+8], $aj ! ap[1] + add $ap, 16, $ap + ldx [$np+0], $nj ! np[0] + + mulx $lo0, $n0, $m1 ! "tp[0]"*n0 + + mulx $aj, $m0, $alo ! ap[1]*bp[0] + umulxhi $aj, $m0, $aj ! ahi=aj + + mulx $nj, $m1, $lo1 ! np[0]*m1 + umulxhi $nj, $m1, $hi1 + + ldx [$np+8], $nj ! np[1] + + addcc $lo0, $lo1, $lo1 + add $np, 16, $np + addxc %g0, $hi1, $hi1 + + mulx $nj, $m1, $nlo ! np[1]*m1 + umulxhi $nj, $m1, $nj ! nhi=nj + + ba .L1st + sub $num, 24, $cnt ! cnt=num-3 + +.align 16 +.L1st: + addcc $alo, $hi0, $lo0 + addxc $aj, %g0, $hi0 + + ldx [$ap+0], $aj ! ap[j] + addcc $nlo, $hi1, $lo1 + add $ap, 8, $ap + addxc $nj, %g0, $hi1 ! nhi=nj + + ldx [$np+0], $nj ! np[j] + mulx $aj, $m0, $alo ! ap[j]*bp[0] + add $np, 8, $np + umulxhi $aj, $m0, $aj ! ahi=aj + + mulx $nj, $m1, $nlo ! np[j]*m1 + addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] + umulxhi $nj, $m1, $nj ! nhi=nj + addxc %g0, $hi1, $hi1 + stxa $lo1, [$tp]0xe2 ! tp[j-1] + add $tp, 8, $tp ! tp++ + + brnz,pt $cnt, .L1st + sub $cnt, 8, $cnt ! j-- +!.L1st + addcc $alo, $hi0, $lo0 + addxc $aj, %g0, $hi0 ! ahi=aj + + addcc $nlo, $hi1, $lo1 + addxc $nj, %g0, $hi1 + addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] + addxc %g0, $hi1, $hi1 + stxa $lo1, [$tp]0xe2 ! tp[j-1] + add $tp, 8, $tp + + addcc $hi0, $hi1, $hi1 + addxc %g0, %g0, $ovf ! upmost overflow bit + stxa $hi1, [$tp]0xe2 + add $tp, 8, $tp + + ba .Louter + sub $num, 16, $i ! i=num-2 + +.align 16 +.Louter: + ldx [$bp+0], $m0 ! m0=bp[i] + add $bp, 8, $bp + + sub $ap, $num, $ap ! rewind + sub $np, $num, $np + sub $tp, $num, $tp + + ldx [$ap+0], $aj ! ap[0] + ldx [$np+0], $nj ! np[0] + + mulx $aj, $m0, $lo0 ! ap[0]*bp[i] + ldx [$tp], $tj ! tp[0] + umulxhi $aj, $m0, $hi0 + ldx [$ap+8], $aj ! ap[1] + addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] + mulx $aj, $m0, $alo ! ap[1]*bp[i] + addxc %g0, $hi0, $hi0 + mulx $lo0, $n0, $m1 ! tp[0]*n0 + umulxhi $aj, $m0, $aj ! ahi=aj + mulx $nj, $m1, $lo1 ! np[0]*m1 + add $ap, 16, $ap + umulxhi $nj, $m1, $hi1 + ldx [$np+8], $nj ! np[1] + add $np, 16, $np + addcc $lo1, $lo0, $lo1 + mulx $nj, $m1, $nlo ! np[1]*m1 + addxc %g0, $hi1, $hi1 + umulxhi $nj, $m1, $nj ! nhi=nj + + ba .Linner + sub $num, 24, $cnt ! cnt=num-3 +.align 16 +.Linner: + addcc $alo, $hi0, $lo0 + ldx [$tp+8], $tj ! tp[j] + addxc $aj, %g0, $hi0 ! ahi=aj + ldx [$ap+0], $aj ! ap[j] + add $ap, 8, $ap + addcc $nlo, $hi1, $lo1 + mulx $aj, $m0, $alo ! ap[j]*bp[i] + addxc $nj, %g0, $hi1 ! nhi=nj + ldx [$np+0], $nj ! np[j] + add $np, 8, $np + umulxhi $aj, $m0, $aj ! ahi=aj + addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] + mulx $nj, $m1, $nlo ! np[j]*m1 + addxc %g0, $hi0, $hi0 + umulxhi $nj, $m1, $nj ! nhi=nj + addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] + addxc %g0, $hi1, $hi1 + stx $lo1, [$tp] ! tp[j-1] + add $tp, 8, $tp + brnz,pt $cnt, .Linner + sub $cnt, 8, $cnt +!.Linner + ldx [$tp+8], $tj ! tp[j] + addcc $alo, $hi0, $lo0 + addxc $aj, %g0, $hi0 ! ahi=aj + addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] + addxc %g0, $hi0, $hi0 + + addcc $nlo, $hi1, $lo1 + addxc $nj, %g0, $hi1 ! nhi=nj + addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] + addxc %g0, $hi1, $hi1 + stx $lo1, [$tp] ! tp[j-1] + + subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc + addxccc $hi1, $hi0, $hi1 + addxc %g0, %g0, $ovf + stx $hi1, [$tp+8] + add $tp, 16, $tp + + brnz,pt $i, .Louter + sub $i, 8, $i + + sub $ap, $num, $ap ! rewind + sub $np, $num, $np + sub $tp, $num, $tp + ba .Lsub + subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc + +.align 16 +.Lsub: + ldx [$tp], $tj + add $tp, 8, $tp + ldx [$np+0], $nj + add $np, 8, $np + subccc $tj, $nj, $t2 ! tp[j]-np[j] + srlx $tj, 32, $tj + srlx $nj, 32, $nj + subccc $tj, $nj, $t3 + add $rp, 8, $rp + st $t2, [$rp-4] ! reverse order + st $t3, [$rp-8] + brnz,pt $cnt, .Lsub + sub $cnt, 8, $cnt + + sub $np, $num, $np ! rewind + sub $tp, $num, $tp + sub $rp, $num, $rp + + subc $ovf, %g0, $ovf ! handle upmost overflow bit + and $tp, $ovf, $ap + andn $rp, $ovf, $np + or $np, $ap, $ap ! ap=borrow?tp:rp + ba .Lcopy + sub $num, 8, $cnt + +.align 16 +.Lcopy: ! copy or in-place refresh + ldx [$ap+0], $t2 + add $ap, 8, $ap + stx %g0, [$tp] ! zap + add $tp, 8, $tp + stx $t2, [$rp+0] + add $rp, 8, $rp + brnz $cnt, .Lcopy + sub $cnt, 8, $cnt + + mov 1, %o0 + ret + restore +.type bn_mul_mont_t4, #function +.size bn_mul_mont_t4, .-bn_mul_mont_t4 +___ + +# int bn_mul_mont_gather5( +$rp="%o0"; # u64 *rp, +$ap="%o1"; # const u64 *ap, +$bp="%o2"; # const u64 *pwrtbl, +$np="%o3"; # const u64 *np, +$n0p="%o4"; # const BN_ULONG *n0, +$num="%o5"; # int num, # caller ensures that num is >=3 + # int power); +$code.=<<___; +.globl bn_mul_mont_gather5_t4 +.align 32 +bn_mul_mont_gather5_t4: + add %sp, STACK_BIAS, %g4 ! real top of stack + sll $num, 3, $num ! size in bytes + add $num, 63, %g1 + andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes + sub %g4, %g1, %g1 + andn %g1, 63, %g1 ! align at 64 byte + sub %g1, STACK_FRAME, %g1 ! new top of stack + sub %g1, %g4, %g1 + LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument + + save %sp, %g1, %sp +___ +# +-------------------------------+<----- %sp +# . . +# +-------------------------------+<----- aligned at 64 bytes +# | __int64 tmp[0] | +# +-------------------------------+ +# . . |