diff options
-rwxr-xr-x | crypto/bn/asm/ppc64-mont-fixed.pl | 585 | ||||
-rw-r--r-- | crypto/bn/build.info | 3 | ||||
-rw-r--r-- | crypto/ppccap.c | 12 | ||||
-rw-r--r-- | providers/fips-sources.checksums | 1 | ||||
-rw-r--r-- | providers/fips.checksum | 2 | ||||
-rw-r--r-- | providers/fips.module.sources | 1 |
6 files changed, 602 insertions, 2 deletions
diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl new file mode 100755 index 0000000000..62d2db0006 --- /dev/null +++ b/crypto/bn/asm/ppc64-mont-fixed.pl @@ -0,0 +1,585 @@ +#! /usr/bin/env perl +# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# ==================================================================== +# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke +# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for +# the OpenSSL project. +# ==================================================================== + +# +# Fixed length (n=6), unrolled PPC Montgomery Multiplication +# + +# 2021 +# +# Although this is a generic implementation for unrolling Montgomery +# Multiplication for arbitrary values of n, this is currently only +# used for n = 6 to improve the performance of ECC p384. +# +# Unrolling allows intermediate results to be stored in registers, +# rather than on the stack, improving performance by ~7% compared to +# the existing PPC assembly code. +# +# The ISA 3.0 implementation uses combination multiply/add +# instructions (maddld, maddhdu) to improve performance by an +# additional ~10% on Power 9. +# +# Finally, saving non-volatile registers into volatile vector +# registers instead of onto the stack saves a little more. +# +# On a Power 9 machine we see an overall improvement of ~18%. +# + +use strict; +use warnings; + +my ($flavour, $output, $dir, $xlate); + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; + +if ($flavour !~ /64/) { + die "bad flavour ($flavour) - only ppc64 permitted"; +} + +my $SIZE_T= 8; + +# Registers are global so the code is remotely readable + +# Parameters for Montgomery multiplication +my $sp = "r1"; +my $toc = "r2"; +my $rp = "r3"; +my $ap = "r4"; +my $bp = "r5"; +my $np = "r6"; +my $n0 = "r7"; +my $num = "r8"; + +$rp = "r9"; # $rp is reassigned + +my $c0 = "r10"; +my $bp0 = "r11"; +my $bpi = "r11"; +my $bpj = "r11"; +my $tj = "r12"; +my $apj = "r12"; +my $npj = "r12"; +my $lo = "r14"; +my $c1 = "r14"; +my $i = "r15"; + +# Non-volatile registers used for tp[i] +# +# 12 registers are available but the limit on unrolling is 10, +# since registers from $tp[0] to $tp[$n+1] are used. +my @tp = ("r20" .. "r31"); + +# volatile VSRs for saving non-volatile GPRs - faster than stack +my @vsrs = ("v32" .. "v46"); + +package Mont; + +sub new($$) +{ + my ($class, $n) = @_; + + if ($n > 10) { + die "Can't unroll for BN length ${n} (maximum 10)" + } + + my $self = { + code => "", + n => $n, + }; + bless $self, $class; + + return $self; +} + +sub add_code($$) +{ + my ($self, $c) = @_; + + $self->{code} .= $c; +} + +sub get_code($) +{ + my ($self) = @_; + + return $self->{code}; +} + +sub get_function_name($) +{ + my ($self) = @_; + + return "bn_mul_mont_fixed_n" . $self->{n}; +} + +sub get_label($$) +{ + my ($self, $l) = @_; + + return "L" . $l . "_" . $self->{n}; +} + +sub get_labels($@) +{ + my ($self, @labels) = @_; + + my %out = (); + + foreach my $l (@labels) { + $out{"$l"} = $self->get_label("$l"); + } + + return \%out; +} + +sub nl($) +{ + my ($self) = @_; + + $self->add_code("\n"); +} + +sub copy_result($) +{ + my ($self) = @_; + + my ($n) = $self->{n}; + + for (my $j = 0; $j < $n; $j++) { + $self->add_code(<<___); + std $tp[$j],`$j*$SIZE_T`($rp) +___ + } + +} + +sub mul_mont_fixed($) +{ + my ($self) = @_; + + my ($n) = $self->{n}; + my $fname = $self->get_function_name(); + my $label = $self->get_labels("outer", "enter", "sub", "copy", "end"); + + $self->add_code(<<___); + +.globl .${fname} +.${fname}: + mr $rp,r3 + +___ + + $self->save_registers(); + + $self->add_code(<<___); + ld $n0,0($n0) + + ld $bp0,0($bp) + + ld $apj,0($ap) +___ + + $self->mul_c_0($tp[0], $apj, $bp0, $c0); + + for (my $j = 1; $j < $n - 1; $j++) { + $self->add_code(<<___); + ld $apj,`$j*$SIZE_T`($ap) +___ + $self->mul($tp[$j], $apj, $bp0, $c0); + } + + $self->add_code(<<___); + ld $apj,`($n-1)*$SIZE_T`($ap) +___ + + $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0); + + $self->add_code(<<___); + li $tp[$n+1],0 + +___ + + $self->add_code(<<___); + li $i,0 + mtctr $num + b $label->{"enter"} + +$label->{"outer"}: + ldx $bpi,$bp,$i + + ld $apj,0($ap) +___ + + $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0); + + for (my $j = 1; $j < $n; $j++) { + $self->add_code(<<___); + ld $apj,`$j*$SIZE_T`($ap) +___ + $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0); + } + + $self->add_code(<<___); + addc $tp[$n],$tp[$n],$c0 + addze $tp[$n+1],$tp[$n+1] +___ + + $self->add_code(<<___); +$label->{"enter"}: + mulld $bpi,$tp[0],$n0 + + ld $npj,0($np) +___ + + $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0); + + for (my $j = 1; $j < $n; $j++) { + $self->add_code(<<___); + ld $npj,`$j*$SIZE_T`($np) +___ + $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0); + } + + $self->add_code(<<___); + addc $tp[$n-1],$tp[$n],$c0 + addze $tp[$n],$tp[$n+1] + + addi $i,$i,$SIZE_T + bc 25,0,$label->{"outer"} + + and. $tp[$n],$tp[$n],$tp[$n] + bne $label->{"sub"} + + cmpld $tp[$n-1],$npj + blt $label->{"copy"} + +$label->{"sub"}: +___ + + # + # Reduction + # + + $self->add_code(<<___); + ld $bpj,`0*$SIZE_T`($np) + subfc $c1,$bpj,$tp[0] + std $c1,`0*$SIZE_T`($rp) + +___ + for (my $j = 1; $j < $n - 1; $j++) { + $self->add_code(<<___); + ld $bpj,`$j*$SIZE_T`($np) + subfe $c1,$bpj,$tp[$j] + std $c1,`$j*$SIZE_T`($rp) + +___ + } + + $self->add_code(<<___); + subfe $c1,$npj,$tp[$n-1] + std $c1,`($n-1)*$SIZE_T`($rp) + +___ + + $self->add_code(<<___); + addme. $tp[$n],$tp[$n] + beq $label->{"end"} + +$label->{"copy"}: +___ + + $self->copy_result(); + + $self->add_code(<<___); + +$label->{"end"}: +___ + + $self->restore_registers(); + + $self->add_code(<<___); + li r3,1 + blr +.size ${fname},.-${fname} +___ + +} + +package Mont::GPR; + +our @ISA = ('Mont'); + +sub new($$) +{ + my ($class, $n) = @_; + + return $class->SUPER::new($n); +} + +sub save_registers($) +{ + my ($self) = @_; + + my $n = $self->{n}; + + $self->add_code(<<___); + mtvsrd $vsrs[0],$lo + mtvsrd $vsrs[1],$i +___ + + for (my $j = 0; $j <= $n+1; $j++) { + $self->{code}.=<<___; + mtvsrd $vsrs[$j+2],$tp[$j] +___ + } + + $self->add_code(<<___); + +___ +} + +sub restore_registers($) +{ + my ($self) = @_; + + my $n = $self->{n}; + + $self->add_code(<<___); + mfvsrd $lo,$vsrs[0] + mfvsrd $i,$vsrs[1] +___ + + for (my $j = 0; $j <= $n+1; $j++) { + $self->{code}.=<<___; + mfvsrd $tp[$j],$vsrs[$j+2] +___ + } + + $self->{code} .=<<___; + +___ +} + +# Direct translation of C mul() +sub mul($$$$$) +{ + my ($self, $r, $a, $w, $c) = @_; + + $self->add_code(<<___); + mulld $lo,$a,$w + addc $r,$lo,$c + mulhdu $c,$a,$w + addze $c,$c + +___ +} + +# Like mul() but $c is ignored as an input - an optimisation to save a +# preliminary instruction that would set input $c to 0 +sub mul_c_0($$$$$) +{ + my ($self, $r, $a, $w, $c) = @_; + + $self->add_code(<<___); + mulld $r,$a,$w + mulhdu $c,$a,$w + +___ +} + +# Like mul() but does not to the final addition of CA into $c - an +# optimisation to save an instruction +sub mul_last($$$$$$) +{ + my ($self, $r1, $r2, $a, $w, $c) = @_; + + $self->add_code(<<___); + mulld $lo,$a,$w + addc $r1,$lo,$c + mulhdu $c,$a,$w + + addze $r2,$c +___ +} + +# Like C mul_add() but allow $r_out and $r_in to be different +sub mul_add($$$$$$) +{ + my ($self, $r_out, $r_in, $a, $w, $c) = @_; + + $self->add_code(<<___); + mulld $lo,$a,$w + addc $lo,$lo,$c + mulhdu $c,$a,$w + addze $c,$c + addc $r_out,$r_in,$lo + addze $c,$c + +___ +} + +# Like mul_add() but $c is ignored as an input - an optimisation to save a +# preliminary instruction that would set input $c to 0 +sub mul_add_c_0($$$$$$) +{ + my ($self, $r_out, $r_in, $a, $w, $c) = @_; + + $self->add_code(<<___); + mulld $lo,$a,$w + addc $r_out,$r_in,$lo + mulhdu $c,$a,$w + addze $c,$c + +___ +} + +package Mont::GPR_300; + +our @ISA = ('Mont::GPR'); + +sub new($$) +{ + my ($class, $n) = @_; + + my $mont = $class->SUPER::new($n); + + return $mont; +} + +sub get_function_name($) +{ + my ($self) = @_; + + return "bn_mul_mont_300_fixed_n" . $self->{n}; +} + +sub get_label($$) +{ + my ($self, $l) = @_; + + return "L" . $l . "_300_" . $self->{n}; +} + +# Direct translation of C mul() +sub mul($$$$$) +{ + my ($self, $r, $a, $w, $c, $last) = @_; + + $self->add_code(<<___); + maddld $r,$a,$w,$c + maddhdu $c,$a,$w,$c + +___ +} + +# Save the last carry as the final entry +sub mul_last($$$$$) +{ + my ($self, $r1, $r2, $a, $w, $c) = @_; + + $self->add_code(<<___); + maddld $r1,$a,$w,$c + maddhdu $r2,$a,$w,$c + +___ +} + +# Like mul() but $c is ignored as an input - an optimisation to save a +# preliminary instruction that would set input $c to 0 +sub mul_c_0($$$$$) +{ + my ($self, $r, $a, $w, $c) = @_; + + $self->add_code(<<___); + mulld $r,$a,$w + mulhdu $c,$a,$w + +___ +} + +# Like C mul_add() but allow $r_out and $r_in to be different +sub mul_add($$$$$$) +{ + my ($self, $r_out, $r_in, $a, $w, $c) = @_; + + $self->add_code(<<___); + maddld $lo,$a,$w,$c + maddhdu $c,$a,$w,$c + addc $r_out,$r_in,$lo + addze $c,$c + +___ +} + +# Like mul_add() but $c is ignored as an input - an optimisation to save a +# preliminary instruction that would set input $c to 0 +sub mul_add_c_0($$$$$$) +{ + my ($self, $r_out, $r_in, $a, $w, $c) = @_; + + $self->add_code(<<___); + maddld $lo,$a,$w,$r_in + maddhdu $c,$a,$w,$r_in +___ + + if ($r_out ne $lo) { + $self->add_code(<<___); + mr $r_out,$lo +___ + } + + $self->nl(); +} + + +package main; + +my $code; + +$code.=<<___; +.machine "any" +.text +.align 5 +.p2align 5,,31 +___ + +my $mont; + +$mont = new Mont::GPR(6); +$mont->mul_mont_fixed(); +$code .= $mont->get_code(); + +$mont = new Mont::GPR_300(6); +$mont->mul_mont_fixed(); +$code .= $mont->get_code(); + +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +$code.=<<___; +.asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>" +___ + +print $code; +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/bn/build.info b/crypto/bn/build.info index 5e948b8433..3c32e83067 100644 --- a/crypto/bn/build.info +++ b/crypto/bn/build.info @@ -79,7 +79,7 @@ IF[{- !$disabled{asm} -}] $BNASM_ppc32=bn-ppc.s ppc-mont.s $BNDEF_ppc32=OPENSSL_BN_ASM_MONT - $BNASM_ppc64=$BNASM_ppc32 + $BNASM_ppc64=$BNASM_ppc32 ppc64-mont-fixed.s $BNDEF_ppc64=$BNDEF_ppc32 $BNASM_c64xplus=asm/bn-c64xplus.asm @@ -168,6 +168,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.pl GENERATE[bn-ppc.s]=asm/ppc.pl GENERATE[ppc-mont.s]=asm/ppc-mont.pl GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl +GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl GENERATE[alpha-mont.S]=asm/alpha-mont.pl diff --git a/crypto/ppccap.c b/crypto/ppccap.c index 9ed1d80db5..a504bc59b0 100644 --- a/crypto/ppccap.c +++ b/crypto/ppccap.c @@ -47,6 +47,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); + int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, const BN_ULONG *np, + const BN_ULONG *n0, int num); + int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, const BN_ULONG *np, + const BN_ULONG *n0, int num); if (num < 4) return 0; @@ -62,6 +68,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, * no opportunity to figure it out... */ + if (num == 6) + if (OPENSSL_ppccap_P & PPC_MADD300) + return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num); + else + return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num); + return bn_mul_mont_int(rp, ap, bp, np, n0, num); } #endif diff --git a/providers/fips-sources.checksums b/providers/fips-sources.checksums index 01968b7e6f..b1ec8f2339 100644 --- a/providers/fips-sources.checksums +++ b/providers/fips-sources.checksums @@ -42,6 +42,7 @@ eb240c1f72063048abe026ab7fab340361a329d5cd355276a25950be446cc091 crypto/bn/asm/ b27ec5181e387e812925bb26823b830f49d7a6e4971b6d11ea583f5632a1504b crypto/bn/asm/parisc-mont.pl 9973523b361db963eea4938a7a8a3adc692e1a4e1aec4fa1f1e57dc93da37921 crypto/bn/asm/ppc-mont.pl 59cd27e1e10c4984b7fb684b27f491e7634473b1bcff197a07e0ca653124aa9a crypto/bn/asm/ppc.pl +13ba6625cc6c673dc6f7ef69a7bbe40487c5553b3873a996af4904de5b1cd82b crypto/bn/asm/ppc64-mont-fixed.pl a25be64867ab837d93855af232e2bfa71b85b2c6f00e35e620fdc5618187fb6f crypto/bn/asm/ppc64-mont.pl 231579e532443665020d4d522d9f11713d9c5d5c814b95b434b0f65452e16de4 crypto/bn/asm/rsaz-avx2.pl c9bd8679a5104affd9f3f0bcda726f823a1a53cac872e4a21a6f2370489dae08 crypto/bn/asm/rsaz-avx512.pl diff --git a/providers/fips.checksum b/providers/fips.checksum index e5ff9a8040..e9adf327b3 100644 --- a/providers/fips.checksum +++ b/providers/fips.checksum @@ -1 +1 @@ -2e67c3ed3222fedf2d26e91f47b2b7708a95f39a74bd1489412f324f84daa57d providers/fips-sources.checksums +4fcfc6375eef7bed6219191cce24513be04a6ebb8b2d5da8e404150a2ecc0eba providers/fips-sources.checksums diff --git a/providers/fips.module.sources b/providers/fips.module.sources index 7e17658602..416a2b97f7 100644 --- a/providers/fips.module.sources +++ b/providers/fips.module.sources @@ -42,6 +42,7 @@ crypto/bn/asm/mips.pl crypto/bn/asm/parisc-mont.pl crypto/bn/asm/ppc-mont.pl crypto/bn/asm/ppc.pl +crypto/bn/asm/ppc64-mont-fixed.pl crypto/bn/asm/ppc64-mont.pl crypto/bn/asm/rsaz-avx2.pl crypto/bn/asm/rsaz-avx512.pl |