From 30691da1ba465f3cff5d865187fbf5c5244448eb Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Tue, 13 Oct 2020 05:11:40 -0400 Subject: ec: Add PPC64 vector assembly version of p521 field operations Only field multiplication and squaring (but not reduction) show a significant improvement. This is enabled on Power ISA >= 3.0. On a Power 9 CPU an average 10% performance improvement is seen (ECHDE: 14%, ECDSA sign: 6%, ECDSA verify 10%), compared to existing code. On an upcoming Power 10 CPU we see an average performance improvement of 26% (ECHDE: 38%, ECDSA sign: 16%, ECDSA verify 25%), compared to existing code. Signed-off-by: Amitay Isaacs Signed-off-by: Martin Schwenke Reviewed-by: Tomas Mraz Reviewed-by: Paul Dale (Merged from https://github.com/openssl/openssl/pull/15401) --- crypto/ec/asm/ecp_nistp521-ppc64.pl | 436 ++++++++++++++++++++++++++++++++++++ crypto/ec/build.info | 5 +- crypto/ec/ecp_nistp521.c | 16 ++ 3 files changed, 456 insertions(+), 1 deletion(-) create mode 100755 crypto/ec/asm/ecp_nistp521-ppc64.pl (limited to 'crypto/ec') diff --git a/crypto/ec/asm/ecp_nistp521-ppc64.pl b/crypto/ec/asm/ecp_nistp521-ppc64.pl new file mode 100755 index 0000000000..7e71e924ba --- /dev/null +++ b/crypto/ec/asm/ecp_nistp521-ppc64.pl @@ -0,0 +1,436 @@ +#! /usr/bin/env perl +# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Amitay Isaacs and Martin Schwenke +# for the OpenSSL project. +# ==================================================================== +# +# p521 lower-level primitives for PPC64 using vector instructions. +# + +use strict; +use warnings; + +my $flavour = shift; +my $output = ""; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +if (!$output) { + $output = "-"; +} + +my ($xlate, $dir); +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +my $code = ""; + +my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12"); + +my $vzero = "v32"; + +sub startproc($) +{ + my ($name) = @_; + + $code.=<<___; + .globl ${name} +${name}: + .cfi_startproc + +___ +} + +sub endproc($) +{ + my ($name) = @_; + + $code.=<<___; + blr + .cfi_endproc + .size ${name},.-${name} + +___ +} + + +sub push_vrs($$) +{ + my ($min, $max) = @_; + + my $count = $max - $min + 1; + + $code.=<<___; + mr $savesp,$sp + stdu $sp,-16*`$count+1`($sp) + +___ + for (my $i = $min; $i <= $max; $i++) { + my $mult = $max - $i + 1; + $code.=<<___; + stxv $i,-16*$mult($savesp) +___ + + } + + $code.=<<___; + +___ +} + +sub pop_vrs($$) +{ + my ($min, $max) = @_; + + $code.=<<___; + ld $savesp,0($sp) +___ + for (my $i = $min; $i <= $max; $i++) { + my $mult = $max - $i + 1; + $code.=<<___; + lxv $i,-16*$mult($savesp) +___ + } + + $code.=<<___; + mr $sp,$savesp + +___ +} + +sub load_vrs($$) +{ + my ($pointer, $reg_list) = @_; + + for (my $i = 0; $i <= 8; $i++) { + my $offset = $i * 8; + $code.=<<___; + lxsd $reg_list->[$i],$offset($pointer) +___ + } + + $code.=<<___; + +___ +} + +sub store_vrs($$) +{ + my ($pointer, $reg_list) = @_; + + for (my $i = 0; $i <= 8; $i++) { + my $offset = $i * 16; + $code.=<<___; + stxv $reg_list->[$i],$offset($pointer) +___ + } + + $code.=<<___; + +___ +} + +$code.=<<___; +.text + +___ + +{ + # mul/square common + my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v44", "v54"); + my ($zero, $one) = ("r8", "r9"); + my @out = map("v$_",(55..63)); + + { + # + # p521_felem_mul + # + + my ($in1p, $in2p) = ("r4", "r5"); + my @in1 = map("v$_",(45..53)); + my @in2 = map("v$_",(35..43)); + + startproc("p521_felem_mul"); + + push_vrs(52, 63); + + $code.=<<___; + vspltisw $vzero,0 + +___ + + load_vrs($in1p, \@in1); + load_vrs($in2p, \@in2); + + $code.=<<___; + vmsumudm $out[0],$in1[0],$in2[0],$vzero + + xxpermdi $t1,$in1[0],$in1[1],0b00 + xxpermdi $t2,$in2[1],$in2[0],0b00 + vmsumudm $out[1],$t1,$t2,$vzero + + xxpermdi $t2,$in2[2],$in2[1],0b00 + vmsumudm $out[2],$t1,$t2,$vzero + vmsumudm $out[2],$in1[2],$in2[0],$out[2] + + xxpermdi $t2,$in2[3],$in2[2],0b00 + vmsumudm $out[3],$t1,$t2,$vzero + xxpermdi $t3,$in1[2],$in1[3],0b00 + xxpermdi $t4,$in2[1],$in2[0],0b00 + vmsumudm $out[3],$t3,$t4,$out[3] + + xxpermdi $t2,$in2[4],$in2[3],0b00 + vmsumudm $out[4],$t1,$t2,$vzero + xxpermdi $t4,$in2[2],$in2[1],0b00 + vmsumudm $out[4],$t3,$t4,$out[4] + vmsumudm $out[4],$in1[4],$in2[0],$out[4] + + xxpermdi $t2,$in2[5],$in2[4],0b00 + vmsumudm $out[5],$t1,$t2,$vzero + xxpermdi $t4,$in2[3],$in2[2],0b00 + vmsumudm $out[5],$t3,$t4,$out[5] + + xxpermdi $t2,$in2[6],$in2[5],0b00 + vmsumudm $out[6],$t1,$t2,$vzero + xxpermdi $t4,$in2[4],$in2[3],0b00 + vmsumudm $out[6],$t3,$t4,$out[6] + + xxpermdi $t2,$in2[7],$in2[6],0b00 + vmsumudm $out[7],$t1,$t2,$vzero + xxpermdi $t4,$in2[5],$in2[4],0b00 + vmsumudm $out[7],$t3,$t4,$out[7] + + xxpermdi $t2,$in2[8],$in2[7],0b00 + vmsumudm $out[8],$t1,$t2,$vzero + xxpermdi $t4,$in2[6],$in2[5],0b00 + vmsumudm $out[8],$t3,$t4,$out[8] + + xxpermdi $t1,$in1[4],$in1[5],0b00 + xxpermdi $t2,$in2[1],$in2[0],0b00 + vmsumudm $out[5],$t1,$t2,$out[5] + + xxpermdi $t2,$in2[2],$in2[1],0b00 + vmsumudm $out[6],$t1,$t2,$out[6] + vmsumudm $out[6],$in1[6],$in2[0],$out[6] + + xxpermdi $t2,$in2[3],$in2[2],0b00 + vmsumudm $out[7],$t1,$t2,$out[7] + xxpermdi $t3,$in1[6],$in1[7],0b00 + xxpermdi $t4,$in2[1],$in2[0],0b00 + vmsumudm $out[7],$t3,$t4,$out[7] + + xxpermdi $t2,$in2[4],$in2[3],0b00 + vmsumudm $out[8],$t1,$t2,$out[8] + xxpermdi $t4,$in2[2],$in2[1],0b00 + vmsumudm $out[8],$t3,$t4,$out[8] + vmsumudm $out[8],$in1[8],$in2[0],$out[8] + + li $zero,0 + li $one,1 + mtvsrdd $t1,$one,$zero +___ + + for (my $i = 0; $i <= 8; $i++) { + $code.=<<___; + vsld $in2[$i],$in2[$i],$t1 +___ + } + + $code.=<<___; + + vmsumudm $out[7],$in1[8],$in2[8],$out[7] + + xxpermdi $t2,$in2[8],$in2[7],0b00 + xxpermdi $t1,$in1[7],$in1[8],0b00 + vmsumudm $out[6],$t1,$t2,$out[6] + + xxpermdi $t1,$in1[6],$in1[7],0b00 + vmsumudm $out[5],$t1,$t2,$out[5] + vmsumudm $out[5],$in1[8],$in2[6],$out[5] + + xxpermdi $t1,$in1[5],$in1[6],0b00 + vmsumudm $out[4],$t1,$t2,$out[4] + xxpermdi $t4,$in2[6],$in2[5],0b00 + xxpermdi $t3,$in1[7],$in1[8],0b00 + vmsumudm $out[4],$t3,$t4,$out[4] + + xxpermdi $t1,$in1[4],$in1[5],0b00 + vmsumudm $out[3],$t1,$t2,$out[3] + xxpermdi $t3,$in1[6],$in1[7],0b00 + vmsumudm $out[3],$t3,$t4,$out[3] + vmsumudm $out[3],$in1[8],$in2[4],$out[3] + + xxpermdi $t1,$in1[3],$in1[4],0b00 + vmsumudm $out[2],$t1,$t2,$out[2] + xxpermdi $t3,$in1[5],$in1[6],0b00 + vmsumudm $out[2],$t3,$t4,$out[2] + + xxpermdi $t1,$in1[2],$in1[3],0b00 + vmsumudm $out[1],$t1,$t2,$out[1] + xxpermdi $t3,$in1[4],$in1[5],0b00 + vmsumudm $out[1],$t3,$t4,$out[1] + + xxpermdi $t1,$in1[1],$in1[2],0b00 + vmsumudm $out[0],$t1,$t2,$out[0] + xxpermdi $t3,$in1[3],$in1[4],0b00 + vmsumudm $out[0],$t3,$t4,$out[0] + + xxpermdi $t2,$in2[4],$in2[3],0b00 + xxpermdi $t1,$in1[7],$in1[8],0b00 + vmsumudm $out[2],$t1,$t2,$out[2] + + xxpermdi $t1,$in1[6],$in1[7],0b00 + vmsumudm $out[1],$t1,$t2,$out[1] + vmsumudm $out[1],$in1[8],$in2[2],$out[1] + + xxpermdi $t1,$in1[5],$in1[6],0b00 + vmsumudm $out[0],$t1,$t2,$out[0] + xxpermdi $t4,$in2[2],$in2[1],0b00 + xxpermdi $t3,$in1[7],$in1[8],0b00 + vmsumudm $out[0],$t3,$t4,$out[0] + +___ + + store_vrs($outp, \@out); + + pop_vrs(52, 63); + + endproc("p521_felem_mul"); + } + + { + # + # p51_felem_square + # + + my ($inp) = ("r4"); + my @in = map("v$_",(45..53)); + my @inx2 = map("v$_",(35..43)); + + startproc("p521_felem_square"); + + push_vrs(52, 63); + + $code.=<<___; + vspltisw $vzero,0 + +___ + + load_vrs($inp, \@in); + + $code.=<<___; + li $zero,0 + li $one,1 + mtvsrdd $t1,$one,$zero +___ + + for (my $i = 0; $i <= 8; $i++) { + $code.=<<___; + vsld $inx2[$i],$in[$i],$t1 +___ + } + + $code.=<<___; + vmsumudm $out[0],$in[0],$in[0],$vzero + + vmsumudm $out[1],$in[0],$inx2[1],$vzero + + xxpermdi $t1,$in[0],$in[1],0b00 + xxpermdi $t2,$inx2[2],$in[1],0b00 + vmsumudm $out[2],$t1,$t2,$vzero + + xxpermdi $t2,$inx2[3],$inx2[2],0b00 + vmsumudm $out[3],$t1,$t2,$vzero + + xxpermdi $t2,$inx2[4],$inx2[3],0b00 + vmsumudm $out[4],$t1,$t2,$vzero + vmsumudm $out[4],$in[2],$in[2],$out[4] + + xxpermdi $t2,$inx2[5],$inx2[4],0b00 + vmsumudm $out[5],$t1,$t2,$vzero + vmsumudm $out[5],$in[2],$inx2[3],$out[5] + + xxpermdi $t2,$inx2[6],$inx2[5],0b00 + vmsumudm $out[6],$t1,$t2,$vzero + xxpermdi $t3,$in[2],$in[3],0b00 + xxpermdi $t4,$inx2[4],$in[3],0b00 + vmsumudm $out[6],$t3,$t4,$out[6] + + xxpermdi $t2,$inx2[7],$inx2[6],0b00 + vmsumudm $out[7],$t1,$t2,$vzero + xxpermdi $t4,$inx2[5],$inx2[4],0b00 + vmsumudm $out[7],$t3,$t4,$out[7] + + xxpermdi $t2,$inx2[8],$inx2[7],0b00 + vmsumudm $out[8],$t1,$t2,$vzero + xxpermdi $t4,$inx2[6],$inx2[5],0b00 + vmsumudm $out[8],$t3,$t4,$out[8] + vmsumudm $out[8],$in[4],$in[4],$out[8] + + vmsumudm $out[1],$in[5],$inx2[5],$out[1] + + vmsumudm $out[3],$in[6],$inx2[6],$out[3] + + vmsumudm $out[5],$in[7],$inx2[7],$out[5] + + vmsumudm $out[7],$in[8],$inx2[8],$out[7] + + mtvsrdd $t1,$one,$zero +___ + + for (my $i = 5; $i <= 8; $i++) { + $code.=<<___; + vsld $inx2[$i],$inx2[$i],$t1 +___ + } + + $code.=<<___; + + vmsumudm $out[6],$in[7],$inx2[8],$out[6] + + vmsumudm $out[5],$in[6],$inx2[8],$out[5] + + xxpermdi $t2,$inx2[8],$inx2[7],0b00 + xxpermdi $t1,$in[5],$in[6],0b00 + vmsumudm $out[4],$t1,$t2,$out[4] + + xxpermdi $t1,$in[4],$in[5],0b00 + vmsumudm $out[3],$t1,$t2,$out[3] + + xxpermdi $t1,$in[3],$in[4],0b00 + vmsumudm $out[2],$t1,$t2,$out[2] + vmsumudm $out[2],$in[5],$inx2[6],$out[2] + + xxpermdi $t1,$in[2],$in[3],0b00 + vmsumudm $out[1],$t1,$t2,$out[1] + vmsumudm $out[1],$in[4],$inx2[6],$out[1] + + xxpermdi $t1,$in[1],$in[2],0b00 + vmsumudm $out[0],$t1,$t2,$out[0] + xxpermdi $t2,$inx2[6],$inx2[5],0b00 + xxpermdi $t1,$in[3],$in[4],0b00 + vmsumudm $out[0],$t1,$t2,$out[0] + +___ + + store_vrs($outp, \@out); + + pop_vrs(52, 63); + + endproc("p521_felem_square"); + } +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/ec/build.info b/crypto/ec/build.info index bfd16b326f..f4314dd896 100644 --- a/crypto/ec/build.info +++ b/crypto/ec/build.info @@ -30,8 +30,9 @@ IF[{- !$disabled{asm} -}] $ECASM_parisc20_64= $ECASM_ppc32= - $ECASM_ppc64=ecp_nistz256.c ecp_nistz256-ppc64.s x25519-ppc64.s + $ECASM_ppc64=ecp_nistz256.c ecp_nistz256-ppc64.s ecp_nistp521-ppc64.s x25519-ppc64.s $ECDEF_ppc64=ECP_NISTZ256_ASM ECP_NISTP521_ASM X25519_ASM + INCLUDE[ecp_nistp521.o]=.. $ECASM_c64xplus= @@ -86,5 +87,7 @@ GENERATE[ecp_nistz256-armv8.S]=asm/ecp_nistz256-armv8.pl INCLUDE[ecp_nistz256-armv8.o]=.. GENERATE[ecp_nistz256-ppc64.s]=asm/ecp_nistz256-ppc64.pl +GENERATE[ecp_nistp521-ppc64.s]=asm/ecp_nistp521-ppc64.pl + GENERATE[x25519-x86_64.s]=asm/x25519-x86_64.pl GENERATE[x25519-ppc64.s]=asm/x25519-ppc64.pl diff --git a/crypto/ec/ecp_nistp521.c b/crypto/ec/ecp_nistp521.c index 02bded2b6f..338618ebca 100644 --- a/crypto/ec/ecp_nistp521.c +++ b/crypto/ec/ecp_nistp521.c @@ -684,8 +684,24 @@ static void (*felem_square_p)(largefelem out, const felem in) = static void (*felem_mul_p)(largefelem out, const felem in1, const felem in2) = felem_mul_wrapper; +void p521_felem_square(largefelem out, const felem in); +void p521_felem_mul(largefelem out, const felem in1, const felem in2); + +# if defined(_ARCH_PPC64) +# include "ppc_arch.h" +# endif + void felem_select(void) { +# if defined(_ARCH_PPC64) + if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) { + felem_square_p = p521_felem_square; + felem_mul_p = p521_felem_mul; + + return; + } +# endif + /* Default */ felem_square_p = felem_square_ref; felem_mul_p = felem_mul_ref; -- cgit v1.2.3