From 30691da1ba465f3cff5d865187fbf5c5244448eb Mon Sep 17 00:00:00 2001
From: Amitay Isaacs <amitay@ozlabs.org>
Date: Tue, 13 Oct 2020 05:11:40 -0400
Subject: ec: Add PPC64 vector assembly version of p521 field operations

Only field multiplication and squaring (but not reduction) show a
significant improvement.  This is enabled on Power ISA >= 3.0.

On a Power 9 CPU an average 10% performance improvement is seen (ECHDE:
14%, ECDSA sign: 6%, ECDSA verify 10%), compared to existing code.

On an upcoming Power 10 CPU we see an average performance improvement
of 26% (ECHDE: 38%, ECDSA sign: 16%, ECDSA verify 25%), compared to
existing code.

Signed-off-by: Amitay Isaacs <amitay@ozlabs.org>
Signed-off-by: Martin Schwenke <martin@meltin.net>

Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/15401)
---
 crypto/ec/asm/ecp_nistp521-ppc64.pl | 436 ++++++++++++++++++++++++++++++++++++
 crypto/ec/build.info                |   5 +-
 crypto/ec/ecp_nistp521.c            |  16 ++
 3 files changed, 456 insertions(+), 1 deletion(-)
 create mode 100755 crypto/ec/asm/ecp_nistp521-ppc64.pl

(limited to 'crypto/ec')

diff --git a/crypto/ec/asm/ecp_nistp521-ppc64.pl b/crypto/ec/asm/ecp_nistp521-ppc64.pl
new file mode 100755
index 0000000000..7e71e924ba
--- /dev/null
+++ b/crypto/ec/asm/ecp_nistp521-ppc64.pl
@@ -0,0 +1,436 @@
+#! /usr/bin/env perl
+# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Amitay Isaacs <amitay@ozlabs.org> and Martin Schwenke
+# <martin@meltin.net> for the OpenSSL project.
+# ====================================================================
+#
+# p521 lower-level primitives for PPC64 using vector instructions.
+#
+
+use strict;
+use warnings;
+
+my $flavour = shift;
+my $output = "";
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+if (!$output) {
+	$output = "-";
+}
+
+my ($xlate, $dir);
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my $code = "";
+
+my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12");
+
+my $vzero = "v32";
+
+sub startproc($)
+{
+    my ($name) = @_;
+
+    $code.=<<___;
+    .globl ${name}
+${name}:
+	.cfi_startproc
+
+___
+}
+
+sub endproc($)
+{
+    my ($name) = @_;
+
+    $code.=<<___;
+	blr
+	.cfi_endproc
+	    .size	${name},.-${name}
+
+___
+}
+
+
+sub push_vrs($$)
+{
+	my ($min, $max) = @_;
+
+	my $count = $max - $min + 1;
+
+	$code.=<<___;
+	mr		$savesp,$sp
+	stdu		$sp,-16*`$count+1`($sp)
+
+___
+	    for (my $i = $min; $i <= $max; $i++) {
+		    my $mult = $max - $i + 1;
+		    $code.=<<___;
+	stxv		$i,-16*$mult($savesp)
+___
+
+	}
+
+	$code.=<<___;
+
+___
+}
+
+sub pop_vrs($$)
+{
+	my ($min, $max) = @_;
+
+	$code.=<<___;
+	ld		$savesp,0($sp)
+___
+	for (my $i = $min; $i <= $max; $i++) {
+		my $mult = $max - $i + 1;
+		$code.=<<___;
+	lxv		$i,-16*$mult($savesp)
+___
+	}
+
+	$code.=<<___;
+	mr		$sp,$savesp
+
+___
+}
+
+sub load_vrs($$)
+{
+	my ($pointer, $reg_list) = @_;
+
+	for (my $i = 0; $i <= 8; $i++) {
+		my $offset = $i * 8;
+		$code.=<<___;
+	lxsd		$reg_list->[$i],$offset($pointer)
+___
+	}
+
+	$code.=<<___;
+
+___
+}
+
+sub store_vrs($$)
+{
+	my ($pointer, $reg_list) = @_;
+
+	for (my $i = 0; $i <= 8; $i++) {
+		my $offset = $i * 16;
+		$code.=<<___;
+	stxv		$reg_list->[$i],$offset($pointer)
+___
+	}
+
+	$code.=<<___;
+
+___
+}
+
+$code.=<<___;
+.text
+
+___
+
+{
+	# mul/square common
+	my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v44", "v54");
+	my ($zero, $one) = ("r8", "r9");
+	my @out = map("v$_",(55..63));
+
+	{
+		#
+		# p521_felem_mul
+		#
+
+		my ($in1p, $in2p) = ("r4", "r5");
+		my @in1 = map("v$_",(45..53));
+		my @in2 = map("v$_",(35..43));
+
+		startproc("p521_felem_mul");
+
+		push_vrs(52, 63);
+
+		$code.=<<___;
+	vspltisw	$vzero,0
+
+___
+
+		load_vrs($in1p, \@in1);
+		load_vrs($in2p, \@in2);
+
+		$code.=<<___;
+	vmsumudm	$out[0],$in1[0],$in2[0],$vzero
+
+	xxpermdi	$t1,$in1[0],$in1[1],0b00
+	xxpermdi	$t2,$in2[1],$in2[0],0b00
+	vmsumudm	$out[1],$t1,$t2,$vzero
+
+	xxpermdi	$t2,$in2[2],$in2[1],0b00
+	vmsumudm	$out[2],$t1,$t2,$vzero
+	vmsumudm	$out[2],$in1[2],$in2[0],$out[2]
+
+	xxpermdi	$t2,$in2[3],$in2[2],0b00
+	vmsumudm	$out[3],$t1,$t2,$vzero
+	xxpermdi	$t3,$in1[2],$in1[3],0b00
+	xxpermdi	$t4,$in2[1],$in2[0],0b00
+	vmsumudm	$out[3],$t3,$t4,$out[3]
+
+	xxpermdi	$t2,$in2[4],$in2[3],0b00
+	vmsumudm	$out[4],$t1,$t2,$vzero
+	xxpermdi	$t4,$in2[2],$in2[1],0b00
+	vmsumudm	$out[4],$t3,$t4,$out[4]
+	vmsumudm	$out[4],$in1[4],$in2[0],$out[4]
+
+	xxpermdi	$t2,$in2[5],$in2[4],0b00
+	vmsumudm	$out[5],$t1,$t2,$vzero
+	xxpermdi	$t4,$in2[3],$in2[2],0b00
+	vmsumudm	$out[5],$t3,$t4,$out[5]
+
+	xxpermdi	$t2,$in2[6],$in2[5],0b00
+	vmsumudm	$out[6],$t1,$t2,$vzero
+	xxpermdi	$t4,$in2[4],$in2[3],0b00
+	vmsumudm	$out[6],$t3,$t4,$out[6]
+
+	xxpermdi	$t2,$in2[7],$in2[6],0b00
+	vmsumudm	$out[7],$t1,$t2,$vzero
+	xxpermdi	$t4,$in2[5],$in2[4],0b00
+	vmsumudm	$out[7],$t3,$t4,$out[7]
+
+	xxpermdi	$t2,$in2[8],$in2[7],0b00
+	vmsumudm	$out[8],$t1,$t2,$vzero
+	xxpermdi	$t4,$in2[6],$in2[5],0b00
+	vmsumudm	$out[8],$t3,$t4,$out[8]
+
+	xxpermdi	$t1,$in1[4],$in1[5],0b00
+	xxpermdi	$t2,$in2[1],$in2[0],0b00
+	vmsumudm	$out[5],$t1,$t2,$out[5]
+
+	xxpermdi	$t2,$in2[2],$in2[1],0b00
+	vmsumudm	$out[6],$t1,$t2,$out[6]
+	vmsumudm	$out[6],$in1[6],$in2[0],$out[6]
+
+	xxpermdi	$t2,$in2[3],$in2[2],0b00
+	vmsumudm	$out[7],$t1,$t2,$out[7]
+	xxpermdi	$t3,$in1[6],$in1[7],0b00
+	xxpermdi	$t4,$in2[1],$in2[0],0b00
+	vmsumudm	$out[7],$t3,$t4,$out[7]
+
+	xxpermdi	$t2,$in2[4],$in2[3],0b00
+	vmsumudm	$out[8],$t1,$t2,$out[8]
+	xxpermdi	$t4,$in2[2],$in2[1],0b00
+	vmsumudm	$out[8],$t3,$t4,$out[8]
+	vmsumudm	$out[8],$in1[8],$in2[0],$out[8]
+
+	li		$zero,0
+	li		$one,1
+	mtvsrdd		$t1,$one,$zero
+___
+
+		for (my $i = 0; $i <= 8; $i++) {
+			$code.=<<___;
+	vsld		$in2[$i],$in2[$i],$t1
+___
+		}
+
+		$code.=<<___;
+
+	vmsumudm	$out[7],$in1[8],$in2[8],$out[7]
+
+	xxpermdi	$t2,$in2[8],$in2[7],0b00
+	xxpermdi	$t1,$in1[7],$in1[8],0b00
+	vmsumudm	$out[6],$t1,$t2,$out[6]
+
+	xxpermdi	$t1,$in1[6],$in1[7],0b00
+	vmsumudm	$out[5],$t1,$t2,$out[5]
+	vmsumudm	$out[5],$in1[8],$in2[6],$out[5]
+
+	xxpermdi	$t1,$in1[5],$in1[6],0b00
+	vmsumudm	$out[4],$t1,$t2,$out[4]
+	xxpermdi	$t4,$in2[6],$in2[5],0b00
+	xxpermdi	$t3,$in1[7],$in1[8],0b00
+	vmsumudm	$out[4],$t3,$t4,$out[4]
+
+	xxpermdi	$t1,$in1[4],$in1[5],0b00
+	vmsumudm	$out[3],$t1,$t2,$out[3]
+	xxpermdi	$t3,$in1[6],$in1[7],0b00
+	vmsumudm	$out[3],$t3,$t4,$out[3]
+	vmsumudm	$out[3],$in1[8],$in2[4],$out[3]
+
+	xxpermdi	$t1,$in1[3],$in1[4],0b00
+	vmsumudm	$out[2],$t1,$t2,$out[2]
+	xxpermdi	$t3,$in1[5],$in1[6],0b00
+	vmsumudm	$out[2],$t3,$t4,$out[2]
+
+	xxpermdi	$t1,$in1[2],$in1[3],0b00
+	vmsumudm	$out[1],$t1,$t2,$out[1]
+	xxpermdi	$t3,$in1[4],$in1[5],0b00
+	vmsumudm	$out[1],$t3,$t4,$out[1]
+
+	xxpermdi	$t1,$in1[1],$in1[2],0b00
+	vmsumudm	$out[0],$t1,$t2,$out[0]
+	xxpermdi	$t3,$in1[3],$in1[4],0b00
+	vmsumudm	$out[0],$t3,$t4,$out[0]
+
+	xxpermdi	$t2,$in2[4],$in2[3],0b00
+	xxpermdi	$t1,$in1[7],$in1[8],0b00
+	vmsumudm	$out[2],$t1,$t2,$out[2]
+
+	xxpermdi	$t1,$in1[6],$in1[7],0b00
+	vmsumudm	$out[1],$t1,$t2,$out[1]
+	vmsumudm	$out[1],$in1[8],$in2[2],$out[1]
+
+	xxpermdi	$t1,$in1[5],$in1[6],0b00
+	vmsumudm	$out[0],$t1,$t2,$out[0]
+	xxpermdi	$t4,$in2[2],$in2[1],0b00
+	xxpermdi	$t3,$in1[7],$in1[8],0b00
+	vmsumudm	$out[0],$t3,$t4,$out[0]
+
+___
+
+		store_vrs($outp, \@out);
+
+		pop_vrs(52, 63);
+
+		endproc("p521_felem_mul");
+	}
+
+	{
+		#
+		# p51_felem_square
+		#
+
+		my ($inp) = ("r4");
+		my @in = map("v$_",(45..53));
+		my @inx2 = map("v$_",(35..43));
+
+		startproc("p521_felem_square");
+
+		push_vrs(52, 63);
+
+		$code.=<<___;
+	vspltisw	$vzero,0
+
+___
+
+		load_vrs($inp, \@in);
+
+		$code.=<<___;
+	li		$zero,0
+	li		$one,1
+	mtvsrdd		$t1,$one,$zero
+___
+
+		for (my $i = 0; $i <= 8; $i++) {
+			$code.=<<___;
+	vsld		$inx2[$i],$in[$i],$t1
+___
+		}
+
+		$code.=<<___;
+	vmsumudm	$out[0],$in[0],$in[0],$vzero
+
+	vmsumudm	$out[1],$in[0],$inx2[1],$vzero
+
+	xxpermdi	$t1,$in[0],$in[1],0b00
+	xxpermdi	$t2,$inx2[2],$in[1],0b00
+	vmsumudm	$out[2],$t1,$t2,$vzero
+
+	xxpermdi	$t2,$inx2[3],$inx2[2],0b00
+	vmsumudm	$out[3],$t1,$t2,$vzero
+
+	xxpermdi	$t2,$inx2[4],$inx2[3],0b00
+	vmsumudm	$out[4],$t1,$t2,$vzero
+	vmsumudm	$out[4],$in[2],$in[2],$out[4]
+
+	xxpermdi	$t2,$inx2[5],$inx2[4],0b00
+	vmsumudm	$out[5],$t1,$t2,$vzero
+	vmsumudm	$out[5],$in[2],$inx2[3],$out[5]
+
+	xxpermdi	$t2,$inx2[6],$inx2[5],0b00
+	vmsumudm	$out[6],$t1,$t2,$vzero
+	xxpermdi	$t3,$in[2],$in[3],0b00
+	xxpermdi	$t4,$inx2[4],$in[3],0b00
+	vmsumudm	$out[6],$t3,$t4,$out[6]
+
+	xxpermdi	$t2,$inx2[7],$inx2[6],0b00
+	vmsumudm	$out[7],$t1,$t2,$vzero
+	xxpermdi	$t4,$inx2[5],$inx2[4],0b00
+	vmsumudm	$out[7],$t3,$t4,$out[7]
+
+	xxpermdi	$t2,$inx2[8],$inx2[7],0b00
+	vmsumudm	$out[8],$t1,$t2,$vzero
+	xxpermdi	$t4,$inx2[6],$inx2[5],0b00
+	vmsumudm	$out[8],$t3,$t4,$out[8]
+	vmsumudm	$out[8],$in[4],$in[4],$out[8]
+
+	vmsumudm	$out[1],$in[5],$inx2[5],$out[1]
+
+	vmsumudm	$out[3],$in[6],$inx2[6],$out[3]
+
+	vmsumudm	$out[5],$in[7],$inx2[7],$out[5]
+
+	vmsumudm	$out[7],$in[8],$inx2[8],$out[7]
+
+	mtvsrdd		$t1,$one,$zero
+___
+
+		for (my $i = 5; $i <= 8; $i++) {
+			$code.=<<___;
+	vsld		$inx2[$i],$inx2[$i],$t1
+___
+		}
+
+		$code.=<<___;
+
+	vmsumudm	$out[6],$in[7],$inx2[8],$out[6]
+
+	vmsumudm	$out[5],$in[6],$inx2[8],$out[5]
+
+	xxpermdi	$t2,$inx2[8],$inx2[7],0b00
+	xxpermdi	$t1,$in[5],$in[6],0b00
+	vmsumudm	$out[4],$t1,$t2,$out[4]
+
+	xxpermdi	$t1,$in[4],$in[5],0b00
+	vmsumudm	$out[3],$t1,$t2,$out[3]
+
+	xxpermdi	$t1,$in[3],$in[4],0b00
+	vmsumudm	$out[2],$t1,$t2,$out[2]
+	vmsumudm	$out[2],$in[5],$inx2[6],$out[2]
+
+	xxpermdi	$t1,$in[2],$in[3],0b00
+	vmsumudm	$out[1],$t1,$t2,$out[1]
+	vmsumudm	$out[1],$in[4],$inx2[6],$out[1]
+
+	xxpermdi	$t1,$in[1],$in[2],0b00
+	vmsumudm	$out[0],$t1,$t2,$out[0]
+	xxpermdi	$t2,$inx2[6],$inx2[5],0b00
+	xxpermdi	$t1,$in[3],$in[4],0b00
+	vmsumudm	$out[0],$t1,$t2,$out[0]
+
+___
+
+		store_vrs($outp, \@out);
+
+		pop_vrs(52, 63);
+
+		endproc("p521_felem_square");
+	}
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/ec/build.info b/crypto/ec/build.info
index bfd16b326f..f4314dd896 100644
--- a/crypto/ec/build.info
+++ b/crypto/ec/build.info
@@ -30,8 +30,9 @@ IF[{- !$disabled{asm} -}]
   $ECASM_parisc20_64=
 
   $ECASM_ppc32=
-  $ECASM_ppc64=ecp_nistz256.c ecp_nistz256-ppc64.s x25519-ppc64.s
+  $ECASM_ppc64=ecp_nistz256.c ecp_nistz256-ppc64.s ecp_nistp521-ppc64.s x25519-ppc64.s
   $ECDEF_ppc64=ECP_NISTZ256_ASM ECP_NISTP521_ASM X25519_ASM
+  INCLUDE[ecp_nistp521.o]=..
 
   $ECASM_c64xplus=
 
@@ -86,5 +87,7 @@ GENERATE[ecp_nistz256-armv8.S]=asm/ecp_nistz256-armv8.pl
 INCLUDE[ecp_nistz256-armv8.o]=..
 GENERATE[ecp_nistz256-ppc64.s]=asm/ecp_nistz256-ppc64.pl
 
+GENERATE[ecp_nistp521-ppc64.s]=asm/ecp_nistp521-ppc64.pl
+
 GENERATE[x25519-x86_64.s]=asm/x25519-x86_64.pl
 GENERATE[x25519-ppc64.s]=asm/x25519-ppc64.pl
diff --git a/crypto/ec/ecp_nistp521.c b/crypto/ec/ecp_nistp521.c
index 02bded2b6f..338618ebca 100644
--- a/crypto/ec/ecp_nistp521.c
+++ b/crypto/ec/ecp_nistp521.c
@@ -684,8 +684,24 @@ static void (*felem_square_p)(largefelem out, const felem in) =
 static void (*felem_mul_p)(largefelem out, const felem in1, const felem in2) =
     felem_mul_wrapper;
 
+void p521_felem_square(largefelem out, const felem in);
+void p521_felem_mul(largefelem out, const felem in1, const felem in2);
+
+# if defined(_ARCH_PPC64)
+#  include "ppc_arch.h"
+# endif
+
 void felem_select(void)
 {
+# if defined(_ARCH_PPC64)
+    if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
+        felem_square_p = p521_felem_square;
+        felem_mul_p = p521_felem_mul;
+
+        return;
+    }
+# endif
+
     /* Default */
     felem_square_p = felem_square_ref;
     felem_mul_p = felem_mul_ref;
-- 
cgit v1.2.3