#! /usr/bin/env perl
# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements Poly1305 hash for s390x.
#
# June 2015
#
# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
# code. For older compiler improvement coefficient is >3x, because
# then base 2^64 and base 2^32 implementations are compared.
#
# On side note, z13 enables vector base 2^26 implementation...
#
# January 2019
#
# Add vx code path (base 2^26).
#
# Copyright IBM Corp. 2019
# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
#
# January 2019
#
# Add vector base 2^26 implementation. It's problematic to accurately
# measure performance, because reference system is hardly idle. But
# it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's
# >=20% faster than IBM's submission on long inputs, and much faster on
# short ones, because calculation of key powers is postponed till we
# know that input is long enough to justify the additional overhead.
use strict;
use FindBin qw($Bin);
use lib "$Bin/../..";
use perlasm::s390x qw(:DEFAULT :LD :GE :EI :MI1 :VX AUTOLOAD LABEL INCLUDE);
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
my ($z,$SIZE_T);
if ($flavour =~ /3[12]/) {
$z=0; # S/390 ABI
$SIZE_T=4;
} else {
$z=1; # zSeries ABI
$SIZE_T=8;
}
my $stdframe=16*$SIZE_T+4*8;
my $sp="%r15";
my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
PERLASM_BEGIN($output);
INCLUDE ("s390x_arch.h");
TEXT ();
################
# static void poly1305_init(void *ctx, const unsigned char key[16])
{
GLOBL ("poly1305_init");
TYPE ("poly1305_init","\@function");
ALIGN (16);
LABEL ("poly1305_init");
lghi ("%r0",0);
lghi ("%r1",-1);
stg ("%r0","0($ctx)"); # zero hash value
stg ("%r0","8($ctx)");
stg ("%r0","16($ctx)");
st ("%r0","24($ctx)"); # clear is_base2_26
lgr ("%r5",$ctx); # reassign $ctx
lghi ("%r2",0);
&{$z? \&clgr:\&clr} ($inp,"%r0");
je (".Lno_key");
lrvg ("%r2","0($inp)"); # load little-endian key
lrvg ("%r3","8($inp)");
nihl ("%r1",0xffc0); # 0xffffffc0ffffffff
srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff
srlg ("%r1","%r1",4);
nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc
ngr ("%r2","%r0");
ngr ("%r3","%r1");
stmg ("%r2","%r3","32(%r5)");
larl ("%r1","OPENSSL_s390xcap_P");
lg ("%r0","16(%r1)");
srlg ("%r0","%r0",62);
nill ("%r0",1); # extract vx bit
lcgr ("%r0","%r0");
larl ("%r1",".Lpoly1305_blocks");
larl ("%r2",".Lpoly1305_blocks_vx");
larl ("%r3",".Lpoly1305_emit");
&{$z? \&xgr:\&xr} ("%r2","%r1"); # select between scalar and vector
&{$z? \&ngr:\&nr} ("%r2","%r0");
&{$z? \&xgr:\&xr} ("%r2","%r1");
&{$z? \&stmg:\&stm} ("%r2","%r3","0(%r4)");
lghi ("%r2",1);
LABEL (".Lno_key");
br ("%r14");
SIZE ("poly1305_init",".-poly1305_init");
}
################
# static void poly1305_blocks(void *ctx, const unsigned char *inp,
# size_t len, u32 padbit)
{
my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
my ($r0,$r1,$s1) = map("%r$_",(0..2));
GLOBL ("poly1305_blocks");
TYPE ("poly1305_blocks","\@function");
ALIGN (16);
LABEL ("poly1305_blocks");
LABEL (".Lpoly1305_blocks");
&{$z? \<gr:\<r} ("%r0",$len);
jz (".Lno_data");
&