#! /usr/bin/env perl
# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# October 2015
#
# ChaCha20 for PowerPC/AltiVec.
#
# June 2018
#
# Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for
# processors that can't issue more than one vector instruction per
# cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x
# interleave would perform better. Incidentally PowerISA 2.07 (first
# implemented by POWER8) defined new usable instructions, hence 4xVSX
# code path...
#
# Performance in cycles per byte out of large buffer.
#
# IALU/gcc-4.x 3xAltiVec+1xIALU 4xVSX
#
# Freescale e300 13.6/+115% - -
# PPC74x0/G4e 6.81/+310% 3.81 -
# PPC970/G5 9.29/+160% ? -
# POWER7 8.62/+61% 3.35 -
# POWER8 8.70/+51% 2.91 2.09
# POWER9 8.80/+29% 4.44(*) 2.45(**)
#
# (*) this is trade-off result, it's possible to improve it, but
# then it would negatively affect all others;
# (**) POWER9 seems to be "allergic" to mixing vector and integer
# instructions, which is why switch to vector-only code pays
# off that much;
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
$UCMP ="cmpld";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
$UCMP ="cmplw";
} else { die "nonsense $flavour"; }
$LITTLE_ENDIAN = ($flavour=~/le$/