#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA512 block transform for x86. September 2007.
#
# May 2013.
#
# Add SSSE3 code path, 20-25% improvement [over original SSE2 code].
#
# Performance in clock cycles per processed byte (less is better):
#
# gcc icc x86 asm SIMD(*) x86_64(**)
# Pentium 100 97 61 - -
# PIII 75 77 56 - -
# P4 116 95 82 34.6 30.8
# AMD K8 54 55 36 20.7 9.57
# Core2 66 57 40 15.9 9.97
# Westmere 70 - 38 12.2 9.58
# Sandy Bridge 58 - 35 11.9 11.2
# Ivy Bridge 50 - 33 11.5 8.17
# Haswell 46 - 29 11.3 7.66
# Skylake 40 - 26 13.3 7.25
# Bulldozer 121 - 50 14.0 13.5
# VIA Nano 91 - 52 33 14.7
# Atom 126 - 68 48(***) 14.7
# Silvermont 97 - 58 42(***) 17.5
# Goldmont 80 - 48 19.5 12.0
#
# (*) whichever best applicable.
# (**) x86_64 assembler performance is presented for reference
# purposes, the results are for integer-only code.
# (***) paddq is incredibly slow on Atom.
#
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
# performance improvement over compiler generated code reaches ~60%,
# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
# to 50%, but it's less important as they are expected to execute SSE2
# code-path, which is commonly ~2-3x faster [than compiler generated
# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
# though it does not use 128-bit operations. The latter means that
# SSE2-aware kernel is no longer required to execute the code. Another
# difference is that new code optimizes amount of writes, but at the
# cost of increased data cache "footprint" by 1/2KB.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output=pop;
open STDOUT,">$output";
&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp");
$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp");
$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp");
$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp");
$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp");
$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp");
$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp");
$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp");
$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp");
$K512="ebp";
$Asse2=&QWP(0,"esp");
$Bsse2=&QWP(8,"esp");
$Csse2=&QWP(16,"esp");
$Dsse2=&QWP(24,"esp");
$Esse2=&QWP(32,"esp");
$Fsse2=&QWP(40,"esp");
$Gsse2=&QWP(48,"esp");
$Hsse2=&QWP(56,"esp");
$A="mm0"; # B-D and
$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
# mm5-mm7, but it's done on on-demand basis...
$BxC="mm2"; # ... except for B^C
sub BODY_00_15_sse2 {
my $phase=shift;
#&movq ("mm5",$Fsse2); # load f
#&movq ("mm6",$Gsse2); # load g
&movq ("mm1",$E); # %mm1 is sliding right
&pxor ("mm5","mm6"); # f^=g
&psrlq ("mm1",14);
&movq ($Esse2,$E); # modulo-scheduled save e
&pand ("mm5",$E); # f&=e
&psllq ($E,23); # $E is sliding left
&movq ($A,"mm3") if ($phase<2);
&movq (&QWP(8*9,"esp"),"mm7") # save X[i]
&movq ("mm3","mm1"); # %mm3 is T1
&psrlq ("mm1",4);
&pxor ("mm5","mm6"); # Ch(e,f,g)
&pxor ("mm3",$E);
&psllq ($E,23);
&pxor ("mm3","mm1");
&movq ($Asse2,$A); # modulo-scheduled save a
&paddq ("mm7","mm5"); # X[i]+=Ch(e,f,g)
&pxor ("mm3",$E);
&psrlq ("mm1",23);
&paddq ("mm7",$Hsse2); # X[i]+=h
&pxor ("mm3","mm1");
&psllq ($E,4);
&paddq ("mm7",QWP(0,$K512)); # X[i]+=K512[i]
&pxor ("mm3",$E); # T1=Sigma1_512(e)
&movq ($E,$Dsse2); # e = load d, e in next round
&paddq ("mm3","mm7"); # T1+=X[i]
&movq ("mm5",$A); # %mm5 is sliding right
&psrlq ("mm5",28);
&paddq ($E,"mm3"); # d += T1
&movq ("mm6",$A); # %mm6 is sliding left
&movq ("mm7","mm5");
&psllq ("mm6",25);
&movq ("mm1",$Bsse2); # load b
&psrlq ("mm5",6);