#!/usr/bin/env perl
# Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# X25519 lower-level primitives for x86_64.
#
# February 2018.
#
# This module implements radix 2^51 multiplication and squaring, and
# radix 2^64 multiplication, squaring, addition, subtraction and final
# reduction. Latter radix is used on ADCX/ADOX-capable processors such
# as Broadwell. On related note one should mention that there are
# vector implementations that provide significantly better performance
# on some processors(*), but they are large and overly complex. Which
# in combination with them being effectively processor-specific makes
# the undertaking hard to justify. The goal for this implementation
# is rather versatility and simplicity [and ultimately formal
# verification].
#
# (*) For example sandy2x should provide ~30% improvement on Sandy
# Bridge, but only nominal ~5% on Haswell [and big loss on
# Broadwell and successors].
#
######################################################################
# Improvement coefficients:
#
# amd64-51(*) gcc-5.x(**)
#
# P4 +22% +40%
# Sandy Bridge -3% +11%
# Haswell -1% +13%
# Broadwell(***) +30% +35%
# Skylake(***) +33% +47%
# Silvermont +20% +26%
# Goldmont +40% +50%
# Bulldozer +20% +9%
# Ryzen(***) +43% +40%
# VIA +170% +120%
#
# (*) amd64-51 is popular assembly implementation with 2^51 radix,
# only multiplication and squaring subroutines were linked
# for comparison, but not complete ladder step; gain on most
# processors is because this module refrains from shld, and
# minor regression on others is because this does result in
# higher instruction count;
# (**) compiler is free to inline functions, in assembly one would
# need to implement ladder step to do that, and it will improve
# performance by several percent;
# (***) ADCX/ADOX result for 2^64 radix, there is no corresponding
# C implementation, so that comparison is always against
# 2^51 radix;
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
$addx = ($1>=2.23);
}
if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
$addx = ($1>=2.10);
}
if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
`ml64 2>&1` =~ /Version ([0-9]+)\./) {
$addx = ($1>=12);
}
if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
$addx = ($ver>=3.03);
}
$code.=<<___;
.text
.globl x25519_fe51_mul
.type x25519_fe51_mul,\@function,3
.align 32
x25519_fe51_mul:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
lea -8*5(%rsp),%rsp
.cfi_adjust_cfa_offset 40
.Lfe51_mul_body:
mov 8*0(%rsi),%rax # f[0]
mov 8*0(%rdx),%r11 # load g[0-4]
mov 8*1(%rdx),%r12
mov 8*2(%rdx),%r13
mov 8*3(%rdx),%rbp
mov 8*4(%rdx),%r14
mov %rdi,8*4(%rsp) # offload 1st argument
mov %rax,%rdi
mulq %r11 # f[0]*g[0]
mov %r11,8*0(%rsp) # offload g[0]
mov %rax,%rbx # %rbx:%rcx = h0
mov %rdi,%rax
mov %rdx,%rcx
mulq %r12 # f[0]*g[1]
mov %r12,8*1(%rsp) # offload g[1]
mov %rax,%r8 # %r8:%r9 = h1
mov %rdi,%rax
lea (%r14,%r14,8),%r15
mov %rdx,%r9
mulq %r13 # f[0]*g[2]
mov %r13,8*2(%rsp) # offload g[2]
mov %rax,%r10 # %r10:%r11 = h2
mov %rdi,%rax
lea (%r14,%r15,2),%rdi # g[4]*19
mov %rdx,%r11
mulq %rbp # f[0]*g[3]
mov %rax,%r12 # %r12:%r13 = h3
mov 8*0(%rsi),%rax # f[0]
mov %rdx,%r13
mulq %r14 # f[0]*g[4]
mov %rax,%r14 # %r14:%r15 = h4
mov 8*1(%rsi),%rax # f[1]
mov %rdx,%r15
mulq %rdi # f[1]*g[4]*19
add %rax,%rbx
mov 8*2(%rsi),%rax # f[2]
adc %rdx,%rcx
mulq %rdi # f[2]*g[4]*19
add %rax,%r8
mov 8*3(%rsi),%rax # f[3]
adc %rdx,%r9
mulq %rdi # f[3]*g[4]*19
add %rax,%r10
mov 8*4(%rsi),%rax # f[4]
adc %rdx,%r11
mulq %rdi # f[4]*g[4]*19
imulq \$19,%rbp,%rdi # g[3]*19
add %rax,%r12
mov 8*1(%rsi),%rax # f[1]
adc %rdx,%r13
mulq %rbp # f[1]*g[3]
mov 8*2(%rsp),%rbp # g[2]
add %rax,%r14
mov 8*2(%rsi),%rax # f[2]
adc %rdx,%r15
mulq %rdi # f[2]*g[3]*19
add %rax,%rbx
mov 8*3(%rsi)