From 3645eb0be22a4cea4300ab5afbf248d195d0f45b Mon Sep 17 00:00:00 2001 From: Jerry Shih Date: Thu, 28 Sep 2023 13:51:17 +0800 Subject: Update for Zvkb extension. https://github.com/riscv/riscv-crypto/blob/c8ddeb7e64a3444dda0438316af1238aeed72041/doc/vector/riscv-crypto-vector-zvkb.adoc Create `RISCV_HAS_ZVKB()` macro. Use zvkb for SM4 instead of zvbb. Use zvkb for ghash instead of zvbb. We could just use the zvbb's subset `zvkb` for flexibility. Signed-off-by: Jerry Shih Signed-off-by: Phoebe Chen Reviewed-by: Tomas Mraz Reviewed-by: Paul Dale Reviewed-by: Hugo Landau (Merged from https://github.com/openssl/openssl/pull/21923) --- crypto/modes/asm/ghash-riscv64-zvbb-zvbc.pl | 378 ---------------------------- crypto/modes/asm/ghash-riscv64-zvkb-zvbc.pl | 378 ++++++++++++++++++++++++++++ crypto/modes/asm/ghash-riscv64-zvkg.pl | 19 +- crypto/modes/build.info | 4 +- crypto/modes/gcm128.c | 22 +- crypto/perlasm/riscv.pm | 13 +- crypto/sm4/asm/sm4-riscv64-zvksed.pl | 6 +- 7 files changed, 417 insertions(+), 403 deletions(-) delete mode 100644 crypto/modes/asm/ghash-riscv64-zvbb-zvbc.pl create mode 100644 crypto/modes/asm/ghash-riscv64-zvkb-zvbc.pl (limited to 'crypto') diff --git a/crypto/modes/asm/ghash-riscv64-zvbb-zvbc.pl b/crypto/modes/asm/ghash-riscv64-zvbb-zvbc.pl deleted file mode 100644 index c64211c3ab..0000000000 --- a/crypto/modes/asm/ghash-riscv64-zvbb-zvbc.pl +++ /dev/null @@ -1,378 +0,0 @@ -#! /usr/bin/env perl -# This file is dual-licensed, meaning that you can use it under your -# choice of either of the following two licenses: -# -# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. -# -# Licensed under the Apache License 2.0 (the "License"). You can obtain -# a copy in the file LICENSE in the source distribution or at -# https://www.openssl.org/source/license.html -# -# or -# -# Copyright (c) 2023, Christoph Müllner -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# - RV64I -# - RISC-V vector ('V') with VLEN >= 128 -# - Vector Bit-manipulation used in Cryptography ('Zvbb') -# - Vector Carryless Multiplication ('Zvbc') - -use strict; -use warnings; - -use FindBin qw($Bin); -use lib "$Bin"; -use lib "$Bin/../../perlasm"; -use riscv; - -# $output is the last argument if it looks like a file (it has an extension) -# $flavour is the first argument if it doesn't look like a file -my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; -my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; - -$output and open STDOUT,">$output"; - -my $code=<<___; -.text -___ - -################################################################################ -# void gcm_init_rv64i_zvbb_zvbc(u128 Htable[16], const u64 H[2]); -# -# input: H: 128-bit H - secret parameter E(K, 0^128) -# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zvbb_zvbc and -# gcm_ghash_rv64i_zvbb_zvbc -{ -my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2"); -my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6"); - -$code .= <<___; -.p2align 3 -.globl gcm_init_rv64i_zvbb_zvbc -.type gcm_init_rv64i_zvbb_zvbc,\@function -gcm_init_rv64i_zvbb_zvbc: - # Load/store data in reverse order. - # This is needed as a part of endianness swap. - add $H, $H, 8 - li $TMP0, -8 - li $TMP1, 63 - la $TMP2, Lpolymod - - @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu - - @{[vlse64_v $V1, $H, $TMP0]} # vlse64.v v1, (a1), t0 - @{[vle64_v $V2, $TMP2]} # vle64.v v2, (t2) - - # Shift one left and get the carry bits. - @{[vsrl_vx $V3, $V1, $TMP1]} # vsrl.vx v3, v1, t1 - @{[vsll_vi $V1, $V1, 1]} # vsll.vi v1, v1, 1 - - # Use the fact that the polynomial degree is no more than 128, - # i.e. only the LSB of the upper half could be set. - # Thanks to this we don't need to do the full reduction here. - # Instead simply subtract the reduction polynomial. - # This idea was taken from x86 ghash implementation in OpenSSL. - @{[vslideup_vi $V4, $V3, 1]} # vslideup.vi v4, v3, 1 - @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 - - @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 - @{[vor_vv_v0t $V1, $V1, $V4]} # vor.vv v1, v1, v4, v0.t - - # Need to set the mask to 3, if the carry bit is set. - @{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3 - @{[vmv_v_i $V3, 0]} # vmv.v.i v3, 0 - @{[vmerge_vim $V3, $V3, 3]} # vmerge.vim v3, v3, 3, v0 - @{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3 - - @{[vxor_vv_v0t $V1, $V1, $V2]} # vxor.vv v1, v1, v2, v0.t - - @{[vse64_v $V1, $Htable]} # vse64.v v1, (a0) - ret -.size gcm_init_rv64i_zvbb_zvbc,.-gcm_init_rv64i_zvbb_zvbc -___ -} - -################################################################################ -# void gcm_gmult_rv64i_zvbb_zvbc(u64 Xi[2], const u128 Htable[16]); -# -# input: Xi: current hash value -# Htable: preprocessed H -# output: Xi: next hash value Xi = (Xi * H mod f) -{ -my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4"); -my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6"); - -$code .= <<___; -.text -.p2align 3 -.globl gcm_gmult_rv64i_zvbb_zvbc -.type gcm_gmult_rv64i_zvbb_zvbc,\@function -gcm_gmult_rv64i_zvbb_zvbc: - ld $TMP0, ($Htable) - ld $TMP1, 8($Htable) - li $TMP2, 63 - la $TMP3, Lpolymod - ld $TMP3, 8($TMP3) - - # Load/store data in reverse order. - # This is needed as a part of endianness swap. - add $Xi, $Xi, 8 - li $TMP4, -8 - - @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu - - @{[vlse64_v $V5, $Xi, $TMP4]} # vlse64.v v5, (a0), t4 - @{[vrev8_v $V5, $V5]} # vrev8.v v5, v5 - - # Multiplication - - # Do two 64x64 multiplications in one go to save some time - # and simplify things. - - # A = a1a0 (t1, t0) - # B = b1b0 (v5) - # C = c1c0 (256 bit) - # c1 = a1b1 + (a0b1)h + (a1b0)h - # c0 = a0b0 + (a0b1)l + (a1b0)h - - # v1 = (a0b1)l,(a0b0)l - @{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0 - # v3 = (a0b1)h,(a0b0)h - @{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0 - - # v4 = (a1b1)l,(a1b0)l - @{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1 - # v2 = (a1b1)h,(a1b0)h - @{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1 - - # Is there a better way to do this? - # Would need to swap the order of elements within a vector register. - @{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1 - @{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1 - @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 - @{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1 - - @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 - # v2 += (a0b1)h - @{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t - # v2 += (a1b1)l - @{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t - - @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 - # v1 += (a0b0)h,0 - @{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t - # v1 += (a1b0)l,0 - @{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t - - # Now the 256bit product should be stored in (v2,v1) - # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l - # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l - - # Reduction - # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0] - # This is a slight variation of the Gueron's Montgomery reduction. - # The difference being the order of some operations has been changed, - # to make a better use of vclmul(h) instructions. - - # First step: - # c1 += (c0 * P)l - # vmv.v.i v0, 2 - @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t - @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t - @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t - - # Second step: - # D = d1,d0 is final result - # We want: - # m1 = c1 + (c1 * P)h - # m0 = (c1 * P)l + (c0 * P)h + c0 - # d1 = c3 + m1 - # d0 = c2 + m0 - - #v3 = (c1 * P)l, 0 - @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t - #v4 = (c1 * P)h, (c0 * P)h - @{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3 - - @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 - @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 - - @{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4 - @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t - - # XOR in the upper upper part of the product - @{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1 - - @{[vrev8_v $V2, $V2]} # vrev8.v v2, v2 - @{[vsse64_v $V2, $Xi, $TMP4]} # vsse64.v v2, (a0), t4 - ret -.size gcm_gmult_rv64i_zvbb_zvbc,.-gcm_gmult_rv64i_zvbb_zvbc -___ -} - -################################################################################ -# void gcm_ghash_rv64i_zvbb_zvbc(u64 Xi[2], const u128 Htable[16], -# const u8 *inp, size_t len); -# -# input: Xi: current hash value -# Htable: preprocessed H -# inp: pointer to input data -# len: length of input data in bytes (multiple of block size) -# output: Xi: Xi+1 (next hash value Xi) -{ -my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6"); -my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7"); - -$code .= <<___; -.p2align 3 -.globl gcm_ghash_rv64i_zvbb_zvbc -.type gcm_ghash_rv64i_zvbb_zvbc,\@function -gcm_ghash_rv64i_zvbb_zvbc: - ld $TMP0, ($Htable) - ld $TMP1, 8($Htable) - li $TMP2, 63 - la $TMP3, Lpolymod - ld $TMP3, 8($TMP3) - - # Load/store data in reverse order. - # This is needed as a part of endianness swap. - add $Xi, $Xi, 8 - add $inp, $inp, 8 - li $M8, -8 - - @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu - - @{[vlse64_v $V5, $Xi, $M8]} # vlse64.v v5, (a0), t4 - -Lstep: - # Read input data - @{[vlse64_v $Vinp, $inp, $M8]} # vle64.v v0, (a2) - add $inp, $inp, 16 - add $len, $len, -16 - # XOR them into Xi - @{[vxor_vv $V5, $V5, $Vinp]} # vxor.vv v0, v0, v1 - - @{[vrev8_v $V5, $V5]} # vrev8.v v5, v5 - - # Multiplication - - # Do two 64x64 multiplications in one go to save some time - # and simplify things. - - # A = a1a0 (t1, t0) - # B = b1b0 (v5) - # C = c1c0 (256 bit) - # c1 = a1b1 + (a0b1)h + (a1b0)h - # c0 = a0b0 + (a0b1)l + (a1b0)h - - # v1 = (a0b1)l,(a0b0)l - @{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0 - # v3 = (a0b1)h,(a0b0)h - @{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0 - - # v4 = (a1b1)l,(a1b0)l - @{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1 - # v2 = (a1b1)h,(a1b0)h - @{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1 - - # Is there a better way to do this? - # Would need to swap the order of elements within a vector register. - @{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1 - @{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1 - @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 - @{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1 - - @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 - # v2 += (a0b1)h - @{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t - # v2 += (a1b1)l - @{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t - - @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 - # v1 += (a0b0)h,0 - @{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t - # v1 += (a1b0)l,0 - @{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t - - # Now the 256bit product should be stored in (v2,v1) - # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l - # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l - - # Reduction - # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0] - # This is a slight variation of the Gueron's Montgomery reduction. - # The difference being the order of some operations has been changed, - # to make a better use of vclmul(h) instructions. - - # First step: - # c1 += (c0 * P)l - # vmv.v.i v0, 2 - @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t - @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t - @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t - - # Second step: - # D = d1,d0 is final result - # We want: - # m1 = c1 + (c1 * P)h - # m0 = (c1 * P)l + (c0 * P)h + c0 - # d1 = c3 + m1 - # d0 = c2 + m0 - - #v3 = (c1 * P)l, 0 - @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t - #v4 = (c1 * P)h, (c0 * P)h - @{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3 - - @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 - @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 - - @{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4 - @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t - - # XOR in the upper upper part of the product - @{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1 - - @{[vrev8_v $V5, $V2]} # vrev8.v v2, v2 - - bnez $len, Lstep - - @{[vsse64_v $V5, $Xi, $M8]} # vsse64.v v2, (a0), t4 - ret -.size gcm_ghash_rv64i_zvbb_zvbc,.-gcm_ghash_rv64i_zvbb_zvbc -___ -} - -$code .= <<___; -.p2align 4 -Lpolymod: - .dword 0x0000000000000001 - .dword 0xc200000000000000 -.size Lpolymod,.-Lpolymod -___ - -print $code; - -close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/modes/asm/ghash-riscv64-zvkb-zvbc.pl b/crypto/modes/asm/ghash-riscv64-zvkb-zvbc.pl new file mode 100644 index 0000000000..5eb748bdc2 --- /dev/null +++ b/crypto/modes/asm/ghash-riscv64-zvkb-zvbc.pl @@ -0,0 +1,378 @@ +#! /usr/bin/env perl +# This file is dual-licensed, meaning that you can use it under your +# choice of either of the following two licenses: +# +# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You can obtain +# a copy in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# or +# +# Copyright (c) 2023, Christoph Müllner +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# - RV64I +# - RISC-V Vector ('V') with VLEN >= 128 +# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') +# - RISC-V Vector Carryless Multiplication extension ('Zvbc') + +use strict; +use warnings; + +use FindBin qw($Bin); +use lib "$Bin"; +use lib "$Bin/../../perlasm"; +use riscv; + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$output and open STDOUT,">$output"; + +my $code=<<___; +.text +___ + +################################################################################ +# void gcm_init_rv64i_zvkb_zvbc(u128 Htable[16], const u64 H[2]); +# +# input: H: 128-bit H - secret parameter E(K, 0^128) +# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zvkb_zvbc and +# gcm_ghash_rv64i_zvkb_zvbc +{ +my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2"); +my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6"); + +$code .= <<___; +.p2align 3 +.globl gcm_init_rv64i_zvkb_zvbc +.type gcm_init_rv64i_zvkb_zvbc,\@function +gcm_init_rv64i_zvkb_zvbc: + # Load/store data in reverse order. + # This is needed as a part of endianness swap. + add $H, $H, 8 + li $TMP0, -8 + li $TMP1, 63 + la $TMP2, Lpolymod + + @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu + + @{[vlse64_v $V1, $H, $TMP0]} # vlse64.v v1, (a1), t0 + @{[vle64_v $V2, $TMP2]} # vle64.v v2, (t2) + + # Shift one left and get the carry bits. + @{[vsrl_vx $V3, $V1, $TMP1]} # vsrl.vx v3, v1, t1 + @{[vsll_vi $V1, $V1, 1]} # vsll.vi v1, v1, 1 + + # Use the fact that the polynomial degree is no more than 128, + # i.e. only the LSB of the upper half could be set. + # Thanks to this we don't need to do the full reduction here. + # Instead simply subtract the reduction polynomial. + # This idea was taken from x86 ghash implementation in OpenSSL. + @{[vslideup_vi $V4, $V3, 1]} # vslideup.vi v4, v3, 1 + @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 + + @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 + @{[vor_vv_v0t $V1, $V1, $V4]} # vor.vv v1, v1, v4, v0.t + + # Need to set the mask to 3, if the carry bit is set. + @{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3 + @{[vmv_v_i $V3, 0]} # vmv.v.i v3, 0 + @{[vmerge_vim $V3, $V3, 3]} # vmerge.vim v3, v3, 3, v0 + @{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3 + + @{[vxor_vv_v0t $V1, $V1, $V2]} # vxor.vv v1, v1, v2, v0.t + + @{[vse64_v $V1, $Htable]} # vse64.v v1, (a0) + ret +.size gcm_init_rv64i_zvkb_zvbc,.-gcm_init_rv64i_zvkb_zvbc +___ +} + +################################################################################ +# void gcm_gmult_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16]); +# +# input: Xi: current hash value +# Htable: preprocessed H +# output: Xi: next hash value Xi = (Xi * H mod f) +{ +my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4"); +my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6"); + +$code .= <<___; +.text +.p2align 3 +.globl gcm_gmult_rv64i_zvkb_zvbc +.type gcm_gmult_rv64i_zvkb_zvbc,\@function +gcm_gmult_rv64i_zvkb_zvbc: + ld $TMP0, ($Htable) + ld $TMP1, 8($Htable) + li $TMP2, 63 + la $TMP3, Lpolymod + ld $TMP3, 8($TMP3) + + # Load/store data in reverse order. + # This is needed as a part of endianness swap. + add $Xi, $Xi, 8 + li $TMP4, -8 + + @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu + + @{[vlse64_v $V5, $Xi, $TMP4]} # vlse64.v v5, (a0), t4 + @{[vrev8_v $V5, $V5]} # vrev8.v v5, v5 + + # Multiplication + + # Do two 64x64 multiplications in one go to save some time + # and simplify things. + + # A = a1a0 (t1, t0) + # B = b1b0 (v5) + # C = c1c0 (256 bit) + # c1 = a1b1 + (a0b1)h + (a1b0)h + # c0 = a0b0 + (a0b1)l + (a1b0)h + + # v1 = (a0b1)l,(a0b0)l + @{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0 + # v3 = (a0b1)h,(a0b0)h + @{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0 + + # v4 = (a1b1)l,(a1b0)l + @{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1 + # v2 = (a1b1)h,(a1b0)h + @{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1 + + # Is there a better way to do this? + # Would need to swap the order of elements within a vector register. + @{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1 + @{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1 + @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 + @{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1 + + @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 + # v2 += (a0b1)h + @{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t + # v2 += (a1b1)l + @{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t + + @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 + # v1 += (a0b0)h,0 + @{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t + # v1 += (a1b0)l,0 + @{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t + + # Now the 256bit product should be stored in (v2,v1) + # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l + # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l + + # Reduction + # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0] + # This is a slight variation of the Gueron's Montgomery reduction. + # The difference being the order of some operations has been changed, + # to make a better use of vclmul(h) instructions. + + # First step: + # c1 += (c0 * P)l + # vmv.v.i v0, 2 + @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t + @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t + @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t + + # Second step: + # D = d1,d0 is final result + # We want: + # m1 = c1 + (c1 * P)h + # m0 = (c1 * P)l + (c0 * P)h + c0 + # d1 = c3 + m1 + # d0 = c2 + m0 + + #v3 = (c1 * P)l, 0 + @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t + #v4 = (c1 * P)h, (c0 * P)h + @{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3 + + @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 + @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 + + @{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4 + @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t + + # XOR in the upper upper part of the product + @{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1 + + @{[vrev8_v $V2, $V2]} # vrev8.v v2, v2 + @{[vsse64_v $V2, $Xi, $TMP4]} # vsse64.v v2, (a0), t4 + ret +.size gcm_gmult_rv64i_zvkb_zvbc,.-gcm_gmult_rv64i_zvkb_zvbc +___ +} + +################################################################################ +# void gcm_ghash_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16], +# const u8 *inp, size_t len); +# +# input: Xi: current hash value +# Htable: preprocessed H +# inp: pointer to input data +# len: length of input data in bytes (multiple of block size) +# output: Xi: Xi+1 (next hash value Xi) +{ +my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6"); +my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7"); + +$code .= <<___; +.p2align 3 +.globl gcm_ghash_rv64i_zvkb_zvbc +.type gcm_ghash_rv64i_zvkb_zvbc,\@function +gcm_ghash_rv64i_zvkb_zvbc: + ld $TMP0, ($Htable) + ld $TMP1, 8($Htable) + li $TMP2, 63 + la $TMP3, Lpolymod + ld $TMP3, 8($TMP3) + + # Load/store data in reverse order. + # This is needed as a part of endianness swap. + add $Xi, $Xi, 8 + add $inp, $inp, 8 + li $M8, -8 + + @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu + + @{[vlse64_v $V5, $Xi, $M8]} # vlse64.v v5, (a0), t4 + +Lstep: + # Read input data + @{[vlse64_v $Vinp, $inp, $M8]} # vle64.v v0, (a2) + add $inp, $inp, 16 + add $len, $len, -16 + # XOR them into Xi + @{[vxor_vv $V5, $V5, $Vinp]} # vxor.vv v0, v0, v1 + + @{[vrev8_v $V5, $V5]} # vrev8.v v5, v5 + + # Multiplication + + # Do two 64x64 multiplications in one go to save some time + # and simplify things. + + # A = a1a0 (t1, t0) + # B = b1b0 (v5) + # C = c1c0 (256 bit) + # c1 = a1b1 + (a0b1)h + (a1b0)h + # c0 = a0b0 + (a0b1)l + (a1b0)h + + # v1 = (a0b1)l,(a0b0)l + @{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0 + # v3 = (a0b1)h,(a0b0)h + @{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0 + + # v4 = (a1b1)l,(a1b0)l + @{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1 + # v2 = (a1b1)h,(a1b0)h + @{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1 + + # Is there a better way to do this? + # Would need to swap the order of elements within a vector register. + @{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1 + @{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1 + @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 + @{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1 + + @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 + # v2 += (a0b1)h + @{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t + # v2 += (a1b1)l + @{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t + + @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 + # v1 += (a0b0)h,0 + @{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t + # v1 += (a1b0)l,0 + @{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t + + # Now the 256bit product should be stored in (v2,v1) + # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l + # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l + + # Reduction + # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0] + # This is a slight variation of the Gueron's Montgomery reduction. + # The difference being the order of some operations has been changed, + # to make a better use of vclmul(h) instructions. + + # First step: + # c1 += (c0 * P)l + # vmv.v.i v0, 2 + @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t + @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t + @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t + + # Second step: + # D = d1,d0 is final result + # We want: + # m1 = c1 + (c1 * P)h + # m0 = (c1 * P)l + (c0 * P)h + c0 + # d1 = c3 + m1 + # d0 = c2 + m0 + + #v3 = (c1 * P)l, 0 + @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t + #v4 = (c1 * P)h, (c0 * P)h + @{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3 + + @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 + @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 + + @{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4 + @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t + + # XOR in the upper upper part of the product + @{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1 + + @{[vrev8_v $V5, $V2]} # vrev8.v v2, v2 + + bnez $len, Lstep + + @{[vsse64_v $V5, $Xi, $M8]} # vsse64.v v2, (a0), t4 + ret +.size gcm_ghash_rv64i_zvkb_zvbc,.-gcm_ghash_rv64i_zvkb_zvbc +___ +} + +$code .= <<___; +.p2align 4 +Lpolymod: + .dword 0x0000000000000001 + .dword 0xc200000000000000 +.size Lpolymod,.-Lpolymod +___ + +print $code; + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/modes/asm/ghash-riscv64-zvkg.pl b/crypto/modes/asm/ghash-riscv64-zvkg.pl index c3217598e4..8423ae9cf8 100644 --- a/crypto/modes/asm/ghash-riscv64-zvkg.pl +++ b/crypto/modes/asm/ghash-riscv64-zvkg.pl @@ -35,8 +35,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # - RV64I -# - RISC-V vector ('V') with VLEN >= 128 -# - RISC-V vector crypto GHASH extension ('Zvkg') +# - RISC-V Vector ('V') with VLEN >= 128 +# - RISC-V Vector GCM/GMAC extension ('Zvkg') +# +# Optional: +# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') use strict; use warnings; @@ -59,7 +62,7 @@ ___ ################################################################################ # void gcm_init_rv64i_zvkg(u128 Htable[16], const u64 H[2]); -# void gcm_init_rv64i_zvkg_zvbb(u128 Htable[16], const u64 H[2]); +# void gcm_init_rv64i_zvkg_zvkb(u128 Htable[16], const u64 H[2]); # # input: H: 128-bit H - secret parameter E(K, 0^128) # output: Htable: Copy of secret parameter (in normalized byte order) @@ -88,15 +91,15 @@ my ($Htable,$H,$V0) = ("a0","a1","v0"); $code .= <<___; .p2align 3 -.globl gcm_init_rv64i_zvkg_zvbb -.type gcm_init_rv64i_zvkg_zvbb,\@function -gcm_init_rv64i_zvkg_zvbb: - @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu +.globl gcm_init_rv64i_zvkg_zvkb +.type gcm_init_rv64i_zvkg_zvkb,\@function +gcm_init_rv64i_zvkg_zvkb: + @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, ta, ma @{[vle64_v $V0, $H]} # vle64.v v0, (a1) @{[vrev8_v $V0, $V0]} # vrev8.v v0, v0 @{[vse64_v $V0, $Htable]} # vse64.v v0, (a0) ret -.size gcm_init_rv64i_zvkg_zvbb,.-gcm_init_rv64i_zvkg_zvbb +.size gcm_init_rv64i_zvkg_zvkb,.-gcm_init_rv64i_zvkg_zvkb ___ } diff --git a/crypto/modes/build.info b/crypto/modes/build.info index 7b188fba81..9ebb5cc7a8 100644 --- a/crypto/modes/build.info +++ b/crypto/modes/build.info @@ -43,7 +43,7 @@ IF[{- !$disabled{asm} -}] $MODESASM_c64xplus=ghash-c64xplus.s $MODESDEF_c64xplus=GHASH_ASM - $MODESASM_riscv64=ghash-riscv64.s ghash-riscv64-zvbb-zvbc.s ghash-riscv64-zvkg.s + $MODESASM_riscv64=ghash-riscv64.s ghash-riscv64-zvkb-zvbc.s ghash-riscv64-zvkg.s $MODESDEF_riscv64=GHASH_ASM # Now that we have defined all the arch specific variables, use the @@ -91,5 +91,5 @@ GENERATE[ghash-s390x.S]=asm/ghash-s390x.pl INCLUDE[ghash-s390x.o]=.. GENERATE[ghash-c64xplus.S]=asm/ghash-c64xplus.pl GENERATE[ghash-riscv64.s]=asm/ghash-riscv64.pl -GENERATE[ghash-riscv64-zvbb-zvbc.s]=asm/ghash-riscv64-zvbb-zvbc.pl +GENERATE[ghash-riscv64-zvkb-zvbc.s]=asm/ghash-riscv64-zvkb-zvbc.pl GENERATE[ghash-riscv64-zvkg.s]=asm/ghash-riscv64-zvkg.pl diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c index 4b49d202a4..6f293ef794 100644 --- a/crypto/modes/gcm128.c +++ b/crypto/modes/gcm128.c @@ -413,14 +413,14 @@ void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16], const u8 *inp, size_t len); void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16], const u8 *inp, size_t len); -/* Zvbb/Zvbc (vector crypto with vclmul) based routines. */ -void gcm_init_rv64i_zvbb_zvbc(u128 Htable[16], const u64 Xi[2]); -void gcm_gmult_rv64i_zvbb_zvbc(u64 Xi[2], const u128 Htable[16]); -void gcm_ghash_rv64i_zvbb_zvbc(u64 Xi[2], const u128 Htable[16], +/* zvkb/Zvbc (vector crypto with vclmul) based routines. */ +void gcm_init_rv64i_zvkb_zvbc(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16], const u8 *inp, size_t len); /* Zvkg (vector crypto with vgmul.vv and vghsh.vv). */ void gcm_init_rv64i_zvkg(u128 Htable[16], const u64 Xi[2]); -void gcm_init_rv64i_zvkg_zvbb(u128 Htable[16], const u64 Xi[2]); +void gcm_init_rv64i_zvkg_zvkb(u128 Htable[16], const u64 Xi[2]); void gcm_gmult_rv64i_zvkg(u64 Xi[2], const u128 Htable[16]); void gcm_ghash_rv64i_zvkg(u64 Xi[2], const u128 Htable[16], const u8 *inp, size_t len); @@ -524,16 +524,16 @@ static void gcm_get_funcs(struct gcm_funcs_st *ctx) ctx->ghash = gcm_ghash_4bit; if (RISCV_HAS_ZVKG() && riscv_vlen() >= 128) { - if (RISCV_HAS_ZVBB()) - ctx->ginit = gcm_init_rv64i_zvkg_zvbb; + if (RISCV_HAS_ZVKB()) + ctx->ginit = gcm_init_rv64i_zvkg_zvkb; else ctx->ginit = gcm_init_rv64i_zvkg; ctx->gmult = gcm_gmult_rv64i_zvkg; ctx->ghash = gcm_ghash_rv64i_zvkg; - } else if (RISCV_HAS_ZVBB() && RISCV_HAS_ZVBC() && riscv_vlen() >= 128) { - ctx->ginit = gcm_init_rv64i_zvbb_zvbc; - ctx->gmult = gcm_gmult_rv64i_zvbb_zvbc; - ctx->ghash = gcm_ghash_rv64i_zvbb_zvbc; + } else if (RISCV_HAS_ZVKB() && RISCV_HAS_ZVBC() && riscv_vlen() >= 128) { + ctx->ginit = gcm_init_rv64i_zvkb_zvbc; + ctx->gmult = gcm_gmult_rv64i_zvkb_zvbc; + ctx->ghash = gcm_ghash_rv64i_zvkb_zvbc; } else if (RISCV_HAS_ZBC()) { if (RISCV_HAS_ZBKB()) { ctx->ginit = gcm_init_rv64i_zbc__zbkb; diff --git a/crypto/perlasm/riscv.pm b/crypto/perlasm/riscv.pm index 14434e2848..8d602d8493 100644 --- a/crypto/perlasm/riscv.pm +++ b/crypto/perlasm/riscv.pm @@ -746,7 +746,18 @@ sub vxor_vv { # Vector crypto instructions -## Zvbb instructions +## Zvbb and Zvkb instructions +## +## vandn (also in zvkb) +## vbrev +## vbrev8 (also in zvkb) +## vrev8 (also in zvkb) +## vclz +## vctz +## vcpop +## vrol (also in zvkb) +## vror (also in zvkb) +## vwsll sub vrev8_v { # vrev8.v vd, vs2, vm diff --git a/crypto/sm4/asm/sm4-riscv64-zvksed.pl b/crypto/sm4/asm/sm4-riscv64-zvksed.pl index ba600d53d7..0734e5fa4c 100644 --- a/crypto/sm4/asm/sm4-riscv64-zvksed.pl +++ b/crypto/sm4/asm/sm4-riscv64-zvksed.pl @@ -36,9 +36,9 @@ # The generated code of this file depends on the following RISC-V extensions: # - RV64I -# - RISC-V vector ('V') with VLEN >= 128 -# - Vector Bit-manipulation used in Cryptography ('Zvbb') -# - Vector ShangMi Suite: SM4 Block Cipher ('Zvksed') +# - RISC-V Vector ('V') with VLEN >= 128 +# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') +# - RISC-V Vector SM4 Block Cipher extension ('Zvksed') use strict; use warnings; -- cgit v1.2.3