summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Müllner <christoph.muellner@vrull.eu>2023-01-18 13:11:19 +0100
committerHugo Landau <hlandau@openssl.org>2023-10-26 15:55:49 +0100
commit003f5698146b81f3185d7f17d60a7351c69e236d (patch)
treee85a5d7947b4de58b752f7f583002d938c1b421a
parentcdea67193da8aab0f1a49d2b7ce144ad21bfc51d (diff)
riscv: GCM: Provide a Zvbb/Zvbc-based implementation
The RISC-V vector crypto extensions features a Zvbc extension that provides a carryless multiplication ('vclmul.vv') instruction. This patch provides an implementation that utilizes this extension if available. Tested on QEMU and no regressions observed. Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu> Reviewed-by: Tomas Mraz <tomas@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> Reviewed-by: Hugo Landau <hlandau@openssl.org> (Merged from https://github.com/openssl/openssl/pull/21923)
-rw-r--r--crypto/modes/asm/ghash-riscv64-zvbb-zvbc.pl378
-rw-r--r--crypto/modes/build.info3
-rw-r--r--crypto/modes/gcm128.c11
-rw-r--r--crypto/perlasm/riscv.pm202
-rw-r--r--include/crypto/riscv_arch.def2
5 files changed, 594 insertions, 2 deletions
diff --git a/crypto/modes/asm/ghash-riscv64-zvbb-zvbc.pl b/crypto/modes/asm/ghash-riscv64-zvbb-zvbc.pl
new file mode 100644
index 0000000000..5b150ab068
--- /dev/null
+++ b/crypto/modes/asm/ghash-riscv64-zvbb-zvbc.pl
@@ -0,0 +1,378 @@
+#! /usr/bin/env perl
+# This file is dual-licensed, meaning that you can use it under your
+# choice of either of the following two licenses:
+#
+# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You can obtain
+# a copy in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# or
+#
+# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# - RV64I
+# - RISC-V vector ('V') with VLEN >= 128
+# - Vector Bit-manipulation used in Cryptography ('Zvbb')
+# - Vector Carryless Multiplication ('Zvbc')
+
+use strict;
+use warnings;
+
+use FindBin qw($Bin);
+use lib "$Bin";
+use lib "$Bin/../../perlasm";
+use riscv;
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+my $code=<<___;
+.text
+___
+
+################################################################################
+# void gcm_init_rv64i_zvbb_zvbc(u128 Htable[16], const u64 H[2]);
+#
+# input: H: 128-bit H - secret parameter E(K, 0^128)
+# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zvbb_zvbc and
+# gcm_ghash_rv64i_zvbb_zvbc
+{
+my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2");
+my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_init_rv64i_zvbb_zvbc
+.type gcm_init_rv64i_zvbb_zvbc,\@function
+gcm_init_rv64i_zvbb_zvbc:
+ # Load/store data in reverse order.
+ # This is needed as a part of endianness swap.
+ add $H, $H, 8
+ li $TMP0, -8
+ li $TMP1, 63
+ la $TMP2, Lpolymod
+
+ @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
+
+ @{[vlse64_v $V1, $H, $TMP0]} # vlse64.v v1, (a1), t0
+ @{[vle64_v $V2, $TMP2]} # vle64.v v2, (t2)
+
+ # Shift one left and get the carry bits.
+ @{[vsrl_vx $V3, $V1, $TMP1]} # vsrl.vx v3, v1, t1
+ @{[vsll_vi $V1, $V1, 1]} # vsll.vi v1, v1, 1
+
+ # Use the fact that the polynomial degree is no more than 128,
+ # i.e. only the LSB of the upper half could be set.
+ # Thanks to this we don't need to do the full reduction here.
+ # Instead simply subtract the reduction polynomial.
+ # This idea was taken from x86 ghash implementation in OpenSSL.
+ @{[vslideup_vi $V4, $V3, 1]} # vslideup.vi v4, v3, 1
+ @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
+
+ @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
+ @{[vor_vv_v0t $V1, $V1, $V4]} # vor.vv v1, v1, v4, v0.t
+
+ # Need to set the mask to 3, if the carry bit is set.
+ @{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3
+ @{[vmv_v_i $V3, 0]} # vmv.v.i v3, 0
+ @{[vmerge_vim $V3, $V3, 3]} # vmerge.vim v3, v3, 3, v0
+ @{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3
+
+ @{[vxor_vv_v0t $V1, $V1, $V2]} # vxor.vv v1, v1, v2, v0.t
+
+ @{[vse64_v $V1, $Htable]} # vse64.v v1, (a0)
+ ret
+.size gcm_init_rv64i_zvbb_zvbc,.-gcm_init_rv64i_zvbb_zvbc
+___
+}
+
+################################################################################
+# void gcm_gmult_rv64i_zvbb_zvbc(u64 Xi[2], const u128 Htable[16]);
+#
+# input: Xi: current hash value
+# Htable: preprocessed H
+# output: Xi: next hash value Xi = (Xi * H mod f)
+{
+my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4");
+my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
+
+$code .= <<___;
+.text
+.p2align 3
+.globl gcm_gmult_rv64i_zvbb_zvbc
+.type gcm_gmult_rv64i_zvbb_zvbc,\@function
+gcm_gmult_rv64i_zvbb_zvbc:
+ ld $TMP0, ($Htable)
+ ld $TMP1, 8($Htable)
+ li $TMP2, 63
+ la $TMP3, Lpolymod
+ ld $TMP3, 8($TMP3)
+
+ # Load/store data in reverse order.
+ # This is needed as a part of endianness swap.
+ add $Xi, $Xi, 8
+ li $TMP4, -8
+
+ @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
+
+ @{[vlse64_v $V5, $Xi, $TMP4]} # vlse64.v v5, (a0), t4
+ @{[vrev8_v $V5, $V5]} # vrev8.v v5, v5
+
+ # Multiplication
+
+ # Do two 64x64 multiplications in one go to save some time
+ # and simplify things.
+
+ # A = a1a0 (t1, t0)
+ # B = b1b0 (v5)
+ # C = c1c0 (256 bit)
+ # c1 = a1b1 + (a0b1)h + (a1b0)h
+ # c0 = a0b0 + (a0b1)l + (a1b0)h
+
+ # v1 = (a0b1)l,(a0b0)l
+ @{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0
+ # v3 = (a0b1)h,(a0b0)h
+ @{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0
+
+ # v4 = (a1b1)l,(a1b0)l
+ @{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1
+ # v2 = (a1b1)h,(a1b0)h
+ @{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1
+
+ # Is there a better way to do this?
+ # Would need to swap the order of elements within a vector register.
+ @{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1
+ @{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1
+ @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
+ @{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1
+
+ @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
+ # v2 += (a0b1)h
+ @{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t
+ # v2 += (a1b1)l
+ @{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t
+
+ @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
+ # v1 += (a0b0)h,0
+ @{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t
+ # v1 += (a1b0)l,0
+ @{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t
+
+ # Now the 256bit product should be stored in (v2,v1)
+ # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
+ # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
+
+ # Reduction
+ # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
+ # This is a slight variation of the Gueron's Montgomery reduction.
+ # The difference being the order of some operations has been changed,
+ # to make a better use of vclmul(h) instructions.
+
+ # First step:
+ # c1 += (c0 * P)l
+ # vmv.v.i v0, 2
+ @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
+ @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
+ @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
+
+ # Second step:
+ # D = d1,d0 is final result
+ # We want:
+ # m1 = c1 + (c1 * P)h
+ # m0 = (c1 * P)l + (c0 * P)h + c0
+ # d1 = c3 + m1
+ # d0 = c2 + m0
+
+ #v3 = (c1 * P)l, 0
+ @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
+ #v4 = (c1 * P)h, (c0 * P)h
+ @{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3
+
+ @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
+ @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
+
+ @{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4
+ @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
+
+ # XOR in the upper upper part of the product
+ @{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1
+
+ @{[vrev8_v $V2, $V2]} # vrev8.v v2, v2
+ @{[vsse64_v $V2, $Xi, $TMP4]} # vsse64.v v2, (a0), t4
+ ret
+.size gcm_gmult_rv64i_zvbb_zvbc,.-gcm_gmult_rv64i_zvbb_zvbc
+___
+}
+
+################################################################################
+# void gcm_ghash_rv64i_zvbb_zvbc(u64 Xi[2], const u128 Htable[16],
+# const u8 *inp, size_t len);
+#
+# input: Xi: current hash value
+# Htable: preprocessed H
+# inp: pointer to input data
+# len: length of input data in bytes (mutiple of block size)
+# output: Xi: Xi+1 (next hash value Xi)
+{
+my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6");
+my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_ghash_rv64i_zvbb_zvbc
+.type gcm_ghash_rv64i_zvbb_zvbc,\@function
+gcm_ghash_rv64i_zvbb_zvbc:
+ ld $TMP0, ($Htable)
+ ld $TMP1, 8($Htable)
+ li $TMP2, 63
+ la $TMP3, Lpolymod
+ ld $TMP3, 8($TMP3)
+
+ # Load/store data in reverse order.
+ # This is needed as a part of endianness swap.
+ add $Xi, $Xi, 8
+ add $inp, $inp, 8
+ li $M8, -8
+
+ @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
+
+ @{[vlse64_v $V5, $Xi, $M8]} # vlse64.v v5, (a0), t4
+
+Lstep:
+ # Read input data
+ @{[vlse64_v $Vinp, $inp, $M8]} # vle64.v v0, (a2)
+ add $inp, $inp, 16
+ add $len, $len, -16
+ # XOR them into Xi
+ @{[vxor_vv $V5, $V5, $Vinp]} # vxor.vv v0, v0, v1
+
+ @{[vrev8_v $V5, $V5]} # vrev8.v v5, v5
+
+ # Multiplication
+
+ # Do two 64x64 multiplications in one go to save some time
+ # and simplify things.
+
+ # A = a1a0 (t1, t0)
+ # B = b1b0 (v5)
+ # C = c1c0 (256 bit)
+ # c1 = a1b1 + (a0b1)h + (a1b0)h
+ # c0 = a0b0 + (a0b1)l + (a1b0)h
+
+ # v1 = (a0b1)l,(a0b0)l
+ @{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0
+ # v3 = (a0b1)h,(a0b0)h
+ @{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0
+
+ # v4 = (a1b1)l,(a1b0)l
+ @{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1
+ # v2 = (a1b1)h,(a1b0)h
+ @{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1
+
+ # Is there a better way to do this?
+ # Would need to swap the order of elements within a vector register.
+ @{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1
+ @{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1
+ @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
+ @{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1
+
+ @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
+ # v2 += (a0b1)h
+ @{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t
+ # v2 += (a1b1)l
+ @{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t
+
+ @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
+ # v1 += (a0b0)h,0
+ @{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t
+ # v1 += (a1b0)l,0
+ @{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t
+
+ # Now the 256bit product should be stored in (v2,v1)
+ # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
+ # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
+
+ # Reduction
+ # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
+ # This is a slight variation of the Gueron's Montgomery reduction.
+ # The difference being the order of some operations has been changed,
+ # to make a better use of vclmul(h) instructions.
+
+ # First step:
+ # c1 += (c0 * P)l
+ # vmv.v.i v0, 2
+ @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
+ @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
+ @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
+
+ # Second step:
+ # D = d1,d0 is final result
+ # We want:
+ # m1 = c1 + (c1 * P)h
+ # m0 = (c1 * P)l + (c0 * P)h + c0
+ # d1 = c3 + m1
+ # d0 = c2 + m0
+
+ #v3 = (c1 * P)l, 0
+ @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
+ #v4 = (c1 * P)h, (c0 * P)h
+ @{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3
+
+ @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
+ @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
+
+ @{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4
+ @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
+
+ # XOR in the upper upper part of the product
+ @{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1
+
+ @{[vrev8_v $V5, $V2]} # vrev8.v v2, v2
+
+ bnez $len, Lstep
+
+ @{[vsse64_v $V5, $Xi, $M8]} # vsse64.v v2, (a0), t4
+ ret
+.size gcm_ghash_rv64i_zvbb_zvbc,.-gcm_ghash_rv64i_zvbb_zvbc
+___
+}
+
+$code .= <<___;
+.p2align 4
+Lpolymod:
+ .dword 0x0000000000000001
+ .dword 0xc200000000000000
+.size Lpolymod,.-Lpolymod
+___
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/modes/build.info b/crypto/modes/build.info
index c79f75c5c4..aebf853791 100644
--- a/crypto/modes/build.info
+++ b/crypto/modes/build.info
@@ -43,7 +43,7 @@ IF[{- !$disabled{asm} -}]
$MODESASM_c64xplus=ghash-c64xplus.s
$MODESDEF_c64xplus=GHASH_ASM
- $MODESASM_riscv64=ghash-riscv64.s
+ $MODESASM_riscv64=ghash-riscv64.s ghash-riscv64-zvbb-zvbc.s
$MODESDEF_riscv64=GHASH_ASM
# Now that we have defined all the arch specific variables, use the
@@ -91,3 +91,4 @@ GENERATE[ghash-s390x.S]=asm/ghash-s390x.pl
INCLUDE[ghash-s390x.o]=..
GENERATE[ghash-c64xplus.S]=asm/ghash-c64xplus.pl
GENERATE[ghash-riscv64.s]=asm/ghash-riscv64.pl
+GENERATE[ghash-riscv64-zvbb-zvbc.s]=asm/ghash-riscv64-zvbb-zvbc.pl
diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c
index 77ff5dd06b..e475be9bd4 100644
--- a/crypto/modes/gcm128.c
+++ b/crypto/modes/gcm128.c
@@ -413,6 +413,11 @@ void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16],
const u8 *inp, size_t len);
void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16],
const u8 *inp, size_t len);
+/* Zvbb/Zvbc (vector crypto with vclmul) based routines. */
+void gcm_init_rv64i_zvbb_zvbc(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_rv64i_zvbb_zvbc(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_rv64i_zvbb_zvbc(u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len);
# endif
#endif
@@ -512,7 +517,11 @@ static void gcm_get_funcs(struct gcm_funcs_st *ctx)
ctx->gmult = gcm_gmult_4bit;
ctx->ghash = gcm_ghash_4bit;
- if (RISCV_HAS_ZBC()) {
+ if (RISCV_HAS_ZVBB() && RISCV_HAS_ZVBC() && riscv_vlen() >= 128) {
+ ctx->ginit = gcm_init_rv64i_zvbb_zvbc;
+ ctx->gmult = gcm_gmult_rv64i_zvbb_zvbc;
+ ctx->ghash = gcm_ghash_rv64i_zvbb_zvbc;
+ } else if (RISCV_HAS_ZBC()) {
if (RISCV_HAS_ZBKB()) {
ctx->ginit = gcm_init_rv64i_zbc__zbkb;
ctx->gmult = gcm_gmult_rv64i_zbc__zbkb;
diff --git a/crypto/perlasm/riscv.pm b/crypto/perlasm/riscv.pm
index 90540b7dde..8443f6c29c 100644
--- a/crypto/perlasm/riscv.pm
+++ b/crypto/perlasm/riscv.pm
@@ -77,6 +77,29 @@ sub read_reg {
return $1;
}
+my @vregs = map("v$_",(0..31));
+my %vreglookup;
+@vreglookup{@vregs} = @vregs;
+
+sub read_vreg {
+ my $vreg = lc shift;
+ if (!exists($vreglookup{$vreg})) {
+ my $trace = "";
+ if ($have_stacktrace) {
+ $trace = Devel::StackTrace->new->as_string;
+ }
+ die("Unknown vector register ".$vreg."\n".$trace);
+ }
+ if (!($vreg =~ /^v([0-9]+)$/)) {
+ my $trace = "";
+ if ($have_stacktrace) {
+ $trace = Devel::StackTrace->new->as_string;
+ }
+ die("Could not process vector register ".$vreg."\n".$trace);
+ }
+ return $1;
+}
+
# Helper functions
sub brev8_rv64i {
@@ -256,4 +279,183 @@ sub rev8 {
return ".word ".($template | ($rs << 15) | ($rd << 7));
}
+# Vector instructions
+
+sub vle64_v {
+ # vle64.v vd, (rs1)
+ my $template = 0b0000001_00000_00000_111_00000_0000111;
+ my $vd = read_vreg shift;
+ my $rs1 = read_reg shift;
+ return ".word ".($template | ($rs1 << 15) | ($vd << 7));
+}
+
+sub vlse64_v {
+ # vlse64.v vd, (rs1), rs2
+ my $template = 0b0000101_00000_00000_111_00000_0000111;
+ my $vd = read_vreg shift;
+ my $rs1 = read_reg shift;
+ my $rs2 = read_reg shift;
+ return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($vd << 7));
+}
+
+sub vmerge_vim {
+ # vmerge.vim vd, vs2, imm, v0
+ my $template = 0b0101110_00000_00000_011_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs2 = read_vreg shift;
+ my $imm = shift;
+ return ".word ".($template | ($vs2 << 20) | ($imm << 15) | ($vd << 7));
+}
+
+sub vmv_v_i {
+ # vmv.v.i vd, imm
+ my $template = 0b0101111_00000_00000_011_00000_1010111;
+ my $vd = read_vreg shift;
+ my $imm = shift;
+ return ".word ".($template | ($imm << 15) | ($vd << 7));
+}
+
+sub vmv_v_v {
+ # vmv.v.v vd, vs1
+ my $template = 0b0101111_00000_00000_000_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs1 = read_vreg shift;
+ return ".word ".($template | ($vs1 << 15) | ($vd << 7));
+}
+
+sub vor_vv_v0t {
+ # vor.vv vd, vs2, vs1, v0.t
+ my $template = 0b0010100_00000_00000_000_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs2 = read_vreg shift;
+ my $vs1 = read_vreg shift;
+ return ".word ".($template | ($vs2 << 20) | ($vs1 << 15) | ($vd << 7));
+}
+
+sub vse64_v {
+ # vse64.v vd, (rs1)
+ my $template = 0b0000001_00000_00000_111_00000_0100111;
+ my $vd = read_vreg shift;
+ my $rs1 = read_reg shift;
+ return ".word ".($template | ($rs1 << 15) | ($vd << 7));
+}
+
+sub vsetivli__x0_2_e64_m1_tu_mu {
+ # vsetivli x0, 2, e64, m1, tu, mu
+ return ".word 0xc1817057";
+}
+
+sub vslidedown_vi {
+ # vslidedown.vi vd, vs2, uimm
+ my $template = 0b0011111_00000_00000_011_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs2 = read_vreg shift;
+ my $uimm = shift;
+ return ".word ".($template | ($vs2 << 20) | ($uimm << 15) | ($vd << 7));
+}
+
+sub vslideup_vi_v0t {
+ # vslideup.vi vd, vs2, uimm, v0.t
+ my $template = 0b0011100_00000_00000_011_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs2 = read_vreg shift;
+ my $uimm = shift;
+ return ".word ".($template | ($vs2 << 20) | ($uimm << 15) | ($vd << 7));
+}
+
+sub vslideup_vi {
+ # vslideup.vi vd, vs2, uimm
+ my $template = 0b0011101_00000_00000_011_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs2 = read_vreg shift;
+ my $uimm = shift;
+ return ".word ".($template | ($vs2 << 20) | ($uimm << 15) | ($vd << 7));
+}
+
+sub vsll_vi {
+ # vsll.vi vd, vs2, uimm, vm
+ my $template = 0b1001011_00000_00000_011_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs2 = read_vreg shift;
+ my $uimm = shift;
+ return ".word ".($template | ($vs2 << 20) | ($uimm << 15) | ($vd << 7));
+}
+
+sub vsrl_vx {
+ # vsrl.vx vd, vs2, rs1
+ my $template = 0b1010001_00000_00000_100_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs2 = read_vreg shift;
+ my $rs1 = read_reg shift;
+ return ".word ".($template | ($vs2 << 20) | ($rs1 << 15) | ($vd << 7));
+}
+
+sub vsse64_v {
+ # vsse64.v vs3, (rs1), rs2
+ my $template = 0b0000101_00000_00000_111_00000_0100111;
+ my $vs3 = read_vreg shift;
+ my $rs1 = read_reg shift;
+ my $rs2 = read_reg shift;
+ return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($vs3 << 7));
+}
+
+sub vxor_vv_v0t {
+ # vxor.vv vd, vs2, vs1, v0.t
+ my $template = 0b0010110_00000_00000_000_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs2 = read_vreg shift;
+ my $vs1 = read_vreg shift;
+ return ".word ".($template | ($vs2 << 20) | ($vs1 << 15) | ($vd << 7));
+}
+
+sub vxor_vv {
+ # vxor.vv vd, vs2, vs1
+ my $template = 0b0010111_00000_00000_000_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs2 = read_vreg shift;
+ my $vs1 = read_vreg shift;
+ return ".word ".($template | ($vs2 << 20) | ($vs1 << 15) | ($vd << 7));
+}
+
+# Vector crypto instructions
+
+## Zvbb instructions
+
+sub vrev8_v {
+ # vrev8.v vd, vs2
+ my $template = 0b0100101_00000_01001_010_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs2 = read_vreg shift;
+ return ".word ".($template | ($vs2 << 20) | ($vd << 7));
+}
+
+## Zvbc instructions
+
+sub vclmulh_vx {
+ # vclmulh.vx vd, vs2, rs1
+ my $template = 0b0011011_00000_00000_110_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs2 = read_vreg shift;
+ my $rs1 = read_reg shift;
+ return ".word ".($template | ($vs2 << 20) | ($rs1 << 15) | ($vd << 7));
+}
+
+sub vclmul_vx_v0t {
+ # vclmul.vx vd, vs2, rs1, v0.t
+ my $template = 0b0011000_00000_00000_110_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs2 = read_vreg shift;
+ my $rs1 = read_reg shift;
+ return ".word ".($template | ($vs2 << 20) | ($rs1 << 15) | ($vd << 7));
+}
+
+sub vclmul_vx {
+ # vclmul.vx vd, vs2, rs1
+ my $template = 0b0011001_00000_00000_110_00000_1010111;
+ my $vd = read_vreg shift;
+ my $vs2 = read_vreg shift;
+ my $rs1 = read_reg shift;
+ return ".word ".($template | ($vs2 << 20) | ($rs1 << 15) | ($vd << 7));
+}
+
1;
diff --git a/include/crypto/riscv_arch.def b/include/crypto/riscv_arch.def
index b355fa4ddc..58262fbbb8 100644
--- a/include/crypto/riscv_arch.def
+++ b/include/crypto/riscv_arch.def
@@ -33,6 +33,8 @@ RISCV_DEFINE_CAP(ZKSH, 0, 11)
RISCV_DEFINE_CAP(ZKR, 0, 12)
RISCV_DEFINE_CAP(ZKT, 0, 13)
RISCV_DEFINE_CAP(V, 0, 14)
+RISCV_DEFINE_CAP(ZVBB, 0, 15)
+RISCV_DEFINE_CAP(ZVBC, 0, 16)
/*
* In the future ...