From 5e16a6276bf4624fb15ec26b49219af5b2ed19d1 Mon Sep 17 00:00:00 2001
From: Phoebe Chen <phoebe.chen@sifive.com>
Date: Tue, 12 Sep 2023 06:44:05 -0700
Subject: riscv: Provide vector crypto implementation of AES-CBC mode.

To accelerate the performance of the AES-128/192/256-CBC block cipher
encryption, we used the vaesz, vaesem and vaesef instructions, which
implement a single round of AES encryption.

Similarly, to optimize the performance of AES-128/192/256-CBC block
cipher decryption, we have utilized the vaesz, vaesdm, and vaesdf
instructions, which facilitate a single round of AES decryption.

Furthermore, we optimize the key and initialization vector (IV) step by
keeping the rounding key in vector registers.

Signed-off-by: Phoebe Chen <phoebe.chen@sifive.com>

Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Hugo Landau <hlandau@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/21923)
---
 crypto/aes/asm/aes-riscv64-zvkned.pl | 536 ++++++++++++++++++++++++++++++++++-
 1 file changed, 534 insertions(+), 2 deletions(-)

(limited to 'crypto')

diff --git a/crypto/aes/asm/aes-riscv64-zvkned.pl b/crypto/aes/asm/aes-riscv64-zvkned.pl
index 1225e39d2b..319808b51c 100644
--- a/crypto/aes/asm/aes-riscv64-zvkned.pl
+++ b/crypto/aes/asm/aes-riscv64-zvkned.pl
@@ -11,6 +11,7 @@
 # or
 #
 # Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+# Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -35,8 +36,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 # - RV64I
-# - RISC-V vector ('V') with VLEN >= 128
-# - RISC-V vector crypto AES extension ('Zvkned')
+# - RISC-V Vector ('V') with VLEN >= 128
+# - RISC-V Vector AES block cipher extension ('Zvkned')
 
 use strict;
 use warnings;
@@ -57,6 +58,533 @@ my $code=<<___;
 .text
 ___
 
+{
+###############################################################################
+# void rv64i_zvkned_cbc_encrypt(const unsigned char *in, unsigned char *out,
+#                               size_t length, const AES_KEY *key,
+#                               unsigned char *ivec, const int enc);
+my ($INP, $OUTP, $LEN, $KEYP, $IVP, $ENC) = ("a0", "a1", "a2", "a3", "a4", "a5");
+my ($T0, $T1, $rounds) = ("t0", "t1", "t2");
+my ($v0, $v1, $v2, $v3, $v4, $v5, $v6, $v7,
+    $v8, $v9, $v10, $v11, $v12, $v13, $v14, $v15,
+    $v16, $v17, $v18, $v19, $v20, $v21, $v22, $v23,
+    $v24, $v25, $v26, $v27, $v28, $v29, $v30, $v31,
+) = map("v$_",(0..31));
+
+# Load all 11 round keys to v1-v11 registers.
+sub aes_128_load_key {
+    my $code=<<___;
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $v1, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v2, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v3, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v4, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v5, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v6, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v7, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v8, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v9, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v10, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v11, $KEYP]}
+___
+
+    return $code;
+}
+
+# Load all 13 round keys to v1-v13 registers.
+sub aes_192_load_key {
+    my $code=<<___;
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $v1, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v2, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v3, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v4, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v5, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v6, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v7, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v8, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v9, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v10, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v11, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v12, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v13, $KEYP]}
+___
+
+    return $code;
+}
+
+# Load all 15 round keys to v1-v15 registers.
+sub aes_256_load_key {
+    my $code=<<___;
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $v1, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v2, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v3, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v4, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v5, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v6, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v7, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v8, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v9, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v10, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v11, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v12, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v13, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v14, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v15, $KEYP]}
+___
+
+    return $code;
+}
+
+# aes-128 encryption with round keys v1-v11
+sub aes_128_encrypt {
+    my $code=<<___;
+    @{[vaesz_vs $v24, $v1]}     # with round key w[ 0, 3]
+    @{[vaesem_vs $v24, $v2]}    # with round key w[ 4, 7]
+    @{[vaesem_vs $v24, $v3]}    # with round key w[ 8,11]
+    @{[vaesem_vs $v24, $v4]}    # with round key w[12,15]
+    @{[vaesem_vs $v24, $v5]}    # with round key w[16,19]
+    @{[vaesem_vs $v24, $v6]}    # with round key w[20,23]
+    @{[vaesem_vs $v24, $v7]}    # with round key w[24,27]
+    @{[vaesem_vs $v24, $v8]}    # with round key w[28,31]
+    @{[vaesem_vs $v24, $v9]}    # with round key w[32,35]
+    @{[vaesem_vs $v24, $v10]}   # with round key w[36,39]
+    @{[vaesef_vs $v24, $v11]}   # with round key w[40,43]
+___
+
+    return $code;
+}
+
+# aes-128 decryption with round keys v1-v11
+sub aes_128_decrypt {
+    my $code=<<___;
+    @{[vaesz_vs $v24, $v11]}   # with round key w[40,43]
+    @{[vaesdm_vs $v24, $v10]}  # with round key w[36,39]
+    @{[vaesdm_vs $v24, $v9]}   # with round key w[32,35]
+    @{[vaesdm_vs $v24, $v8]}   # with round key w[28,31]
+    @{[vaesdm_vs $v24, $v7]}   # with round key w[24,27]
+    @{[vaesdm_vs $v24, $v6]}   # with round key w[20,23]
+    @{[vaesdm_vs $v24, $v5]}   # with round key w[16,19]
+    @{[vaesdm_vs $v24, $v4]}   # with round key w[12,15]
+    @{[vaesdm_vs $v24, $v3]}   # with round key w[ 8,11]
+    @{[vaesdm_vs $v24, $v2]}   # with round key w[ 4, 7]
+    @{[vaesdf_vs $v24, $v1]}   # with round key w[ 0, 3]
+___
+
+    return $code;
+}
+
+# aes-192 encryption with round keys v1-v13
+sub aes_192_encrypt {
+    my $code=<<___;
+    @{[vaesz_vs $v24, $v1]}     # with round key w[ 0, 3]
+    @{[vaesem_vs $v24, $v2]}    # with round key w[ 4, 7]
+    @{[vaesem_vs $v24, $v3]}    # with round key w[ 8,11]
+    @{[vaesem_vs $v24, $v4]}    # with round key w[12,15]
+    @{[vaesem_vs $v24, $v5]}    # with round key w[16,19]
+    @{[vaesem_vs $v24, $v6]}    # with round key w[20,23]
+    @{[vaesem_vs $v24, $v7]}    # with round key w[24,27]
+    @{[vaesem_vs $v24, $v8]}    # with round key w[28,31]
+    @{[vaesem_vs $v24, $v9]}    # with round key w[32,35]
+    @{[vaesem_vs $v24, $v10]}   # with round key w[36,39]
+    @{[vaesem_vs $v24, $v11]}   # with round key w[40,43]
+    @{[vaesem_vs $v24, $v12]}   # with round key w[44,47]
+    @{[vaesef_vs $v24, $v13]}   # with round key w[48,51]
+___
+
+    return $code;
+}
+
+# aes-192 decryption with round keys v1-v13
+sub aes_192_decrypt {
+    my $code=<<___;
+    @{[vaesz_vs $v24, $v13]}    # with round key w[48,51]
+    @{[vaesdm_vs $v24, $v12]}   # with round key w[44,47]
+    @{[vaesdm_vs $v24, $v11]}   # with round key w[40,43]
+    @{[vaesdm_vs $v24, $v10]}   # with round key w[36,39]
+    @{[vaesdm_vs $v24, $v9]}    # with round key w[32,35]
+    @{[vaesdm_vs $v24, $v8]}    # with round key w[28,31]
+    @{[vaesdm_vs $v24, $v7]}    # with round key w[24,27]
+    @{[vaesdm_vs $v24, $v6]}    # with round key w[20,23]
+    @{[vaesdm_vs $v24, $v5]}    # with round key w[16,19]
+    @{[vaesdm_vs $v24, $v4]}    # with round key w[12,15]
+    @{[vaesdm_vs $v24, $v3]}    # with round key w[ 8,11]
+    @{[vaesdm_vs $v24, $v2]}    # with round key w[ 4, 7]
+    @{[vaesdf_vs $v24, $v1]}    # with round key w[ 0, 3]
+___
+
+    return $code;
+}
+
+# aes-256 encryption with round keys v1-v15
+sub aes_256_encrypt {
+    my $code=<<___;
+    @{[vaesz_vs $v24, $v1]}     # with round key w[ 0, 3]
+    @{[vaesem_vs $v24, $v2]}    # with round key w[ 4, 7]
+    @{[vaesem_vs $v24, $v3]}    # with round key w[ 8,11]
+    @{[vaesem_vs $v24, $v4]}    # with round key w[12,15]
+    @{[vaesem_vs $v24, $v5]}    # with round key w[16,19]
+    @{[vaesem_vs $v24, $v6]}    # with round key w[20,23]
+    @{[vaesem_vs $v24, $v7]}    # with round key w[24,27]
+    @{[vaesem_vs $v24, $v8]}    # with round key w[28,31]
+    @{[vaesem_vs $v24, $v9]}    # with round key w[32,35]
+    @{[vaesem_vs $v24, $v10]}   # with round key w[36,39]
+    @{[vaesem_vs $v24, $v11]}   # with round key w[40,43]
+    @{[vaesem_vs $v24, $v12]}   # with round key w[44,47]
+    @{[vaesem_vs $v24, $v13]}   # with round key w[48,51]
+    @{[vaesem_vs $v24, $v14]}   # with round key w[52,55]
+    @{[vaesef_vs $v24, $v15]}   # with round key w[56,59]
+___
+
+    return $code;
+}
+
+# aes-256 decryption with round keys v1-v15
+sub aes_256_decrypt {
+    my $code=<<___;
+    @{[vaesz_vs $v24, $v15]}    # with round key w[56,59]
+    @{[vaesdm_vs $v24, $v14]}   # with round key w[52,55]
+    @{[vaesdm_vs $v24, $v13]}   # with round key w[48,51]
+    @{[vaesdm_vs $v24, $v12]}   # with round key w[44,47]
+    @{[vaesdm_vs $v24, $v11]}   # with round key w[40,43]
+    @{[vaesdm_vs $v24, $v10]}   # with round key w[36,39]
+    @{[vaesdm_vs $v24, $v9]}    # with round key w[32,35]
+    @{[vaesdm_vs $v24, $v8]}    # with round key w[28,31]
+    @{[vaesdm_vs $v24, $v7]}    # with round key w[24,27]
+    @{[vaesdm_vs $v24, $v6]}    # with round key w[20,23]
+    @{[vaesdm_vs $v24, $v5]}    # with round key w[16,19]
+    @{[vaesdm_vs $v24, $v4]}    # with round key w[12,15]
+    @{[vaesdm_vs $v24, $v3]}    # with round key w[ 8,11]
+    @{[vaesdm_vs $v24, $v2]}    # with round key w[ 4, 7]
+    @{[vaesdf_vs $v24, $v1]}    # with round key w[ 0, 3]
+___
+
+    return $code;
+}
+
+$code .= <<___;
+.p2align 3
+.globl rv64i_zvkned_cbc_encrypt
+.type rv64i_zvkned_cbc_encrypt,\@function
+rv64i_zvkned_cbc_encrypt:
+    # check whether the length is a multiple of 16 and >= 16
+    li $T1, 16
+    blt $LEN, $T1, L_end
+    andi $T1, $LEN, 15
+    bnez $T1, L_end
+
+    # Load number of rounds
+    lwu $rounds, 240($KEYP)
+
+    # Get proper routine for key size
+    li $T0, 10
+    beq $rounds, $T0, L_cbc_enc_128
+
+    li $T0, 12
+    beq $rounds, $T0, L_cbc_enc_192
+
+    li $T0, 14
+    beq $rounds, $T0, L_cbc_enc_256
+
+    ret
+.size rv64i_zvkned_cbc_encrypt,.-rv64i_zvkned_cbc_encrypt
+___
+
+$code .= <<___;
+.p2align 3
+L_cbc_enc_128:
+    # Load all 11 round keys to v1-v11 registers.
+    @{[aes_128_load_key]}
+
+    # Load IV.
+    @{[vle32_v $v16, ($IVP)]}
+
+    @{[vle32_v $v24, ($INP)]}
+    @{[vxor_vv $v24, $v24, $v16]}
+    j 2f
+
+1:
+    @{[vle32_v $v17, ($INP)]}
+    @{[vxor_vv $v24, $v24, $v17]}
+
+2:
+    # AES body
+    @{[aes_128_encrypt]}
+
+    @{[vse32_v $v24, ($OUTP)]}
+
+    addi $INP, $INP, 16
+    addi $OUTP, $OUTP, 16
+    addi $LEN, $LEN, -16
+
+    bnez $LEN, 1b
+
+    @{[vse32_v $v24, ($IVP)]}
+
+    ret
+.size L_cbc_enc_128,.-L_cbc_enc_128
+___
+
+$code .= <<___;
+.p2align 3
+L_cbc_enc_192:
+    # Load all 13 round keys to v1-v13 registers.
+    @{[aes_192_load_key]}
+
+    # Load IV.
+    @{[vle32_v $v16, ($IVP)]}
+
+    @{[vle32_v $v24, ($INP)]}
+    @{[vxor_vv $v24, $v24, $v16]}
+    j 2f
+
+1:
+    @{[vle32_v $v17, ($INP)]}
+    @{[vxor_vv $v24, $v24, $v17]}
+
+2:
+    # AES body
+    @{[aes_192_encrypt]}
+
+    @{[vse32_v $v24, ($OUTP)]}
+
+    addi $INP, $INP, 16
+    addi $OUTP, $OUTP, 16
+    addi $LEN, $LEN, -16
+
+    bnez $LEN, 1b
+
+    @{[vse32_v $v24, ($IVP)]}
+
+    ret
+.size L_cbc_enc_192,.-L_cbc_enc_192
+___
+
+$code .= <<___;
+.p2align 3
+L_cbc_enc_256:
+    # Load all 15 round keys to v1-v15 registers.
+    @{[aes_256_load_key]}
+
+    # Load IV.
+    @{[vle32_v $v16, ($IVP)]}
+
+    @{[vle32_v $v24, ($INP)]}
+    @{[vxor_vv $v24, $v24, $v16]}
+    j 2f
+
+1:
+    @{[vle32_v $v17, ($INP)]}
+    @{[vxor_vv $v24, $v24, $v17]}
+
+2:
+    # AES body
+    @{[aes_256_encrypt]}
+
+    @{[vse32_v $v24, ($OUTP)]}
+
+    addi $INP, $INP, 16
+    addi $OUTP, $OUTP, 16
+    addi $LEN, $LEN, -16
+
+    bnez $LEN, 1b
+
+    @{[vse32_v $v24, ($IVP)]}
+
+    ret
+.size L_cbc_enc_256,.-L_cbc_enc_256
+___
+
+###############################################################################
+# void rv64i_zvkned_cbc_decrypt(const unsigned char *in, unsigned char *out,
+#                               size_t length, const AES_KEY *key,
+#                               unsigned char *ivec, const int enc);
+my ($INP, $OUTP, $LEN, $KEYP, $IVP, $ENC) = ("a0", "a1", "a2", "a3", "a4", "a5");
+my ($T0, $T1, $rounds) = ("t0", "t1", "t2");
+my ($v0, $v1, $v2, $v3, $v4, $v5, $v6, $v7,
+    $v8, $v9, $v10, $v11, $v12, $v13, $v14, $v15,
+    $v16, $v17, $v18, $v19, $v20, $v21, $v22, $v23,
+    $v24, $v25, $v26, $v27, $v28, $v29, $v30, $v31,
+) = map("v$_",(0..31));
+
+$code .= <<___;
+.p2align 3
+.globl rv64i_zvkned_cbc_decrypt
+.type rv64i_zvkned_cbc_decrypt,\@function
+rv64i_zvkned_cbc_decrypt:
+    # check whether the length is a multiple of 16 and >= 16
+    li $T1, 16
+    blt $LEN, $T1, L_end
+    andi $T1, $LEN, 15
+    bnez $T1, L_end
+
+    # Load number of rounds
+    lwu $rounds, 240($KEYP)
+
+    # Get proper routine for key size
+    li $T0, 10
+    beq $rounds, $T0, L_cbc_dec_128
+
+    li $T0, 12
+    beq $rounds, $T0, L_cbc_dec_192
+
+    li $T0, 14
+    beq $rounds, $T0, L_cbc_dec_256
+
+    ret
+.size rv64i_zvkned_cbc_decrypt,.-rv64i_zvkned_cbc_decrypt
+___
+
+$code .= <<___;
+.p2align 3
+L_cbc_dec_128:
+    # Load all 11 round keys to v1-v11 registers.
+    @{[aes_128_load_key]}
+
+    # Load IV.
+    @{[vle32_v $v16, ($IVP)]}
+
+    @{[vle32_v $v24, ($INP)]}
+    @{[vmv_v_v $v17, $v24]}
+    j 2f
+
+1:
+    @{[vle32_v $v24, ($INP)]}
+    @{[vmv_v_v $v17, $v24]}
+    addi $OUTP, $OUTP, 16
+
+2:
+    # AES body
+    @{[aes_128_decrypt]}
+
+    @{[vxor_vv $v24, $v24, $v16]}
+    @{[vse32_v $v24, ($OUTP)]}
+    @{[vmv_v_v $v16, $v17]}
+
+    addi $LEN, $LEN, -16
+    addi $INP, $INP, 16
+
+    bnez $LEN, 1b
+
+    @{[vse32_v $v16, ($IVP)]}
+
+    ret
+.size L_cbc_dec_128,.-L_cbc_dec_128
+___
+
+$code .= <<___;
+.p2align 3
+L_cbc_dec_192:
+    # Load all 13 round keys to v1-v13 registers.
+    @{[aes_192_load_key]}
+
+    # Load IV.
+    @{[vle32_v $v16, ($IVP)]}
+
+    @{[vle32_v $v24, ($INP)]}
+    @{[vmv_v_v $v17, $v24]}
+    j 2f
+
+1:
+    @{[vle32_v $v24, ($INP)]}
+    @{[vmv_v_v $v17, $v24]}
+    addi $OUTP, $OUTP, 16
+
+2:
+    # AES body
+    @{[aes_192_decrypt]}
+
+    @{[vxor_vv $v24, $v24, $v16]}
+    @{[vse32_v $v24, ($OUTP)]}
+    @{[vmv_v_v $v16, $v17]}
+
+    addi $LEN, $LEN, -16
+    addi $INP, $INP, 16
+
+    bnez $LEN, 1b
+
+    @{[vse32_v $v16, ($IVP)]}
+
+    ret
+.size L_cbc_dec_192,.-L_cbc_dec_192
+___
+
+$code .= <<___;
+.p2align 3
+L_cbc_dec_256:
+    # Load all 15 round keys to v1-v15 registers.
+    @{[aes_256_load_key]}
+
+    # Load IV.
+    @{[vle32_v $v16, ($IVP)]}
+
+    @{[vle32_v $v24, ($INP)]}
+    @{[vmv_v_v $v17, $v24]}
+    j 2f
+
+1:
+    @{[vle32_v $v24, ($INP)]}
+    @{[vmv_v_v $v17, $v24]}
+    addi $OUTP, $OUTP, 16
+
+2:
+    # AES body
+    @{[aes_256_decrypt]}
+
+    @{[vxor_vv $v24, $v24, $v16]}
+    @{[vse32_v $v24, ($OUTP)]}
+    @{[vmv_v_v $v16, $v17]}
+
+    addi $LEN, $LEN, -16
+    addi $INP, $INP, 16
+
+    bnez $LEN, 1b
+
+    @{[vse32_v $v16, ($IVP)]}
+
+    ret
+.size L_cbc_dec_256,.-L_cbc_dec_256
+___
+
+}
+
 ################################################################################
 # int rv64i_zvkned_set_encrypt_key(const unsigned char *userKey, const int bits,
 #                                  AES_KEY *key)
@@ -627,6 +1155,10 @@ L_fail_m2:
     li a0, -2
     ret
 .size L_fail_m2,.-L_fail_m2
+
+L_end:
+  ret
+.size L_end,.-L_end
 ___
 
 print $code;
-- 
cgit v1.2.3