summaryrefslogtreecommitdiffstats
path: root/crypto/aes
diff options
context:
space:
mode:
authorHongren (Zenithal) Zheng <i@zenithal.me>2022-05-13 22:24:43 +0800
committerPauli <pauli@openssl.org>2022-09-05 10:20:30 +1000
commitb1b889d1b3fc92a56ead5536bee06f3415b78482 (patch)
treee89c78da40126d629f847f8086a78034e802cdef /crypto/aes
parent3e139f3d85396cab0bac5d263472b3223a51b76a (diff)
Add AES implementation in riscv32 zkn asm
Reviewed-by: Tomas Mraz <tomas@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/18308)
Diffstat (limited to 'crypto/aes')
-rw-r--r--crypto/aes/asm/aes-riscv32-zkn.pl1061
1 files changed, 1061 insertions, 0 deletions
diff --git a/crypto/aes/asm/aes-riscv32-zkn.pl b/crypto/aes/asm/aes-riscv32-zkn.pl
new file mode 100644
index 0000000000..7a20f66e5c
--- /dev/null
+++ b/crypto/aes/asm/aes-riscv32-zkn.pl
@@ -0,0 +1,1061 @@
+#! /usr/bin/env perl
+# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+################################################################################
+# Utility functions to help with keeping track of which registers to stack/
+# unstack when entering / exiting routines.
+################################################################################
+{
+ # Callee-saved registers
+ my @callee_saved = map("x$_",(2,8,9,18..27));
+ # Caller-saved registers
+ my @caller_saved = map("x$_",(1,5..7,10..17,28..31));
+ my @must_save;
+ sub use_reg {
+ my $reg = shift;
+ if (grep(/^$reg$/, @callee_saved)) {
+ push(@must_save, $reg);
+ } elsif (!grep(/^$reg$/, @caller_saved)) {
+ # Register is not usable!
+ die("Unusable register ".$reg);
+ }
+ return $reg;
+ }
+ sub use_regs {
+ return map(use_reg("x$_"), @_);
+ }
+ sub save_regs {
+ my $ret = '';
+ my $stack_reservation = ($#must_save + 1) * 8;
+ my $stack_offset = $stack_reservation;
+ if ($stack_reservation % 16) {
+ $stack_reservation += 8;
+ }
+ $ret.=" addi sp,sp,-$stack_reservation\n";
+ foreach (@must_save) {
+ $stack_offset -= 8;
+ $ret.=" sw $_,$stack_offset(sp)\n";
+ }
+ return $ret;
+ }
+ sub load_regs {
+ my $ret = '';
+ my $stack_reservation = ($#must_save + 1) * 8;
+ my $stack_offset = $stack_reservation;
+ if ($stack_reservation % 16) {
+ $stack_reservation += 8;
+ }
+ foreach (@must_save) {
+ $stack_offset -= 8;
+ $ret.=" lw $_,$stack_offset(sp)\n";
+ }
+ $ret.=" addi sp,sp,$stack_reservation\n";
+ return $ret;
+ }
+ sub clear_regs {
+ @must_save = ();
+ }
+}
+
+################################################################################
+# util for encoding scalar crypto extension instructions
+################################################################################
+
+my @regs = map("x$_",(0..31));
+my %reglookup;
+@reglookup{@regs} = @regs;
+
+# Takes a register name, possibly an alias, and converts it to a register index
+# from 0 to 31
+sub read_reg {
+ my $reg = lc shift;
+ if (!exists($reglookup{$reg})) {
+ die("Unknown register ".$reg);
+ }
+ my $regstr = $reglookup{$reg};
+ if (!($regstr =~ /^x([0-9]+)$/)) {
+ die("Could not process register ".$reg);
+ }
+ return $1;
+}
+
+sub aes32dsi {
+ # Encoding for aes32dsi rd, rs1, rs2, bs instruction on RV32
+ # bs_XXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b00_10101_00000_00000_000_00000_0110011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ my $rs2 = read_reg shift;
+ my $bs = shift;
+
+ return ".word ".($template | ($bs << 30) | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
+}
+
+sub aes32dsmi {
+ # Encoding for aes32dsmi rd, rs1, rs2, bs instruction on RV32
+ # bs_XXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b00_10111_00000_00000_000_00000_0110011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ my $rs2 = read_reg shift;
+ my $bs = shift;
+
+ return ".word ".($template | ($bs << 30) | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
+}
+
+sub aes32esi {
+ # Encoding for aes32esi rd, rs1, rs2, bs instruction on RV32
+ # bs_XXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b00_10001_00000_00000_000_00000_0110011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ my $rs2 = read_reg shift;
+ my $bs = shift;
+
+ return ".word ".($template | ($bs << 30) | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
+}
+
+sub aes32esmi {
+ # Encoding for aes32esmi rd, rs1, rs2, bs instruction on RV32
+ # bs_XXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b00_10011_00000_00000_000_00000_0110011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ my $rs2 = read_reg shift;
+ my $bs = shift;
+
+ return ".word ".($template | ($bs << 30) | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
+}
+
+sub rori {
+ # Encoding for ror rd, rs1, imm instruction on RV64
+ # XXXXXXX_shamt_ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b0110000_00000_00000_101_00000_0010011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ my $shamt = shift;
+
+ return ".word ".($template | ($shamt << 20) | ($rs1 << 15) | ($rd << 7));
+}
+
+################################################################################
+# Register assignment for rv32i_zkne_encrypt and rv32i_zknd_decrypt
+################################################################################
+
+# Registers initially to hold AES state (called s0-s3 or y0-y3 elsewhere)
+my ($Q0,$Q1,$Q2,$Q3) = use_regs(6..9);
+
+# Function arguments (x10-x12 are a0-a2 in the ABI)
+# Input block pointer, output block pointer, key pointer
+my ($INP,$OUTP,$KEYP) = use_regs(10..12);
+
+# Registers initially to hold Key
+my ($T0,$T1,$T2,$T3) = use_regs(13..16);
+
+# Loop counter
+my ($loopcntr) = use_regs(30);
+
+################################################################################
+# Utility for rv32i_zkne_encrypt and rv32i_zknd_decrypt
+################################################################################
+
+# outer product of whole state into one column of key
+sub outer {
+ my $inst = shift;
+ my $key = shift;
+ # state 0 to 3
+ my $s0 = shift;
+ my $s1 = shift;
+ my $s2 = shift;
+ my $s3 = shift;
+ my $ret = '';
+$ret .= <<___;
+ @{[$inst->($key,$key,$s0,0)]}
+ @{[$inst->($key,$key,$s1,1)]}
+ @{[$inst->($key,$key,$s2,2)]}
+ @{[$inst->($key,$key,$s3,3)]}
+___
+ return $ret;
+}
+
+sub aes32esmi4 {
+ return outer(\&aes32esmi, @_)
+}
+
+sub aes32esi4 {
+ return outer(\&aes32esi, @_)
+}
+
+sub aes32dsmi4 {
+ return outer(\&aes32dsmi, @_)
+}
+
+sub aes32dsi4 {
+ return outer(\&aes32dsi, @_)
+}
+
+################################################################################
+# void rv32i_zkne_encrypt(const unsigned char *in, unsigned char *out,
+# const AES_KEY *key);
+################################################################################
+my $code .= <<___;
+.text
+.balign 16
+.globl rv32i_zkne_encrypt
+.type rv32i_zkne_encrypt,\@function
+rv32i_zkne_encrypt:
+___
+
+$code .= save_regs();
+
+$code .= <<___;
+ # Load input to block cipher
+ lw $Q0,0($INP)
+ lw $Q1,4($INP)
+ lw $Q2,8($INP)
+ lw $Q3,12($INP)
+
+ # Load key
+ lw $T0,0($KEYP)
+ lw $T1,4($KEYP)
+ lw $T2,8($KEYP)
+ lw $T3,12($KEYP)
+
+ # Load number of rounds
+ lw $loopcntr,240($KEYP)
+
+ # initial transformation
+ xor $Q0,$Q0,$T0
+ xor $Q1,$Q1,$T1
+ xor $Q2,$Q2,$T2
+ xor $Q3,$Q3,$T3
+
+ # The main loop only executes the first N-2 rounds, each loop consumes two rounds
+ add $loopcntr,$loopcntr,-2
+ srli $loopcntr,$loopcntr,1
+1:
+ # Grab next key in schedule
+ add $KEYP,$KEYP,16
+ lw $T0,0($KEYP)
+ lw $T1,4($KEYP)
+ lw $T2,8($KEYP)
+ lw $T3,12($KEYP)
+
+ @{[aes32esmi4 $T0,$Q0,$Q1,$Q2,$Q3]}
+ @{[aes32esmi4 $T1,$Q1,$Q2,$Q3,$Q0]}
+ @{[aes32esmi4 $T2,$Q2,$Q3,$Q0,$Q1]}
+ @{[aes32esmi4 $T3,$Q3,$Q0,$Q1,$Q2]}
+ # now T0~T3 hold the new state
+
+ # Grab next key in schedule
+ add $KEYP,$KEYP,16
+ lw $Q0,0($KEYP)
+ lw $Q1,4($KEYP)
+ lw $Q2,8($KEYP)
+ lw $Q3,12($KEYP)
+
+ @{[aes32esmi4 $Q0,$T0,$T1,$T2,$T3]}
+ @{[aes32esmi4 $Q1,$T1,$T2,$T3,$T0]}
+ @{[aes32esmi4 $Q2,$T2,$T3,$T0,$T1]}
+ @{[aes32esmi4 $Q3,$T3,$T0,$T1,$T2]}
+ # now Q0~Q3 hold the new state
+
+ add $loopcntr,$loopcntr,-1
+ bgtz $loopcntr,1b
+
+# final two rounds
+ # Grab next key in schedule
+ add $KEYP,$KEYP,16
+ lw $T0,0($KEYP)
+ lw $T1,4($KEYP)
+ lw $T2,8($KEYP)
+ lw $T3,12($KEYP)
+
+ @{[aes32esmi4 $T0,$Q0,$Q1,$Q2,$Q3]}
+ @{[aes32esmi4 $T1,$Q1,$Q2,$Q3,$Q0]}
+ @{[aes32esmi4 $T2,$Q2,$Q3,$Q0,$Q1]}
+ @{[aes32esmi4 $T3,$Q3,$Q0,$Q1,$Q2]}
+ # now T0~T3 hold the new state
+
+ # Grab next key in schedule
+ add $KEYP,$KEYP,16
+ lw $Q0,0($KEYP)
+ lw $Q1,4($KEYP)
+ lw $Q2,8($KEYP)
+ lw $Q3,12($KEYP)
+
+ # no mix column now
+ @{[aes32esi4 $Q0,$T0,$T1,$T2,$T3]}
+ @{[aes32esi4 $Q1,$T1,$T2,$T3,$T0]}
+ @{[aes32esi4 $Q2,$T2,$T3,$T0,$T1]}
+ @{[aes32esi4 $Q3,$T3,$T0,$T1,$T2]}
+ # now Q0~Q3 hold the new state
+
+ sw $Q0,0($OUTP)
+ sw $Q1,4($OUTP)
+ sw $Q2,8($OUTP)
+ sw $Q3,12($OUTP)
+
+ # Pop registers and return
+___
+
+$code .= load_regs();
+
+$code .= <<___;
+ ret
+___
+
+################################################################################
+# void rv32i_zknd_decrypt(const unsigned char *in, unsigned char *out,
+# const AES_KEY *key);
+################################################################################
+$code .= <<___;
+.text
+.balign 16
+.globl rv32i_zknd_decrypt
+.type rv32i_zknd_decrypt,\@function
+rv32i_zknd_decrypt:
+___
+
+$code .= save_regs();
+
+$code .= <<___;
+ # Load input to block cipher
+ lw $Q0,0($INP)
+ lw $Q1,4($INP)
+ lw $Q2,8($INP)
+ lw $Q3,12($INP)
+
+ # Load number of rounds
+ lw $loopcntr,240($KEYP)
+
+ # Load the last key
+ # use T0 as temporary now
+ slli $T0,$loopcntr,4
+ add $KEYP,$KEYP,$T0
+ # Load key
+ lw $T0,0($KEYP)
+ lw $T1,4($KEYP)
+ lw $T2,8($KEYP)
+ lw $T3,12($KEYP)
+
+ # initial transformation
+ xor $Q0,$Q0,$T0
+ xor $Q1,$Q1,$T1
+ xor $Q2,$Q2,$T2
+ xor $Q3,$Q3,$T3
+
+ # The main loop only executes the first N-2 rounds, each loop consumes two rounds
+ add $loopcntr,$loopcntr,-2
+ srli $loopcntr,$loopcntr,1
+1:
+ # Grab next key in schedule
+ add $KEYP,$KEYP,-16
+ lw $T0,0($KEYP)
+ lw $T1,4($KEYP)
+ lw $T2,8($KEYP)
+ lw $T3,12($KEYP)
+
+ @{[aes32dsmi4 $T0,$Q0,$Q3,$Q2,$Q1]}
+ @{[aes32dsmi4 $T1,$Q1,$Q0,$Q3,$Q2]}
+ @{[aes32dsmi4 $T2,$Q2,$Q1,$Q0,$Q3]}
+ @{[aes32dsmi4 $T3,$Q3,$Q2,$Q1,$Q0]}
+ # now T0~T3 hold the new state
+
+ # Grab next key in schedule
+ add $KEYP,$KEYP,-16
+ lw $Q0,0($KEYP)
+ lw $Q1,4($KEYP)
+ lw $Q2,8($KEYP)
+ lw $Q3,12($KEYP)
+
+ @{[aes32dsmi4 $Q0,$T0,$T3,$T2,$T1]}
+ @{[aes32dsmi4 $Q1,$T1,$T0,$T3,$T2]}
+ @{[aes32dsmi4 $Q2,$T2,$T1,$T0,$T3]}
+ @{[aes32dsmi4 $Q3,$T3,$T2,$T1,$T0]}
+ # now Q0~Q3 hold the new state
+
+ add $loopcntr,$loopcntr,-1
+ bgtz $loopcntr,1b
+
+# final two rounds
+ # Grab next key in schedule
+ add $KEYP,$KEYP,-16
+ lw $T0,0($KEYP)
+ lw $T1,4($KEYP)
+ lw $T2,8($KEYP)
+ lw $T3,12($KEYP)
+
+ @{[aes32dsmi4 $T0,$Q0,$Q3,$Q2,$Q1]}
+ @{[aes32dsmi4 $T1,$Q1,$Q0,$Q3,$Q2]}
+ @{[aes32dsmi4 $T2,$Q2,$Q1,$Q0,$Q3]}
+ @{[aes32dsmi4 $T3,$Q3,$Q2,$Q1,$Q0]}
+ # now T0~T3 hold the new state
+
+ # Grab next key in schedule
+ add $KEYP,$KEYP,-16
+ lw $Q0,0($KEYP)
+ lw $Q1,4($KEYP)
+ lw $Q2,8($KEYP)
+ lw $Q3,12($KEYP)
+
+ # no mix column now
+ @{[aes32dsi4 $Q0,$T0,$T3,$T2,$T1]}
+ @{[aes32dsi4 $Q1,$T1,$T0,$T3,$T2]}
+ @{[aes32dsi4 $Q2,$T2,$T1,$T0,$T3]}
+ @{[aes32dsi4 $Q3,$T3,$T2,$T1,$T0]}
+ # now Q0~Q3 hold the new state
+
+ sw $Q0,0($OUTP)
+ sw $Q1,4($OUTP)
+ sw $Q2,8($OUTP)
+ sw $Q3,12($OUTP)
+
+ # Pop registers and return
+___
+
+$code .= load_regs();
+
+$code .= <<___;
+ ret
+___
+
+clear_regs();
+
+################################################################################
+# Register assignment for rv32i_zkn[e/d]_set_[en/de]crypt
+################################################################################
+
+# Function arguments (x10-x12 are a0-a2 in the ABI)
+# Pointer to user key, number of bits in key, key pointer
+my ($UKEY,$BITS,$KEYP) = use_regs(10..12);
+
+# Temporaries
+my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7,$T8) = use_regs(13..17,28..31);
+
+################################################################################
+# utility functions for rv32i_zkne_set_encrypt_key
+################################################################################
+
+my @rcon = (0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36);
+
+# do 4 sbox on 4 bytes of rs, (possibly mix), then xor with rd
+sub sbox4 {
+ my $inst = shift;
+ my $rd = shift;
+ my $rs = shift;
+ my $ret = <<___;
+ @{[$inst->($rd,$rd,$rs,0)]}
+ @{[$inst->($rd,$rd,$rs,1)]}
+ @{[$inst->($rd,$rd,$rs,2)]}
+ @{[$inst->($rd,$rd,$rs,3)]}
+___
+ return $ret;
+}
+
+sub fwdsbox4 {
+ return sbox4(\&aes32esi, @_);
+}
+
+sub ke128enc {
+ my $zbkb = shift;
+ my $rnum = 0;
+ my $ret = '';
+$ret .= <<___;
+ lw $T0,0($UKEY)
+ lw $T1,4($UKEY)
+ lw $T2,8($UKEY)
+ lw $T3,12($UKEY)
+
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+___
+ while($rnum < 10) {
+$ret .= <<___;
+ # use T4 to store rcon
+ li $T4,$rcon[$rnum]
+ # as xor is associative and commutative
+ # we fist xor T0 with RCON, then use T0 to
+ # xor the result of each SBOX result of T3
+ xor $T0,$T0,$T4
+ # use T4 to store rotated T3
+___
+ # right rotate by 8
+ if ($zbkb) {
+$ret .= <<___;
+ @{[rori $T4,$T3,8]}
+___
+ } else {
+$ret .= <<___;
+ srli $T4,$T3,8
+ slli $T5,$T3,24
+ or $T4,$T4,$T5
+___
+ }
+$ret .= <<___;
+ # update T0
+ @{[fwdsbox4 $T0,$T4]}
+
+ # update new T1~T3
+ xor $T1,$T1,$T0
+ xor $T2,$T2,$T1
+ xor $T3,$T3,$T2
+
+ add $KEYP,$KEYP,16
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+___
+ $rnum++;
+ }
+ return $ret;
+}
+
+sub ke192enc {
+ my $zbkb = shift;
+ my $rnum = 0;
+ my $ret = '';
+$ret .= <<___;
+ lw $T0,0($UKEY)
+ lw $T1,4($UKEY)
+ lw $T2,8($UKEY)
+ lw $T3,12($UKEY)
+ lw $T4,16($UKEY)
+ lw $T5,20($UKEY)
+
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+ sw $T4,16($KEYP)
+ sw $T5,20($KEYP)
+___
+ while($rnum < 8) {
+$ret .= <<___;
+ # see the comment in ke128enc
+ li $T6,$rcon[$rnum]
+ xor $T0,$T0,$T6
+___
+ # right rotate by 8
+ if ($zbkb) {
+$ret .= <<___;
+ @{[rori $T6,$T5,8]}
+___
+ } else {
+$ret .= <<___;
+ srli $T6,$T5,8
+ slli $T7,$T5,24
+ or $T6,$T6,$T7
+___
+ }
+$ret .= <<___;
+ @{[fwdsbox4 $T0,$T6]}
+ xor $T1,$T1,$T0
+ xor $T2,$T2,$T1
+ xor $T3,$T3,$T2
+___
+ if ($rnum != 7) {
+ # note that (8+1)*24 = 216, (12+1)*16 = 208
+ # thus the last 8 bytes can be dropped
+$ret .= <<___;
+ xor $T4,$T4,$T3
+ xor $T5,$T5,$T4
+___
+ }
+$ret .= <<___;
+ add $KEYP,$KEYP,24
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+___
+ if ($rnum != 7) {
+$ret .= <<___;
+ sw $T4,16($KEYP)
+ sw $T5,20($KEYP)
+___
+ }
+ $rnum++;
+ }
+ return $ret;
+}
+
+sub ke256enc {
+ my $zbkb = shift;
+ my $rnum = 0;
+ my $ret = '';
+$ret .= <<___;
+ lw $T0,0($UKEY)
+ lw $T1,4($UKEY)
+ lw $T2,8($UKEY)
+ lw $T3,12($UKEY)
+ lw $T4,16($UKEY)
+ lw $T5,20($UKEY)
+ lw $T6,24($UKEY)
+ lw $T7,28($UKEY)
+
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+ sw $T4,16($KEYP)
+ sw $T5,20($KEYP)
+ sw $T6,24($KEYP)
+ sw $T7,28($KEYP)
+___
+ while($rnum < 7) {
+$ret .= <<___;
+ # see the comment in ke128enc
+ li $T8,$rcon[$rnum]
+ xor $T0,$T0,$T8
+___
+ # right rotate by 8
+ if ($zbkb) {
+$ret .= <<___;
+ @{[rori $T8,$T7,8]}
+___
+ } else {
+$ret .= <<___;
+ srli $T8,$T7,8
+ slli $BITS,$T7,24
+ or $T8,$T8,$BITS
+___
+ }
+$ret .= <<___;
+ @{[fwdsbox4 $T0,$T8]}
+ xor $T1,$T1,$T0
+ xor $T2,$T2,$T1
+ xor $T3,$T3,$T2
+
+ add $KEYP,$KEYP,32
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+___
+ if ($rnum != 6) {
+ # note that (7+1)*32 = 256, (14+1)*16 = 240
+ # thus the last 16 bytes can be dropped
+$ret .= <<___;
+ # for aes256, T3->T4 needs 4sbox but no rotate/rcon
+ @{[fwdsbox4 $T4,$T3]}
+ xor $T5,$T5,$T4
+ xor $T6,$T6,$T5
+ xor $T7,$T7,$T6
+ sw $T4,16($KEYP)
+ sw $T5,20($KEYP)
+ sw $T6,24($KEYP)
+ sw $T7,28($KEYP)
+___
+ }
+ $rnum++;
+ }
+ return $ret;
+}
+
+################################################################################
+# void rv32i_zkne_set_encrypt_key(const unsigned char *userKey, const int bits,
+# AES_KEY *key)
+################################################################################
+sub AES_set_common {
+ my ($ke128, $ke192, $ke256) = @_;
+ my $ret = '';
+$ret .= <<___;
+ bnez $UKEY,1f # if (!userKey || !key) return -1;
+ bnez $KEYP,1f
+ li a0,-1
+ ret
+1:
+ # Determine number of rounds from key size in bits
+ li $T0,128
+ bne $BITS,$T0,1f
+ li $T1,10 # key->rounds = 10 if bits == 128
+ sw $T1,240($KEYP) # store key->rounds
+$ke128
+ j 4f
+1:
+ li $T0,192
+ bne $BITS,$T0,2f
+ li $T1,12 # key->rounds = 12 if bits == 192
+ sw $T1,240($KEYP) # store key->rounds
+$ke192
+ j 4f
+2:
+ li $T1,14 # key->rounds = 14 if bits == 256
+ li $T0,256
+ beq $BITS,$T0,3f
+ li a0,-2 # If bits != 128, 192, or 256, return -2
+ j 5f
+3:
+ sw $T1,240($KEYP) # store key->rounds
+$ke256
+4: # return 0
+ li a0,0
+5: # return a0
+___
+ return $ret;
+}
+$code .= <<___;
+.text
+.balign 16
+.globl rv32i_zkne_set_encrypt_key
+.type rv32i_zkne_set_encrypt_key,\@function
+rv32i_zkne_set_encrypt_key:
+___
+
+$code .= save_regs();
+$code .= AES_set_common(ke128enc(0), ke192enc(0),ke256enc(0));
+$code .= load_regs();
+$code .= <<___;
+ ret
+___
+
+################################################################################
+# void rv32i_zbkb_zkne_set_encrypt_key(const unsigned char *userKey,
+# const int bits, AES_KEY *key)
+################################################################################
+$code .= <<___;
+.text
+.balign 16
+.globl rv32i_zbkb_zkne_set_encrypt_key
+.type rv32i_zbkb_zkne_set_encrypt_key,\@function
+rv32i_zbkb_zkne_set_encrypt_key:
+___
+
+$code .= save_regs();
+$code .= AES_set_common(ke128enc(1), ke192enc(1),ke256enc(1));
+$code .= load_regs();
+$code .= <<___;
+ ret
+___
+
+################################################################################
+# utility functions for rv32i_zknd_zkne_set_decrypt_key
+################################################################################
+
+sub invm4 {
+ # fwd sbox then inv sbox then mix column
+ # the result is only mix column
+ # this simulates aes64im T0
+ my $rd = shift;
+ my $tmp = shift;
+ my $rs = shift;
+ my $ret = <<___;
+ li $tmp,0
+ li $rd,0
+ @{[fwdsbox4 $tmp,$rs]}
+ @{[sbox4(\&aes32dsmi, $rd,$tmp)]}
+___
+ return $ret;
+}
+
+sub ke128dec {
+ my $zbkb = shift;
+ my $rnum = 0;
+ my $ret = '';
+$ret .= <<___;
+ lw $T0,0($UKEY)
+ lw $T1,4($UKEY)
+ lw $T2,8($UKEY)
+ lw $T3,12($UKEY)
+
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+___
+ while($rnum < 10) {
+$ret .= <<___;
+ # see comments in ke128enc
+ li $T4,$rcon[$rnum]
+ xor $T0,$T0,$T4
+___
+ # right rotate by 8
+ if ($zbkb) {
+$ret .= <<___;
+ @{[rori $T4,$T3,8]}
+___
+ } else {
+$ret .= <<___;
+ srli $T4,$T3,8
+ slli $T5,$T3,24
+ or $T4,$T4,$T5
+___
+ }
+$ret .= <<___;
+ @{[fwdsbox4 $T0,$T4]}
+ xor $T1,$T1,$T0
+ xor $T2,$T2,$T1
+ xor $T3,$T3,$T2
+ add $KEYP,$KEYP,16
+___
+ # need to mixcolumn only for [1:N-1] round keys
+ # this is from the fact that aes32dsmi subwords first then mix column
+ # intuitively decryption needs to first mix column then subwords
+ # however, for merging datapaths (encryption first subwords then mix column)
+ # aes32dsmi chooses to inverse the order of them, thus
+ # transform should then be done on the round key
+ if ($rnum < 9) {
+$ret .= <<___;
+ # T4 and T5 are temp variables
+ @{[invm4 $T5,$T4,$T0]}
+ sw $T5,0($KEYP)
+ @{[invm4 $T5,$T4,$T1]}
+ sw $T5,4($KEYP)
+ @{[invm4 $T5,$T4,$T2]}
+ sw $T5,8($KEYP)
+ @{[invm4 $T5,$T4,$T3]}
+ sw $T5,12($KEYP)
+___
+ } else {
+$ret .= <<___;
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+___
+ }
+ $rnum++;
+ }
+ return $ret;
+}
+
+sub ke192dec {
+ my $zbkb = shift;
+ my $rnum = 0;
+ my $ret = '';
+$ret .= <<___;
+ lw $T0,0($UKEY)
+ lw $T1,4($UKEY)
+ lw $T2,8($UKEY)
+ lw $T3,12($UKEY)
+ lw $T4,16($UKEY)
+ lw $T5,20($UKEY)
+
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+ # see the comment in ke128dec
+ # T7 and T6 are temp variables
+ @{[invm4 $T7,$T6,$T4]}
+ sw $T7,16($KEYP)
+ @{[invm4 $T7,$T6,$T5]}
+ sw $T7,20($KEYP)
+___
+ while($rnum < 8) {
+$ret .= <<___;
+ # see the comment in ke128enc
+ li $T6,$rcon[$rnum]
+ xor $T0,$T0,$T6
+___
+ # right rotate by 8
+ if ($zbkb) {
+$ret .= <<___;
+ @{[rori $T6,$T5,8]}
+___
+ } else {
+$ret .= <<___;
+ srli $T6,$T5,8
+ slli $T7,$T5,24
+ or $T6,$T6,$T7
+___
+ }
+$ret .= <<___;
+ @{[fwdsbox4 $T0,$T6]}
+ xor $T1,$T1,$T0
+ xor $T2,$T2,$T1
+ xor $T3,$T3,$T2
+
+ add $KEYP,$KEYP,24
+___
+ if ($rnum < 7) {
+$ret .= <<___;
+ xor $T4,$T4,$T3
+ xor $T5,$T5,$T4
+
+ # see the comment in ke128dec
+ # T7 and T6 are temp variables
+ @{[invm4 $T7,$T6,$T0]}
+ sw $T7,0($KEYP)
+ @{[invm4 $T7,$T6,$T1]}
+ sw $T7,4($KEYP)
+ @{[invm4 $T7,$T6,$T2]}
+ sw $T7,8($KEYP)
+ @{[invm4 $T7,$T6,$T3]}
+ sw $T7,12($KEYP)
+ @{[invm4 $T7,$T6,$T4]}
+ sw $T7,16($KEYP)
+ @{[invm4 $T7,$T6,$T5]}
+ sw $T7,20($KEYP)
+___
+ } else { # rnum == 7
+$ret .= <<___;
+ # the reason for dropping T4/T5 is in ke192enc
+ # the reason for not invm4 is in ke128dec
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+___
+ }
+ $rnum++;
+ }
+ return $ret;
+}
+
+sub ke256dec {
+ my $zbkb = shift;
+ my $rnum = 0;
+ my $ret = '';
+$ret .= <<___;
+ lw $T0,0($UKEY)
+ lw $T1,4($UKEY)
+ lw $T2,8($UKEY)
+ lw $T3,12($UKEY)
+ lw $T4,16($UKEY)
+ lw $T5,20($UKEY)
+ lw $T6,24($UKEY)
+ lw $T7,28($UKEY)
+
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+ # see the comment in ke128dec
+ # BITS and T8 are temp variables
+ # BITS are not used anymore
+ @{[invm4 $T8,$BITS,$T4]}
+ sw $T8,16($KEYP)
+ @{[invm4 $T8,$BITS,$T5]}
+ sw $T8,20($KEYP)
+ @{[invm4 $T8,$BITS,$T6]}
+ sw $T8,24($KEYP)
+ @{[invm4 $T8,$BITS,$T7]}
+ sw $T8,28($KEYP)
+___
+ while($rnum < 7) {
+$ret .= <<___;
+ # see the comment in ke128enc
+ li $T8,$rcon[$rnum]
+ xor $T0,$T0,$T8
+___
+ # right rotate by 8
+ if ($zbkb) {
+$ret .= <<___;
+ @{[rori $T8,$T7,8]}
+___
+ } else {
+$ret .= <<___;
+ srli $T8,$T7,8
+ slli $BITS,$T7,24
+ or $T8,$T8,$BITS
+___
+ }
+$ret .= <<___;
+ @{[fwdsbox4 $T0,$T8]}
+ xor $T1,$T1,$T0
+ xor $T2,$T2,$T1
+ xor $T3,$T3,$T2
+
+ add $KEYP,$KEYP,32
+___
+ if ($rnum < 6) {
+$ret .= <<___;
+ # for aes256, T3->T4 needs 4sbox but no rotate/rcon
+ @{[fwdsbox4 $T4,$T3]}
+ xor $T5,$T5,$T4
+ xor $T6,$T6,$T5
+ xor $T7,$T7,$T6
+
+ # see the comment in ke128dec
+ # T8 and BITS are temp variables
+ @{[invm4 $T8,$BITS,$T0]}
+ sw $T8,0($KEYP)
+ @{[invm4 $T8,$BITS,$T1]}
+ sw $T8,4($KEYP)
+ @{[invm4 $T8,$BITS,$T2]}
+ sw $T8,8($KEYP)
+ @{[invm4 $T8,$BITS,$T3]}
+ sw $T8,12($KEYP)
+ @{[invm4 $T8,$BITS,$T4]}
+ sw $T8,16($KEYP)
+ @{[invm4 $T8,$BITS,$T5]}
+ sw $T8,20($KEYP)
+ @{[invm4 $T8,$BITS,$T6]}
+ sw $T8,24($KEYP)
+ @{[invm4 $T8,$BITS,$T7]}
+ sw $T8,28($KEYP)
+___
+ } else {
+$ret .= <<___;
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+ # last 16 bytes are dropped
+ # see the comment in ke256enc
+___
+ }
+ $rnum++;
+ }
+ return $ret;
+}
+
+################################################################################
+# void rv32i_zknd_zkne_set_decrypt_key(const unsigned char *userKey, const int bits,
+# AES_KEY *key)
+################################################################################
+# a note on naming: set_decrypt_key needs aes32esi thus add zkne on name
+$code .= <<___;
+.text
+.balign 16
+.globl rv32i_zknd_zkne_set_decrypt_key
+.type rv32i_zknd_zkne_set_decrypt_key,\@function
+rv32i_zknd_zkne_set_decrypt_key:
+___
+$code .= save_regs();
+$code .= AES_set_common(ke128dec(0), ke192dec(0),ke256dec(0));
+$code .= load_regs();
+$code .= <<___;
+ ret
+___
+
+################################################################################
+# void rv32i_zbkb_zknd_zkne_set_decrypt_key(const unsigned char *userKey,
+# const int bits, AES_KEY *key)
+################################################################################
+$code .= <<___;
+.text
+.balign 16
+.globl rv32i_zbkb_zknd_zkne_set_decrypt_key
+.type rv32i_zbkb_zknd_zkne_set_decrypt_key,\@function
+rv32i_zbkb_zknd_zkne_set_decrypt_key:
+___
+
+$code .= save_regs();
+$code .= AES_set_common(ke128dec(1), ke192dec(1),ke256dec(1));
+$code .= load_regs();
+$code .= <<___;
+ ret
+___
+
+
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";