summaryrefslogtreecommitdiffstats
path: root/crypto/aes
diff options
context:
space:
mode:
authorHenry Brausen <henry.brausen@vrull.eu>2022-01-28 01:13:04 -0700
committerPauli <pauli@openssl.org>2022-05-19 16:32:49 +1000
commitb3504b600c028a00f36cdbfedc928a48df9818ff (patch)
treeac4885d461554f0dfe9204527a51dca20969d68c /crypto/aes
parentec26144288fd6dce6dd76bd9e2b192b495033723 (diff)
Add AES implementation in generic riscv64 asm
This implementation is based on the four-table approach, along the same lines as the non-constant-time implementation in aes_core.c The implementation is in perlasm. Utility functions are defined to automatically stack/unstack registers as needed for prologues and epilogues. See riscv-elf-psabi-doc at https://github.com/riscv-non-isa/riscv-elf-psabi-doc/ for ABI details. Reviewed-by: Philipp Tomsich <philipp.tomsich@vrull.eu> Signed-off-by: Henry Brausen <henry.brausen@vrull.eu> Reviewed-by: Tomas Mraz <tomas@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/17640)
Diffstat (limited to 'crypto/aes')
-rw-r--r--crypto/aes/asm/aes-riscv64.pl1709
-rw-r--r--crypto/aes/build.info5
2 files changed, 1714 insertions, 0 deletions
diff --git a/crypto/aes/asm/aes-riscv64.pl b/crypto/aes/asm/aes-riscv64.pl
new file mode 100644
index 0000000000..525eba4b46
--- /dev/null
+++ b/crypto/aes/asm/aes-riscv64.pl
@@ -0,0 +1,1709 @@
+#! /usr/bin/env perl
+# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+################################################################################
+# Utility functions to help with keeping track of which registers to stack/
+# unstack when entering / exiting routines.
+################################################################################
+{
+ # Callee-saved registers
+ my @callee_saved = map("x$_",(2,8,9,18..27));
+ # Caller-saved registers
+ my @caller_saved = map("x$_",(1,5..7,10..17,28..31));
+ my @must_save;
+ sub use_reg {
+ my $reg = shift;
+ if (grep(/^$reg$/, @callee_saved)) {
+ push(@must_save, $reg);
+ } elsif (!grep(/^$reg$/, @caller_saved)) {
+ # Register is not usable!
+ die("Unusable register ".$reg);
+ }
+ return $reg;
+ }
+ sub use_regs {
+ return map(use_reg("x$_"), @_);
+ }
+ sub save_regs {
+ my $ret = '';
+ my $stack_reservation = ($#must_save + 1) * 8;
+ my $stack_offset = $stack_reservation;
+ if ($stack_reservation % 16) {
+ $stack_reservation += 8;
+ }
+ $ret.=" addi sp,sp,-$stack_reservation\n";
+ foreach (@must_save) {
+ $stack_offset -= 8;
+ $ret.=" sd $_,$stack_offset(sp)\n";
+ }
+ return $ret;
+ }
+ sub load_regs {
+ my $ret = '';
+ my $stack_reservation = ($#must_save + 1) * 8;
+ my $stack_offset = $stack_reservation;
+ if ($stack_reservation % 16) {
+ $stack_reservation += 8;
+ }
+ foreach (@must_save) {
+ $stack_offset -= 8;
+ $ret.=" ld $_,$stack_offset(sp)\n";
+ }
+ $ret.=" addi sp,sp,$stack_reservation\n";
+ return $ret;
+ }
+ sub clear_regs {
+ @must_save = ();
+ }
+}
+
+################################################################################
+# Register assignment for AES_encrypt and AES_decrypt
+################################################################################
+
+# Registers to hold AES state (called s0-s3 or y0-y3 elsewhere)
+my ($Q0,$Q1,$Q2,$Q3) = use_regs(6..9);
+
+# Function arguments (x10-x12 are a0-a2 in the ABI)
+# Input block pointer, output block pointer, key pointer
+my ($INP,$OUTP,$KEYP) = use_regs(10..12);
+
+# Temporaries
+my ($T0,$T1,$T2,$T3) = use_regs(13..16);
+my ($T4,$T5,$T6,$T7,$T8,$T9,$T10,$T11) = use_regs(17..24);
+my ($T12,$T13,$T14,$T15) = use_regs(25..28);
+
+# Register to hold table offset
+my ($I0) = use_regs(29);
+
+# Loop counter
+my ($loopcntr) = use_regs(30);
+
+# Lookup table address register
+my ($TBL) = use_regs(31);
+
+# Lookup table mask register
+my ($MSK) = use_regs(5);
+
+# Aliases for readability
+my $K0 = $loopcntr;
+my $K1 = $KEYP;
+
+################################################################################
+# Table lookup utility functions for AES_encrypt and AES_decrypt
+################################################################################
+
+# do_lookup([destination regs], [state regs], [temporary regs], shamt)
+# do_lookup loads four entries from an AES encryption/decryption table
+# and stores the result in the specified destination register set
+# Ds->[0] = Table[Qs->[0] >> shamt]
+# Ds->[1] = Table[Qs->[1] >> shamt]
+# Ds->[2] = Table[Qs->[2] >> shamt]
+# Ds->[3] = Table[Qs->[3] >> shamt]
+# Four temporary regs are used to generate these lookups. The temporary regs
+# can be equal to the destination regs, but only if they appear in the same
+# order. I.e. do_lookup([A,B,C,D],[...],[A,B,C,D],...) is OK
+sub do_lookup {
+ # (destination regs, state regs, temporary regs, shift amount)
+ my ($Ds, $Qs, $Ts, $shamt) = @_;
+
+ my $ret = '';
+
+ # AES encryption/decryption table entries have word-sized (4-byte) entries.
+ # To convert the table index into a byte offset, we compute
+ # ((Qs->[i] >> shamt) & 0xFF) << 2
+ # However, to save work, we compute the equivalent expression
+ # (Qs->[i] >> (shamt-2)) & 0x3FC
+ if ($shamt < 2) {
+$ret .= <<___;
+
+ slli $Ts->[0],$Qs->[0],$shamt+2
+ slli $Ts->[1],$Qs->[1],$shamt+2
+ slli $Ts->[2],$Qs->[2],$shamt+2
+ slli $Ts->[3],$Qs->[3],$shamt+2
+___
+ } else {
+$ret .= <<___;
+
+ srli $Ts->[0],$Qs->[0],$shamt-2
+ srli $Ts->[1],$Qs->[1],$shamt-2
+ srli $Ts->[2],$Qs->[2],$shamt-2
+ srli $Ts->[3],$Qs->[3],$shamt-2
+___
+ }
+
+$ret .= <<___;
+
+ andi $Ts->[0],$Ts->[0],0x3FC
+ andi $Ts->[1],$Ts->[1],0x3FC
+ andi $Ts->[2],$Ts->[2],0x3FC
+ andi $Ts->[3],$Ts->[3],0x3FC
+
+ # Index into table.
+ add $I0,$TBL,$Ts->[0]
+ lwu $Ds->[0],0($I0)
+ add $I0,$TBL,$Ts->[1]
+ lwu $Ds->[1],0($I0)
+ add $I0,$TBL,$Ts->[2]
+ lwu $Ds->[2],0($I0)
+ add $I0,$TBL,$Ts->[3]
+ lwu $Ds->[3],0($I0)
+
+___
+
+ return $ret;
+}
+
+# Identical to do_lookup(), but loads only a single byte into each destination
+# register (replaces lwu with lbu). Used in the final round of AES_encrypt.
+sub do_lookup_byte {
+ my $ret = do_lookup(@_);
+ $ret =~ s/lwu/lbu/g;
+ return $ret;
+}
+
+# do_lookup_Td4([destination regs], [state regs], [temporary regs])
+# Used in final phase of AES_decrypt
+# Ds->[0] = Table[(Qs->[0]) &0xFF]
+# Ds->[1] = Table[(Qs->[1] >> 8 )&0xFF]
+# Ds->[2] = Table[(Qs->[2] >> 16)&0xFF]
+# Ds->[3] = Table[(Qs->[3] >> 24)&0xFF]
+# Four temporary regs are used to generate these lookups. The temporary regs
+# can be equal to the destination regs, but only if they appear in the same
+# order. I.e. do_lookup([A,B,C,D],[...],[A,B,C,D],...) is OK
+sub do_lookup_Td4 {
+ my ($Ds, $Qs, $Ts) = @_;
+
+ my $ret = '';
+
+$ret .= <<___;
+ srli $Ts->[1],$Qs->[1],8
+ srli $Ts->[2],$Qs->[2],16
+ srli $Ts->[3],$Qs->[3],24
+
+ andi $Ts->[0],$Qs->[0],0xFF
+ andi $Ts->[1],$Ts->[1],0xFF
+ andi $Ts->[2],$Ts->[2],0xFF
+ andi $Ts->[3],$Ts->[3],0xFF
+
+ add $I0,$TBL,$Ts->[0]
+ lbu $Ds->[0],0($I0)
+ add $I0,$TBL,$Ts->[1]
+ lbu $Ds->[1],0($I0)
+ add $I0,$TBL,$Ts->[2]
+ lbu $Ds->[2],0($I0)
+ add $I0,$TBL,$Ts->[3]
+ lbu $Ds->[3],0($I0)
+
+___
+
+ return $ret;
+}
+
+################################################################################
+# void AES_encrypt(const unsigned char *in, unsigned char *out,
+# const AES_KEY *key);
+################################################################################
+my $code .= <<___;
+.text
+.balign 16
+.globl AES_encrypt
+.type AES_encrypt,\@function
+AES_encrypt:
+___
+
+$code .= save_regs();
+
+$code .= <<___;
+
+ # Load input to block cipher
+ ld $Q0,0($INP)
+ ld $Q2,8($INP)
+
+
+ # Load key
+ ld $T0,0($KEYP)
+ ld $T2,8($KEYP)
+
+
+ # Load number of rounds
+ lwu $loopcntr,240($KEYP)
+
+ # Load address of substitution table and wrap-around mask
+ la $TBL,AES_Te0
+ li $MSK,~0xFFF
+
+ # y = n xor k, stored in Q0-Q3
+
+ xor $Q0,$Q0,$T0
+ xor $Q2,$Q2,$T2
+ srli $Q1,$Q0,32
+ srli $Q3,$Q2,32
+
+ # The main loop only executes the first N-1 rounds.
+ add $loopcntr,$loopcntr,-1
+
+ # Do Nr - 1 rounds (final round is special)
+
+1:
+___
+
+# Lookup in table Te0
+$code .= do_lookup(
+ [$T4,$T5,$T6,$T7], # Destination registers
+ [$Q0,$Q1,$Q2,$Q3], # State registers
+ [$T0,$T1,$T2,$T3], # Temporaries
+ 0 # Shift amount
+);
+
+$code .= <<___;
+ add $TBL,$TBL,1024
+___
+
+# Lookup in table Te1
+$code .= do_lookup(
+ [$T8,$T9,$T10,$T11],
+ [$Q1,$Q2,$Q3,$Q0],
+ [$T0,$T1,$T2,$T3],
+ 8
+);
+
+$code .= <<___;
+ add $TBL,$TBL,1024
+___
+
+# Lookup in table Te2
+$code .= do_lookup(
+ [$T12,$T13,$T14,$T15],
+ [$Q2,$Q3,$Q0,$Q1],
+ [$T0,$T1,$T2,$T3],
+ 16
+);
+
+$code .= <<___;
+ add $TBL,$TBL,1024
+___
+
+# Lookup in table Te3
+$code .= do_lookup(
+ [$T0,$T1,$T2,$T3],
+ [$Q3,$Q0,$Q1,$Q2],
+ [$T0,$T1,$T2,$T3],
+ 24
+);
+
+$code .= <<___;
+
+ # Combine table lookups
+ xor $T4,$T4,$T8
+ xor $T5,$T5,$T9
+ xor $T6,$T6,$T10
+ xor $T7,$T7,$T11
+
+ xor $T4,$T4,$T12
+ xor $T5,$T5,$T13
+ xor $T6,$T6,$T14
+ xor $T7,$T7,$T15
+
+ xor $T0,$T0,$T4
+ xor $T1,$T1,$T5
+ xor $T2,$T2,$T6
+ xor $T3,$T3,$T7
+
+ # Update key ptr to point to next key in schedule
+ add $KEYP,$KEYP,16
+
+ # Grab next key in schedule
+ ld $T4,0($KEYP)
+ ld $T6,8($KEYP)
+
+ # Round TBL back to 4k boundary
+ and $TBL,$TBL,$MSK
+
+ add $loopcntr,$loopcntr,-1
+
+ xor $Q0,$T0,$T4
+ xor $Q2,$T2,$T6
+ srli $T5,$T4,32
+ xor $Q1,$T1,$T5
+ srli $T7,$T6,32
+ xor $Q3,$T3,$T7
+
+ bgtz $loopcntr,1b
+
+#================================FINAL ROUND====================================
+
+# In the final round, all lookup table accesses would appear as follows:
+#
+# ... compute index I0
+# add I0,TBL,T0
+# lbu T0,1(I0)
+#
+# Instead of indexing with a 1 offset, we can add 1 to the TBL pointer, and use
+# a 0 offset when indexing in the following code. This enables some instruction
+# fusion opportunities.
+
+ add $TBL,$TBL,1
+
+ ld $K0,16($KEYP)
+ ld $K1,24($KEYP)
+___
+
+$code .= do_lookup_byte(
+ [$T4,$T5,$T6,$T7],
+ [$Q0,$Q1,$Q2,$Q3],
+ [$T0,$T1,$T2,$T3],
+ 0
+);
+
+$code .= do_lookup_byte(
+ [$T8,$T9,$T10,$T11],
+ [$Q1,$Q2,$Q3,$Q0],
+ [$T0,$T1,$T2,$T3],
+ 8
+);
+
+$code .= do_lookup_byte(
+ [$T12,$T13,$T14,$T15],
+ [$Q2,$Q3,$Q0,$Q1],
+ [$T0,$T1,$T2,$T3],
+ 16
+);
+
+$code .= do_lookup_byte(
+ [$T0,$T1,$T2,$T3],
+ [$Q3,$Q0,$Q1,$Q2],
+ [$T0,$T1,$T2,$T3],
+ 24
+);
+
+$code .= <<___;
+
+ # Combine table lookups into T0 and T2
+
+ slli $T5,$T5,32
+ slli $T7,$T7,32
+ slli $T8,$T8,8
+ slli $T9,$T9,8+32
+ slli $T10,$T10,8
+ slli $T11,$T11,8+32
+ slli $T12,$T12,16
+ slli $T13,$T13,16+32
+ slli $T14,$T14,16
+ slli $T15,$T15,16+32
+
+ slli $T0,$T0,24
+ slli $T1,$T1,24+32
+ slli $T2,$T2,24
+ slli $T3,$T3,24+32
+
+ xor $T4,$T4,$T0
+ xor $T5,$T5,$T1
+ xor $T6,$T6,$T2
+ xor $T7,$T7,$T3
+
+ xor $T8,$T8,$T12
+ xor $T9,$T9,$T13
+ xor $T10,$T10,$T14
+ xor $T11,$T11,$T15
+
+ xor $T0,$T4,$T8
+ xor $T1,$T5,$T9
+ xor $T2,$T6,$T10
+ xor $T3,$T7,$T11
+
+
+ xor $T0,$T0,$T1
+ # T0 = [T1 T13 T9 T5 T0 T12 T8 T4]
+ xor $T0,$T0,$K0 # XOR in key
+
+ xor $T2,$T2,$T3
+ # T2 = [T3 T15 T11 T7 T2 T14 T10 T6]
+ xor $T2,$T2,$K1 # XOR in key
+
+ sd $T0,0($OUTP)
+ sd $T2,8($OUTP)
+
+ # Pop registers and return
+2:
+___
+
+$code .= load_regs();
+
+$code .= <<___;
+ ret
+___
+
+################################################################################
+# void AES_decrypt(const unsigned char *in, unsigned char *out,
+# const AES_KEY *key);
+################################################################################
+$code .= <<___;
+.text
+.balign 16
+.globl AES_decrypt
+.type AES_decrypt,\@function
+AES_decrypt:
+___
+
+$code .= save_regs();
+
+$code .= <<___;
+
+ # Load input to block cipher
+ ld $Q0,0($INP)
+ ld $Q2,8($INP)
+
+ # Load key
+ # Note that key is assumed in BE byte order
+ # (This routine was written against a key scheduling implementation that
+ # placed keys in BE byte order.)
+ ld $T0,0($KEYP)
+ ld $T2,8($KEYP)
+
+ # Load number of rounds
+ lwu $loopcntr,240($KEYP)
+
+ # Load address of substitution table and wrap-around mask
+ la $TBL,AES_Td0
+ li $MSK,~0xFFF
+
+ xor $Q0,$Q0,$T0
+ xor $Q2,$Q2,$T2
+ srli $Q1,$Q0,32
+ srli $Q3,$Q2,32
+
+ # The main loop only executes the first N-1 rounds.
+ add $loopcntr,$loopcntr,-1
+
+ # Do Nr - 1 rounds (final round is special)
+1:
+___
+
+# Lookup in Td0
+$code .= do_lookup(
+ [$T4,$T5,$T6,$T7], # Destination registers
+ [$Q0,$Q1,$Q2,$Q3], # State registers
+ [$T0,$T1,$T2,$T3], # Temporaries
+ 0 # Shift amount
+);
+
+$code .= <<___;
+ add $TBL,$TBL,1024
+___
+
+# Lookup in Td1
+$code .= do_lookup(
+ [$T8,$T9,$T10,$T11],
+ [$Q3,$Q0,$Q1,$Q2],
+ [$T0,$T1,$T2,$T3],
+ 8
+);
+
+$code .= <<___;
+ add $TBL,$TBL,1024
+___
+
+# Lookup in Td2
+$code .= do_lookup(
+ [$T12,$T13,$T14,$T15],
+ [$Q2,$Q3,$Q0,$Q1],
+ [$T0,$T1,$T2,$T3],
+ 16
+);
+
+$code .= <<___;
+ add $TBL,$TBL,1024
+___
+
+# Lookup in Td3
+$code .= do_lookup(
+ [$T0,$T1,$T2,$T3],
+ [$Q1,$Q2,$Q3,$Q0],
+ [$T0,$T1,$T2,$T3],
+ 24
+);
+
+$code .= <<___;
+ xor $T4,$T4,$T8
+ xor $T5,$T5,$T9
+ xor $T6,$T6,$T10
+ xor $T7,$T7,$T11
+
+ xor $T4,$T4,$T12
+ xor $T5,$T5,$T13
+ xor $T6,$T6,$T14
+ xor $T7,$T7,$T15
+
+ xor $T0,$T0,$T4
+ xor $T1,$T1,$T5
+ xor $T2,$T2,$T6
+ xor $T3,$T3,$T7
+
+ # Update key ptr to point to next key in schedule
+ add $KEYP,$KEYP,16
+
+ # Grab next key in schedule
+ ld $T4,0($KEYP)
+ ld $T6,8($KEYP)
+
+ # Round TBL back to 4k boundary
+ and $TBL,$TBL,$MSK
+
+ add $loopcntr,$loopcntr,-1
+
+ xor $Q0,$T0,$T4
+ xor $Q2,$T2,$T6
+ srli $T5,$T4,32
+ xor $Q1,$T1,$T5
+ srli $T7,$T6,32
+ xor $Q3,$T3,$T7
+
+ bgtz $loopcntr,1b
+
+#================================FINAL ROUND====================================
+
+ la $TBL,AES_Td4
+
+ # K0,K1 are aliases for loopcntr,KEYP
+ # As these registers will no longer be used after these loads, reuse them
+ # to store the final key in the schedule.
+ ld $K0,16($KEYP)
+ ld $K1,24($KEYP)
+___
+
+$code .= do_lookup_Td4(
+ [$T4,$T5,$T6,$T7],
+ [$Q0,$Q3,$Q2,$Q1],
+ [$T0,$T1,$T2,$T3]
+);
+
+$code .= do_lookup_Td4(
+ [$T8,$T9,$T10,$T11],
+ [$Q1,$Q0,$Q3,$Q2],
+ [$T0,$T1,$T2,$T3]
+);
+
+$code .= do_lookup_Td4(
+ [$T12,$T13,$T14,$T15],
+ [$Q2,$Q1,$Q0,$Q3],
+ [$T0,$T1,$T2,$T3]
+);
+
+$code .= do_lookup_Td4(
+ [$T0,$T1,$T2,$T3],
+ [$Q3,$Q2,$Q1,$Q0],
+ [$T0,$T1,$T2,$T3]
+);
+
+$code .= <<___;
+
+ # T0-T15 now contain the decrypted block, minus xoring with the final round
+ # key. We pack T0-T15 into the two 64-bit registers T0 and T4, then xor
+ # in the key and store.
+
+ slli $T5,$T5,8
+ slli $T6,$T6,16
+ slli $T7,$T7,24
+ slli $T8,$T8,32
+ slli $T9,$T9,8+32
+ slli $T10,$T10,16+32
+ slli $T11,$T11,32+24
+ slli $T13,$T13,8
+ slli $T14,$T14,16
+ slli $T15,$T15,24
+ slli $T0,$T0,32
+ slli $T1,$T1,8+32
+ slli $T2,$T2,16+32
+ slli $T3,$T3,24+32
+
+ xor $T4,$T4,$T5
+ xor $T6,$T6,$T7
+ xor $T8,$T8,$T9
+ xor $T10,$T10,$T11
+
+ xor $T12,$T12,$T13
+ xor $T14,$T14,$T15
+ xor $T0,$T0,$T1
+ xor $T2,$T2,$T3
+
+ xor $T4,$T4,$T6
+ xor $T8,$T8,$T10
+ xor $T12,$T12,$T14
+ xor $T0,$T0,$T2
+
+ xor $T4,$T4,$T8
+ # T4 = [T11 T10 T9 T8 T7 T6 T5 T4]
+ xor $T4,$T4,$K0 # xor in key
+
+ xor $T0,$T0,$T12
+ # T0 = [T3 T2 T1 T0 T15 T14 T13 T12]
+ xor $T0,$T0,$K1 # xor in key
+
+ sd $T4,0($OUTP)
+ sd $T0,8($OUTP)
+
+ # Pop registers and return
+___
+
+$code .= load_regs();
+
+$code .= <<___;
+ ret
+___
+
+clear_regs();
+
+################################################################################
+# Register assignment for AES_set_encrypt_key
+################################################################################
+
+# Function arguments (x10-x12 are a0-a2 in the ABI)
+# Pointer to user key, number of bits in key, key pointer
+my ($UKEY,$BITS,$KEYP) = use_regs(10..12);
+
+# Temporaries
+my ($T0,$T1,$T2,$T3) = use_regs(6..8,13);
+my ($T4,$T5,$T6,$T7,$T8,$T9,$T10,$T11) = use_regs(14..17,28..31);
+
+# Pointer into rcon table
+my ($RCON) = use_regs(9);
+
+# Register to hold table offset and used as a temporary
+my ($I0) = use_regs(18);
+
+# Loop counter
+my ($loopcntr) = use_regs(19);
+
+# Lookup table address register
+my ($TBL) = use_regs(20);
+
+# Calculates dest = [
+# S[(in>>shifts[3])&0xFF],
+# S[(in>>shifts[2])&0xFF],
+# S[(in>>shifts[1])&0xFF],
+# S[(in>>shifts[0])&0xFF]
+# ]
+# This routine spreads accesses across Te0-Te3 to help bring those tables
+# into cache, in anticipation of running AES_[en/de]crypt.
+sub do_enc_lookup {
+ # (destination reg, input reg, shifts array, temporary regs)
+ my ($dest, $in, $shifts, $Ts) = @_;
+
+ my $ret = '';
+
+$ret .= <<___;
+
+ # Round TBL back to 4k boundary
+ srli $TBL,$TBL,12
+ slli $TBL,$TBL,12
+
+ # Offset by 1 byte, since Te0[x] = S[x].[03, 01, 01, 02]
+ # So that, later on, a 0-offset lbu yields S[x].01 == S[x]
+ addi $TBL,$TBL,1
+___
+
+ for ($i = 0; $i < 4; $i++) {
+ if ($shifts->[$i] < 2) {
+ $ret .= " slli $Ts->[$i],$in,2-$shifts->[$i]\n";
+ } else {
+ $ret .= " srli $Ts->[$i],$in,$shifts->[$i]-2\n";
+ }
+ }
+
+$ret .= <<___;
+
+ andi $Ts->[0],$Ts->[0],0x3FC
+ andi $Ts->[1],$Ts->[1],0x3FC
+ andi $Ts->[2],$Ts->[2],0x3FC
+ andi $Ts->[3],$Ts->[3],0x3FC
+
+ # Index into tables Te0-Te3 (spread access across tables to help bring
+ # them into cache for later)
+
+ add $I0,$TBL,$Ts->[0]
+ lbu $Ts->[0],0($I0)
+
+ add $TBL,$TBL,1025 # yes, 1025
+ add $I0,$TBL,$Ts->[1]
+ lbu $Ts->[1],0($I0)
+
+ add $TBL,$TBL,1025
+ add $I0,$TBL,$Ts->[2]
+ lbu $Ts->[2],0($I0)
+
+ add $TBL,$TBL,1022
+ add $I0,$TBL,$Ts->[3]
+ lbu $Ts->[3],0($I0)
+
+ slli $Ts->[1],$Ts->[1],8
+ slli $Ts->[2],$Ts->[2],16
+ slli $Ts->[3],$Ts->[3],24
+
+ xor $Ts->[0],$Ts->[0],$Ts->[1]
+ xor $Ts->[2],$Ts->[2],$Ts->[3]
+ xor $dest,$Ts->[0],$Ts->[2]
+___
+
+ return $ret;
+}
+
+################################################################################
+# void AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# AES_KEY *key)
+################################################################################
+$code .= <<___;
+.text
+.balign 16
+.globl AES_set_encrypt_key
+.type AES_set_encrypt_key,\@function
+AES_set_encrypt_key:
+___
+$code .= save_regs();
+$code .= <<___;
+ bnez $UKEY,1f # if (!userKey || !key) return -1;
+ bnez $KEYP,1f
+ li a0,-1
+ ret
+1:
+ la $RCON,AES_rcon
+ la $TBL,AES_Te0
+ li $T8,128
+ li $T9,192
+ li $T10,256
+
+ # Determine number of rounds from key size in bits
+ bne $BITS,$T8,1f
+ li $T3,10 # key->rounds = 10 if bits == 128
+ j 3f
+1:
+ bne $BITS,$T9,2f
+ li $T3,12 # key->rounds = 12 if bits == 192
+ j 3f
+2:
+ li $T3,14 # key->rounds = 14 if bits == 256
+ beq $BITS,$T10,3f
+ li a0,-2 # If bits != 128, 192, or 256, return -2
+ j 5f
+3:
+ ld $T0,0($UKEY)
+ ld $T2,8($UKEY)
+
+ sw $T3,240($KEYP)
+
+ li $loopcntr,0 # == i*4
+
+ srli $T1,$T0,32
+ srli $T3,$T2,32
+
+ sd $T0,0($KEYP)
+ sd $T2,8($KEYP)
+
+ # if bits == 128
+ # jump into loop
+ beq $BITS,$T8,1f
+
+ ld $T4,16($UKEY)
+ srli $T5,$T4,32
+ sd $T4,16($KEYP)
+
+ # if bits == 192
+ # jump into loop
+ beq $BITS,$T9,2f
+
+ ld $T6,24($UKEY)
+ srli $T7,$T6,32
+ sd $T6,24($KEYP)
+
+ # bits == 256
+ j 3f
+___
+
+$code .= <<___;
+1:
+ addi $KEYP,$KEYP,16
+1:
+___
+$code .= do_enc_lookup($T4,$T3,[8,16,24,0],[$T4,$T5,$T6,$T7]);
+
+$code .= <<___;
+ add $T5,$RCON,$loopcntr # rcon[i] (i increments by 4 so it can double as
+ # a word offset)
+ lwu $T5,0($T5)
+
+ addi $loopcntr,$loopcntr,4
+ li $I0,10*4
+
+ xor $T0,$T0,$T4
+ xor $T0,$T0,$T5
+ xor $T1,$T1,$T0
+ xor $T2,$T2,$T1
+ xor $T3,$T3,$T2
+
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+
+ addi $KEYP,$KEYP,16
+
+
+ bne $loopcntr,$I0,1b
+ j 4f
+___
+$code .= <<___;
+2:
+ addi $KEYP,$KEYP,24
+2:
+___
+$code .= do_enc_lookup($T6,$T5,[8,16,24,0],[$T6,$T7,$T8,$T9]);
+
+$code .= <<___;
+ add $T7,$RCON,$loopcntr # rcon[i] (i increments by 4 so it can double as
+ # a word offset)
+ lwu $T7,0($T7)
+
+ addi $loopcntr,$loopcntr,4
+ li $I0,8*4
+
+ xor $T0,$T0,$T6
+ xor $T0,$T0,$T7
+ xor $T1,$T1,$T0
+ xor $T2,$T2,$T1
+ xor $T3,$T3,$T2
+
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+
+ beq $loopcntr,$I0,4f
+
+ xor $T4,$T4,$T3
+ xor $T5,$T5,$T4
+ sw $T4,16($KEYP)
+ sw $T5,20($KEYP)
+
+ addi $KEYP,$KEYP,24
+ j 2b
+___
+$code .= <<___;
+3:
+ addi $KEYP,$KEYP,32
+3:
+___
+$code .= do_enc_lookup($T8,$T7,[8,16,24,0],[$T8,$T9,$T10,$T11]);
+
+$code .= <<___;
+ add $T9,$RCON,$loopcntr # rcon[i] (i increments by 4 so it can double as
+ # a word offset)
+ lwu $T9,0($T9)
+
+ addi $loopcntr,$loopcntr,4
+ li $I0,7*4
+
+ xor $T0,$T0,$T8
+ xor $T0,$T0,$T9
+ xor $T1,$T1,$T0
+ xor $T2,$T2,$T1
+ xor $T3,$T3,$T2
+
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+
+ beq $loopcntr,$I0,4f
+___
+$code .= do_enc_lookup($T8,$T3,[0,8,16,24],[$T8,$T9,$T10,$T11]);
+$code .= <<___;
+ xor $T4,$T4,$T8
+ xor $T5,$T5,$T4
+ xor $T6,$T6,$T5
+ xor $T7,$T7,$T6
+ sw $T4,16($KEYP)
+ sw $T5,20($KEYP)
+ sw $T6,24($KEYP)
+ sw $T7,28($KEYP)
+
+ addi $KEYP,$KEYP,32
+ j 3b
+
+4: # return 0
+ li a0,0
+5: # return a0
+___
+$code .= load_regs();
+$code .= <<___;
+ ret
+___
+
+clear_regs();
+
+################################################################################
+# Register assignment for AES_set_decrypt_key
+################################################################################
+
+# Function arguments (x10-x12 are a0-a2 in the ABI)
+# Pointer to user key, number of bits in key, key pointer
+my ($UKEY,$BITS,$KEYP) = use_regs(10..12);
+
+# Temporaries
+my ($T0,$T1,$T2,$T3) = use_regs(6..8,9);
+my ($T4,$T5,$T6,$T7,$T8) = use_regs(13..17);
+
+my ($I1) = use_regs(18);
+
+# Register to hold table offset and used as a temporary
+my ($I0) = use_regs(19);
+
+# Loop counter
+my ($loopcntr) = use_regs(20);
+
+# Lookup table address register
+my ($TBL) = use_regs(21);
+
+# Calculates dest = [
+# Td0[Te1[(in >> 24) & 0xff] & 0xff] ^
+# Td1[Te1[(in >> 16) & 0xff] & 0xff] ^
+# Td2[Te1[(in >> 8) & 0xff] & 0xff] ^
+# Td3[Te1[(in ) & 0xff] & 0xff]
+# ]
+sub do_dec_lookup {
+ # (destination reg, input reg, temporary regs)
+ my ($dest, $in, $Ts) = @_;
+
+ my $ret = '';
+
+$ret .= <<___;
+
+ la $TBL,AES_Te2
+
+ slli $Ts->[0],$in,2
+ srli $Ts->[1],$in,8-2
+ srli $Ts->[2],$in,16-2
+ srli $Ts->[3],$in,24-2
+
+ andi $Ts->[0],$Ts->[0],0x3FC
+ andi $Ts->[1],$Ts->[1],0x3FC
+ andi $Ts->[2],$Ts->[2],0x3FC
+ andi $Ts->[3],$Ts->[3],0x3FC
+
+ # Index into table Te2
+
+ add $I0,$TBL,$Ts->[0]
+ lwu $Ts->[0],0($I0)
+
+ add $I0,$TBL,$Ts->[1]
+ lwu $Ts->[1],0($I0)
+
+ add $I0,$TBL,$Ts->[2]
+ lwu $Ts->[2],0($I0)
+
+ add $I0,$TBL,$Ts->[3]
+ lwu $Ts->[3],0($I0)
+
+ andi $Ts->[0],$Ts->[0],0xFF
+ andi $Ts->[1],$Ts->[1],0xFF
+ andi $Ts->[2],$Ts->[2],0xFF
+ andi $Ts->[3],$Ts->[3],0xFF
+
+ slli $Ts->[0],$Ts->[0],2
+ slli $Ts->[1],$Ts->[1],2
+ slli $Ts->[2],$Ts->[2],2
+ slli $Ts->[3],$Ts->[3],2
+
+ la $TBL,AES_Td0
+
+ # Lookup in Td0-Td3
+
+ add $I0,$TBL,$Ts->[0]
+ lwu $Ts->[0],0($I0)
+
+ add $TBL,$TBL,1024
+ add $I0,$TBL,$Ts->[1]
+ lwu $Ts->[1],0($I0)
+
+ add $TBL,$TBL,1024
+ add $I0,$TBL,$Ts->[2]
+ lwu $Ts->[2],0($I0)
+
+ add $TBL,$TBL,1024
+ add $I0,$TBL,$Ts->[3]
+ lwu $Ts->[3],0($I0)
+
+ xor $Ts->[0],$Ts->[0],$Ts->[1]
+ xor $Ts->[2],$Ts->[2],$Ts->[3]
+ xor $dest,$Ts->[0],$Ts->[2]
+___
+
+ return $ret;
+}
+
+################################################################################
+# void AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# AES_KEY *key)
+################################################################################
+$code .= <<___;
+.text
+.balign 16
+.globl AES_set_decrypt_key
+.type AES_set_decrypt_key,\@function
+AES_set_decrypt_key:
+ # Call AES_set_encrypt_key first
+ addi sp,sp,-16
+ sd $KEYP,0(sp) # We need to hold onto this!
+ sd ra,8(sp)
+ jal ra,AES_set_encrypt_key
+ ld $KEYP,0(sp)
+ ld ra,8(sp)
+ addi sp,sp,16
+ bgez a0,1f # If error, return error
+ ret
+1:
+___
+$code .= save_regs();
+$code .= <<___;
+
+ li $T4,0
+ lwu $T8,240($KEYP)
+ slli $T5,$T8,4
+ # Invert order of round keys
+1:
+ add $I0,$KEYP,$T4
+ ld $T0,0($I0)
+ ld $T1,8($I0)
+ add $I1,$KEYP,$T5
+ ld $T2,0($I1)
+ ld $T3,8($I1)
+ addi $T4,$T4,16
+ addi $T5,$T5,-16
+ sd $T0,0($I1)
+ sd $T1,8($I1)
+ sd $T2,0($I0)
+ sd $T3,8($I0)
+ blt $T4,$T5,1b
+
+ li $loopcntr,1
+
+1:
+ addi $KEYP,$KEYP,16
+ lwu $T0,0($KEYP)
+ lwu $T1,4($KEYP)
+ lwu $T2,8($KEYP)
+ lwu $T3,12($KEYP)
+___
+$code .= do_dec_lookup($T0,$T0,[$T4,$T5,$T6,$T7]);
+$code .= do_dec_lookup($T1,$T1,[$T4,$T5,$T6,$T7]);
+$code .= do_dec_lookup($T2,$T2,[$T4,$T5,$T6,$T7]);
+$code .= do_dec_lookup($T3,$T3,[$T4,$T5,$T6,$T7]);
+$code .= <<___;
+ sw $T0,0($KEYP)
+ sw $T1,4($KEYP)
+ sw $T2,8($KEYP)
+ sw $T3,12($KEYP)
+ addi $loopcntr,$loopcntr,1
+ blt $loopcntr,$T8,1b
+___
+$code .= load_regs();
+$code .= <<___;
+ li a0,0
+ ret
+___
+$code .= <<___;
+
+.section .rodata
+.p2align 12
+.type AES_Te0,\@object
+AES_Te0:
+.word 0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U
+.word 0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U
+.word 0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U
+.word 0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU
+.word 0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU
+.word 0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU
+.word 0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U
+.word 0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU
+.word 0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU
+.word 0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U
+.word 0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U
+.word 0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU
+.word 0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU
+.word 0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU
+.word 0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU
+.word 0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU
+.word 0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U
+.word 0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU
+.word 0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU
+.word 0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U
+.word 0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U
+.word 0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U
+.word 0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U
+.word 0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U
+.word 0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU
+.word 0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U
+.word 0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU
+.word 0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU
+.word 0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U
+.word 0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U