diff options
author | Henry Brausen <henry.brausen@vrull.eu> | 2022-01-28 01:13:04 -0700 |
---|---|---|
committer | Pauli <pauli@openssl.org> | 2022-05-19 16:32:49 +1000 |
commit | b3504b600c028a00f36cdbfedc928a48df9818ff (patch) | |
tree | ac4885d461554f0dfe9204527a51dca20969d68c | |
parent | ec26144288fd6dce6dd76bd9e2b192b495033723 (diff) |
Add AES implementation in generic riscv64 asm
This implementation is based on the four-table approach, along the same
lines as the non-constant-time implementation in aes_core.c The
implementation is in perlasm.
Utility functions are defined to automatically stack/unstack registers
as needed for prologues and epilogues. See riscv-elf-psabi-doc at
https://github.com/riscv-non-isa/riscv-elf-psabi-doc/ for ABI details.
Reviewed-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
Signed-off-by: Henry Brausen <henry.brausen@vrull.eu>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/17640)
-rw-r--r-- | crypto/aes/asm/aes-riscv64.pl | 1709 | ||||
-rw-r--r-- | crypto/aes/build.info | 5 |
2 files changed, 1714 insertions, 0 deletions
diff --git a/crypto/aes/asm/aes-riscv64.pl b/crypto/aes/asm/aes-riscv64.pl new file mode 100644 index 0000000000..525eba4b46 --- /dev/null +++ b/crypto/aes/asm/aes-riscv64.pl @@ -0,0 +1,1709 @@ +#! /usr/bin/env perl +# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$output and open STDOUT,">$output"; + +################################################################################ +# Utility functions to help with keeping track of which registers to stack/ +# unstack when entering / exiting routines. +################################################################################ +{ + # Callee-saved registers + my @callee_saved = map("x$_",(2,8,9,18..27)); + # Caller-saved registers + my @caller_saved = map("x$_",(1,5..7,10..17,28..31)); + my @must_save; + sub use_reg { + my $reg = shift; + if (grep(/^$reg$/, @callee_saved)) { + push(@must_save, $reg); + } elsif (!grep(/^$reg$/, @caller_saved)) { + # Register is not usable! + die("Unusable register ".$reg); + } + return $reg; + } + sub use_regs { + return map(use_reg("x$_"), @_); + } + sub save_regs { + my $ret = ''; + my $stack_reservation = ($#must_save + 1) * 8; + my $stack_offset = $stack_reservation; + if ($stack_reservation % 16) { + $stack_reservation += 8; + } + $ret.=" addi sp,sp,-$stack_reservation\n"; + foreach (@must_save) { + $stack_offset -= 8; + $ret.=" sd $_,$stack_offset(sp)\n"; + } + return $ret; + } + sub load_regs { + my $ret = ''; + my $stack_reservation = ($#must_save + 1) * 8; + my $stack_offset = $stack_reservation; + if ($stack_reservation % 16) { + $stack_reservation += 8; + } + foreach (@must_save) { + $stack_offset -= 8; + $ret.=" ld $_,$stack_offset(sp)\n"; + } + $ret.=" addi sp,sp,$stack_reservation\n"; + return $ret; + } + sub clear_regs { + @must_save = (); + } +} + +################################################################################ +# Register assignment for AES_encrypt and AES_decrypt +################################################################################ + +# Registers to hold AES state (called s0-s3 or y0-y3 elsewhere) +my ($Q0,$Q1,$Q2,$Q3) = use_regs(6..9); + +# Function arguments (x10-x12 are a0-a2 in the ABI) +# Input block pointer, output block pointer, key pointer +my ($INP,$OUTP,$KEYP) = use_regs(10..12); + +# Temporaries +my ($T0,$T1,$T2,$T3) = use_regs(13..16); +my ($T4,$T5,$T6,$T7,$T8,$T9,$T10,$T11) = use_regs(17..24); +my ($T12,$T13,$T14,$T15) = use_regs(25..28); + +# Register to hold table offset +my ($I0) = use_regs(29); + +# Loop counter +my ($loopcntr) = use_regs(30); + +# Lookup table address register +my ($TBL) = use_regs(31); + +# Lookup table mask register +my ($MSK) = use_regs(5); + +# Aliases for readability +my $K0 = $loopcntr; +my $K1 = $KEYP; + +################################################################################ +# Table lookup utility functions for AES_encrypt and AES_decrypt +################################################################################ + +# do_lookup([destination regs], [state regs], [temporary regs], shamt) +# do_lookup loads four entries from an AES encryption/decryption table +# and stores the result in the specified destination register set +# Ds->[0] = Table[Qs->[0] >> shamt] +# Ds->[1] = Table[Qs->[1] >> shamt] +# Ds->[2] = Table[Qs->[2] >> shamt] +# Ds->[3] = Table[Qs->[3] >> shamt] +# Four temporary regs are used to generate these lookups. The temporary regs +# can be equal to the destination regs, but only if they appear in the same +# order. I.e. do_lookup([A,B,C,D],[...],[A,B,C,D],...) is OK +sub do_lookup { + # (destination regs, state regs, temporary regs, shift amount) + my ($Ds, $Qs, $Ts, $shamt) = @_; + + my $ret = ''; + + # AES encryption/decryption table entries have word-sized (4-byte) entries. + # To convert the table index into a byte offset, we compute + # ((Qs->[i] >> shamt) & 0xFF) << 2 + # However, to save work, we compute the equivalent expression + # (Qs->[i] >> (shamt-2)) & 0x3FC + if ($shamt < 2) { +$ret .= <<___; + + slli $Ts->[0],$Qs->[0],$shamt+2 + slli $Ts->[1],$Qs->[1],$shamt+2 + slli $Ts->[2],$Qs->[2],$shamt+2 + slli $Ts->[3],$Qs->[3],$shamt+2 +___ + } else { +$ret .= <<___; + + srli $Ts->[0],$Qs->[0],$shamt-2 + srli $Ts->[1],$Qs->[1],$shamt-2 + srli $Ts->[2],$Qs->[2],$shamt-2 + srli $Ts->[3],$Qs->[3],$shamt-2 +___ + } + +$ret .= <<___; + + andi $Ts->[0],$Ts->[0],0x3FC + andi $Ts->[1],$Ts->[1],0x3FC + andi $Ts->[2],$Ts->[2],0x3FC + andi $Ts->[3],$Ts->[3],0x3FC + + # Index into table. + add $I0,$TBL,$Ts->[0] + lwu $Ds->[0],0($I0) + add $I0,$TBL,$Ts->[1] + lwu $Ds->[1],0($I0) + add $I0,$TBL,$Ts->[2] + lwu $Ds->[2],0($I0) + add $I0,$TBL,$Ts->[3] + lwu $Ds->[3],0($I0) + +___ + + return $ret; +} + +# Identical to do_lookup(), but loads only a single byte into each destination +# register (replaces lwu with lbu). Used in the final round of AES_encrypt. +sub do_lookup_byte { + my $ret = do_lookup(@_); + $ret =~ s/lwu/lbu/g; + return $ret; +} + +# do_lookup_Td4([destination regs], [state regs], [temporary regs]) +# Used in final phase of AES_decrypt +# Ds->[0] = Table[(Qs->[0]) &0xFF] +# Ds->[1] = Table[(Qs->[1] >> 8 )&0xFF] +# Ds->[2] = Table[(Qs->[2] >> 16)&0xFF] +# Ds->[3] = Table[(Qs->[3] >> 24)&0xFF] +# Four temporary regs are used to generate these lookups. The temporary regs +# can be equal to the destination regs, but only if they appear in the same +# order. I.e. do_lookup([A,B,C,D],[...],[A,B,C,D],...) is OK +sub do_lookup_Td4 { + my ($Ds, $Qs, $Ts) = @_; + + my $ret = ''; + +$ret .= <<___; + srli $Ts->[1],$Qs->[1],8 + srli $Ts->[2],$Qs->[2],16 + srli $Ts->[3],$Qs->[3],24 + + andi $Ts->[0],$Qs->[0],0xFF + andi $Ts->[1],$Ts->[1],0xFF + andi $Ts->[2],$Ts->[2],0xFF + andi $Ts->[3],$Ts->[3],0xFF + + add $I0,$TBL,$Ts->[0] + lbu $Ds->[0],0($I0) + add $I0,$TBL,$Ts->[1] + lbu $Ds->[1],0($I0) + add $I0,$TBL,$Ts->[2] + lbu $Ds->[2],0($I0) + add $I0,$TBL,$Ts->[3] + lbu $Ds->[3],0($I0) + +___ + + return $ret; +} + +################################################################################ +# void AES_encrypt(const unsigned char *in, unsigned char *out, +# const AES_KEY *key); +################################################################################ +my $code .= <<___; +.text +.balign 16 +.globl AES_encrypt +.type AES_encrypt,\@function +AES_encrypt: +___ + +$code .= save_regs(); + +$code .= <<___; + + # Load input to block cipher + ld $Q0,0($INP) + ld $Q2,8($INP) + + + # Load key + ld $T0,0($KEYP) + ld $T2,8($KEYP) + + + # Load number of rounds + lwu $loopcntr,240($KEYP) + + # Load address of substitution table and wrap-around mask + la $TBL,AES_Te0 + li $MSK,~0xFFF + + # y = n xor k, stored in Q0-Q3 + + xor $Q0,$Q0,$T0 + xor $Q2,$Q2,$T2 + srli $Q1,$Q0,32 + srli $Q3,$Q2,32 + + # The main loop only executes the first N-1 rounds. + add $loopcntr,$loopcntr,-1 + + # Do Nr - 1 rounds (final round is special) + +1: +___ + +# Lookup in table Te0 +$code .= do_lookup( + [$T4,$T5,$T6,$T7], # Destination registers + [$Q0,$Q1,$Q2,$Q3], # State registers + [$T0,$T1,$T2,$T3], # Temporaries + 0 # Shift amount +); + +$code .= <<___; + add $TBL,$TBL,1024 +___ + +# Lookup in table Te1 +$code .= do_lookup( + [$T8,$T9,$T10,$T11], + [$Q1,$Q2,$Q3,$Q0], + [$T0,$T1,$T2,$T3], + 8 +); + +$code .= <<___; + add $TBL,$TBL,1024 +___ + +# Lookup in table Te2 +$code .= do_lookup( + [$T12,$T13,$T14,$T15], + [$Q2,$Q3,$Q0,$Q1], + [$T0,$T1,$T2,$T3], + 16 +); + +$code .= <<___; + add $TBL,$TBL,1024 +___ + +# Lookup in table Te3 +$code .= do_lookup( + [$T0,$T1,$T2,$T3], + [$Q3,$Q0,$Q1,$Q2], + [$T0,$T1,$T2,$T3], + 24 +); + +$code .= <<___; + + # Combine table lookups + xor $T4,$T4,$T8 + xor $T5,$T5,$T9 + xor $T6,$T6,$T10 + xor $T7,$T7,$T11 + + xor $T4,$T4,$T12 + xor $T5,$T5,$T13 + xor $T6,$T6,$T14 + xor $T7,$T7,$T15 + + xor $T0,$T0,$T4 + xor $T1,$T1,$T5 + xor $T2,$T2,$T6 + xor $T3,$T3,$T7 + + # Update key ptr to point to next key in schedule + add $KEYP,$KEYP,16 + + # Grab next key in schedule + ld $T4,0($KEYP) + ld $T6,8($KEYP) + + # Round TBL back to 4k boundary + and $TBL,$TBL,$MSK + + add $loopcntr,$loopcntr,-1 + + xor $Q0,$T0,$T4 + xor $Q2,$T2,$T6 + srli $T5,$T4,32 + xor $Q1,$T1,$T5 + srli $T7,$T6,32 + xor $Q3,$T3,$T7 + + bgtz $loopcntr,1b + +#================================FINAL ROUND==================================== + +# In the final round, all lookup table accesses would appear as follows: +# +# ... compute index I0 +# add I0,TBL,T0 +# lbu T0,1(I0) +# +# Instead of indexing with a 1 offset, we can add 1 to the TBL pointer, and use +# a 0 offset when indexing in the following code. This enables some instruction +# fusion opportunities. + + add $TBL,$TBL,1 + + ld $K0,16($KEYP) + ld $K1,24($KEYP) +___ + +$code .= do_lookup_byte( + [$T4,$T5,$T6,$T7], + [$Q0,$Q1,$Q2,$Q3], + [$T0,$T1,$T2,$T3], + 0 +); + +$code .= do_lookup_byte( + [$T8,$T9,$T10,$T11], + [$Q1,$Q2,$Q3,$Q0], + [$T0,$T1,$T2,$T3], + 8 +); + +$code .= do_lookup_byte( + [$T12,$T13,$T14,$T15], + [$Q2,$Q3,$Q0,$Q1], + [$T0,$T1,$T2,$T3], + 16 +); + +$code .= do_lookup_byte( + [$T0,$T1,$T2,$T3], + [$Q3,$Q0,$Q1,$Q2], + [$T0,$T1,$T2,$T3], + 24 +); + +$code .= <<___; + + # Combine table lookups into T0 and T2 + + slli $T5,$T5,32 + slli $T7,$T7,32 + slli $T8,$T8,8 + slli $T9,$T9,8+32 + slli $T10,$T10,8 + slli $T11,$T11,8+32 + slli $T12,$T12,16 + slli $T13,$T13,16+32 + slli $T14,$T14,16 + slli $T15,$T15,16+32 + + slli $T0,$T0,24 + slli $T1,$T1,24+32 + slli $T2,$T2,24 + slli $T3,$T3,24+32 + + xor $T4,$T4,$T0 + xor $T5,$T5,$T1 + xor $T6,$T6,$T2 + xor $T7,$T7,$T3 + + xor $T8,$T8,$T12 + xor $T9,$T9,$T13 + xor $T10,$T10,$T14 + xor $T11,$T11,$T15 + + xor $T0,$T4,$T8 + xor $T1,$T5,$T9 + xor $T2,$T6,$T10 + xor $T3,$T7,$T11 + + + xor $T0,$T0,$T1 + # T0 = [T1 T13 T9 T5 T0 T12 T8 T4] + xor $T0,$T0,$K0 # XOR in key + + xor $T2,$T2,$T3 + # T2 = [T3 T15 T11 T7 T2 T14 T10 T6] + xor $T2,$T2,$K1 # XOR in key + + sd $T0,0($OUTP) + sd $T2,8($OUTP) + + # Pop registers and return +2: +___ + +$code .= load_regs(); + +$code .= <<___; + ret +___ + +################################################################################ +# void AES_decrypt(const unsigned char *in, unsigned char *out, +# const AES_KEY *key); +################################################################################ +$code .= <<___; +.text +.balign 16 +.globl AES_decrypt +.type AES_decrypt,\@function +AES_decrypt: +___ + +$code .= save_regs(); + +$code .= <<___; + + # Load input to block cipher + ld $Q0,0($INP) + ld $Q2,8($INP) + + # Load key + # Note that key is assumed in BE byte order + # (This routine was written against a key scheduling implementation that + # placed keys in BE byte order.) + ld $T0,0($KEYP) + ld $T2,8($KEYP) + + # Load number of rounds + lwu $loopcntr,240($KEYP) + + # Load address of substitution table and wrap-around mask + la $TBL,AES_Td0 + li $MSK,~0xFFF + + xor $Q0,$Q0,$T0 + xor $Q2,$Q2,$T2 + srli $Q1,$Q0,32 + srli $Q3,$Q2,32 + + # The main loop only executes the first N-1 rounds. + add $loopcntr,$loopcntr,-1 + + # Do Nr - 1 rounds (final round is special) +1: +___ + +# Lookup in Td0 +$code .= do_lookup( + [$T4,$T5,$T6,$T7], # Destination registers + [$Q0,$Q1,$Q2,$Q3], # State registers + [$T0,$T1,$T2,$T3], # Temporaries + 0 # Shift amount +); + +$code .= <<___; + add $TBL,$TBL,1024 +___ + +# Lookup in Td1 +$code .= do_lookup( + [$T8,$T9,$T10,$T11], + [$Q3,$Q0,$Q1,$Q2], + [$T0,$T1,$T2,$T3], + 8 +); + +$code .= <<___; + add $TBL,$TBL,1024 +___ + +# Lookup in Td2 +$code .= do_lookup( + [$T12,$T13,$T14,$T15], + [$Q2,$Q3,$Q0,$Q1], + [$T0,$T1,$T2,$T3], + 16 +); + +$code .= <<___; + add $TBL,$TBL,1024 +___ + +# Lookup in Td3 +$code .= do_lookup( + [$T0,$T1,$T2,$T3], + [$Q1,$Q2,$Q3,$Q0], + [$T0,$T1,$T2,$T3], + 24 +); + +$code .= <<___; + xor $T4,$T4,$T8 + xor $T5,$T5,$T9 + xor $T6,$T6,$T10 + xor $T7,$T7,$T11 + + xor $T4,$T4,$T12 + xor $T5,$T5,$T13 + xor $T6,$T6,$T14 + xor $T7,$T7,$T15 + + xor $T0,$T0,$T4 + xor $T1,$T1,$T5 + xor $T2,$T2,$T6 + xor $T3,$T3,$T7 + + # Update key ptr to point to next key in schedule + add $KEYP,$KEYP,16 + + # Grab next key in schedule + ld $T4,0($KEYP) + ld $T6,8($KEYP) + + # Round TBL back to 4k boundary + and $TBL,$TBL,$MSK + + add $loopcntr,$loopcntr,-1 + + xor $Q0,$T0,$T4 + xor $Q2,$T2,$T6 + srli $T5,$T4,32 + xor $Q1,$T1,$T5 + srli $T7,$T6,32 + xor $Q3,$T3,$T7 + + bgtz $loopcntr,1b + +#================================FINAL ROUND==================================== + + la $TBL,AES_Td4 + + # K0,K1 are aliases for loopcntr,KEYP + # As these registers will no longer be used after these loads, reuse them + # to store the final key in the schedule. + ld $K0,16($KEYP) + ld $K1,24($KEYP) +___ + +$code .= do_lookup_Td4( + [$T4,$T5,$T6,$T7], + [$Q0,$Q3,$Q2,$Q1], + [$T0,$T1,$T2,$T3] +); + +$code .= do_lookup_Td4( + [$T8,$T9,$T10,$T11], + [$Q1,$Q0,$Q3,$Q2], + [$T0,$T1,$T2,$T3] +); + +$code .= do_lookup_Td4( + [$T12,$T13,$T14,$T15], + [$Q2,$Q1,$Q0,$Q3], + [$T0,$T1,$T2,$T3] +); + +$code .= do_lookup_Td4( + [$T0,$T1,$T2,$T3], + [$Q3,$Q2,$Q1,$Q0], + [$T0,$T1,$T2,$T3] +); + +$code .= <<___; + + # T0-T15 now contain the decrypted block, minus xoring with the final round + # key. We pack T0-T15 into the two 64-bit registers T0 and T4, then xor + # in the key and store. + + slli $T5,$T5,8 + slli $T6,$T6,16 + slli $T7,$T7,24 + slli $T8,$T8,32 + slli $T9,$T9,8+32 + slli $T10,$T10,16+32 + slli $T11,$T11,32+24 + slli $T13,$T13,8 + slli $T14,$T14,16 + slli $T15,$T15,24 + slli $T0,$T0,32 + slli $T1,$T1,8+32 + slli $T2,$T2,16+32 + slli $T3,$T3,24+32 + + xor $T4,$T4,$T5 + xor $T6,$T6,$T7 + xor $T8,$T8,$T9 + xor $T10,$T10,$T11 + + xor $T12,$T12,$T13 + xor $T14,$T14,$T15 + xor $T0,$T0,$T1 + xor $T2,$T2,$T3 + + xor $T4,$T4,$T6 + xor $T8,$T8,$T10 + xor $T12,$T12,$T14 + xor $T0,$T0,$T2 + + xor $T4,$T4,$T8 + # T4 = [T11 T10 T9 T8 T7 T6 T5 T4] + xor $T4,$T4,$K0 # xor in key + + xor $T0,$T0,$T12 + # T0 = [T3 T2 T1 T0 T15 T14 T13 T12] + xor $T0,$T0,$K1 # xor in key + + sd $T4,0($OUTP) + sd $T0,8($OUTP) + + # Pop registers and return +___ + +$code .= load_regs(); + +$code .= <<___; + ret +___ + +clear_regs(); + +################################################################################ +# Register assignment for AES_set_encrypt_key +################################################################################ + +# Function arguments (x10-x12 are a0-a2 in the ABI) +# Pointer to user key, number of bits in key, key pointer +my ($UKEY,$BITS,$KEYP) = use_regs(10..12); + +# Temporaries +my ($T0,$T1,$T2,$T3) = use_regs(6..8,13); +my ($T4,$T5,$T6,$T7,$T8,$T9,$T10,$T11) = use_regs(14..17,28..31); + +# Pointer into rcon table +my ($RCON) = use_regs(9); + +# Register to hold table offset and used as a temporary +my ($I0) = use_regs(18); + +# Loop counter +my ($loopcntr) = use_regs(19); + +# Lookup table address register +my ($TBL) = use_regs(20); + +# Calculates dest = [ +# S[(in>>shifts[3])&0xFF], +# S[(in>>shifts[2])&0xFF], +# S[(in>>shifts[1])&0xFF], +# S[(in>>shifts[0])&0xFF] +# ] +# This routine spreads accesses across Te0-Te3 to help bring those tables +# into cache, in anticipation of running AES_[en/de]crypt. +sub do_enc_lookup { + # (destination reg, input reg, shifts array, temporary regs) + my ($dest, $in, $shifts, $Ts) = @_; + + my $ret = ''; + +$ret .= <<___; + + # Round TBL back to 4k boundary + srli $TBL,$TBL,12 + slli $TBL,$TBL,12 + + # Offset by 1 byte, since Te0[x] = S[x].[03, 01, 01, 02] + # So that, later on, a 0-offset lbu yields S[x].01 == S[x] + addi $TBL,$TBL,1 +___ + + for ($i = 0; $i < 4; $i++) { + if ($shifts->[$i] < 2) { + $ret .= " slli $Ts->[$i],$in,2-$shifts->[$i]\n"; + } else { + $ret .= " srli $Ts->[$i],$in,$shifts->[$i]-2\n"; + } + } + +$ret .= <<___; + + andi $Ts->[0],$Ts->[0],0x3FC + andi $Ts->[1],$Ts->[1],0x3FC + andi $Ts->[2],$Ts->[2],0x3FC + andi $Ts->[3],$Ts->[3],0x3FC + + # Index into tables Te0-Te3 (spread access across tables to help bring + # them into cache for later) + + add $I0,$TBL,$Ts->[0] + lbu $Ts->[0],0($I0) + + add $TBL,$TBL,1025 # yes, 1025 + add $I0,$TBL,$Ts->[1] + lbu $Ts->[1],0($I0) + + add $TBL,$TBL,1025 + add $I0,$TBL,$Ts->[2] + lbu $Ts->[2],0($I0) + + add $TBL,$TBL,1022 + add $I0,$TBL,$Ts->[3] + lbu $Ts->[3],0($I0) + + slli $Ts->[1],$Ts->[1],8 + slli $Ts->[2],$Ts->[2],16 + slli $Ts->[3],$Ts->[3],24 + + xor $Ts->[0],$Ts->[0],$Ts->[1] + xor $Ts->[2],$Ts->[2],$Ts->[3] + xor $dest,$Ts->[0],$Ts->[2] +___ + + return $ret; +} + +################################################################################ +# void AES_set_encrypt_key(const unsigned char *userKey, const int bits, +# AES_KEY *key) +################################################################################ +$code .= <<___; +.text +.balign 16 +.globl AES_set_encrypt_key +.type AES_set_encrypt_key,\@function +AES_set_encrypt_key: +___ +$code .= save_regs(); +$code .= <<___; + bnez $UKEY,1f # if (!userKey || !key) return -1; + bnez $KEYP,1f + li a0,-1 + ret +1: + la $RCON,AES_rcon + la $TBL,AES_Te0 + li $T8,128 + li $T9,192 + li $T10,256 + + # Determine number of rounds from key size in bits + bne $BITS,$T8,1f + li $T3,10 # key->rounds = 10 if bits == 128 + j 3f +1: + bne $BITS,$T9,2f + li $T3,12 # key->rounds = 12 if bits == 192 + j 3f +2: + li $T3,14 # key->rounds = 14 if bits == 256 + beq $BITS,$T10,3f + li a0,-2 # If bits != 128, 192, or 256, return -2 + j 5f +3: + ld $T0,0($UKEY) + ld $T2,8($UKEY) + + sw $T3,240($KEYP) + + li $loopcntr,0 # == i*4 + + srli $T1,$T0,32 + srli $T3,$T2,32 + + sd $T0,0($KEYP) + sd $T2,8($KEYP) + + # if bits == 128 + # jump into loop + beq $BITS,$T8,1f + + ld $T4,16($UKEY) + srli $T5,$T4,32 + sd $T4,16($KEYP) + + # if bits == 192 + # jump into loop + beq $BITS,$T9,2f + + ld $T6,24($UKEY) + srli $T7,$T6,32 + sd $T6,24($KEYP) + + # bits == 256 + j 3f +___ + +$code .= <<___; +1: + addi $KEYP,$KEYP,16 +1: +___ +$code .= do_enc_lookup($T4,$T3,[8,16,24,0],[$T4,$T5,$T6,$T7]); + +$code .= <<___; + add $T5,$RCON,$loopcntr # rcon[i] (i increments by 4 so it can double as + # a word offset) + lwu $T5,0($T5) + + addi $loopcntr,$loopcntr,4 + li $I0,10*4 + + xor $T0,$T0,$T4 + xor $T0,$T0,$T5 + xor $T1,$T1,$T0 + xor $T2,$T2,$T1 + xor $T3,$T3,$T2 + + sw $T0,0($KEYP) + sw $T1,4($KEYP) + sw $T2,8($KEYP) + sw $T3,12($KEYP) + + addi $KEYP,$KEYP,16 + + + bne $loopcntr,$I0,1b + j 4f +___ +$code .= <<___; +2: + addi $KEYP,$KEYP,24 +2: +___ +$code .= do_enc_lookup($T6,$T5,[8,16,24,0],[$T6,$T7,$T8,$T9]); + +$code .= <<___; + add $T7,$RCON,$loopcntr # rcon[i] (i increments by 4 so it can double as + # a word offset) + lwu $T7,0($T7) + + addi $loopcntr,$loopcntr,4 + li $I0,8*4 + + xor $T0,$T0,$T6 + xor $T0,$T0,$T7 + xor $T1,$T1,$T0 + xor $T2,$T2,$T1 + xor $T3,$T3,$T2 + + sw $T0,0($KEYP) + sw $T1,4($KEYP) + sw $T2,8($KEYP) + sw $T3,12($KEYP) + + beq $loopcntr,$I0,4f + + xor $T4,$T4,$T3 + xor $T5,$T5,$T4 + sw $T4,16($KEYP) + sw $T5,20($KEYP) + + addi $KEYP,$KEYP,24 + j 2b +___ +$code .= <<___; +3: + addi $KEYP,$KEYP,32 +3: +___ +$code .= do_enc_lookup($T8,$T7,[8,16,24,0],[$T8,$T9,$T10,$T11]); + +$code .= <<___; + add $T9,$RCON,$loopcntr # rcon[i] (i increments by 4 so it can double as + # a word offset) + lwu $T9,0($T9) + + addi $loopcntr,$loopcntr,4 + li $I0,7*4 + + xor $T0,$T0,$T8 + xor $T0,$T0,$T9 + xor $T1,$T1,$T0 + xor $T2,$T2,$T1 + xor $T3,$T3,$T2 + + sw $T0,0($KEYP) + sw $T1,4($KEYP) + sw $T2,8($KEYP) + sw $T3,12($KEYP) + + beq $loopcntr,$I0,4f +___ +$code .= do_enc_lookup($T8,$T3,[0,8,16,24],[$T8,$T9,$T10,$T11]); +$code .= <<___; + xor $T4,$T4,$T8 + xor $T5,$T5,$T4 + xor $T6,$T6,$T5 + xor $T7,$T7,$T6 + sw $T4,16($KEYP) + sw $T5,20($KEYP) + sw $T6,24($KEYP) + sw $T7,28($KEYP) + + addi $KEYP,$KEYP,32 + j 3b + +4: # return 0 + li a0,0 +5: # return a0 +___ +$code .= load_regs(); +$code .= <<___; + ret +___ + +clear_regs(); + +################################################################################ +# Register assignment for AES_set_decrypt_key +################################################################################ + +# Function arguments (x10-x12 are a0-a2 in the ABI) +# Pointer to user key, number of bits in key, key pointer +my ($UKEY,$BITS,$KEYP) = use_regs(10..12); + +# Temporaries +my ($T0,$T1,$T2,$T3) = use_regs(6..8,9); +my ($T4,$T5,$T6,$T7,$T8) = use_regs(13..17); + +my ($I1) = use_regs(18); + +# Register to hold table offset and used as a temporary +my ($I0) = use_regs(19); + +# Loop counter +my ($loopcntr) = use_regs(20); + +# Lookup table address register +my ($TBL) = use_regs(21); + +# Calculates dest = [ +# Td0[Te1[(in >> 24) & 0xff] & 0xff] ^ +# Td1[Te1[(in >> 16) & 0xff] & 0xff] ^ +# Td2[Te1[(in >> 8) & 0xff] & 0xff] ^ +# Td3[Te1[(in ) & 0xff] & 0xff] +# ] +sub do_dec_lookup { + # (destination reg, input reg, temporary regs) + my ($dest, $in, $Ts) = @_; + + my $ret = ''; + +$ret .= <<___; + + la $TBL,AES_Te2 + + slli $Ts->[0],$in,2 + srli $Ts->[1],$in,8-2 + srli $Ts->[2],$in,16-2 + srli $Ts->[3],$in,24-2 + + andi $Ts->[0],$Ts->[0],0x3FC + andi $Ts->[1],$Ts->[1],0x3FC + andi $Ts->[2],$Ts->[2],0x3FC + andi $Ts->[3],$Ts->[3],0x3FC + + # Index into table Te2 + + add $I0,$TBL,$Ts->[0] + lwu $Ts->[0],0($I0) + + add $I0,$TBL,$Ts->[1] + lwu $Ts->[1],0($I0) + + add $I0,$TBL,$Ts->[2] + lwu $Ts->[2],0($I0) + + add $I0,$TBL,$Ts->[3] + lwu $Ts->[3],0($I0) + + andi $Ts->[0],$Ts->[0],0xFF + andi $Ts->[1],$Ts->[1],0xFF + andi $Ts->[2],$Ts->[2],0xFF + andi $Ts->[3],$Ts->[3],0xFF + + slli $Ts->[0],$Ts->[0],2 + slli $Ts->[1],$Ts->[1],2 + slli $Ts->[2],$Ts->[2],2 + slli $Ts->[3],$Ts->[3],2 + + la $TBL,AES_Td0 + + # Lookup in Td0-Td3 + + add $I0,$TBL,$Ts->[0] + lwu $Ts->[0],0($I0) + + add $TBL,$TBL,1024 + add $I0,$TBL,$Ts->[1] + lwu $Ts->[1],0($I0) + + add $TBL,$TBL,1024 + add $I0,$TBL,$Ts->[2] + lwu $Ts->[2],0($I0) + + add $TBL,$TBL,1024 + add $I0,$TBL,$Ts->[3] + lwu $Ts->[3],0($I0) + + xor $Ts->[0],$Ts->[0],$Ts->[1] + xor $Ts->[2],$Ts->[2],$Ts->[3] + xor $dest,$Ts->[0],$Ts->[2] +___ + + return $ret; +} + +################################################################################ +# void AES_set_decrypt_key(const unsigned char *userKey, const int bits, +# AES_KEY *key) +################################################################################ +$code .= <<___; +.text +.balign 16 +.globl AES_set_decrypt_key +.type AES_set_decrypt_key,\@function +AES_set_decrypt_key: + # Call AES_set_encrypt_key first + addi sp,sp,-16 + sd $KEYP,0(sp) # We need to hold onto this! + sd ra,8(sp) + jal ra,AES_set_encrypt_key + ld $KEYP,0(sp) + ld ra,8(sp) + addi sp,sp,16 + bgez a0,1f # If error, return error + ret +1: +___ +$code .= save_regs(); +$code .= <<___; + + li $T4,0 + lwu $T8,240($KEYP) + slli $T5,$T8,4 + # Invert order of round keys +1: + add $I0,$KEYP,$T4 + ld $T0,0($I0) + ld $T1,8($I0) + add $I1,$KEYP,$T5 + ld $T2,0($I1) + ld $T3,8($I1) + addi $T4,$T4,16 + addi $T5,$T5,-16 + sd $T0,0($I1) + sd $T1,8($I1) + sd $T2,0($I0) + sd $T3,8($I0) + blt $T4,$T5,1b + + li $loopcntr,1 + +1: + addi $KEYP,$KEYP,16 + lwu $T0,0($KEYP) + lwu $T1,4($KEYP) + lwu $T2,8($KEYP) + lwu $T3,12($KEYP) +___ +$code .= do_dec_lookup($T0,$T0,[$T4,$T5,$T6,$T7]); +$code .= do_dec_lookup($T1,$T1,[$T4,$T5,$T6,$T7]); +$code .= do_dec_lookup($T2,$T2,[$T4,$T5,$T6,$T7]); +$code .= do_dec_lookup($T3,$T3,[$T4,$T5,$T6,$T7]); +$code .= <<___; + sw $T0,0($KEYP) + sw $T1,4($KEYP) + sw $T2,8($KEYP) + sw $T3,12($KEYP) + addi $loopcntr,$loopcntr,1 + blt $loopcntr,$T8,1b +___ +$code .= load_regs(); +$code .= <<___; + li a0,0 + ret +___ +$code .= <<___; + +.section .rodata +.p2align 12 +.type AES_Te0,\@object +AES_Te0: +.word 0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U +.word 0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U +.word 0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U +.word 0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU +.word 0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU +.word 0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU +.word 0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U +.word 0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU +.word 0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU +.word 0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U +.word 0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U +.word 0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU +.word 0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU +.word 0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU +.word 0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU +.word 0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU +.word 0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U +.word 0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU +.word 0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU +.word 0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U +.word 0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U +.word 0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U +.word 0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U +.word 0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U +.word 0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU +.word 0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U +.word 0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU +.word 0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU +.word 0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U +.word 0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U +.word 0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U +.word 0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU +.word 0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U +.word 0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU +.word 0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU +.word 0xac6464 |