From 1d23bbccd3dc966254368bcd102bb685c641b14f Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sat, 17 Jun 2017 20:29:52 +0200 Subject: Add sha/asm/keccak1600-c64x.pl [skip ci] Reviewed-by: Bernd Edlinger (Merged from https://github.com/openssl/openssl/pull/3708) --- crypto/sha/asm/keccak1600-c64x.pl | 882 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 882 insertions(+) create mode 100755 crypto/sha/asm/keccak1600-c64x.pl (limited to 'crypto/sha') diff --git a/crypto/sha/asm/keccak1600-c64x.pl b/crypto/sha/asm/keccak1600-c64x.pl new file mode 100755 index 0000000000..585f64b739 --- /dev/null +++ b/crypto/sha/asm/keccak1600-c64x.pl @@ -0,0 +1,882 @@ +#!/usr/bin/env perl +# Copyright 2017 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# [ABI- and endian-neutral] Keccak-1600 for C64x. +# +# June 2017. +# +# This is straightforward KECCAK_1X_ALT variant (see sha/keccak1600.c) +# with bit interleaving. 64-bit values are simply split between A- and +# B-files, with A-file holding least significant halves. This works +# out perfectly, because all operations including cross-communications +# [in rotate operations] are always complementary. Performance is +# [incredible for a 32-bit processor] 10.9 cycles per processed byte +# for r=1088, which corresponds to SHA3-256. This is >15x faster than +# compiler-generated KECCAK_1X_ALT code, and >10x than other variants. +# On average processor ends up issuing ~4.5 instructions per cycle... + +my @A = map([ $_, ($_+1), ($_+2), ($_+3), ($_+4) ], (5,10,16,21,26)); + $A[1][4] = 31; # B14 is reserved, A14 is used as iota[] + ($A[3][0],$A[4][1]) = ($A[4][1],$A[3][0]); +my @C = (0..4,$A[3][0],$A[4][0]); +my $iotas = "A14"; + +my @rhotates = ([ 0, 1, 62, 28, 27 ], + [ 36, 44, 6, 55, 20 ], + [ 3, 10, 43, 25, 39 ], + [ 41, 45, 15, 21, 8 ], + [ 18, 2, 61, 56, 14 ]); + +sub ROL64 { + my ($src,$rot,$dst,$p) = @_; + + if ($rot&1) { +$code.=<<___; +$p ROTL B$src,$rot/2+1,A$dst +|| ROTL A$src,$rot/2, B$dst +___ + } else { +$code.=<<___; +$p ROTL A$src,$rot/2,A$dst +|| ROTL B$src,$rot/2,B$dst +___ + } +} + +######################################################################## +# Stack frame layout +# +# SP--->+------+------+ +# | | | +# +1--->+------+------+<- -9 below 4 slots are used by KeccakF1600_int +# | | | +# +2--->+------+------+<- -8 +# | | | +# +3--->+------+------+<- -7 +# | A2 | A3 | A3:A2 are preserved by KeccakF1600_int +# +4--->+------+------+<- -6 +# | B2 | B3 | B3:B2 are preserved by KeccakF1600_int +# +5--->+------+------+<- -5 below is ABI-compliant layout +# | A10 | A11 | +# +6--->+------+------+<- -4 +# | A12 | A13 | +# +7--->+------+------+<- -3 +# | A14 | B3 | +# +8--->+------+------+<- -2 +# | B10 | B11 | +# +9--->+------+------+<- -1 +# | B12 | B13 | +# +------+------+<---FP +# | A15 | +# +------+-- + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .nocmp + .asg KeccakF1600,_KeccakF1600 + .asg SHA3_absorb,_SHA3_absorb + .asg SHA3_squeeze,_SHA3_squeeze + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .align 32 +_KeccakF1600_int: + .asmfunc + STDW A3:A2,*FP[-7] +|| STDW B3:B2,*SP[4] +_KeccakF1600_cheat: + .if __TI_EABI__ + ADDKPC _KeccakF1600_int,B0 +|| MVKL \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas + MVKH \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas + .else + ADDKPC _KeccakF1600_int,B0 +|| MVKL (iotas-_KeccakF1600_int),$iotas + MVKH (iotas-_KeccakF1600_int),$iotas + .endif + ADD B0,$iotas,$iotas +loop?: + XOR A$A[0][2],A$A[1][2],A$C[2] ; Theta +|| XOR B$A[0][2],B$A[1][2],B$C[2] +|| XOR A$A[0][3],A$A[1][3],A$C[3] +|| XOR B$A[0][3],B$A[1][3],B$C[3] +|| XOR A$A[0][0],A$A[1][0],A$C[0] +|| XOR B$A[0][0],B$A[1][0],B$C[0] + XOR A$A[2][2],A$C[2],A$C[2] +|| XOR B$A[2][2],B$C[2],B$C[2] +|| XOR A$A[2][3],A$C[3],A$C[3] +|| XOR B$A[2][3],B$C[3],B$C[3] +|| XOR A$A[2][0],A$C[0],A$C[0] +|| XOR B$A[2][0],B$C[0],B$C[0] + XOR A$A[3][2],A$C[2],A$C[2] +|| XOR B$A[3][2],B$C[2],B$C[2] +|| XOR A$A[3][3],A$C[3],A$C[3] +|| XOR B$A[3][3],B$C[3],B$C[3] +|| XOR A$A[3][0],A$C[0],A$C[0] +|| XOR B$A[3][0],B$C[0],B$C[0] + XOR A$A[4][2],A$C[2],A$C[2] +|| XOR B$A[4][2],B$C[2],B$C[2] +|| XOR A$A[4][3],A$C[3],A$C[3] +|| XOR B$A[4][3],B$C[3],B$C[3] +|| XOR A$A[4][0],A$C[0],A$C[0] +|| XOR B$A[4][0],B$C[0],B$C[0] + XOR A$A[0][4],A$A[1][4],A$C[4] +|| XOR B$A[0][4],B$A[1][4],B$C[4] +|| XOR A$A[0][1],A$A[1][1],A$C[1] +|| XOR B$A[0][1],B$A[1][1],B$C[1] +|| STDW A$A[3][0]:A$A[4][0],*SP[1] ; offload some data + STDW B$A[3][0]:B$A[4][0],*SP[2] +|| XOR A$A[2][4],A$C[4],A$C[4] +|| XOR B$A[2][4],B$C[4],B$C[4] +|| XOR A$A[2][1],A$C[1],A$C[1] +|| XOR B$A[2][1],B$C[1],B$C[1] +|| ROTL B$C[2],1,A$C[5] ; ROL64(C[2],1) +|| ROTL A$C[2],0,B$C[5] + XOR A$A[3][4],A$C[4],A$C[4] +|| XOR B$A[3][4],B$C[4],B$C[4] +|| XOR A$A[3][1],A$C[1],A$C[1] +|| XOR B$A[3][1],B$C[1],B$C[1] +|| ROTL B$C[3],1,A$C[6] ; ROL64(C[3],1) +|| ROTL A$C[3],0,B$C[6] + XOR A$A[4][4],A$C[4],A$C[4] +|| XOR B$A[4][4],B$C[4],B$C[4] +|| XOR A$A[4][1],A$C[1],A$C[1] +|| XOR B$A[4][1],B$C[1],B$C[1] +|| XOR A$C[0],A$C[5],A$C[5] ; C[0] ^ ROL64(C[2],1) +|| XOR B$C[0],B$C[5],B$C[5] + XOR A$C[5],A$A[0][1],A$A[0][1] +|| XOR B$C[5],B$A[0][1],B$A[0][1] +|| XOR A$C[5],A$A[1][1],A$A[1][1] +|| XOR B$C[5],B$A[1][1],B$A[1][1] +|| XOR A$C[5],A$A[2][1],A$A[2][1] +|| XOR B$C[5],B$A[2][1],B$A[2][1] + XOR A$C[5],A$A[3][1],A$A[3][1] +|| XOR B$C[5],B$A[3][1],B$A[3][1] +|| XOR A$C[5],A$A[4][1],A$A[4][1] +|| XOR B$C[5],B$A[4][1],B$A[4][1] +|| ROTL B$C[4],1,A$C[5] ; ROL64(C[4],1) +|| ROTL A$C[4],0,B$C[5] +|| XOR A$C[1],A$C[6],A$C[6] ; C[1] ^ ROL64(C[3],1) +|| XOR B$C[1],B$C[6],B$C[6] + XOR A$C[6],A$A[0][2],A$A[0][2] +|| XOR B$C[6],B$A[0][2],B$A[0][2] +|| XOR A$C[6],A$A[1][2],A$A[1][2] +|| XOR B$C[6],B$A[1][2],B$A[1][2] +|| XOR A$C[6],A$A[2][2],A$A[2][2] +|| XOR B$C[6],B$A[2][2],B$A[2][2] +|| ROTL B$C[1],1,A$C[1] ; ROL64(C[1],1) +|| ROTL A$C[1],0,B$C[1] + XOR A$C[6],A$A[3][2],A$A[3][2] +|| XOR B$C[6],B$A[3][2],B$A[3][2] +|| XOR A$C[6],A$A[4][2],A$A[4][2] +|| XOR B$C[6],B$A[4][2],B$A[4][2] +|| ROTL B$C[0],1,A$C[6] ; ROL64(C[0],1) +|| ROTL A$C[0],0,B$C[6] +|| XOR A$C[5],A$C[2],A$C[2] ; C[2] ^= ROL64(C[4],1) +|| XOR B$C[5],B$C[2],B$C[2] + XOR A$C[2],A$A[0][3],A$A[0][3] +|| XOR B$C[2],B$A[0][3],B$A[0][3] +|| XOR A$C[2],A$A[1][3],A$A[1][3] +|| XOR B$C[2],B$A[1][3],B$A[1][3] +|| XOR A$C[2],A$A[2][3],A$A[2][3] +|| XOR B$C[2],B$A[2][3],B$A[2][3] + XOR A$C[6],A$C[3],A$C[3] ; C[3] ^= ROL64(C[0],1) +|| XOR B$C[6],B$C[3],B$C[3] +|| LDDW *FP[-9],A$A[3][0]:A$A[4][0] ; restore offloaded data +|| LDDW *SP[2],B$A[3][0]:B$A[4][0] +|| XOR A$C[2],A$A[3][3],A$A[3][3] +|| XOR B$C[2],B$A[3][3],B$A[3][3] + XOR A$C[2],A$A[4][3],A$A[4][3] +|| XOR B$C[2],B$A[4][3],B$A[4][3] +|| XOR A$C[3],A$A[0][4],A$A[0][4] +|| XOR B$C[3],B$A[0][4],B$A[0][4] +|| XOR A$C[3],A$A[1][4],A$A[1][4] +|| XOR B$C[3],B$A[1][4],B$A[1][4] + XOR A$C[3],A$A[2][4],A$A[2][4] +|| XOR B$C[3],B$A[2][4],B$A[2][4] +|| XOR A$C[3],A$A[3][4],A$A[3][4] +|| XOR B$C[3],B$A[3][4],B$A[3][4] +|| XOR A$C[3],A$A[4][4],A$A[4][4] +|| XOR B$C[3],B$A[4][4],B$A[4][4] + XOR A$C[1],A$C[4],A$C[4] ; C[4] ^= ROL64(C[1],1) +|| XOR B$C[1],B$C[4],B$C[4] +|| MV A$A[0][1],A$C[1] ; Rho+Pi, "early start" +|| MV B$A[0][1],B$C[1] +___ + &ROL64 ($A[1][1],$rhotates[1][1],$A[0][1],"||"); +$code.=<<___; + XOR A$C[4],A$A[0][0],A$A[0][0] +|| XOR B$C[4],B$A[0][0],B$A[0][0] +|| XOR A$C[4],A$A[1][0],A$A[1][0] +|| XOR B$C[4],B$A[1][0],B$A[1][0] +|| MV A$A[0][3],A$C[3] +|| MV B$A[0][3],B$C[3] +___ + &ROL64 ($A[3][3],$rhotates[3][3],$A[0][3],"||"); +$code.=<<___; + XOR A$C[4],A$A[2][0],A$A[2][0] +|| XOR B$C[4],B$A[2][0],B$A[2][0] +|| XOR A$C[4],A$A[3][0],A$A[3][0] +|| XOR B$C[4],B$A[3][0],B$A[3][0] +|| MV A$A[0][2],A$C[2] +|| MV B$A[0][2],B$C[2] +___ + &ROL64 ($A[2][2],$rhotates[2][2],$A[0][2],"||"); +$code.=<<___; + XOR A$C[4],A$A[4][0],A$A[4][0] +|| XOR B$C[4],B$A[4][0],B$A[4][0] +|| MV A$A[0][4],A$C[4] +|| MV B$A[0][4],B$C[4] +___ + &ROL64 ($A[4][4],$rhotates[4][4],$A[0][4],"||"); + + &ROL64 ($A[1][4],$rhotates[1][4],$A[1][1]); +$code.=<<___; +|| LDW *${iotas}++[2],A$C[0] +___ + &ROL64 ($A[2][3],$rhotates[2][3],$A[2][2]); +$code.=<<___; +|| LDW *${iotas}[-1],B$C[0] +___ + &ROL64 ($A[3][2],$rhotates[3][2],$A[3][3]); + &ROL64 ($A[4][1],$rhotates[4][1],$A[4][4]); + + &ROL64 ($A[4][2],$rhotates[4][2],$A[1][4]); + &ROL64 ($A[3][4],$rhotates[3][4],$A[2][3]); + &ROL64 ($A[2][1],$rhotates[2][1],$A[3][2]); + &ROL64 ($A[1][3],$rhotates[1][3],$A[4][1]); + + &ROL64 ($A[2][4],$rhotates[2][4],$A[4][2]); + &ROL64 ($A[4][3],$rhotates[4][3],$A[3][4]); + &ROL64 ($A[1][2],$rhotates[1][2],$A[2][1]); + &ROL64 ($A[3][1],$rhotates[3][1],$A[1][3]); + + &ROL64 ($A[4][0],$rhotates[4][0],$A[2][4]); + &ROL64 ($A[3][0],$rhotates[3][0],$A[4][3]); + &ROL64 ($A[2][0],$rhotates[2][0],$A[1][2]); + &ROL64 ($A[1][0],$rhotates[1][0],$A[3][1]); + + #&ROL64 ($C[3], $rhotates[0][3],$A[1][0]); # moved below + &ROL64 ($C[1], $rhotates[0][1],$A[2][0]); + &ROL64 ($C[4], $rhotates[0][4],$A[3][0]); + &ROL64 ($C[2], $rhotates[0][2],$A[4][0]); +$code.=<<___; +|| ANDN A$A[0][2],A$A[0][1],A$C[4] ; Chi+Iota +|| ANDN B$A[0][2],B$A[0][1],B$C[4] +|| ANDN A$A[0][3],A$A[0][2],A$C[1] +|| ANDN B$A[0][3],B$A[0][2],B$C[1] +|| ANDN A$A[0][4],A$A[0][3],A$C[2] +|| ANDN B$A[0][4],B$A[0][3],B$C[2] +___ + &ROL64 ($C[3], $rhotates[0][3],$A[1][0]); +$code.=<<___; +|| ANDN A$A[0][0],A$A[0][4],A$C[3] +|| ANDN B$A[0][0],B$A[0][4],B$C[3] +|| XOR A$C[4],A$A[0][0],A$A[0][0] +|| XOR B$C[4],B$A[0][0],B$A[0][0] +|| ANDN A$A[0][1],A$A[0][0],A$C[4] +|| ANDN B$A[0][1],B$A[0][0],B$C[4] + XOR A$C[1],A$A[0][1],A$A[0][1] +|| XOR B$C[1],B$A[0][1],B$A[0][1] +|| XOR A$C[2],A$A[0][2],A$A[0][2] +|| XOR B$C[2],B$A[0][2],B$A[0][2] +|| XOR A$C[3],A$A[0][3],A$A[0][3] +|| XOR B$C[3],B$A[0][3],B$A[0][3] + XOR A$C[4],A$A[0][4],A$A[0][4] +|| XOR B$C[4],B$A[0][4],B$A[0][4] +|| XOR A$C[0],A$A[0][0],A$A[0][0] ; A[0][0] ^= iotas[i++]; +|| XOR B$C[0],B$A[0][0],B$A[0][0] +|| EXTU $iotas,24,24,A0 ; A0 is A$C[0], as we done? + + ANDN A$A[1][2],A$A[1][1],A$C[4] +|| ANDN B$A[1][2],B$A[1][1],B$C[4] +|| ANDN A$A[1][3],A$A[1][2],A$C[1] +|| ANDN B$A[1][3],B$A[1][2],B$C[1] +|| ANDN A$A[1][4],A$A[1][3],A$C[2] +|| ANDN B$A[1][4],B$A[1][3],B$C[2] + ANDN A$A[1][0],A$A[1][4],A$C[3] +|| ANDN B$A[1][0],B$A[1][4],B$C[3] +|| XOR A$C[4],A$A[1][0],A$A[1][0] +|| XOR B$C[4],B$A[1][0],B$A[1][0] +|| ANDN A$A[1][1],A$A[1][0],A$C[4] +|| ANDN B$A[1][1],B$A[1][0],B$C[4] + XOR A$C[1],A$A[1][1],A$A[1][1] +|| XOR B$C[1],B$A[1][1],B$A[1][1] +|| XOR A$C[2],A$A[1][2],A$A[1][2] +|| XOR B$C[2],B$A[1][2],B$A[1][2] +|| XOR A$C[3],A$A[1][3],A$A[1][3] +|| XOR B$C[3],B$A[1][3],B$A[1][3] + XOR A$C[4],A$A[1][4],A$A[1][4] +|| XOR B$C[4],B$A[1][4],B$A[1][4] + +|| ANDN A$A[2][2],A$A[2][1],A$C[4] +|| ANDN B$A[2][2],B$A[2][1],B$C[4] +|| ANDN A$A[2][3],A$A[2][2],A$C[1] +|| ANDN B$A[2][3],B$A[2][2],B$C[1] + ANDN A$A[2][4],A$A[2][3],A$C[2] +|| ANDN B$A[2][4],B$A[2][3],B$C[2] +|| ANDN A$A[2][0],A$A[2][4],A$C[3] +|| ANDN B$A[2][0],B$A[2][4],B$C[3] +|| XOR A$C[4],A$A[2][0],A$A[2][0] +|| XOR B$C[4],B$A[2][0],B$A[2][0] + ANDN A$A[2][1],A$A[2][0],A$C[4] +|| ANDN B$A[2][1],B$A[2][0],B$C[4] +|| XOR A$C[1],A$A[2][1],A$A[2][1] +|| XOR B$C[1],B$A[2][1],B$A[2][1] +|| XOR A$C[2],A$A[2][2],A$A[2][2] +|| XOR B$C[2],B$A[2][2],B$A[2][2] + XOR A$C[3],A$A[2][3],A$A[2][3] +|| XOR B$C[3],B$A[2][3],B$A[2][3] +|| XOR A$C[4],A$A[2][4],A$A[2][4] +|| XOR B$C[4],B$A[2][4],B$A[2][4] + + ANDN A$A[3][2],A$A[3][1],A$C[4] +|| ANDN B$A[3][2],B$A[3][1],B$C[4] +|| ANDN A$A[3][3],A$A[3][2],A$C[1] +|| ANDN B$A[3][3],B$A[3][2],B$C[1] +|| ANDN A$A[3][4],A$A[3][3],A$C[2] +|| ANDN B$A[3][4],B$A[3][3],B$C[2] + ANDN A$A[3][0],A$A[3][4],A$C[3] +|| ANDN B$A[3][0],B$A[3][4],B$C[3] +|| XOR A$C[4],A$A[3][0],A$A[3][0] +|| XOR B$C[4],B$A[3][0],B$A[3][0] +|| ANDN A$A[3][1],A$A[3][0],A$C[4] +|| ANDN B$A[3][1],B$A[3][0],B$C[4] + XOR A$C[1],A$A[3][1],A$A[3][1] +|| XOR B$C[1],B$A[3][1],B$A[3][1] +|| XOR A$C[2],A$A[3][2],A$A[3][2] +|| XOR B$C[2],B$A[3][2],B$A[3][2] +|| XOR A$C[3],A$A[3][3],A$A[3][3] +||[A0] BNOP loop? + XOR B$C[3],B$A[3][3],B$A[3][3] +|| XOR A$C[4],A$A[3][4],A$A[3][4] +|| XOR B$C[4],B$A[3][4],B$A[3][4] +||[!A0] LDDW *FP[-7],A3:A2 +||[!A0] LDDW *SP[4], RA:B2 + + ANDN A$A[4][2],A$A[4][1],A$C[4] +|| ANDN B$A[4][2],B$A[4][1],B$C[4] +|| ANDN A$A[4][3],A$A[4][2],A$C[1] +|| ANDN B$A[4][3],B$A[4][2],B$C[1] +|| ANDN A$A[4][4],A$A[4][3],A$C[2] +|| ANDN B$A[4][4],B$A[4][3],B$C[2] + ANDN A$A[4][0],A$A[4][4],A$C[3] +|| ANDN B$A[4][0],B$A[4][4],B$C[3] +|| XOR A$C[4],A$A[4][0],A$A[4][0] +|| XOR B$C[4],B$A[4][0],B$A[4][0] +|| ANDN A$A[4][1],A$A[4][0],A$C[4] +|| ANDN B$A[4][1],B$A[4][0],B$C[4] + XOR A$C[1],A$A[4][1],A$A[4][1] +|| XOR B$C[1],B$A[4][1],B$A[4][1] +|| XOR A$C[2],A$A[4][2],A$A[4][2] +|| XOR B$C[2],B$A[4][2],B$A[4][2] +|| XOR A$C[3],A$A[4][3],A$A[4][3] +|| XOR B$C[3],B$A[4][3],B$A[4][3] + XOR A$C[4],A$A[4][4],A$A[4][4] +|| XOR B$C[4],B$A[4][4],B$A[4][4] +;;===== branch to loop? is taken here + + BNOP RA,5 + .endasmfunc + + .newblock + .global _KeccakF1600 + .align 32 +_KeccakF1600: + .asmfunc stack_usage(80) + STW FP,*SP--(80) ; save frame pointer +|| MV SP,FP + STDW B13:B12,*SP[9] +|| STDW A13:A12,*FP[-4] + STDW B11:B10,*SP[8] +|| STDW A11:A10,*FP[-5] + STW RA, *SP[15] +|| STW A14,*FP[-6] +|| MV A4,A2 +|| ADD 4,A4,B2 + + LDW *A2++[2],A$A[0][0] ; load A[5][5] +|| LDW *B2++[2],B$A[0][0] + LDW *A2++[2],A$A[0][1] +|| LDW *B2++[2],B$A[0][1] + LDW *A2++[2],A$A[0][2] +|| LDW *B2++[2],B$A[0][2] + LDW *A2++[2],A$A[0][3] +|| LDW *B2++[2],B$A[0][3] + LDW *A2++[2],A$A[0][4] +|| LDW *B2++[2],B$A[0][4] + + LDW *A2++[2],A$A[1][0] +|| LDW *B2++[2],B$A[1][0] + LDW *A2++[2],A$A[1][1] +|| LDW *B2++[2],B$A[1][1] + LDW *A2++[2],A$A[1][2] +|| LDW *B2++[2],B$A[1][2] + LDW *A2++[2],A$A[1][3] +|| LDW *B2++[2],B$A[1][3] + LDW *A2++[2],A$A[1][4] +|| LDW *B2++[2],B$A[1][4] + + LDW *A2++[2],A$A[2][0] +|| LDW *B2++[2],B$A[2][0] + LDW *A2++[2],A$A[2][1] +|| LDW *B2++[2],B$A[2][1] + LDW *A2++[2],A$A[2][2] +|| LDW *B2++[2],B$A[2][2] + LDW *A2++[2],A$A[2][3] +|| LDW *B2++[2],B$A[2][3] + LDW *A2++[2],A$A[2][4] +|| LDW *B2++[2],B$A[2][4] + + LDW *A2++[2],A$A[3][0] +|| LDW *B2++[2],B$A[3][0] + LDW *A2++[2],A$A[3][1] +|| LDW *B2++[2],B$A[3][1] + LDW *A2++[2],A$A[3][2] +|| LDW *B2++[2],B$A[3][2] + LDW *A2++[2],A$A[3][3] +|| LDW *B2++[2],B$A[3][3] + LDW *A2++[2],A$A[3][4] +|| LDW *B2++[2],B$A[3][4] +|| BNOP _KeccakF1600_int + + ADDKPC ret?,RA +|| LDW *A2++[2],A$A[4][0] +|| LDW *B2++[2],B$A[4][0] + LDW *A2++[2],A$A[4][1] +|| LDW *B2++[2],B$A[4][1] + LDW *A2++[2],A$A[4][2] +|| LDW *B2++[2],B$A[4][2] + LDW *A2++[2],A$A[4][3] +|| LDW *B2++[2],B$A[4][3] + LDW *A2,A$A[4][4] +|| LDW *B2,B$A[4][4] +|| ADDK -192,A2 ; rewind +|| ADDK -192,B2 + + .align 16 +ret?: + STW A$A[0][0],*A2++[2] ; store A[5][5] +|| STW B$A[0][0],*B2++[2] + STW A$A[0][1],*A2++[2] +|| STW B$A[0][1],*B2++[2] + STW A$A[0][2],*A2++[2] +|| STW B$A[0][2],*B2++[2] + STW A$A[0][3],*A2++[2] +|| STW B$A[0][3],*B2++[2] + STW A$A[0][4],*A2++[2] +|| STW B$A[0][4],*B2++[2] + + STW A$A[1][0],*A2++[2] +|| STW B$A[1][0],*B2++[2] + STW A$A[1][1],*A2++[2] +|| STW B$A[1][1],*B2++[2] + STW A$A[1][2],*A2++[2] +|| STW B$A[1][2],*B2++[2] + STW A$A[1][3],*A2++[2] +|| STW B$A[1][3],*B2++[2] + STW A$A[1][4],*A2++[2] +|| STW B$A[1][4],*B2++[2] + + STW A$A[2][0],*A2++[2] +|| STW B$A[2][0],*B2++[2] + STW A$A[2][1],*A2++[2] +|| STW B$A[2][1],*B2++[2] + STW A$A[2][2],*A2++[2] +|| STW B$A[2][2],*B2++[2] + STW A$A[2][3],*A2++[2] +|| STW B$A[2][3],*B2++[2] + STW A$A[2][4],*A2++[2] +|| STW B$A[2][4],*B2++[2] + + STW A$A[3][0],*A2++[2] +|| STW B$A[3][0],*B2++[2] + STW A$A[3][1],*A2++[2] +|| STW B$A[3][1],*B2++[2] + STW A$A[3][2],*A2++[2] +|| STW B$A[3][2],*B2++[2] + STW A$A[3][3],*A2++[2] +|| STW B$A[3][3],*B2++[2] + STW A$A[3][4],*A2++[2] +|| STW B$A[3][4],*B2++[2] + + LDW *SP[15],RA +|| LDW *FP[-6],A14 + + STW A$A[4][0],*A2++[2] +|| STW B$A[4][0],*B2++[2] + STW A$A[4][1],*A2++[2] +|| STW B$A[4][1],*B2++[2] + STW A$A[4][2],*A2++[2] +|| STW B$A[4][2],*B2++[2] + STW A$A[4][3],*A2++[2] +|| STW B$A[4][3],*B2++[2] + STW A$A[4][4],*A2 +|| STW B$A[4][4],*B2 +|| ADDK -192,A2 ; rewind + + MV A2,A4 ; return original A4 +|| LDDW *SP[8], B11:B10 +|| LDDW *FP[-5],A11:A10 + LDDW *SP[9], B13:B12 +|| LDDW *FP[-4],A13:A12 +|| BNOP RA + LDW *++SP(80),FP ; restore frame pointer + NOP 4 ; wait till FP is committed + .endasmfunc + + .newblock + .asg B2,BSZ + .asg A2,INP + .asg A3,LEN + .global _SHA3_absorb + .align 32 +_SHA3_absorb: + .asmfunc stack_usage(80) + STW FP,*SP--(80) ; save frame pointer +|| MV SP,FP + STDW B13:B12,*SP[9] +|| STDW A13:A12,*FP[-4] + STDW B11:B10,*SP[8] +|| STDW A11:A10,*FP[-5] + STW RA, *SP[15] +|| STW A14,*FP[-6] + + STW A4,*SP[1] ; save A[][] +|| MV B4,INP ; reassign arguments +|| MV A6,LEN +|| MV B6,BSZ +|| ADD 4,A4,B4 + + LDW *A4++[2],A$A[0][0] ; load A[5][5] +|| LDW *B4++[2],B$A[0][0] + LDW *A4++[2],A$A[0][1] +|| LDW *B4++[2],B$A[0][1] + LDW *A4++[2],A$A[0][2] +|| LDW *B4++[2],B$A[0][2] + LDW *A4++[2],A$A[0][3] +|| LDW *B4++[2],B$A[0][3] + LDW *A4++[2],A$A[0][4] +|| LDW *B4++[2],B$A[0][4] + + LDW *A4++[2],A$A[1][0] +|| LDW *B4++[2],B$A[1][0] + LDW *A4++[2],A$A[1][1] +|| LDW *B4++[2],B$A[1][1] + LDW *A4++[2],A$A[1][2] +|| LDW *B4++[2],B$A[1][2] + LDW *A4++[2],A$A[1][3] +|| LDW *B4++[2],B$A[1][3] + LDW *A4++[2],A$A[1][4] +|| LDW *B4++[2],B$A[1][4] + + LDW *A4++[2],A$A[2][0] +|| LDW *B4++[2],B$A[2][0] + LDW *A4++[2],A$A[2][1] +|| LDW *B4++[2],B$A[2][1] + LDW *A4++[2],A$A[2][2] +|| LDW *B4++[2],B$A[2][2] + LDW *A4++[2],A$A[2][3] +|| LDW *B4++[2],B$A[2][3] + LDW *A4++[2],A$A[2][4] +|| LDW *B4++[2],B$A[2][4] + + LDW *A4++[2],A$A[3][0] +|| LDW *B4++[2],B$A[3][0] + LDW *A4++[2],A$A[3][1] +|| LDW *B4++[2],B$A[3][1] + LDW *A4++[2],A$A[3][2] +|| LDW *B4++[2],B$A[3][2] + LDW *A4++[2],A$A[3][3] +|| LDW *B4++[2],B$A[3][3] + LDW *A4++[2],A$A[3][4] +|| LDW *B4++[2],B$A[3][4] + + LDW *A4++[2],A$A[4][0] +|| LDW *B4++[2],B$A[4][0] + LDW *A4++[2],A$A[4][1] +|| LDW *B4++[2],B$A[4][1] + LDW *A4++[2],A$A[4][2] +|| LDW *B4++[2],B$A[4][2] + LDW *A4++[2],A$A[4][3] +|| LDW *B4++[2],B$A[4][3] + LDW *A4,A$A[4][4] +|| LDW *B4,B$A[4][4] +|| ADDKPC loop?,RA + STDW RA:BSZ,*SP[4] + +loop?: + CMPLTU LEN,BSZ,A0 ; len < bsz? +|| SHRU BSZ,3,BSZ + [A0] BNOP ret? +||[A0] ZERO BSZ +||[A0] LDW *SP[1],A2 ; pull A[][] + [BSZ] LDNDW *INP++,A1:A0 +||[BSZ] SUB LEN,8,LEN +||[BSZ] SUB BSZ,1,BSZ + NOP 4 +___ +for ($y = 0; $y < 5; $y++) { + for ($x = 0; $x < ($y<4 ? 5 : 4); $x++) { +$code.=<<___; + .if .BIG_ENDIAN + SWAP2 A0,A1 +|| SWAP2 A1,A0 + SWAP4 A0,A0 + SWAP4 A1,A1 +||[!BSZ]BNOP _KeccakF1600_cheat +||[!BSZ]STDW LEN:INP,*SP[3] +|| DEAL A0,A0 + .else + [!BSZ]BNOP _KeccakF1600_cheat +||[!BSZ]STDW LEN:INP,*SP[3] +|| DEAL A0,A0 + .endif + [BSZ] LDNDW *INP++,A1:A0 +|| DEAL A1,A1 + [BSZ] SUB LEN,8,LEN +||[BSZ] SUB BSZ,1,BSZ + PACK2 A1,A0,A0 +|| PACKH2 A1,A0,A1 + XOR A0,A$A[$y][$x],A$A[$y][$x] + XOR A1,B$A[$y][$x],B$A[$y][$x] +___ + } +} +$code.=<<___; + .if .BIG_ENDIAN + SWAP2 A0,A1 +|| SWAP2 A1,A0 + SWAP4 A0,A0 + SWAP4 A1,A1 + .endif + BNOP _KeccakF1600_cheat +|| STDW LEN:INP,*SP[3] +|| DEAL A0,A0 + DEAL A1,A1 + NOP + PACK2 A1,A0,A0 +|| PACKH2 A1,A0,A1 + XOR A0,A$A[4][4],A$A[4][4] + XOR A1,B$A[4][4],B$A[4][4] + + .align 16 +ret?: + MV LEN,A4 ; return value +|| ADD 4,A2,B2 + + STW A$A[0][0],*A2++[2] ; store A[5][5] +|| STW B$A[0][0],*B2++[2] + STW A$A[0][1],*A2++[2] +|| STW B$A[0][1],*B2++[2] + STW A$A[0][2],*A2++[2] +|| STW B$A[0][2],*B2++[2] + STW A$A[0][3],*A2++[2] +|| STW B$A[0][3],*B2++[2] + STW A$A[0][4],*A2++[2] +|| STW B$A[0][4],*B2++[2] + + STW A$A[1][0],*A2++[2] +|| STW B$A[1][0],*B2++[2] + STW A$A[1][1],*A2++[2] +|| STW B$A[1][1],*B2++[2] + STW A$A[1][2],*A2++[2] +|| STW B$A[1][2],*B2++[2] + STW A$A[1][3],*A2++[2] +|| STW B$A[1][3],*B2++[2] + STW A$A[1][4],*A2++[2] +|| STW B$A[1][4],*B2++[2] + + STW A$A[2][0],*A2++[2] +|| STW B$A[2][0],*B2++[2] + STW A$A[2][1],*A2++[2] +|| STW B$A[2][1],*B2++[2] + STW A$A[2][2],*A2++[2] +|| STW B$A[2][2],*B2++[2] + STW A$A[2][3],*A2++[2] +|| STW B$A[2][3],*B2++[2] + STW A$A[2][4],*A2++[2] +|| STW B$A[2][4],*B2++[2] + + LDW *SP[15],RA +|| LDW *FP[-6],A14 + + STW A$A[3][0],*A2++[2] +|| STW B$A[3][0],*B2++[2] + STW A$A[3][1],*A2++[2] +|| STW B$A[3][1],*B2++[2] + STW A$A[3][2],*A2++[2] +|| STW B$A[3][2],*B2++[2] + STW A$A[3][3],*A2++[2] +|| STW B$A[3][3],*B2++[2] + STW A$A[3][4],*A2++[2] +|| STW B$A[3][4],*B2++[2] + + LDDW *SP[8], B11:B10 +|| LDDW *FP[-5],A11:A10 + LDDW *SP[9], B13:B12 +|| LDDW *FP[-4],A13:A12 + BNOP RA +|| LDW *++SP(80),FP ; restore frame pointer + + STW A$A[4][0],*A2++[2] +|| STW B$A[4][0],*B2++[2] + STW A$A[4][1],*A2++[2] +|| STW B$A[4][1],*B2++[2] + STW A$A[4][2],*A2++[2] +|| STW B$A[4][2],*B2++[2] + STW A$A[4][3],*A2++[2] +|| STW B$A[4][3],*B2++[2] + STW A$A[4][4],*A2++[2] +|| STW B$A[4][4],*B2++[2] + .endasmfunc + + .newblock + .global _SHA3_squeeze + .asg A12,OUT + .asg A13,LEN + .asg A14,BSZ + .align 32 +_SHA3_squeeze: + .asmfunc stack_usage(24) + STW FP,*SP--(24) ; save frame pointer +|| MV SP,FP + STW RA, *SP[5] +|| STW A14,*FP[-2] + STDW A13:A12,*FP[-2] +|| MV B4,OUT ; reassign arguments + MV A6,LEN +|| MV B6,BSZ + +loop?: + LDW *SP[5],RA ; reload RA +|| SHRU BSZ,3,A1 +|| MV A4,A8 +|| ADD 4,A4,B8 +block?: + CMPLTU LEN,8,A0 ; len < 8? + [A0] BNOP tail? + LDW *A8++[2],A9 +|| LDW *B8++[2],B9 +|| SUB LEN,8,LEN ; len -= 8 + MV LEN,A0 +|| SUB A1,1,A1 ; bsz-- +|| NOP 4 + .if .BIG_ENDIAN + SWAP4 A9,A9 +|| SWAP4 B9,B9 + SWAP2 A9,A9 +|| SWAP2 B9,B9 + .endif + [!A0] BNOP ret? +||[!A0] ZERO A1 + PACK2 B9,A9,B7 +||[A1] BNOP block? + PACKH2 B9,A9,B9 +|| SHFL B7,B7 + SHFL B9,B9 + STNW B7,*OUT++ + STNW B9,*OUT++ + NOP + + BNOP _KeccakF1600,4 + ADDKPC loop?,RA + + .align 16 +tail?: + .if .BIG_ENDIAN + SWAP4 A9,A9 +|| SWAP4 B9,B9 + SWAP2 A9,A9 +|| SWAP2 B9,B9 + .endif + PACK2 B9,A9,B7 + PACKH2 B9,A9,B9 +|| SHFL B7,B7 + SHFL B9,B9 + + STB B7,*OUT++ +|| SHRU B7,8,B7 +|| ADD LEN,7,A0 + [A0] STB B7,*OUT++ +||[A0] SHRU B7,8,B7 +||[A0] SUB A0,1,A0 + [A0] STB B7,*OUT++ +||[A0] SHRU B7,8,B7 +||[A0] SUB A0,1,A0 + [A0] STB B7,*OUT++ +||[A0] SUB A0,1,A0 + [A0] STB B9,*OUT++ +||[A0] SHRU B9,8,B9 +||[A0] SUB A0,1,A0 + [A0] STB B9,*OUT++ +||[A0] SHRU B9,8,B9 +||[A0] SUB A0,1,A0 + [A0] STB B9,*OUT++ + +ret?: + LDDW *FP[-2],A13:A12 + BNOP RA +|| LDW *FP[-2],A14 + LDW *++SP(24),FP ; restore frame pointer + NOP 4 ; wait till FP is committed + .endasmfunc + + .if __TI_EABI__ + .sect ".text:sha_asm.const" + .else + .sect ".const:sha_asm" + .endif + .align 256 + .uword 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +iotas: + .uword 0x00000001, 0x00000000 + .uword 0x00000000, 0x00000089 + .uword 0x00000000, 0x8000008b + .uword 0x00000000, 0x80008080 + .uword 0x00000001, 0x0000008b + .uword 0x00000001, 0x00008000 + .uword 0x00000001, 0x80008088 + .uword 0x00000001, 0x80000082 + .uword 0x00000000, 0x0000000b + .uword 0x00000000, 0x0000000a + .uword 0x00000001, 0x00008082 + .uword 0x00000000, 0x00008003 + .uword 0x00000001, 0x0000808b + .uword 0x00000001, 0x8000000b + .uword 0x00000001, 0x8000008a + .uword 0x00000001, 0x80000081 + .uword 0x00000000, 0x80000081 + .uword 0x00000000, 0x80000008 + .uword 0x00000000, 0x00000083 + .uword 0x00000000, 0x80008003 + .uword 0x00000001, 0x80008088 + .uword 0x00000000, 0x80000088 + .uword 0x00000001, 0x00008000 + .uword 0x00000000, 0x80008082 + + .cstring "Keccak-1600 absorb and squeeze for C64x, CRYPTOGAMS by " + .align 4 +___ + +print $code; -- cgit v1.2.3