#!/usr/bin/env perl
# Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Keccak-1600 for ARMv4.
#
# June 2017.
#
# Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
# interleaving. How does it compare to Keccak Code Package? It's as
# fast, but several times smaller, and is endian- and ISA-neutral. ISA
# neutrality means that minimum ISA requirement is ARMv4, yet it can
# be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
# register layout taken from Keccak Code Package. It's also as fast,
# in fact faster by 10-15% on some processors, and endian-neutral.
#
########################################################################
# Numbers are cycles per processed byte. Non-NEON results account even
# for input bit interleaving [which takes ~1/4-1/3 of time].
#
# r=1600(*),NEON r=1088(**),NEON
#
# Cortex-A5 80/+220%, 24 110, 36
# Cortex-A7 71/+180%, 23 99, 34
# Cortex-A8 48/+290%, 20 67, 30
# Cortex-A9 48/+290%, 17 66, 26
# Cortex-A15 34/+210%, 12 47, 18
# Snapdragon S4 44/+230%, 16 59, 24
#
# (*) Not used in real life, meaningful as estimate for single absorb
# operation performance. Percentage after slash is improvement
# over compiler-generated KECCAK_1X reference code.
# (**) Corresponds to SHA3-256, 8KB message size.
my @C = map("r$_",(0..9));
my @E = map("r$_",(10..12,14));
########################################################################
# Stack layout
# ----->+-----------------------+
# | uint64_t A[5][5] |
# | ... |
# +200->+-----------------------+
# | uint64_t D[5] |
# | ... |
# +240->+-----------------------+
# | uint64_t T[2][5] |
# | ... |
# +320->+-----------------------+
# | saved lr |
# +324->+-----------------------+
# | loop counter |
# +328->+-----------------------+
# | ...
my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
my @D = map(8*$_, (25..29));
my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35));
$code.=<<___;
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
.type iotas32, %object
.align 5
iotas32:
.long 0x00000001, 0x00000000
.long 0x00000000, 0x00000089
.long 0x00000000, 0x8000008b
.long 0x00000000, 0x80008080
.long 0x00000001, 0x0000008b
.long 0x00000001, 0x00008000
.long 0x00000001, 0x80008088
.long 0x00000001, 0x80000082
.long 0x00000000, 0x0000000b
.long 0x00000000, 0x0000000a
.long 0x00000001, 0x00008082