summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Hu <Daniel.Hu@arm.com>2022-02-14 14:36:34 +0000
committerTomas Mraz <tomas@openssl.org>2022-11-21 10:49:51 +0100
commit2535075bf0bd1a599a7f483d06b3ef019104ee7c (patch)
treed128f6c4c1f82a8c29dfc4c9f2939d95de6aba9b
parent553e125aff68274e9a5883fd7d51a7c57e60734e (diff)
SM4 optimization for ARM by ASIMD
This patch optimizes SM4 for ARM processor using ASIMD instruction It will improve performance if both of following conditions are met: 1) Input data equal to or more than 4 blocks 2) Cipher mode allows parallelism, including ECB,CTR,GCM or CBC decryption This patch implements SM4 SBOX lookup in vector registers, with the benefit of constant processing time over existing C implementation. It is only enabled for micro-architecture N1/V1. In the ideal scenario, performance can reach up to 2.7X When either of above two conditions is not met, e.g. single block input or CFB/OFB mode, CBC encryption, performance could drop about 50%. The assembly code has been reviewed internally by ARM engineer Fangming.Fang@arm.com Signed-off-by: Daniel Hu <Daniel.Hu@arm.com> Reviewed-by: Paul Dale <pauli@openssl.org> Reviewed-by: Tomas Mraz <tomas@openssl.org> (Merged from https://github.com/openssl/openssl/pull/17951) (cherry picked from commit 4908787f21f4f5fa24b721ed3ebbc4d3e93ef70c)
-rw-r--r--crypto/evp/e_sm4.c24
-rwxr-xr-xcrypto/sm4/asm/vpsm4-armv8.pl1118
-rw-r--r--crypto/sm4/build.info6
-rw-r--r--include/crypto/sm4_platform.h29
-rw-r--r--providers/implementations/ciphers/cipher_sm4_gcm_hw.c7
-rw-r--r--providers/implementations/ciphers/cipher_sm4_hw.c24
6 files changed, 1206 insertions, 2 deletions
diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c
index bff79ff197..c8e8cfe9c9 100644
--- a/crypto/evp/e_sm4.c
+++ b/crypto/evp/e_sm4.c
@@ -77,6 +77,17 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
# endif
} else
#endif
+#ifdef VPSM4_CAPABLE
+ if (VPSM4_CAPABLE) {
+ vpsm4_set_decrypt_key(key, &dat->ks.ks);
+ dat->block = (block128_f) vpsm4_decrypt;
+ dat->stream.cbc = NULL;
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt;
+ else if (mode == EVP_CIPH_ECB_MODE)
+ dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt;
+ } else
+#endif
{
dat->block = (block128_f) ossl_sm4_decrypt;
ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
@@ -105,6 +116,19 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
(void)0; /* terminate potentially open 'else' */
} else
#endif
+#ifdef VPSM4_CAPABLE
+ if (VPSM4_CAPABLE) {
+ vpsm4_set_encrypt_key(key, &dat->ks.ks);
+ dat->block = (block128_f) vpsm4_encrypt;
+ dat->stream.cbc = NULL;
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt;
+ else if (mode == EVP_CIPH_ECB_MODE)
+ dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt;
+ else if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) vpsm4_ctr32_encrypt_blocks;
+ } else
+#endif
{
dat->block = (block128_f) ossl_sm4_encrypt;
ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
new file mode 100755
index 0000000000..095d9dae64
--- /dev/null
+++ b/crypto/sm4/asm/vpsm4-armv8.pl
@@ -0,0 +1,1118 @@
+#! /usr/bin/env perl
+# Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# This module implements SM4 with ASIMD on aarch64
+#
+# Feb 2022
+#
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+$prefix="vpsm4";
+my @vtmp=map("v$_",(0..3));
+my @data=map("v$_",(4..7));
+my @datax=map("v$_",(8..11));
+my ($rk0,$rk1)=("v12","v13");
+my ($rka,$rkb)=("v14","v15");
+my @vtmpx=map("v$_",(12..15));
+my @sbox=map("v$_",(16..31));
+my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
+my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
+my ($ptr,$counter)=("x10","w11");
+my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
+
+sub rev32() {
+ my $dst = shift;
+ my $src = shift;
+
+ if ($src and ("$src" ne "$dst")) {
+$code.=<<___;
+#ifndef __ARMEB__
+ rev32 $dst.16b,$src.16b
+#else
+ mov $dst.16b,$src.16b
+#endif
+___
+ } else {
+$code.=<<___;
+#ifndef __ARMEB__
+ rev32 $dst.16b,$dst.16b
+#endif
+___
+ }
+}
+
+sub transpose() {
+ my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
+
+$code.=<<___;
+ zip1 $vt0.4s,$dat0.4s,$dat1.4s
+ zip2 $vt1.4s,$dat0.4s,$dat1.4s
+ zip1 $vt2.4s,$dat2.4s,$dat3.4s
+ zip2 $vt3.4s,$dat2.4s,$dat3.4s
+ zip1 $dat0.2d,$vt0.2d,$vt2.2d
+ zip2 $dat1.2d,$vt0.2d,$vt2.2d
+ zip1 $dat2.2d,$vt1.2d,$vt3.2d
+ zip2 $dat3.2d,$vt1.2d,$vt3.2d
+___
+}
+
+# sbox operations for 4-lane of words
+sub sbox() {
+ my $dat = shift;
+
+$code.=<<___;
+ movi @vtmp[0].16b,#64
+ movi @vtmp[1].16b,#128
+ movi @vtmp[2].16b,#192
+ sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b
+ sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b
+ sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b
+ tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
+ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
+ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
+ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
+ add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
+ add @vtmp[2].2d,@vtmp[2].2d,$dat.2d
+ add $dat.2d,@vtmp[0].2d,@vtmp[2].2d
+
+ ushr @vtmp[0].4s,$dat.4s,32-2
+ sli @vtmp[0].4s,$dat.4s,2
+ ushr @vtmp[2].4s,$dat.4s,32-10
+ eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
+ sli @vtmp[2].4s,$dat.4s,10
+ eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
+ ushr @vtmp[0].4s,$dat.4s,32-18
+ sli @vtmp[0].4s,$dat.4s,18
+ ushr @vtmp[2].4s,$dat.4s,32-24
+ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
+ sli @vtmp[2].4s,$dat.4s,24
+ eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b
+___
+}
+
+# sbox operation for 8-lane of words
+sub sbox_double() {
+ my $dat = shift;
+ my $datx = shift;
+
+$code.=<<___;
+ movi @vtmp[3].16b,#64
+ sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b
+ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
+ sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
+ tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
+ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
+ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
+ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
+ add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
+ add $dat.2d,@vtmp[2].2d,$dat.2d
+ add $dat.2d,@vtmp[1].2d,$dat.2d
+
+ sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b
+ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
+ sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
+ tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
+ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
+ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
+ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
+ add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
+ add $datx.2d,@vtmp[2].2d,$datx.2d
+ add $datx.2d,@vtmp[1].2d,$datx.2d
+
+ ushr @vtmp[0].4s,$dat.4s,32-2
+ sli @vtmp[0].4s,$dat.4s,2
+ ushr @vtmp[2].4s,$datx.4s,32-2
+ eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
+ sli @vtmp[2].4s,$datx.4s,2
+
+ ushr @vtmp[0].4s,$dat.4s,32-10
+ eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b
+ sli @vtmp[0].4s,$dat.4s,10
+ ushr @vtmp[2].4s,$datx.4s,32-10
+ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
+ sli @vtmp[2].4s,$datx.4s,10
+
+ ushr @vtmp[0].4s,$dat.4s,32-18
+ eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
+ sli @vtmp[0].4s,$dat.4s,18
+ ushr @vtmp[2].4s,$datx.4s,32-18
+ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
+ sli @vtmp[2].4s,$datx.4s,18
+
+ ushr @vtmp[0].4s,$dat.4s,32-24
+ eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
+ sli @vtmp[0].4s,$dat.4s,24
+ ushr @vtmp[2].4s,$datx.4s,32-24
+ eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b
+ sli @vtmp[2].4s,$datx.4s,24
+ eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
+___
+}
+
+# sbox operation for one single word
+sub sbox_1word () {
+ my $word = shift;
+
+$code.=<<___;
+ movi @vtmp[1].16b,#64
+ movi @vtmp[2].16b,#128
+ movi @vtmp[3].16b,#192
+ mov @vtmp[0].s[0],$word
+
+ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
+ sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
+ sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
+
+ tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
+ tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
+ tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
+ tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
+
+ mov $word,@vtmp[0].s[0]
+ mov $wtmp0,@vtmp[1].s[0]
+ mov $wtmp2,@vtmp[2].s[0]
+ add $wtmp0,$word,$wtmp0
+ mov $word,@vtmp[3].s[0]
+ add $wtmp0,$wtmp0,$wtmp2
+ add $wtmp0,$wtmp0,$word
+
+ eor $word,$wtmp0,$wtmp0,ror #32-2
+ eor $word,$word,$wtmp0,ror #32-10
+ eor $word,$word,$wtmp0,ror #32-18
+ eor $word,$word,$wtmp0,ror #32-24
+___
+}
+
+# sm4 for one block of data, in scalar registers word0/word1/word2/word3
+sub sm4_1blk () {
+ my $kptr = shift;
+
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor $tmpw,$word2,$word3
+ eor $wtmp2,$wtmp0,$word1
+ eor $tmpw,$tmpw,$wtmp2
+___
+ &sbox_1word($tmpw);
+$code.=<<___;
+ eor $word0,$word0,$tmpw
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor $tmpw,$word2,$word3
+ eor $wtmp2,$word0,$wtmp1
+ eor $tmpw,$tmpw,$wtmp2
+___
+ &sbox_1word($tmpw);
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ eor $word1,$word1,$tmpw
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor $tmpw,$word0,$word1
+ eor $wtmp2,$wtmp0,$word3
+ eor $tmpw,$tmpw,$wtmp2
+___
+ &sbox_1word($tmpw);
+$code.=<<___;
+ eor $word2,$word2,$tmpw
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor $tmpw,$word0,$word1
+ eor $wtmp2,$word2,$wtmp1
+ eor $tmpw,$tmpw,$wtmp2
+___
+ &sbox_1word($tmpw);
+$code.=<<___;
+ eor $word3,$word3,$tmpw
+___
+}
+
+# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
+sub sm4_4blks () {
+ my $kptr = shift;
+
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ dup $rk0.4s,$wtmp0
+ dup $rk1.4s,$wtmp1
+
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor $rka.16b,@data[2].16b,@data[3].16b
+ eor $rk0.16b,@data[1].16b,$rk0.16b
+ eor $rk0.16b,$rka.16b,$rk0.16b
+___
+ &sbox($rk0);
+$code.=<<___;
+ eor @data[0].16b,@data[0].16b,$rk0.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor $rka.16b,$rka.16b,@data[0].16b
+ eor $rk1.16b,$rka.16b,$rk1.16b
+___
+ &sbox($rk1);
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ eor @data[1].16b,@data[1].16b,$rk1.16b
+
+ dup $rk0.4s,$wtmp0
+ dup $rk1.4s,$wtmp1
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor $rka.16b,@data[0].16b,@data[1].16b
+ eor $rk0.16b,@data[3].16b,$rk0.16b
+ eor $rk0.16b,$rka.16b,$rk0.16b
+___
+ &sbox($rk0);
+$code.=<<___;
+ eor @data[2].16b,@data[2].16b,$rk0.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor $rka.16b,$rka.16b,@data[2].16b
+ eor $rk1.16b,$rka.16b,$rk1.16b
+___
+ &sbox($rk1);
+$code.=<<___;
+ eor @data[3].16b,@data[3].16b,$rk1.16b
+___
+}
+
+# sm4 for 8 lanes of data, in neon registers
+# data0/data1/data2/data3 datax0/datax1/datax2/datax3
+sub sm4_8blks () {
+ my $kptr = shift;
+
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ dup $rk0.4s,$wtmp0
+ eor $rka.16b,@data[2].16b,@data[3].16b
+ eor $rkb.16b,@datax[2].16b,@datax[3].16b
+ eor @vtmp[0].16b,@data[1].16b,$rk0.16b
+ eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
+ eor $rk0.16b,$rka.16b,@vtmp[0].16b
+ eor $rk1.16b,$rkb.16b,@vtmp[1].16b
+___
+ &sbox_double($rk0,$rk1);
+$code.=<<___;
+ eor @data[0].16b,@data[0].16b,$rk0.16b
+ eor @datax[0].16b,@datax[0].16b,$rk1.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ dup $rk1.4s,$wtmp1
+ eor $rka.16b,$rka.16b,@data[0].16b
+ eor $rkb.16b,$rkb.16b,@datax[0].16b
+ eor $rk0.16b,$rka.16b,$rk1.16b
+ eor $rk1.16b,$rkb.16b,$rk1.16b
+___
+ &sbox_double($rk0,$rk1);
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ eor @data[1].16b,@data[1].16b,$rk0.16b
+ eor @datax[1].16b,@datax[1].16b,$rk1.16b
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ dup $rk0.4s,$wtmp0
+ eor $rka.16b,@data[0].16b,@data[1].16b
+ eor $rkb.16b,@datax[0].16b,@datax[1].16b
+ eor @vtmp[0].16b,@data[3].16b,$rk0.16b
+ eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
+ eor $rk0.16b,$rka.16b,@vtmp[0].16b
+ eor $rk1.16b,$rkb.16b,@vtmp[1].16b
+___
+ &sbox_double($rk0,$rk1);
+$code.=<<___;
+ eor @data[2].16b,@data[2].16b,$rk0.16b
+ eor @datax[2].16b,@datax[2].16b,$rk1.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ dup $rk1.4s,$wtmp1
+ eor $rka.16b,$rka.16b,@data[2].16b
+ eor $rkb.16b,$rkb.16b,@datax[2].16b
+ eor $rk0.16b,$rka.16b,$rk1.16b
+ eor $rk1.16b,$rkb.16b,$rk1.16b
+___
+ &sbox_double($rk0,$rk1);
+$code.=<<___;
+ eor @data[3].16b,@data[3].16b,$rk0.16b
+ eor @datax[3].16b,@datax[3].16b,$rk1.16b
+___
+}
+
+sub encrypt_1blk_norev() {
+ my $dat = shift;
+
+$code.=<<___;
+ mov $ptr,$rks
+ mov $counter,#8
+ mov $word0,$dat.s[0]
+ mov $word1,$dat.s[1]
+ mov $word2,$dat.s[2]
+ mov $word3,$dat.s[3]
+10:
+___
+ &sm4_1blk($ptr);
+$code.=<<___;
+ subs $counter,$counter,#1
+ b.ne 10b
+ mov $dat.s[0],$word3
+ mov $dat.s[1],$word2
+ mov $dat.s[2],$word1
+ mov $dat.s[3],$word0
+___
+}
+
+sub encrypt_1blk() {
+ my $dat = shift;
+
+ &encrypt_1blk_norev($dat);
+ &rev32($dat,$dat);
+}
+
+sub encrypt_4blks() {
+$code.=<<___;
+ mov $ptr,$rks
+ mov $counter,#8
+10:
+___
+ &sm4_4blks($ptr);
+$code.=<<___;
+ subs $counter,$counter,#1
+ b.ne 10b
+___
+ &rev32(@vtmp[3],@data[0]);
+ &rev32(@vtmp[2],@data[1]);
+ &rev32(@vtmp[1],@data[2]);
+ &rev32(@vtmp[0],@data[3]);
+}
+
+sub encrypt_8blks() {
+$code.=<<___;
+ mov $ptr,$rks
+ mov $counter,#8
+10:
+___
+ &sm4_8blks($ptr);
+$code.=<<___;
+ subs $counter,$counter,#1
+ b.ne 10b
+___
+ &rev32(@vtmp[3],@data[0]);
+ &rev32(@vtmp[2],@data[1]);
+ &rev32(@vtmp[1],@data[2]);
+ &rev32(@vtmp[0],@data[3]);
+ &rev32(@data[3],@datax[0]);
+ &rev32(@data[2],@datax[1]);
+ &rev32(@data[1],@datax[2]);
+ &rev32(@data[0],@datax[3]);
+}
+
+sub load_sbox () {
+ my $data = shift;
+
+$code.=<<___;
+ adr $ptr,.Lsbox
+ ld1 {@sbox[0].4s,@sbox[1].4s,@sbox[2].4s,@sbox[3].4s},[$ptr],#64
+ ld1 {@sbox[4].4s,@sbox[5].4s,@sbox[6].4s,@sbox[7].4s},[$ptr],#64
+ ld1 {@sbox[8].4s,@sbox[9].4s,@sbox[10].4s,@sbox[11].4s},[$ptr],#64
+ ld1 {@sbox[12].4s,@sbox[13].4s,@sbox[14].4s,@sbox[15].4s},[$ptr]
+___
+}
+
+$code=<<___;
+#include "arm_arch.h"
+.arch armv8-a
+.text
+
+.type _vpsm4_consts,%object
+.align 7
+_vpsm4_consts:
+.Lsbox:
+ .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
+ .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
+ .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
+ .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
+ .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
+ .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
+ .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
+ .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
+ .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
+ .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
+ .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
+ .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
+ .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
+ .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
+ .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
+ .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
+.Lck:
+ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
+ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
+ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
+ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
+ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
+ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
+ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
+ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+.Lfk:
+ .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
+.Lshuffles:
+ .dword 0x0B0A090807060504,0x030201000F0E0D0C
+
+.size _vpsm4_consts,.-_vpsm4_consts
+___
+
+{{{
+my ($key,$keys,$enc)=("x0","x1","w2");
+my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
+my ($vkey,$vfk,$vmap)=("v5","v6","v7");
+$code.=<<___;
+.type _vpsm4_set_key,%function
+.align 4
+_vpsm4_set_key:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {$vkey.4s},[$key]
+___
+ &load_sbox();
+ &rev32($vkey,$vkey);
+$code.=<<___;
+ adr $pointer,.Lshuffles
+ ld1 {$vmap.4s},[$pointer]
+ adr $pointer,.Lfk
+ ld1 {$vfk.4s},[$pointer]
+ eor $vkey.16b,$vkey.16b,$vfk.16b
+ mov $schedules,#32
+ adr $pointer,.Lck
+ movi @vtmp[0].16b,#64
+ cbnz $enc,1f
+ add $keys,$keys,124
+1:
+ mov $wtmp,$vkey.s[1]
+ ldr $roundkey,[$pointer],#4
+ eor $roundkey,$roundkey,$wtmp
+ mov $wtmp,$vkey.s[2]
+ eor $roundkey,$roundkey,$wtmp
+ mov $wtmp,$vkey.s[3]
+ eor $roundkey,$roundkey,$wtmp
+ // sbox lookup
+ mov @data[0].s[0],$roundkey
+ tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
+ sub @data[0].16b,@data[0].16b,@vtmp[0].16b
+ tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
+ sub @data[0].16b,@data[0].16b,@vtmp[0].16b
+ tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
+ sub @data[0].16b,@data[0].16b,@vtmp[0].16b
+ tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
+ mov $wtmp,@vtmp[1].s[0]
+ eor $roundkey,$wtmp,$wtmp,ror #19
+ eor $roundkey,$roundkey,$wtmp,ror #9
+ mov $wtmp,$vkey.s[0]
+ eor $roundkey,$roundkey,$wtmp
+ mov $vkey.s[0],$roundkey
+ cbz $enc,2f
+ str $roundkey,[$keys],#4
+ b 3f
+2:
+ str $roundkey,[$keys],#-4
+3:
+ tbl $vkey.16b,{$vkey.16b},$vmap.16b
+ subs $schedules,$schedules,#1
+ b.ne 1b
+ ret
+.size _vpsm4_set_key,.-_vpsm4_set_key
+___
+}}}
+
+
+{{{
+$code.=<<___;
+.type _vpsm4_enc_4blks,%function
+.align 4
+_vpsm4_enc_4blks:
+ AARCH64_VALID_CALL_TARGET
+___
+ &encrypt_4blks();
+$code.=<<___;
+ ret
+.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
+___
+}}}
+
+{{{
+$code.=<<___;
+.type _vpsm4_enc_8blks,%function
+.align 4
+_vpsm4_enc_8blks:
+ AARCH64_VALID_CALL_TARGET
+___
+ &encrypt_8blks();
+$code.=<<___;
+ ret
+.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
+___
+}}}
+
+
+{{{
+my ($key,$keys)=("x0","x1");
+$code.=<<___;
+.globl ${prefix}_set_encrypt_key
+.type ${prefix}_set_encrypt_key,%function
+.align 5
+${prefix}_set_encrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,1
+ bl _vpsm4_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+___
+}}}
+
+{{{
+my ($key,$keys)=("x0","x1");
+$code.=<<___;
+.globl ${prefix}_set_decrypt_key
+.type ${prefix}_set_decrypt_key,%function
+.align 5
+${prefix}_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,0
+ bl _vpsm4_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
+___
+}}}
+
+{{{
+sub gen_block () {
+ my $dir = shift;
+ my ($inp,$outp,$rk)=map("x$_",(0..2));
+
+$code.=<<___;
+.globl ${prefix}_${dir}crypt
+.type ${prefix}_${dir}crypt,%function
+.align 5
+${prefix}_${dir}crypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {@data[0].16b},[$inp]
+___
+ &load_sbox();
+ &rev32(@data[0],@data[0]);
+$code.=<<___;
+ mov $rks,x2
+___
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ st1 {@data[0].16b},[$outp]
+ ret
+.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+
+{{{
+my ($enc) = ("w4");
+my @dat=map("v$_",(16..23));
+
+$code.=<<___;
+.globl ${prefix}_ecb_encrypt
+.type ${prefix}_ecb_encrypt,%function
+.align 5
+${prefix}_ecb_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ // convert length into blocks
+ lsr x2,x2,4
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+___
+ &load_sbox();
+$code.=<<___;
+.Lecb_8_blocks_process:
+ cmp $blocks,#8
+ b.lt .Lecb_4_blocks_process
+ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+ ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+ &rev32(@datax[0],@datax[0]);
+ &rev32(@datax[1],@datax[1]);
+ &rev32(@datax[2],@datax[2]);
+ &rev32(@datax[3],@datax[3]);
+$code.=<<___;
+ bl _vpsm4_enc_8blks
+ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#8
+ b.gt .Lecb_8_blocks_process
+ b 100f
+.Lecb_4_blocks_process:
+ cmp $blocks,#4
+ b.lt 1f
+ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _vpsm4_enc_4blks
+ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ sub $blocks,$blocks,#4
+1:
+ // process last block
+ cmp $blocks,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {@data[0].16b},[$inp]
+___
+ &rev32(@data[0],@data[0]);
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ st1 {@data[0].16b},[$outp]
+ b 100f
+1: // process last 2 blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
+ cmp $blocks,#2
+ b.gt 1f
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _vpsm4_enc_4blks
+ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
+ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
+ b 100f
+1: // process last 3 blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _vpsm4_enc_4blks
+ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
+ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
+ st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
+___
+}}}
+
+{{{
+my ($len,$ivp,$enc)=("x2","x4","w5");
+my $ivec0=("v3");
+my $ivec1=("v15");
+
+$code.=<<___;
+.globl ${prefix}_cbc_encrypt
+.type ${prefix}_cbc_encrypt,%function
+.align 5
+${prefix}_cbc_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ lsr $len,$len,4
+___
+ &load_sbox();
+$code.=<<___;
+ cbz $enc,.Ldec
+ ld1 {$ivec0.4s},[$ivp]
+.Lcbc_4_blocks_enc:
+ cmp $blocks,#4
+ b.lt 1f
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+ eor @data[0].16b,@data[0].16b,$ivec0.16b
+___
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+ &encrypt_1blk_norev(@data[0]);
+$code.=<<___;
+ eor @data[1].16b,@data[1].16b,@data[0].16b
+___
+ &encrypt_1blk_norev(@data[1]);
+ &rev32(@data[0],@data[0]);
+
+$code.=<<___;
+ eor @data[2].16b,@data[2].16b,@data[1].16b
+___
+ &encrypt_1blk_norev(@data[2]);
+ &rev32(@data[1],@data[1]);
+$code.=<<___;
+ eor @data[3].16b,@data[3].16b,@data[2].16b
+___
+ &encrypt_1blk_norev(@data[3]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ orr $ivec0.16b,@data[3].16b,@data[3].16b
+ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#4
+ b.ne .Lcbc_4_blocks_enc
+ b 2f
+1:
+ subs $blocks,$blocks,#1
+ b.lt 2f
+ ld1 {@data[0].4s},[$inp],#16
+ eor $ivec0.16b,$ivec0.16b,@data[0].16b
+___
+ &rev32($ivec0,$ivec0);
+ &encrypt_1blk($ivec0);
+$code.=<<___;
+ st1 {$ivec0.16b},[$outp],#16
+ b 1b
+2:
+ // save back IV
+ st1 {$ivec0.16b},[$ivp]
+ ret
+
+.Ldec:
+ // decryption mode starts
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+.Lcbc_8_blocks_dec:
+ cmp $blocks,#8
+ b.lt 1f
+ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
+ add $ptr,$inp,#64
+ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],$data[3]);
+ &rev32(@datax[0],@datax[0]);
+ &rev32(@datax[1],@datax[1]);
+ &rev32(@datax[2],@datax[2]);
+ &rev32(@datax[3],$datax[3]);
+$code.=<<___;
+ bl _vpsm4_enc_8blks
+___
+ &transpose(@vtmp,@datax);
+ &transpose(@data,@datax);
+$code.=<<___;
+ ld1 {$ivec1.16b},[$ivp]
+ ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+ // note ivec1 and vtmpx[3] are resuing the same register
+ // care needs to be taken to avoid conflict
+ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
+ ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
+ eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
+ eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
+ // save back IV
+ st1 {$vtmpx[3].16b}, [$ivp]
+ eor @data[0].16b,@data[0].16b,$datax[3].16b
+ eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
+ eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
+ eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#8
+ b.gt .Lcbc_8_blocks_dec
+ b.eq 100f
+1:
+ ld1 {$ivec1.16b},[$ivp]
+.Lcbc_4_blocks_dec:
+ cmp $blocks,#4
+ b.lt 1f
+ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],$data[3]);
+$code.=<<___;
+ bl _vpsm4_enc_4blks
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+___
+ &transpose(@vtmp,@datax);
+$code.=<<___;
+ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
+ orr $ivec1.16b,@data[3].16b,@data[3].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
+ eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ subs $blocks,$blocks,#4
+ b.gt .Lcbc_4_blocks_dec
+ // save back IV
+ st1 {@vtmp[3].16b}, [$ivp]
+ b 100f
+1: // last block
+ subs $blocks,$blocks,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {@data[0].4s},[$inp],#16
+ // save back IV
+ st1 {$data[0].16b}, [$ivp]
+___
+ &rev32(@datax[0],@data[0]);
+ &encrypt_1blk(@datax[0]);
+$code.=<<___;
+ eor @datax[0].16b,@datax[0].16b,$ivec1.16b
+ st1 {@datax[0].16b},[$outp],#16
+ b 100f
+1: // last two blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
+ add $ptr,$inp,#16
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
+ subs $blocks,$blocks,1
+ b.gt 1f
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _vpsm4_enc_4blks
+ ld1 {@data[0].4s,@data[1].4s},[$inp],#32
+___
+ &transpose(@vtmp,@datax);
+$code.=<<___;
+ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
+ // save back IV
+ st1 {@data[1].16b}, [$ivp]
+ b 100f
+1: // last 3 blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _vpsm4_enc_4blks
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
+___
+ &transpose(@vtmp,@datax);
+$code.=<<___;
+ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
+ // save back IV
+ st1 {@data[2].16b}, [$ivp]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
+___
+}}}
+
+{{{
+my ($ivp)=("x4");
+my ($ctr)=("w5");
+my $ivec=("v3");
+
+$code.=<<___;
+.globl ${prefix}_ctr32_encrypt_blocks
+.type ${prefix}_ctr32_encrypt_blocks,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {$ivec.4s},[$ivp]
+___
+ &rev32($ivec,$ivec);
+ &load_sbox();
+$code.=<<___;
+ cmp $blocks,#1
+ b.ne 1f
+ // fast processing for one single block without
+ // context saving overhead
+___
+ &encrypt_1blk($ivec);
+$code.=<<___;
+ ld1 {@data[0].16b},[$inp]
+ eor @data[0].16b,@data[0].16b,$ivec.16b
+ st1 {@data[0].16b},[$outp]
+ ret
+1:
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+ mov $word0,$ivec.s[0]
+ mov $word1,$ivec.s[1]
+ mov $word2,$ivec.s[2]
+ mov $ctr,$ivec.s[3]
+.Lctr32_4_blocks_process:
+ cmp $blocks,#4
+ b.lt 1f
+ dup @data[0].4s,$word0
+ dup @data[1].4s,$word1
+ dup @data[2].4s,$word2
+ mov @data[3].s[0],$ctr
+ add $ctr,$ctr,#1
+ mov $data[3].s[1],$ctr
+ add $ctr,$ctr,#1
+ mov @data[3].s[2],$ctr
+ add $ctr,$ctr,#1
+ mov @data[3].s[3],$ctr
+ add $ctr,$ctr,#1
+ cmp $blocks,#8
+ b.ge .Lctr32_8_blocks_process
+ bl _vpsm4_enc_4blks
+ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
+ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
+ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
+ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ subs $blocks,$blocks,#4
+ b.ne .Lctr32_4_blocks_process
+ b 100f
+.Lctr32_8_blocks_process:
+ dup @datax[0].4s,$word0
+ dup @datax[1].4s,$word1
+ dup @datax[2].4s,$word2
+ mov @datax[3].s[0],$ctr
+ add $ctr,$ctr,#1
+ mov $datax[3].s[1],$ctr
+ add $ctr,$ctr,#1
+ mov @datax[3].s[2],$ctr
+ add $ctr,$ctr,#1
+ mov @datax[3].s[3],$ctr
+ add $ctr,$ctr,#1
+ bl _vpsm4_enc_8blks
+ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
+ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
+ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
+ eor @data[0].16b,@data[0].16b,@datax[0].16b
+ eor @data[1].16b,@data[1].16b,@datax[1].16b
+