summaryrefslogtreecommitdiffstats
path: root/crypto/aes
diff options
context:
space:
mode:
authorzhuchen <zhuchen@loongson.cn>2022-09-29 20:14:00 +0800
committerPauli <pauli@openssl.org>2022-10-12 18:02:12 +1100
commitef917549f5867d269d359155ff67b8ccb5e66a76 (patch)
treeec62cb300e8c31adaa44d3618821c176e52faca4 /crypto/aes
parent7f2d6188c7b16ef7a4deeeedb56f42014156b9f8 (diff)
Add vpaes-loongarch64.pl module.
Add 128 bit lsx vector expansion optimization code of Loongarch64 architecture to AES. The test result on the 3A5000 improves performance by about 40%~50%. Signed-off-by: zhuchen <zhuchen@loongson.cn> Reviewed-by: Tomas Mraz <tomas@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/19364)
Diffstat (limited to 'crypto/aes')
-rw-r--r--crypto/aes/asm/vpaes-loongarch64.pl1003
-rw-r--r--crypto/aes/build.info6
2 files changed, 1009 insertions, 0 deletions
diff --git a/crypto/aes/asm/vpaes-loongarch64.pl b/crypto/aes/asm/vpaes-loongarch64.pl
new file mode 100644
index 0000000000..286adc25f3
--- /dev/null
+++ b/crypto/aes/asm/vpaes-loongarch64.pl
@@ -0,0 +1,1003 @@
+#! /usr/bin/env perl
+# Copyright 2015-2022 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+##
+######################################################################
+
+# Loongarch64 LSX adaptation by <zhuchen@loongson.cn>,
+# <lujingfeng@loongson.cn> and <shichenlong@loongson.cn>
+#
+
+($zero,$ra,$tp,$sp)=map("\$r$_",(0..3));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11));
+($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9)=map("\$r$_",(12..21));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$r$_",(23..30));
+($vr0,$vr1,$vr2,$vr3,$vr4,$vr5,$vr6,$vr7,$vr8,$vr9,$vr10,$vr11,$vr12,$vr13,$vr14,$vr15,$vr16,$vr17,$vr18,$vr19)=map("\$vr$_",(0..19));
+($fp)=map("\$r$_",(22));
+
+for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
+open STDOUT,">$output";
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$PREFIX="vpaes";
+
+$code.=<<___;
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %vr0.
+##
+## Inputs:
+## %vr0 = input
+## %vr9-%vr15 as in _vpaes_preheat
+## (%a2) = scheduled keys
+##
+## Output in %vr0
+## Clobbers %vr1-%vr5, %r9, %r10, %r11, %t5
+## Preserves %vr6 - %vr8 so you get some local vectors
+##
+##
+##.type _vpaes_encrypt_core
+.align 4
+_vpaes_encrypt_core:
+.cfi_startproc
+ move $a5,$a2
+ li.d $a7,0x10
+ ld.w $t5,$a2,240
+ vori.b $vr1,$vr9,0
+ la.local $t0,Lk_ipt
+ vld $vr2,$t0,0 # iptlo
+ vandn.v $vr1,$vr1,$vr0
+ vld $vr5,$a5,0 # round0 key
+ vsrli.w $vr1,$vr1,4
+ vand.v $vr0,$vr0,$vr9
+ vshuf.b $vr2,$vr0,$vr2,$vr0
+ vld $vr0,$t0,16 # ipthi
+ vshuf.b $vr0,$vr1,$vr0,$vr1
+ vxor.v $vr2,$vr2,$vr5
+ addi.d $a5,$a5,16
+ vxor.v $vr0,$vr0,$vr2
+ la.local $a6,Lk_mc_backward
+ b .Lenc_entry
+
+.align 4
+.Lenc_loop:
+ # middle of middle round
+ vori.b $vr4,$vr13,0 # 4 : sb1u
+ vori.b $vr0,$vr12,0 # 0 : sb1t
+ vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sb1u
+ vshuf.b $vr0,$vr3,$vr0,$vr3 # 0 = sb1t
+ vxor.v $vr4,$vr4,$vr5 # 4 = sb1u + k
+ vori.b $vr5,$vr15,0 # 4 : sb2u
+ vxor.v $vr0,$vr0,$vr4 # 0 = A
+ add.d $t0,$a7,$a6 # Lk_mc_forward[]
+ vld $vr1,$t0,-0x40
+ vshuf.b $vr5,$vr2,$vr5,$vr2 # 4 = sb2u
+ vld $vr4,$t0,0 # Lk_mc_backward[]
+ vori.b $vr2,$vr14,0 # 2 : sb2t
+ vshuf.b $vr2,$vr3,$vr2,$vr3 # 2 = sb2t
+ vori.b $vr3,$vr0,0 # 3 = A
+ vxor.v $vr2,$vr5,$vr2 # 2 = 2A
+ vshuf.b $vr0,$vr1,$vr0,$vr1 # 0 = B
+ addi.d $a5,$a5,16 # next key
+ vxor.v $vr0,$vr0,$vr2 # 0 = 2A+B
+ vshuf.b $vr3,$vr4,$vr3,$vr4 # 3 = D
+ addi.d $a7,$a7,16 # next mc
+ vxor.v $vr3,$vr3,$vr0 # 3 = 2A+B+D
+ vshuf.b $vr0,$vr1,$vr0,$vr1 # 0 = 2B+C
+ andi $a7,$a7,0x30 # ... mod 4
+ addi.d $t5,$t5,-1 # nr--
+ vxor.v $vr0,$vr0,$vr3 # 0 = 2A+3B+C+D
+
+.Lenc_entry:
+ # top of round
+ vori.b $vr1,$vr9,0 # 1 : i
+ vori.b $vr5,$vr11,0 # 2 : a/k
+ vandn.v $vr1,$vr1,$vr0 # 1 = i<<4
+ vsrli.w $vr1,$vr1,4 # 1 = i
+ vand.v $vr0,$vr0,$vr9 # 0 = k
+ vshuf.b $vr5,$vr0,$vr5,$vr0 # 2 = a/k
+ vori.b $vr3,$vr10,0 # 3 : 1/i
+ vxor.v $vr0,$vr0,$vr1 # 0 = j
+ vshuf.b $vr3,$vr1,$vr3,$vr1 # 3 = 1/i
+ vori.b $vr4,$vr10,0 # 4 : 1/j
+ vxor.v $vr3,$vr3,$vr5 # 3 = iak = 1/i + a/k
+ vshuf.b $vr4,$vr0,$vr4,$vr0 # 4 = 1/j
+ vori.b $vr2,$vr10,0 # 2 : 1/iak
+ vxor.v $vr4,$vr4,$vr5 # 4 = jak = 1/j + a/k
+ vshuf.b $vr2,$vr3,$vr2,$vr3 # 2 = 1/iak
+ vori.b $vr3,$vr10,0 # 3 : 1/jak
+ vxor.v $vr2,$vr2,$vr0 # 2 = io
+ vshuf.b $vr3,$vr4,$vr3,$vr4 # 3 = 1/jak
+ vld $vr5,$a5, 0
+ vxor.v $vr3,$vr3,$vr1 # 3 = jo
+ bnez $t5,.Lenc_loop
+
+ # middle of last round
+ vld $vr4,$a6, -0x60 # 3 : sbou Lk_sbo
+ vld $vr0,$a6, -0x50 # 0 : sbot Lk_sbo+16
+ vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbou
+ vxor.v $vr4,$vr4,$vr5 # 4 = sb1u + k
+ vshuf.b $vr0,$vr3,$vr0,$vr3 # 0 = sb1t
+ add.d $t0,$a7,$a6 # Lk_sr[]
+ vld $vr1,$t0, 0x40
+ vxor.v $vr0,$vr0,$vr4 # 0 = A
+ vshuf.b $vr0,$vr1,$vr0,$vr1
+ jr $ra
+.cfi_endproc
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+#.type _vpaes_decrypt_core,\@abi-omnipotent
+.align 4
+_vpaes_decrypt_core:
+.cfi_startproc
+ move $a5,$a2 # load key
+ ld.w $t5,$a2,240
+ vori.b $vr1,$vr9,0
+ la.local $t0,Lk_dipt
+ vld $vr2,$t0,0 # iptlo
+ vandn.v $vr1,$vr1,$vr0
+ move $a7,$t5
+ vsrli.w $vr1,$vr1,4
+ vld $vr5,$a5,0 # round0 key
+ slli.d $a7,$a7,4
+ vand.v $vr0,$vr9,$vr0
+ vshuf.b $vr2,$vr0,$vr2,$vr0
+ vld $vr0,$t0,16 # ipthi
+ xori $a7,$a7,0x30
+ la.local $a6,Lk_dsbd
+ vshuf.b $vr0,$vr1,$vr0,$vr1
+ andi $a7,$a7,0x30
+ vxor.v $vr2,$vr2,$vr5
+ la.local $t0,Lk_mc_forward
+ vld $vr5,$t0,48
+ vxor.v $vr0,$vr0,$vr2
+ addi.d $a5,$a5,16
+ add.d $a7,$a7,$a6
+ b .Ldec_entry
+
+.align 4
+.Ldec_loop:
+##
+## Inverse mix columns
+##
+ vld $vr4,$a6,-0x20 # 4 : sb9u
+ vld $vr1,$a6,-0x10 # 0 : sb9t
+ vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sb9u
+ vshuf.b $vr1,$vr3,$vr1,$vr3 # 0 = sb9t
+ vxor.v $vr0,$vr0,$vr4
+ vld $vr4,$a6,0x0 # 4 : sbdu
+ vxor.v $vr0,$vr0,$vr1 # 0 = ch
+ vld $vr1,$a6,0x10 # 0 : sbdt
+ vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbdu
+ vshuf.b $vr0,$vr5,$vr0,$vr5 # MC ch
+ vshuf.b $vr1,$vr3,$vr1,$vr3 # 0 = sbdt
+ vxor.v $vr0,$vr0,$vr4 # 4 = ch
+ vld $vr4,$a6,0x20 # 4 : sbbu
+ vxor.v $vr0,$vr0,$vr1 # 0 = ch
+ vld $vr1,$a6,0x30 # 0 : sbbt
+ vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbbu
+ vshuf.b $vr0,$vr5,$vr0,$vr5 # MC ch
+ vshuf.b $vr1,$vr3,$vr1,$vr3 # 0 = sbbt
+ vxor.v $vr0,$vr0,$vr4 # 4 = ch
+ vld $vr4,$a6,0x40 # 4 : sbeu
+ vxor.v $vr0,$vr0,$vr1 # 0 = ch
+ vld $vr1,$a6,0x50 # 0 : sbet
+ vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbeu
+ vshuf.b $vr0,$vr5,$vr0,$vr5 # MC ch
+ vshuf.b $vr1,$vr3,$vr1,$vr3 # 0 = sbet
+ vxor.v $vr0,$vr0,$vr4 # 4 = ch
+ addi.d $a5,$a5, 16 # next round key
+ vbsrl.v $vr16,$vr5,0xc
+ vbsll.v $vr5,$vr5,0x4
+ vor.v $vr5,$vr5,$vr16
+ vxor.v $vr0,$vr0,$vr1 # 0 = ch
+ addi.d $t5,$t5,-1 # nr--
+
+.Ldec_entry:
+ # top of round
+ vori.b $vr1,$vr9,0 # 1 : i
+ vandn.v $vr1,$vr1,$vr0 # 1 = i<<4
+ vori.b $vr2,$vr11,0 # 2 : a/k
+ vsrli.w $vr1,$vr1,4 # 1 = i
+ vand.v $vr0,$vr0,$vr9 # 0 = k
+ vshuf.b $vr2,$vr0,$vr2,$vr0 # 2 = a/k
+ vori.b $vr3,$vr10,0 # 3 : 1/i
+ vxor.v $vr0,$vr0,$vr1 # 0 = j
+ vshuf.b $vr3,$vr1,$vr3,$vr1 # 3 = 1/i
+ vori.b $vr4,$vr10,0 # 4 : 1/j
+ vxor.v $vr3,$vr3,$vr2 # 3 = iak = 1/i + a/k
+ vshuf.b $vr4,$vr0,$vr4,$vr0 # 4 = 1/j
+ vxor.v $vr4,$vr4,$vr2 # 4 = jak = 1/j + a/k
+ vori.b $vr2,$vr10,0 # 2 : 1/iak
+ vshuf.b $vr2,$vr3,$vr2,$vr3 # 2 = 1/iak
+ vori.b $vr3,$vr10,0 # 3 : 1/jak
+ vxor.v $vr2,$vr2,$vr0 # 2 = io
+ vshuf.b $vr3,$vr4,$vr3,$vr4 # 3 = 1/jak
+ vld $vr0,$a5,0
+ vxor.v $vr3,$vr3,$vr1 # 3 = jo
+ bnez $t5,.Ldec_loop
+
+ # middle of last round
+ vld $vr4,$a6,0x60 # 3 : sbou
+ vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbou
+ vxor.v $vr4,$vr4,$vr0 # 4 = sb1u + k
+ vld $vr0,$a6,0x70 # 0 : sbot
+ vld $vr2,$a7,-0x160 # Lk_sr-.Lk_dsbd=-0x160
+ vshuf.b $vr0,$vr3,$vr0,$vr3 # 0 = sb1t
+ vxor.v $vr0,$vr0,$vr4 # 0 = A
+ vshuf.b $vr0,$vr2,$vr0,$vr2
+ jr $ra
+.cfi_endproc
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+#.type _vpaes_schedule_core,\@abi-omnipotent
+.align 4
+_vpaes_schedule_core:
+.cfi_startproc
+ # a0 = key
+ # a1 = size in bits
+ # a2 = buffer
+ # a3 = direction. 0=encrypt, 1=decrypt
+
+ addi.d $sp,$sp,-48
+ st.d $ra,$sp,40
+ st.d $fp,$sp,32
+
+ bl _vpaes_preheat # load the tables
+ la.local $t0,Lk_rcon
+ vld $vr8,$t0,0 # load rcon
+ vld $vr0,$a0,0 # load key (unaligned)
+
+ # input transform
+ vori.b $vr3,$vr0,0
+ la.local $a7,Lk_ipt
+ bl _vpaes_schedule_transform
+ vori.b $vr7,$vr0,0
+
+ la.local $a6,Lk_sr
+ bnez $a3,.Lschedule_am_decrypting
+
+ # encrypting, output zeroth round key after transform
+ vst $vr0,$a2,0
+ b .Lschedule_go
+
+.Lschedule_am_decrypting:
+ # decrypting, output zeroth round key after shiftrows
+ add.d $t2,$a4,$a6
+ vld $vr1,$t2,0
+ vshuf.b $vr3,$vr1,$vr3,$vr1
+ vst $vr3,$a2,0
+ xori $a4,$a4,0x30
+
+.Lschedule_go:
+ li.d $t6,192
+ bltu $t6,$a1,.Lschedule_256
+ beq $t6,$a1,.Lschedule_192
+ # 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+.Lschedule_128:
+ li.w $a1,10
+
+.Loop_schedule_128:
+ bl _vpaes_schedule_round
+ addi.w $a1,$a1,-1
+ beqz $a1,.Lschedule_mangle_last
+ bl _vpaes_schedule_mangle
+ b .Loop_schedule_128
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %vr7 as before, and the short, low side is in
+## the high bits of %vr6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.align 4
+.Lschedule_192:
+ vld $vr0,$a0,8 #load key part 2
+ bl _vpaes_schedule_transform #input transform
+ vaddi.du $vr6,$vr0,0x0 #save short part
+ vxor.v $vr4,$vr4,$vr4 #clear 4
+ vpackod.d $vr6,$vr6,$vr4 #clobber low side with zeros
+ li.w $a1,4
+
+.Loop_schedule_192:
+ bl _vpaes_schedule_round
+ vbsrl.v $vr16,$vr6,0x8
+ vbsll.v $vr0,$vr0,0x8
+ vor.v $vr0,$vr0,$vr16
+
+ bl _vpaes_schedule_mangle # save key n
+ bl _vpaes_schedule_192_smear
+ bl _vpaes_schedule_mangle # save key n+1
+ bl _vpaes_schedule_round
+ addi.w $a1,$a1,-1
+ beqz $a1,.Lschedule_mangle_last
+ bl _vpaes_schedule_mangle # save key n+2
+ bl _vpaes_schedule_192_smear
+ b .Loop_schedule_192
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %vr6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.align 4
+.Lschedule_256:
+ vld $vr0,$a0,16 # load key part 2 (unaligned)
+ bl _vpaes_schedule_transform # input transform
+ addi.w $a1,$zero,7
+
+.Loop_schedule_256:
+ bl _vpaes_schedule_mangle # output low result
+ vori.b $vr6,$vr0,0 # save cur_lo in vr6
+
+ # high round
+ bl _vpaes_schedule_round
+ addi.d $a1,$a1,-1
+ beqz $a1,.Lschedule_mangle_last
+ bl _vpaes_schedule_mangle
+
+ # low round. swap vr7 and vr6
+ vshuf4i.w $vr0,$vr0,0xFF
+ vori.b $vr5,$vr7,0
+ vori.b $vr7,$vr6,0
+ bl _vpaes_schedule_low_round
+ vori.b $vr7,$vr5,0
+
+ b .Loop_schedule_256
+
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %vr0
+## when encrypting, outputs out(%vr0) ^ 63
+## when decrypting, outputs unskew(%vr0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.align 4
+.Lschedule_mangle_last:
+ # schedule last round key from vr0
+ la.local $a7,Lk_deskew # prepare to deskew
+ bnez $a3,.Lschedule_mangle_last_dec
+
+ # encrypting
+ add.d $t0,$a4,$a6
+ vld $vr1,$t0,0
+ vshuf.b $vr0,$vr1,$vr0,$vr1 # output permute
+ la.local $a7,Lk_opt # prepare to output transform
+ addi.d $a2,$a2,32
+
+.Lschedule_mangle_last_dec:
+ addi.d $a2,$a2,-16
+ la.local $t0,Lk_s63
+ vld $vr16,$t0,0
+ vxor.v $vr0,$vr0,$vr16
+ bl _vpaes_schedule_transform # output transform
+ vst $vr0,$a2,0 # save last key
+
+ # cleanup
+ vxor.v $vr0,$vr0,$vr0
+ vxor.v $vr1,$vr1,$vr1
+ vxor.v $vr2,$vr2,$vr2
+ vxor.v $vr3,$vr3,$vr3
+ vxor.v $vr4,$vr4,$vr4
+ vxor.v $vr5,$vr5,$vr5
+ vxor.v $vr6,$vr6,$vr6
+ vxor.v $vr7,$vr7,$vr7
+ ld.d $ra,$sp,40
+ ld.d $fp,$sp,32
+ addi.d $sp,$sp,48
+ jr $ra
+.cfi_endproc
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %vr7: high side, b a x y
+## %vr6: low side, d c 0 0
+## %vr13: 0
+##
+## Outputs:
+## %vr6: b+c+d b+c 0 0
+## %vr0: b+c+d b+c b a
+##
+#.type _vpaes_schedule_192_smear,\@abi-omnipotent
+.align 4
+_vpaes_schedule_192_smear:
+.cfi_startproc
+ vshuf4i.w $vr1,$vr6,0x80 # d c 0 0 -> c 0 0 0
+ vshuf4i.w $vr0,$vr7,0xFE # b a _ _ -> b b b a
+ vxor.v $vr6,$vr6,$vr1 # -> c+d c 0 0
+ vxor.v $vr1,$vr1,$vr1
+ vxor.v $vr6,$vr6,$vr0 # -> b+c+d b+c b a
+ vori.b $vr0,$vr6,0
+ vilvh.d $vr6,$vr6,$vr1 # clobber low side with zeros
+ jr $ra
+.cfi_endproc
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %vr0, %vr7
+##
+## Specifically, runs subbytes on the high dword of %vr0
+## then rotates it by one byte and xors into the low dword of
+## %vr7.
+##
+## Adds rcon from low byte of %vr8, then rotates %vr8 for
+## next rcon.
+##
+## Smears the dwords of %vr7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %vr7 = %vr0.
+## Clobbers %vr1-%vr4, %a7.
+##
+#.type _vpaes_schedule_round,\@abi-omnipotent
+.align 4
+_vpaes_schedule_round:
+.cfi_startproc
+ # extract rcon from vr8
+ vxor.v $vr1,$vr1,$vr1
+ vbsrl.v $vr16,$vr8,0xf
+ vbsll.v $vr1,$vr1,0x1
+ vor.v $vr1,$vr1,$vr16
+ vbsrl.v $vr16,$vr8,0xf
+ vbsll.v $vr8,$vr8,0x1
+ vor.v $vr8,$vr8,$vr16
+
+ vxor.v $vr7,$vr7,$vr1
+
+ # rotate
+ vshuf4i.w $vr0,$vr0,0xff #put $vr0 lowest 32 bit to each words
+ vbsrl.v $vr16,$vr0,0x1
+ vbsll.v $vr0,$vr0,0xf
+ vor.v $vr0,$vr0,$vr16
+
+ # fall through...
+
+ # low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ # smear vr7
+ vaddi.du $vr1,$vr7,0x0
+ vbsll.v $vr7,$vr7,0x4
+ vxor.v $vr7,$vr7,$vr1
+ vaddi.du $vr1,$vr7,0x0
+ vbsll.v $vr7,$vr7,0x8
+ vxor.v $vr7,$vr7,$vr1
+ vxori.b $vr7,$vr7,0x5B
+
+ # subbytes
+ vaddi.du $vr1,$vr9,0x0
+ vandn.v $vr1,$vr1,$vr0
+ vsrli.w $vr1,$vr1,0x4 # 1 = i
+ vand.v $vr0,$vr0,$vr9 # 0 = k
+ vaddi.du $vr2,$vr11,0x0 # 2 : a/k
+ vshuf.b $vr2,$vr0,$vr2,$vr0 # 2 = a/k
+ vxor.v $vr0,$vr0,$vr1 # 0 = j
+ vaddi.du $vr3,$vr10,0x0 # 3 : 1/i
+ vshuf.b $vr3,$vr1,$vr3,$vr1 # 3 = 1/i
+ vxor.v $vr3,$vr3,$vr2 # 3 = iak = 1/i + a/k
+ vaddi.du $vr4,$vr10,0x0 # 4 : 1/j
+ vshuf.b $vr4,$vr0,$vr4,$vr0 # 4 = 1/j
+ vxor.v $vr4,$vr4,$vr2 # 4 = jak = 1/j + a/k
+ vaddi.du $vr2,$vr10,0x0 # 2 : 1/iak
+ vshuf.b $vr2,$vr3,$vr2,$vr3 # 2 = 1/iak
+ vxor.v $vr2,$vr2,$vr0 # 2 = io
+ vaddi.du $vr3,$vr10,0x0 # 3 : 1/jak
+ vshuf.b $vr3,$vr4,$vr3,$vr4 # 3 = 1/jak
+ vxor.v $vr3,$vr3,$vr1 # 3 = jo
+ vaddi.du $vr4,$vr13,0x0 # 4 : sbou
+ vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbou
+ vaddi.du $vr0,$vr12,0x0 # 0 : sbot
+ vshuf.b $vr0,$vr3,$vr0,$vr3 # 0 = sb1t
+ vxor.v $vr0,$vr0,$vr4 # 0 = sbox output
+
+ # add in smeared stuff
+ vxor.v $vr0,$vr0,$vr7
+ vaddi.du $vr7,$vr0,0x0
+ jr $ra
+.cfi_endproc
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %vr0 according to tables at (%r11)
+##
+## Requires that %vr9 = 0x0F0F... as in preheat
+## Output in %vr0
+## Clobbers %vr1, %vr2
+##
+#.type _vpaes_schedule_transform,\@abi-omnipotent
+.align 4
+_vpaes_schedule_transform:
+.cfi_startproc
+ vori.b $vr1,$vr9,0
+ vandn.v $vr1,$vr1,$vr0
+ vsrli.w $vr1,$vr1,4
+ vand.v $vr0,$vr0,$vr9
+ vld $vr2,$a7,0 # lo
+ vshuf.b $vr2,$vr0,$vr2,$vr0
+ vld $vr0,$a7,16 # hi
+ vshuf.b $vr0,$vr1,$vr0,$vr1
+ vxor.v $vr0,$vr0,$vr2
+ jr $ra
+.cfi_endproc
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+## .aes_schedule_mangle
+##
+## Mangle vr0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%a2), and increments or decrements it
+## Keeps track of round number mod 4 in %a4
+## Preserves vr0
+## Clobbers vr1-vr5
+##
+#.type _vpaes_schedule_mangle,\@abi-omnipotent
+.align 4
+_vpaes_schedule_mangle:
+.cfi_startproc
+ vori.b $vr4,$vr0,0 # save vr0 for later
+ la.local $t0,Lk_mc_forward
+ vld $vr5,$t0,0
+ bnez $a3,.Lschedule_mangle_dec
+
+ # encrypting
+ addi.d $a2,$a2,16
+ la.local $t0,Lk_s63
+ vld $vr16,$t0,0
+ vxor.v $vr4,$vr4,$vr16
+ vshuf.b $vr4,$vr5,$vr4,$vr5
+ vori.b $vr3,$vr4,0
+ vshuf.b $vr4,$vr5,$vr4,$vr5
+ vxor.v $vr3,$vr3,$vr4
+ vshuf.b $vr4,$vr5,$vr4,$vr5
+ vxor.v $vr3,$vr3,$vr4
+
+ b .Lschedule_mangle_both
+.align 4
+.Lschedule_mangle_dec:
+ # inverse mix columns
+ la.local $a7,Lk_dksd
+ vori.b $vr1,$vr9,0
+ vandn.v $vr1,$vr1,$vr4
+ vsrli.w $vr1,$vr1,4 # 1 = hi
+ vand.v $vr4,$vr4,$vr9 # 4 = lo
+
+ vld $vr2,$a7,0
+ vshuf.b $vr2,$vr4,$vr2,$vr4
+ vld $vr3,$a7,0x10
+ vshuf.b $vr3,$vr1,$vr3,$vr1
+ vxor.v $vr3,$vr3,$vr2
+ vshuf.b $vr3,$vr5,$vr3,$vr5
+
+ vld $vr2,$a7,0x20
+ vshuf.b $vr2,$vr4,$vr2,$vr4
+ vxor.v $vr2,$vr2,$vr3
+ vld $vr3,$a7,0x30
+ vshuf.b $vr3,$vr1,$vr3,$vr1
+ vxor.v $vr3,$vr3,$vr2
+ vshuf.b $vr3,$vr5,$vr3,$vr5
+
+ vld $vr2,$a7,0x40
+ vshuf.b $vr2,$vr4,$vr2,$vr4
+ vxor.v $vr2,$vr2,$vr3
+ vld $vr3,$a7,0x50
+ vshuf.b $vr3,$vr1,$vr3,$vr1
+ vxor.v $vr3,$vr3,$vr2
+ vshuf.b $vr3,$vr5,$vr3,$vr5
+
+ vld $vr2,$a7,0x60
+ vshuf.b $vr2,$vr4,$vr2,$vr4
+ vxor.v $vr2,$vr2,$vr3
+ vld $vr3,$a7,0x70
+ vshuf.b $vr3,$vr1,$vr3,$vr1
+ vxor.v $vr3,$vr3,$vr2
+
+ addi.d $a2,$a2,-16
+
+.Lschedule_mangle_both:
+ add.d $t2,$a4,$a6
+ vld $vr1,$t2,0
+ vshuf.b $vr3,$vr1,$vr3,$vr1
+ addi.d $a4,$a4,-16
+ andi $a4,$a4,0x30
+ vst $vr3,$a2,0
+ jirl $zero,$ra,0
+.cfi_endproc
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+#
+# Interface to OpenSSL
+#
+.globl ${PREFIX}_set_encrypt_key
+#.type ${PREFIX}_set_encrypt_key,\@function,3
+.align 4
+${PREFIX}_set_encrypt_key:
+.cfi_startproc
+___
+$code.=<<___;
+ addi.d $sp,$sp,-48
+ st.d $ra,$sp,40
+ st.d $fp,$sp,32
+ move $t5,$a1
+ srli.w $t5,$t5,0x5
+ addi.w $t5,$t5,0x5
+ st.w $t5,$a2,240 # AES_KEY->rounds = nbits/32+5;
+
+ move $a3,$zero
+ li.d $a4,0x30
+ bl _vpaes_schedule_core
+___
+$code.=<<___;
+ xor $a0,$a0,$a0
+ ld.d $ra,$sp,40
+ ld.d $fp,$sp,32
+ addi.d $sp,$sp,48
+ jirl $zero,$ra,0
+.cfi_endproc
+.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+
+.globl ${PREFIX}_set_decrypt_key
+#.type ${PREFIX}_set_decrypt_key,\@function,3
+.align 4
+${PREFIX}_set_decrypt_key:
+.cfi_startproc
+
+.Ldec_key_body:
+___
+$code.=<<___;
+ addi.d $sp,$sp,-48
+ st.d $ra,$sp,40
+ st.d $fp,$sp,32
+
+ move $t5,$a1
+ srli.w $t5,$t5,5
+ addi.w $t5,$t5,5
+ st.w $t5,$a2,240 # AES_KEY->rounds = nbits/32+5;
+ slli.w $t5,$t5,4
+ add.d $t0,$a2,$t5
+ addi.d $a2,$t0,16
+
+ li.d $a3,0x1
+ move $a4,$a1
+ srli.w $a4,$a4,1
+ andi $a4,$a4,32
+ xori $a4,$a4,32 # nbits==192?0:32
+ bl _vpaes_schedule_core
+
+.Ldec_key_epilogue:
+___
+$code.=<<___;
+ xor $a0,$a0,$a0
+ ld.d $ra,$sp,40
+ ld.d $fp,$sp,32
+ addi.d $sp,$sp,48
+ jirl $zero,$ra,0
+.cfi_endproc
+.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+
+.globl ${PREFIX}_encrypt
+#.type ${PREFIX}_encrypt,\@function,3
+.align 4
+${PREFIX}_encrypt:
+.cfi_startproc
+.Lenc_body:
+___
+$code.=<<___;
+ addi.d $sp,$sp,-48
+ st.d $ra,$sp,40
+ st.d $fp,$sp,32
+ vld $vr0,$a0,0x0
+ bl _vpaes_preheat
+ bl _vpaes_encrypt_core
+ vst $vr0,$a1,0x0
+.Lenc_epilogue:
+___
+$code.=<<___;
+ ld.d $ra,$sp,40
+ ld.d $fp,$sp,32
+ addi.d $sp,$sp,48
+ jirl $zero,$ra,0
+.cfi_endproc
+.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl ${PREFIX}_decrypt
+#.type ${PREFIX}_decrypt,\@function,3
+.align 4
+${PREFIX}_decrypt:
+.cfi_startproc
+___
+$code.=<<___;
+ addi.d $sp,$sp,-48
+ st.d $ra,$sp,40
+ st.d $fp,$sp,32
+ vld $vr0,$a0,0x0
+ bl _vpaes_preheat
+ bl _vpaes_decrypt_core
+ vst $vr0,$a1,0x0
+___
+$code.=<<___;
+ ld.d $ra,$sp,40
+ ld.d $fp,$sp,32
+ addi.d $sp,$sp,48
+ jirl $zero,$ra,0
+.cfi_endproc
+.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
+___
+{
+my ($inp,$out,$len,$key,$ivp,$enc)=("$a0","$a1","$a2","$a3","$a4","$a5");
+# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+$code.=<<___;
+.globl ${PREFIX}_cbc_encrypt
+#.type ${PREFIX}_cbc_encrypt,\@function,6
+.align 4
+${PREFIX}_cbc_encrypt:
+.cfi_startproc
+ addi.d $sp,$sp,-48
+ st.d $ra,$sp,40
+ st.d $fp,$sp,32
+
+ ori $t0,$len,0
+ ori $len,$key,0
+ ori $key,$t0,0
+___
+($len,$key)=($key,$len);
+$code.=<<___;
+ addi.d $len,$len,-16
+ blt $len,$zero,.Lcbc_abort
+___
+$code.=<<___;
+ vld $vr6,$ivp,0 # load IV
+ sub.d $out,$out,$inp
+ bl _vpaes_preheat
+ beqz $a5,.Lcbc_dec_loop
+ b .Lcbc_enc_loop
+.align 4
+.Lcbc_enc_loop:
+ vld $vr0,$inp,0
+ vxor.v $vr0,$vr0,$vr6
+ bl _vpaes_encrypt_core
+ vori.b $vr6,$vr0,0
+ add.d $t0,$out,$inp
+ vst $vr0,$t0,0
+ addi.d $inp,$inp,16
+ addi.d $len,$len,-16
+ bge $len,$zero,.Lcbc_enc_loop
+ b .Lcbc_done
+.align 4
+.Lcbc_dec_loop:
+ vld $vr0,$inp,0
+ vori.b $vr7,$vr0,0
+ bl _vpaes_decrypt_core
+ vxor.v $vr0,$vr0,$vr6
+ vori.b $vr6,$vr7,0
+ add.d $t0,$out,$inp
+ vst $vr0,$t0,0
+ addi.d $inp,$inp,16
+ addi.d $len,$len,-16
+ bge $len,$zero,.Lcbc_dec_loop
+.Lcbc_done:
+ vst $vr6,$ivp,0 # save IV
+___
+$code.=<<___;
+.Lcbc_abort:
+ ld.d $ra,$sp,40
+ ld.d $fp,$sp,32
+ addi.d $sp,$sp,48
+ jirl $zero,$ra,0
+.cfi_endproc
+.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+{
+$code.=<<___;
+##
+## _aes_preheat
+##
+## Fills register %a6 -> .aes_consts (so you can -fPIC)
+## and %vr9-%vr15 as specified below.
+##
+#.type _vpaes_preheat,\@abi-omnipotent
+.align 4
+_vpaes_preheat:
+.cfi_startproc
+ la.local $a6,Lk_s0F
+ vld $vr10,$a6,-0x20 # Lk_inv
+ vld $vr11,$a6,-0x10 # Lk_inv+16
+ vld $vr9,$a6,0 # Lk_s0F
+ vld $vr13,$a6,0x30 # Lk_sb1
+ vld $vr12,$a6,0x40 # Lk_sb1+16
+ vld $vr15,$a6,0x50 # Lk_sb2
+ vld $vr14,$a6,0x60 # Lk_sb2+16
+ jirl $zero,$ra,0
+.cfi_endproc
+.size _vpaes_preheat,.-_vpaes_preheat
+___
+}
+########################################################
+## ##
+## Constants ##
+## ##
+########################################################
+$code.=<<___;
+.section .rodata
+.align 6
+Lk_inv: # inv, inva
+ .quad 0x0E05060F0D080180, 0x040703090A0B0C02
+ .quad 0x01040A060F0B0780, 0x030D0E0C02050809
+
+Lk_s0F: # s0F
+ .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+Lk_ipt: # input transform (lo, hi)
+ .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+ .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+Lk_sb1: # sb1u, sb1t
+ .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+ .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+Lk_sb2: # sb2u, sb2t
+ .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+ .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+Lk_sbo: # sbou, sbot
+ .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+ .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+Lk_mc_forward: # mc_forward
+ .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+ .quad 0x080B0A0904070605, 0x000302010C0F0E0D
+ .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+ .quad 0x000302010C0F0E0D, 0x080B0A0904070605
+
+Lk_mc_backward:# mc_backward
+ .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+ .quad 0x020100030E0D0C0F, 0x0A09080B06050407
+ .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+ .quad 0x0A09080B06050407, 0x020100030E0D0C0F
+
+Lk_sr: # sr
+ .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+ .quad 0x030E09040F0A0500, 0x0B06010C07020D08
+ .quad 0x0F060D040B020900, 0x070E050C030A0108
+ .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+Lk_rcon: # rcon
+ .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+Lk_s63: # s63: all equal to 0x63 transformed
+ .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+Lk_opt: # output transform
+ .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+ .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+Lk_deskew: # deskew tables: inverts the sbox's "skew"
+ .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+ .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+##
+## Decryption stuff
+## Key schedule constants
+##
+Lk_dksd: # decryption key schedule: invskew x*D
+ .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+ .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+Lk_dksb: # decryption key schedule: invskew x*B
+ .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+ .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+Lk_dkse: # decryption key schedule: invskew x*E + 0x63
+ .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+ .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+Lk_dks9: # decryption key schedule: invskew x*9
+ .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+ .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+##
+## Decryption stuff
+## Round function constants
+##
+Lk_dipt: # decryption input transform
+ .quad 0x0F505B040B545F00, 0x154A411E114E451A
+ .quad 0x86E383E660056500, 0x12771772F491F194
+
+Lk_dsb9: # decryption sbox output *9*u, *9*t
+ .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+ .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+Lk_dsbd: # decryption sbox output *D*u, *D*t
+ .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+ .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+Lk_dsbb: # decryption sbox output *B*u, *B*t
+ .quad 0xD022649296B44200, 0x602646F6B0F2D404
+ .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+Lk_dsbe: # decryption sbox output *E*u, *E*t
+ .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+ .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+Lk_dsbo: # decryption sbox final output
+ .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+ .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.asciz "Vector Permutation AES for loongarch64/lsx, Mike Hamburg (Stanford University)"
+.align 6
+___
+
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/aes/build.info b/crypto/aes/build.info
index 886c3b3df3..aff318b34e 100644
--- a/crypto/aes/build.info
+++ b/crypto/aes/build.info
@@ -51,6 +51,9 @@ IF[{- !$disabled{asm} -}]
$AESDEF_riscv64=AES_ASM
$AESASM_riscv32=aes_core.c aes_cbc.c aes-riscv32-zkn.s
+ $AESASM_loongarch64=aes_core.c aes_cbc.c vpaes-loongarch64.S
+ $AESDEF_loongarch64=VPAES_ASM
+
# Now that we have defined all the arch specific variables, use the
# appropriate one, and define the appropriate macros
IF[$AESASM_{- $target{asm_arch} -}]
@@ -136,3 +139,6 @@ GENERATE[aes-s390x.S]=asm/aes-s390x.pl
INCLUDE[aes-s390x.o]=..
GENERATE[aes-c64xplus.S]=asm/aes-c64xplus.pl
+
+GENERATE[vpaes-loongarch64.S]=asm/vpaes-loongarch64.pl
+INCLUDE[vpaes-loongarch64.o]=..