summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2016-11-25 13:11:09 +0100
committerDr. Stephen Henson <steve@openssl.org>2017-08-30 21:22:30 +0100
commit5526e5791f1426553b6f4806d1ac82efd6ab33bc (patch)
tree2316cece8462369bf8c06fac688f238ed4a73360
parentfe36a698477e7cb1a49de3f4cba5ad7f89f5ad4c (diff)
Add some C64x assembly modules [by minor adjustments of C64x+ modules].
AES, SHA256 and SHA512 modules can actually replace corresponding C64x+ modules. This is because C64x+ instructions don't actually provide "killer-argument" advantage in these modules. As for SHA1, even though its performance exactly same, C64x+ module is more responsive to interrupts, i.e. doesn't inhibit them for as long periods as C64x module. Reviewed-by: Rich Salz <rsalz@openssl.org> Reviewed-by: Tim Hudson <tjh@openssl.org> Reviewed-by: Stephen Henson <steve@openssl.org> (Merged from https://github.com/openssl/openssl/pull/4265)
-rw-r--r--crypto/aes/asm/aes-c64x.pl1375
-rw-r--r--crypto/c64xcpuid.pl326
-rw-r--r--crypto/sha/asm/sha1-c64x-large.pl230
-rw-r--r--crypto/sha/asm/sha1-c64x.pl330
-rw-r--r--crypto/sha/asm/sha256-c64x.pl313
-rw-r--r--crypto/sha/asm/sha512-c64x.pl437
6 files changed, 3011 insertions, 0 deletions
diff --git a/crypto/aes/asm/aes-c64x.pl b/crypto/aes/asm/aes-c64x.pl
new file mode 100644
index 0000000000..0817128c1b
--- /dev/null
+++ b/crypto/aes/asm/aes-c64x.pl
@@ -0,0 +1,1375 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# [Endian-neutral] AES for C64x.
+#
+# Even though loops are scheduled for 13 cycles, and thus expected
+# performance is ~8.5 cycles per byte processed with 128-bit key,
+# measured performance turned to be ~10 cycles per byte. Discrepancy
+# must be caused by limitations of L1D memory banking(*), see SPRU871
+# TI publication for further details. If any consolation it's still
+# ~20% faster than TI's linear assembly module anyway... Compared to
+# aes_core.c compiled with cl6x 6.0 with -mv6400+ -o2 options this
+# code is 3.75x faster and almost 3x smaller (tables included).
+#
+# (*) This means that there might be subtle correlation between data
+# and timing and one can wonder if it can be ... attacked:-(
+# On the other hand this also means that *if* one chooses to
+# implement *4* T-tables variant [instead of 1 T-table as in
+# this implementation, or in addition to], then one ought to
+# *interleave* them. Even though it complicates addressing,
+# references to interleaved tables would be guaranteed not to
+# clash. I reckon that it should be possible to break 8 cycles
+# per byte "barrier," i.e. improve by ~20%, naturally at the
+# cost of 8x increased pressure on L1D. 8x because you'd have
+# to interleave both Te and Td tables...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($TEA,$TEB)=("A5","B5");
+($KPA,$KPB)=("A3","B1");
+@K=("A6","B6","A7","B7");
+@s=("A8","B8","A9","B9");
+@Te0=@Td0=("A16","B16","A17","B17");
+@Te1=@Td1=("A18","B18","A19","B19");
+@Te2=@Td2=("A20","B20","A21","B21");
+@Te3=@Td3=("A22","B22","A23","B23");
+
+$code=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .nocmp
+ .asg AES_encrypt,_AES_encrypt
+ .asg AES_decrypt,_AES_decrypt
+ .asg AES_set_encrypt_key,_AES_set_encrypt_key
+ .asg AES_set_decrypt_key,_AES_set_decrypt_key
+ .asg AES_ctr32_encrypt,_AES_ctr32_encrypt
+ .endif
+
+ .asg B3,RA
+ .asg A4,INP
+ .asg B4,OUT
+ .asg A6,KEY
+ .asg A4,RET
+ .asg B15,SP
+
+ .eval 24,EXT0
+ .eval 16,EXT1
+ .eval 8,EXT2
+ .eval 0,EXT3
+ .eval 8,TBL1
+ .eval 16,TBL2
+ .eval 24,TBL3
+
+ .if .BIG_ENDIAN
+ .eval 24-EXT0,EXT0
+ .eval 24-EXT1,EXT1
+ .eval 24-EXT2,EXT2
+ .eval 24-EXT3,EXT3
+ .eval 32-TBL1,TBL1
+ .eval 32-TBL2,TBL2
+ .eval 32-TBL3,TBL3
+ .endif
+
+ .global _AES_encrypt
+_AES_encrypt:
+ .asmfunc
+ MVK 1,B2
+__encrypt:
+ .if __TI_EABI__
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL \$PCR_OFFSET(AES_Te,__encrypt),$TEA
+|| ADDKPC __encrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH \$PCR_OFFSET(AES_Te,__encrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .else
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL (AES_Te-__encrypt),$TEA
+|| ADDKPC __encrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH (AES_Te-__encrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .endif
+ LDW *$KPA++[2],$Te0[0] ; zero round key
+|| LDW *$KPB++[2],$Te0[1]
+|| MVK 60,A0
+|| ADD B0,$TEA,$TEA ; AES_Te
+ LDW *KEY[A0],B0 ; rounds
+|| MVK 1024,A0 ; sizeof(AES_Te)
+ LDW *$KPA++[2],$Te0[2]
+|| LDW *$KPB++[2],$Te0[3]
+|| MV $TEA,$TEB
+ NOP
+ .if .BIG_ENDIAN
+ MV A9,$s[0]
+|| MV A8,$s[1]
+|| MV B9,$s[2]
+|| MV B8,$s[3]
+ .else
+ MV A8,$s[0]
+|| MV A9,$s[1]
+|| MV B8,$s[2]
+|| MV B9,$s[3]
+ .endif
+ XOR $Te0[0],$s[0],$s[0]
+|| XOR $Te0[1],$s[1],$s[1]
+|| LDW *$KPA++[2],$K[0] ; 1st round key
+|| LDW *$KPB++[2],$K[1]
+
+ LDW *$KPA++[2],$K[2]
+|| LDW *$KPB++[2],$K[3]
+|| EXTU $s[1],EXT1,24,$Te1[1]
+|| EXTU $s[0],EXT3,24,$Te3[0]
+|| SUB B0,1,B0
+;;====================================================================
+enc_loop?:
+ LDW *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0
+|| LDW *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1
+|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled
+|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled
+|| EXTU $s[1],EXT3,24,$Te3[1]
+|| EXTU $s[0],EXT1,24,$Te1[0]
+ LDW *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2
+|| LDW *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3
+|| EXTU $s[2],EXT2,24,$Te2[2]
+|| EXTU $s[3],EXT2,24,$Te2[3]
+ LDW *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0
+|| LDW *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1
+|| EXTU $s[3],EXT3,24,$Te3[3]
+|| EXTU $s[2],EXT1,24,$Te1[2]
+ LDW *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0
+|| LDW *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1
+|| EXTU $s[0],EXT2,24,$Te2[0]
+|| EXTU $s[1],EXT2,24,$Te2[1]
+ LDW *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2
+|| LDW *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3
+|| EXTU $s[3],EXT1,24,$Te1[3]
+|| EXTU $s[2],EXT3,24,$Te3[2]
+ LDW *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2
+|| LDW *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3
+|| ROTL $Te1[1],TBL1,$Te3[0] ; t0
+|| ROTL $Te3[0],TBL3,$Te1[1] ; t1
+|| EXTU $s[0],EXT0,24,$Te0[0]
+|| EXTU $s[1],EXT0,24,$Te0[1]
+ LDW *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0
+|| LDW *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1
+|| ROTL $Te3[1],TBL3,$Te1[0] ; t2
+|| ROTL $Te1[0],TBL1,$Te3[1] ; t3
+|| EXTU $s[2],EXT0,24,$Te0[2]
+|| EXTU $s[3],EXT0,24,$Te0[3]
+|| [B0] SUB B0,1,B0
+ LDW *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2
+|| LDW *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3
+|| ROTL $Te2[2],TBL2,$Te2[2] ; t0
+|| ROTL $Te2[3],TBL2,$Te2[3] ; t1
+|| XOR $K[0],$Te3[0],$s[0]
+|| XOR $K[1],$Te1[1],$s[1]
+|| [B0] BNOP enc_loop?
+ ROTL $Te3[3],TBL3,$Te1[2] ; t0
+|| ROTL $Te1[2],TBL1,$Te3[3] ; t1
+|| XOR $K[2],$Te1[0],$s[2]
+|| XOR $K[3],$Te3[1],$s[3]
+|| LDW *$KPA++[2],$K[0] ; next round key
+|| LDW *$KPB++[2],$K[1]
+ ROTL $Te2[0],TBL2,$Te2[0] ; t2
+|| ROTL $Te2[1],TBL2,$Te2[1] ; t3
+|| XOR $s[0],$Te2[2],$s[0]
+|| XOR $s[1],$Te2[3],$s[1]
+|| LDW *$KPA++[2],$K[2]
+|| LDW *$KPB++[2],$K[3]
+ ROTL $Te1[3],TBL1,$Te3[2] ; t2
+|| ROTL $Te3[2],TBL3,$Te1[3] ; t3
+|| XOR $s[0],$Te1[2],$s[0]
+|| XOR $s[1],$Te3[3],$s[1]
+ XOR $s[2],$Te2[0],$s[2]
+|| XOR $s[3],$Te2[1],$s[3]
+|| XOR $s[0],$Te0[0],$s[0]
+|| XOR $s[1],$Te0[1],$s[1]
+ XOR $s[2],$Te3[2],$s[2]
+|| XOR $s[3],$Te1[3],$s[3]
+|| EXTU $s[1],EXT1,24,$Te1[1]
+|| EXTU $s[0],EXT3,24,$Te3[0]
+||[!B0] ADD ${TEA},A0,${TEA} ; point to Te4
+||[!B0] ADD ${TEB},A0,${TEB}
+;;====================================================================
+ LDBU *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0
+|| LDBU *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1
+|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled
+|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled
+|| EXTU $s[0],EXT0,24,$Te0[0]
+|| EXTU $s[1],EXT0,24,$Te0[1]
+ LDBU *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0
+|| LDBU *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1
+|| EXTU $s[3],EXT3,24,$Te3[3]
+|| EXTU $s[2],EXT1,24,$Te1[2]
+ LDBU *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0
+|| LDBU *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1
+|| EXTU $s[2],EXT2,24,$Te2[2]
+|| EXTU $s[3],EXT2,24,$Te2[3]
+ LDBU *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0
+|| LDBU *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1
+|| EXTU $s[1],EXT3,24,$Te3[1]
+|| EXTU $s[0],EXT1,24,$Te1[0]
+ LDBU *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2
+|| LDBU *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3
+|| EXTU $s[3],EXT1,24,$Te1[3]
+|| EXTU $s[2],EXT3,24,$Te3[2]
+ LDBU *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2
+|| LDBU *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3
+|| EXTU $s[2],EXT0,24,$Te0[2]
+|| EXTU $s[3],EXT0,24,$Te0[3]
+ LDBU *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2
+|| LDBU *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3
+|| EXTU $s[0],EXT2,24,$Te2[0]
+|| EXTU $s[1],EXT2,24,$Te2[1]
+ LDBU *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2
+|| LDBU *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3
+
+ .if .BIG_ENDIAN
+ PACK2 $Te0[0],$Te1[1],$Te0[0]
+|| PACK2 $Te0[1],$Te1[2],$Te0[1]
+ PACK2 $Te2[2],$Te3[3],$Te2[2]
+|| PACK2 $Te2[3],$Te3[0],$Te2[3]
+ PACKL4 $Te0[0],$Te2[2],$Te0[0]
+|| PACKL4 $Te0[1],$Te2[3],$Te0[1]
+ XOR $K[0],$Te0[0],$Te0[0] ; s[0]
+|| XOR $K[1],$Te0[1],$Te0[1] ; s[1]
+
+ PACK2 $Te0[2],$Te1[3],$Te0[2]
+|| PACK2 $Te0[3],$Te1[0],$Te0[3]
+ PACK2 $Te2[0],$Te3[1],$Te2[0]
+|| PACK2 $Te2[1],$Te3[2],$Te2[1]
+|| BNOP RA
+ PACKL4 $Te0[2],$Te2[0],$Te0[2]
+|| PACKL4 $Te0[3],$Te2[1],$Te0[3]
+ XOR $K[2],$Te0[2],$Te0[2] ; s[2]
+|| XOR $K[3],$Te0[3],$Te0[3] ; s[3]
+
+ MV $Te0[0],A9
+|| MV $Te0[1],A8
+ MV $Te0[2],B9
+|| MV $Te0[3],B8
+|| [B2] STNDW A9:A8,*OUT++
+ [B2] STNDW B9:B8,*OUT++
+ .else
+ PACK2 $Te1[1],$Te0[0],$Te1[1]
+|| PACK2 $Te1[2],$Te0[1],$Te1[2]
+ PACK2 $Te3[3],$Te2[2],$Te3[3]
+|| PACK2 $Te3[0],$Te2[3],$Te3[0]
+ PACKL4 $Te3[3],$Te1[1],$Te1[1]
+|| PACKL4 $Te3[0],$Te1[2],$Te1[2]
+ XOR $K[0],$Te1[1],$Te1[1] ; s[0]
+|| XOR $K[1],$Te1[2],$Te1[2] ; s[1]
+
+ PACK2 $Te1[3],$Te0[2],$Te1[3]
+|| PACK2 $Te1[0],$Te0[3],$Te1[0]
+ PACK2 $Te3[1],$Te2[0],$Te3[1]
+|| PACK2 $Te3[2],$Te2[1],$Te3[2]
+|| BNOP RA
+ PACKL4 $Te3[1],$Te1[3],$Te1[3]
+|| PACKL4 $Te3[2],$Te1[0],$Te1[0]
+ XOR $K[2],$Te1[3],$Te1[3] ; s[2]
+|| XOR $K[3],$Te1[0],$Te1[0] ; s[3]
+
+ MV $Te1[1],A8
+|| MV $Te1[2],A9
+ MV $Te1[3],B8
+|| MV $Te1[0],B9
+|| [B2] STNDW A9:A8,*OUT++
+ [B2] STNDW B9:B8,*OUT++
+ .endif
+ .endasmfunc
+
+ .global _AES_decrypt
+_AES_decrypt:
+ .asmfunc
+ MVK 1,B2
+__decrypt:
+ .if __TI_EABI__
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL \$PCR_OFFSET(AES_Td,__decrypt),$TEA
+|| ADDKPC __decrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH \$PCR_OFFSET(AES_Td,__decrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .else
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL (AES_Td-__decrypt),$TEA
+|| ADDKPC __decrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH (AES_Td-__decrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .endif
+ LDW *$KPA++[2],$Td0[0] ; zero round key
+|| LDW *$KPB++[2],$Td0[1]
+|| MVK 60,A0
+|| ADD B0,$TEA,$TEA ; AES_Td
+ LDW *KEY[A0],B0 ; rounds
+|| MVK 1024,A0 ; sizeof(AES_Td)
+ LDW *$KPA++[2],$Td0[2]
+|| LDW *$KPB++[2],$Td0[3]
+|| MV $TEA,$TEB
+ NOP
+ .if .BIG_ENDIAN
+ MV A9,$s[0]
+|| MV A8,$s[1]
+|| MV B9,$s[2]
+|| MV B8,$s[3]
+ .else
+ MV A8,$s[0]
+|| MV A9,$s[1]
+|| MV B8,$s[2]
+|| MV B9,$s[3]
+ .endif
+ XOR $Td0[0],$s[0],$s[0]
+|| XOR $Td0[1],$s[1],$s[1]
+|| LDW *$KPA++[2],$K[0] ; 1st round key
+|| LDW *$KPB++[2],$K[1]
+
+ LDW *$KPA++[2],$K[2]
+|| LDW *$KPB++[2],$K[3]
+|| EXTU $s[1],EXT3,24,$Td3[1]
+|| EXTU $s[0],EXT1,24,$Td1[0]
+|| SUB B0,1,B0
+;;====================================================================
+dec_loop?:
+ LDW *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0
+|| LDW *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1
+|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled
+|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled
+|| EXTU $s[1],EXT1,24,$Td1[1]
+|| EXTU $s[0],EXT3,24,$Td3[0]
+ LDW *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2
+|| LDW *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3
+|| EXTU $s[2],EXT2,24,$Td2[2]
+|| EXTU $s[3],EXT2,24,$Td2[3]
+ LDW *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0
+|| LDW *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1
+|| EXTU $s[3],EXT1,24,$Td1[3]
+|| EXTU $s[2],EXT3,24,$Td3[2]
+ LDW *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0
+|| LDW *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1
+|| EXTU $s[0],EXT2,24,$Td2[0]
+|| EXTU $s[1],EXT2,24,$Td2[1]
+ LDW *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2
+|| LDW *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3
+|| EXTU $s[3],EXT3,24,$Td3[3]
+|| EXTU $s[2],EXT1,24,$Td1[2]
+ LDW *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2
+|| LDW *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3
+|| ROTL $Td3[1],TBL3,$Td1[0] ; t0
+|| ROTL $Td1[0],TBL1,$Td3[1] ; t1
+|| EXTU $s[0],EXT0,24,$Td0[0]
+|| EXTU $s[1],EXT0,24,$Td0[1]
+ LDW *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0
+|| LDW *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1
+|| ROTL $Td1[1],TBL1,$Td3[0] ; t2
+|| ROTL $Td3[0],TBL3,$Td1[1] ; t3
+|| EXTU $s[2],EXT0,24,$Td0[2]
+|| EXTU $s[3],EXT0,24,$Td0[3]
+|| [B0] SUB B0,1,B0
+ LDW *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2
+|| LDW *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3
+|| ROTL $Td2[2],TBL2,$Td2[2] ; t0
+|| ROTL $Td2[3],TBL2,$Td2[3] ; t1
+|| XOR $K[0],$Td1[0],$s[0]
+|| XOR $K[1],$Td3[1],$s[1]
+|| [B0] BNOP dec_loop?
+ ROTL $Td1[3],TBL1,$Td3[2] ; t0
+|| ROTL $Td3[2],TBL3,$Td1[3] ; t1
+|| XOR $K[2],$Td3[0],$s[2]
+|| XOR $K[3],$Td1[1],$s[3]
+|| LDW *$KPA++[2],$K[0] ; next round key
+|| LDW *$KPB++[2],$K[1]
+ ROTL $Td2[0],TBL2,$Td2[0] ; t2
+|| ROTL $Td2[1],TBL2,$Td2[1] ; t3
+|| XOR $s[0],$Td2[2],$s[0]
+|| XOR $s[1],$Td2[3],$s[1]
+|| LDW *$KPA++[2],$K[2]
+|| LDW *$KPB++[2],$K[3]
+ ROTL $Td3[3],TBL3,$Td1[2] ; t2
+|| ROTL $Td1[2],TBL1,$Td3[3] ; t3
+|| XOR $s[0],$Td3[2],$s[0]
+|| XOR $s[1],$Td1[3],$s[1]
+ XOR $s[2],$Td2[0],$s[2]
+|| XOR $s[3],$Td2[1],$s[3]
+|| XOR $s[0],$Td0[0],$s[0]
+|| XOR $s[1],$Td0[1],$s[1]
+ XOR $s[2],$Td1[2],$s[2]
+|| XOR $s[3],$Td3[3],$s[3]
+|| EXTU $s[1],EXT3,24,$Td3[1]
+|| EXTU $s[0],EXT1,24,$Td1[0]
+||[!B0] ADD ${TEA},A0,${TEA} ; point to Td4
+||[!B0] ADD ${TEB},A0,${TEB}
+;;====================================================================
+ LDBU *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0
+|| LDBU *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1
+|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled
+|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled
+|| EXTU $s[0],EXT0,24,$Td0[0]
+|| EXTU $s[1],EXT0,24,$Td0[1]
+ LDBU *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0
+|| LDBU *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1
+|| EXTU $s[2],EXT2,24,$Td2[2]
+|| EXTU $s[3],EXT2,24,$Td2[3]
+ LDBU *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0
+|| LDBU *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1
+|| EXTU $s[3],EXT1,24,$Td1[3]
+|| EXTU $s[2],EXT3,24,$Td3[2]
+ LDBU *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0
+|| LDBU *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1
+|| EXTU $s[1],EXT1,24,$Td1[1]
+|| EXTU $s[0],EXT3,24,$Td3[0]
+ LDBU *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2
+|| LDBU *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3
+|| EXTU $s[0],EXT2,24,$Td2[0]
+|| EXTU $s[1],EXT2,24,$Td2[1]
+ LDBU *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2
+|| LDBU *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3
+|| EXTU $s[3],EXT3,24,$Td3[3]
+|| EXTU $s[2],EXT1,24,$Td1[2]
+ LDBU *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2
+|| LDBU *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3
+|| EXTU $s[2],EXT0,24,$Td0[2]
+|| EXTU $s[3],EXT0,24,$Td0[3]
+ LDBU *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2
+|| LDBU *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3
+
+ .if .BIG_ENDIAN
+ PACK2 $Td0[0],$Td1[3],$Td0[0]
+|| PACK2 $Td0[1],$Td1[0],$Td0[1]
+ PACK2 $Td2[2],$Td3[1],$Td2[2]
+|| PACK2 $Td2[3],$Td3[2],$Td2[3]
+ PACKL4 $Td0[0],$Td2[2],$Td0[0]
+|| PACKL4 $Td0[1],$Td2[3],$Td0[1]
+ XOR $K[0],$Td0[0],$Td0[0] ; s[0]
+|| XOR $K[1],$Td0[1],$Td0[1] ; s[1]
+
+ PACK2 $Td0[2],$Td1[1],$Td0[2]
+|| PACK2 $Td0[3],$Td1[2],$Td0[3]
+ PACK2 $Td2[0],$Td3[3],$Td2[0]
+|| PACK2 $Td2[1],$Td3[0],$Td2[1]
+|| BNOP RA
+ PACKL4 $Td0[2],$Td2[0],$Td0[2]
+|| PACKL4 $Td0[3],$Td2[1],$Td0[3]
+ XOR $K[2],$Td0[2],$Td0[2] ; s[2]
+|| XOR $K[3],$Td0[3],$Td0[3] ; s[3]
+
+ MV $Td0[0],A9
+|| MV $Td0[1],A8
+ MV $Td0[2],B9
+|| MV $Td0[3],B8
+|| [B2] STNDW A9:A8,*OUT++
+ [B2] STNDW B9:B8,*OUT++
+ .else
+ PACK2 $Td1[3],$Td0[0],$Td1[3]
+|| PACK2 $Td1[0],$Td0[1],$Td1[0]
+ PACK2 $Td3[1],$Td2[2],$Td3[1]
+|| PACK2 $Td3[2],$Td2[3],$Td3[2]
+ PACKL4 $Td3[1],$Td1[3],$Td1[3]
+|| PACKL4 $Td3[2],$Td1[0],$Td1[0]
+ XOR $K[0],$Td1[3],$Td1[3] ; s[0]
+|| XOR $K[1],$Td1[0],$Td1[0] ; s[1]
+
+ PACK2 $Td1[1],$Td0[2],$Td1[1]
+|| PACK2 $Td1[2],$Td0[3],$Td1[2]
+ PACK2 $Td3[3],$Td2[0],$Td3[3]
+|| PACK2 $Td3[0],$Td2[1],$Td3[0]
+|| BNOP RA
+ PACKL4 $Td3[3],$Td1[1],$Td1[1]
+|| PACKL4 $Td3[0],$Td1[2],$Td1[2]
+ XOR $K[2],$Td1[1],$Td1[1] ; s[2]
+|| XOR $K[3],$Td1[2],$Td1[2] ; s[3]
+
+ MV $Td1[3],A8
+|| MV $Td1[0],A9
+ MV $Td1[1],B8
+|| MV $Td1[2],B9
+|| [B2] STNDW A9:A8,*OUT++
+ [B2] STNDW B9:B8,*OUT++
+ .endif
+ .endasmfunc
+___
+{
+my @K=(@K,@s); # extended key
+my @Te4=map("B$_",(16..19));
+
+my @Kx9=@Te0; # used in AES_set_decrypt_key
+my @KxB=@Te1;
+my @KxD=@Te2;
+my @KxE=@Te3;
+
+$code.=<<___;
+ .asg OUT,BITS
+
+ .global _AES_set_encrypt_key
+_AES_set_encrypt_key:
+__set_encrypt_key:
+ .asmfunc
+ MV INP,A0
+|| SHRU BITS,5,BITS ; 128-192-256 -> 4-6-8
+|| MV KEY,A1
+ [!A0] B RA
+||[!A0] MVK -1,RET
+||[!A0] MVK 1,A1 ; only one B RA
+ [!A1] B RA
+||[!A1] MVK -1,RET
+||[!A1] MVK 0,A0
+|| MVK 0,B0
+|| MVK 0,A1
+ [A0] LDNDW *INP++,A9:A8
+|| [A0] CMPEQ 4,BITS,B0
+|| [A0] CMPLT 3,BITS,A1
+ [B0] B key128?
+|| [A1] LDNDW *INP++,B9:B8
+|| [A0] CMPEQ 6,BITS,B0
+|| [A0] CMPLT 5,BITS,A1
+ [B0] B key192?
+|| [A1] LDNDW *INP++,B17:B16
+|| [A0] CMPEQ 8,BITS,B0
+|| [A0] CMPLT 7,BITS,A1
+ [B0] B key256?
+|| [A1] LDNDW *INP++,B19:B18
+
+ .if __TI_EABI__
+ [A0] ADD 0,KEY,$KPA
+|| [A0] ADD 4,KEY,$KPB
+|| [A0] MVKL \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+|| [A0] ADDKPC __set_encrypt_key,B6
+ [A0] MVKH \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+ [A0] ADD B6,$TEA,$TEA ; AES_Te4
+ .else
+ [A0] ADD 0,KEY,$KPA
+|| [A0] ADD 4,KEY,$KPB
+|| [A0] MVKL (AES_Te4-__set_encrypt_key),$TEA
+|| [A0] ADDKPC __set_encrypt_key,B6
+ [A0] MVKH (AES_Te4-__set_encrypt_key),$TEA
+ [A0] ADD B6,$TEA,$TEA ; AES_Te4
+ .endif
+ NOP
+ NOP
+
+ BNOP RA,5
+|| MVK -2,RET ; unknown bit length
+|| MVK 0,B0 ; redundant
+;;====================================================================
+;;====================================================================
+key128?:
+ .if .BIG_ENDIAN
+ MV A9,$K[0]
+|| MV A8,$K[1]
+|| MV B9,$Te4[2]
+|| MV B8,$K[3]
+ .else
+ MV A8,$K[0]
+|| MV A9,$K[1]
+|| MV B8,$Te4[2]
+|| MV B9,$K[3]
+ .endif
+
+ MVK 256,A0
+|| MVK 8,B0
+
+ MV $TEA,$TEB
+|| ADD $TEA,A0,A30 ; rcon
+;;====================================================================
+loop128?:
+ LDW *A30++[1],A31 ; rcon[i]
+|| MV $Te4[2],$K[2]
+|| EXTU $K[3],EXT1,24,$Te4[0]
+ LDBU *${TEB}[$Te4[0]],$Te4[0]
+|| MV $K[3],A0
+|| EXTU $K[3],EXT2,24,$Te4[1]
+ LDBU *${TEB}[$Te4[1]],$Te4[1]
+|| EXTU A0,EXT3,24,A0
+|| EXTU $K[3],EXT0,24,$Te4[3]
+ .if .BIG_ENDIAN
+ LDBU *${TEA}[A0],$Te4[3]
+|| LDBU *${TEB}[$Te4[3]],A0
+ .else
+ LDBU *${TEA}[A0],A0
+|| LDBU *${TEB}[$Te4[3]],$Te4[3]
+ .endif
+
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+
+ XOR A31,$K[0],$K[0] ; ^=rcon[i]
+ .if .BIG_ENDIAN
+ PACK2 $Te4[0],$Te4[1],$Te4[1]
+ PACK2 $Te4[3],A0,$Te4[3]
+|| BDEC loop128?,B0
+ PACKL4 $Te4[1],$Te4[3],$Te4[3]
+ .else
+ PACK2 $Te4[1],$Te4[0],$Te4[1]
+ PACK2 $Te4[3],A0,$Te4[3]
+|| BDEC loop128?,B0
+ PACKL4 $Te4[3],$Te4[1],$Te4[3]
+ .endif
+ XOR $Te4[3],$K[0],$Te4[0] ; K[0]
+ XOR $Te4[0],$K[1],$K[1] ; K[1]
+ MV $Te4[0],$K[0]
+|| XOR $K[1],$K[2],$Te4[2] ; K[2]
+ XOR $Te4[2],$K[3],$K[3] ; K[3]
+;;====================================================================
+ BNOP RA
+ MV $Te4[2],$K[2]
+|| STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ MVK 10,B0 ; rounds
+ STW B0,*++${KPB}[15]
+ MVK 0,RET
+;;====================================================================
+;;====================================================================
+key192?:
+ .if .BIG_ENDIAN
+ MV A9,$K[0]
+|| MV A8,$K[1]
+|| MV B9,$K[2]
+|| MV B8,$K[3]
+ MV B17,$Te4[2]
+|| MV B16,$K[5]
+ .else
+ MV A8,$K[0]
+|| MV A9,$K[1]
+|| MV B8,$K[2]
+|| MV B9,$K[3]
+ MV B16,$Te4[2]
+|| MV B17,$K[5]
+ .endif
+
+ MVK 256,A0
+|| MVK 6,B0
+ MV $TEA,$TEB
+|| ADD $TEA,A0,A30 ; rcon
+;;====================================================================
+loop192?:
+ LDW *A30++[1],A31 ; rcon[i]
+|| MV $Te4[2],$K[4]
+|| EXTU $K[5],EXT1,24,$Te4[0]
+ LDBU *${TEB}[$Te4[0]],$Te4[0]
+|| MV $K[5],A0
+|| EXTU $K[5],EXT2,24,$Te4[1]
+ LDBU *${TEB}[$Te4[1]],$Te4[1]
+|| EXTU A0,EXT3,24,A0
+|| EXTU $K[5],EXT0,24,$Te4[3]
+ .if .BIG_ENDIAN
+ LDBU *${TEA}[A0],$Te4[3]
+|| LDBU *${TEB}[$Te4[3]],A0
+ .else
+ LDBU *${TEA}[A0],A0
+|| LDBU *${TEB}[$Te4[3]],$Te4[3]
+ .endif
+
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ STW $K[4],*$KPA++[2]
+|| STW $K[5],*$KPB++[2]
+
+ XOR A31,$K[0],$K[0] ; ^=rcon[i]
+ .if .BIG_ENDIAN
+ PACK2 $Te4[0],$Te4[1],$Te4[1]
+|| PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[1],$Te4[3],$Te4[3]
+ .else
+ PACK2 $Te4[1],$Te4[0],$Te4[1]
+|| PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[3],$Te4[1],$Te4[3]
+ .endif
+ BDEC loop192?,B0
+|| XOR $Te4[3],$K[0],$Te4[0] ; K[0]
+ XOR $Te4[0],$K[1],$K[1] ; K[1]
+ MV $Te4[0],$K[0]
+|| XOR $K[1],$K[2],$Te4[2] ; K[2]
+ XOR $Te4[2],$K[3],$K[3] ; K[3]
+ MV $Te4[2],$K[2]
+|| XOR $K[3],$K[4],$Te4[2] ; K[4]
+ XOR $Te4[2],$K[5],$K[5] ; K[5]
+;;====================================================================
+ BNOP RA
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ MVK 12,B0 ; rounds
+ STW B0,*++${KPB}[7]
+ MVK 0,RET
+;;====================================================================
+;;====================================================================
+key256?:
+ .if .BIG_ENDIAN
+ MV A9,$K[0]
+|| MV A8,$K[1]
+|| MV B9,$K[2]
+|| MV B8,$K[3]
+ MV B17,$K[4]
+|| MV B16,$K[5]
+|| MV B19,$Te4[2]
+|| MV B18,$K[7]
+ .else
+ MV A8,$K[0]
+|| MV A9,$K[1]
+|| MV B8,$K[2]
+|| MV B9,$K[3]
+ MV B16,$K[4]
+|| MV B17,$K[5]
+|| MV B18,$Te4[2]
+|| MV B19,$K[7]
+ .endif
+
+ MVK 256,A0
+|| MVK 6,B0
+ MV $TEA,$TEB
+|| ADD $TEA,A0,A30 ; rcon
+;;====================================================================
+loop256?:
+ LDW *A30++[1],A31 ; rcon[i]
+|| MV $Te4[2],$K[6]
+|| EXTU $K[7],EXT1,24,$Te4[0]
+ LDBU *${TEB}[$Te4[0]],$Te4[0]
+|| MV $K[7],A0
+|| EXTU $K[7],EXT2,24,$Te4[1]
+ LDBU *${TEB}[$Te4[1]],$Te4[1]
+|| EXTU A0,EXT3,24,A0
+|| EXTU $K[7],EXT0,24,$Te4[3]
+ .if .BIG_ENDIAN
+ LDBU *${TEA}[A0],$Te4[3]
+|| LDBU *${TEB}[$Te4[3]],A0
+ .else
+ LDBU *${TEA}[A0],A0
+|| LDBU *${TEB}[$Te4[3]],$Te4[3]
+ .endif
+
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ STW $K[4],*$KPA++[2]
+|| STW $K[5],*$KPB++[2]
+ STW $K[6],*$KPA++[2]
+|| STW $K[7],*$KPB++[2]
+|| XOR A31,$K[0],$K[0] ; ^=rcon[i]
+ .if .BIG_ENDIAN
+ PACK2 $Te4[0],$Te4[1],$Te4[1]
+|| PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[1],$Te4[3],$Te4[3]
+||[!B0] B done256?
+ .else
+ PACK2 $Te4[1],$Te4[0],$Te4[1]
+|| PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[3],$Te4[1],$Te4[3]
+||[!B0] B done256?
+ .endif
+ XOR $Te4[3],$K[0],$Te4[0] ; K[0]
+ XOR $Te4[0],$K[1],$K[1] ; K[1]
+ MV $Te4[0],$K[0]
+|| XOR $K[1],$K[2],$Te4[2] ; K[2]
+ XOR $Te4[2],$K[3],$K[3] ; K[3]
+
+ MV $Te4[2],$K[2]
+|| [B0] EXTU $K[3],EXT0,24,$Te4[0]
+|| [B0] SUB B0,1,B0
+ LDBU *${TEB}[$Te4[0]],$Te4[0]
+|| MV $K[3],A0
+|| EXTU $K[3],EXT1,24,$Te4[1]
+ LDBU *${TEB}[$Te4[1]],$Te4[1]
+|| EXTU A0,EXT2,24,A0
+|| EXTU $K[3],EXT3,24,$Te4[3]
+
+ .if .BIG_ENDIAN
+ LDBU *${TEA}[A0],$Te4[3]
+|| LDBU *${TEB}[$Te4[3]],A0
+ NOP 3
+ PACK2 $Te4[0],$Te4[1],$Te4[1]
+ PACK2 $Te4[3],A0,$Te4[3]
+|| B loop256?
+ PACKL4 $Te4[1],$Te4[3],$Te4[3]
+ .else
+ LDBU *${TEA}[A0],A0
+|| LDBU *${TEB}[$Te4[3]],$Te4[3]
+ NOP 3
+ PACK2 $Te4[1],$Te4[0],$Te4[1]
+ PACK2 $Te4[3],A0,$Te4[3]
+|| B loop256?
+ PACKL4 $Te4[3],$Te4[1],$Te4[3]
+ .endif
+
+ XOR $Te4[3],$K[4],$Te4[0] ; K[4]
+ XOR $Te4[0],$K[5],$K[5] ; K[5]
+ MV $Te4[0],$K[4]
+|| XOR $K[5],$K[6],$Te4[2] ; K[6]
+ XOR $Te4[2],$K[7],$K[7] ; K[7]
+;;====================================================================
+done256?:
+ BNOP RA
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ MVK 14,B0 ; rounds
+ STW B0,*--${KPB}[1]
+ MVK 0,RET
+ .endasmfunc
+
+ .global _AES_set_decrypt_key
+_AES_set_decrypt_key:
+ .asmfunc
+ B __set_encrypt_key ; guarantee local call
+ MV KEY,B30 ; B30 is not modified
+ MV RA, B31 ; B31 is not modified
+ ADDKPC ret?,RA,2
+ret?: ; B0 holds rounds or zero
+ [!B0] BNOP B31 ; return if zero
+ [B0] SHL B0,4,A0 ; offset to last round key
+ [B0] SHRU B0,1,B2
+ [B0] SUB B2,2,B2
+|| [B0] MVK 0x0000001B,B3 ; AES polynomial
+ [B0] MVKH 0x07000000,B3
+|| [B0] MV B30,$KPA
+ [B0] ADD B30,A0,$KPB
+|| [B0] MVK 16,A0 ; sizeof(round key)
+;;====================================================================
+flip_loop?:
+ LDW *${KPA}[0],A16
+|| LDW *${KPB}[0],B16
+ LDW *${KPA}[1],A17
+|| LDW *${KPB}[1],B17
+ LDW *${KPA}[2],A18
+|| LDW *${KPB}[2],B18
+ LDW *${KPA}[3],A19
+|| ADD $KPA,A0,$KPA
+|| LDW *${KPB}[3],B19
+|| SUB $KPB,A0,$KPB
+|| BDEC flip_loop?,B2
+ NOP
+ STW B16,*${KPA}[-4]
+|| STW A16,*${KPB}[4]
+ STW B17,*${KPA}[-3]
+|| STW A17,*${KPB}[5]
+ STW B18,*${KPA}[-2]
+|| STW A18,*${KPB}[6]
+ STW B19,*${KPA}[-1]
+|| STW A19,*${KPB}[7]
+;;====================================================================
+ SUB B0,1,B0 ; skip last round
+|| ADD B30,A0,$KPA ; skip first round
+|| ADD B30,A0,$KPB
+|| MVC GFPGFR,B30 ; save GFPGFR
+ LDW *${KPA}[0],$K[0]
+|| LDW *${KPB}[1],$K[1]
+|| MVC B3,GFPGFR
+ LDW *${KPA}[2],$K[2]
+|| LDW *${KPB}[3],$K[3]
+ MVK 0x00000909,A24
+|| MVK 0x00000B0B,B24
+ MVKH 0x09090000,A24
+|| MVKH 0x0B0B0000,B24
+ SUB B0,1,B0
+
+ GMPY4 $K[0],A24,$Kx9[0] ; ·0x09
+|| GMPY4 $K[1],A24,$Kx9[1]
+|| MVK 0x00000D0D,A25
+|| MVK 0x00000E0E,B25
+ GMPY4 $K[2],A24,$Kx9[2]
+|| GMPY4 $K[3],A24,$Kx9[3]
+|| MVKH 0x0D0D0000,A25
+|| MVKH 0x0E0E0000,B25
+
+ GMPY4 $K[0],B24,$KxB[0] ; ·0x0B
+|| GMPY4 $K[1],B24,$KxB[1]
+ GMPY4 $K[2],B24,$KxB[2]
+|| GMPY4 $K[3],B24,$KxB[3]
+
+;;====================================================================
+invmix_loop?:
+ GMPY4 $K[0],A25,$KxD[0] ; ·0x0D
+|| GMPY4 $K[1],A25,$KxD[1]
+|| SWAP2 $Kx9[0],$Kx9[0] ; rotate by 16
+|| SWAP2 $Kx9[1],$Kx9[1]
+|| MV $K[0],$s[0] ; this or DINT
+|| MV $K[1],$s[1]
+|| [B0] LDW *${KPA}[4],$K[0]
+|| [B0] LDW *${KPB}[5],$K[1]
+ GMPY4 $K[2],A25,$KxD[2]
+|| GMPY4 $K[3],A25,$KxD[3]
+|| SWAP2 $Kx9[2],$Kx9[2]
+|| SWAP2 $Kx9[3],$Kx9[3]
+|| MV $K[2],$s[2]
+|| MV $K[3],$s[3]
+|| [B0] LDW *${KPA}[6],$K[2]
+|| [B0] LDW *${KPB}[7],$K[3]
+
+ GMPY4 $s[0],B25,$KxE[0] ; ·0x0E
+|| GMPY4 $s[1],B25,$KxE[1]
+|| XOR $Kx9[0],$KxB[0],$KxB[0]
+|| XOR $Kx9[1],$KxB[1],$KxB[1]
+ GMPY4 $s[2],B25,$KxE[2]
+|| GMPY4 $s[3],B25,$KxE[3]
+|| XOR $Kx9[2],$KxB[2],$KxB[2]
+|| XOR $Kx9[3],$KxB[3],$KxB[3]
+
+ ROTL $KxB[0],TBL3,$KxB[0]
+|| ROTL $KxB[1],TBL3,$KxB[1]
+|| SWAP2 $KxD[0],$KxD[0] ; rotate by 16
+|| SWAP2 $KxD[1],$KxD[1]
+ ROTL $KxB[2],TBL3,$KxB[2]
+|| ROTL $KxB[3],TBL3,$KxB[3]
+|| SWAP2 $KxD[2],$KxD[2]
+|| SWAP2 $KxD[3],$KxD[3]
+|| [B0] B invmix_loop?
+
+ XOR $KxE[0],$KxD[0],$KxE[0]
+|| XOR $KxE[1],$KxD[1],$KxE[1]
+|| [B0] GMPY4 $K[0],A24,$Kx9[0] ; ·0x09
+|| [B0] GMPY4 $K[1],A24,$Kx9[1]
+|| ADDAW $KPA,4,$KPA
+ XOR $KxE[2],$KxD[2],$KxE[2]
+|| XOR $KxE[3],$KxD[3],$KxE[3]
+|| [B0] GMPY4 $K[2],A24,$Kx9[2]
+|| [B0] GMPY4 $K[3],A24,$Kx9[3]
+|| ADDAW $KPB,4,$KPB
+
+ XOR $KxB[0],$KxE[0],$KxE[0]
+|| XOR $KxB[1],$KxE[1],$KxE[1]
+|| [B0] GMPY4 $K[0],B24,$KxB[0] ; ·0x0B
+|| [B0] GMPY4 $K[1],B24,$KxB[1]
+ XOR $KxB[2],$KxE[2],$KxE[2]
+|| XOR $KxB[3],$KxE[3],$KxE[3]
+|| [B0] GMPY4 $K[2],B24,$KxB[2]
+|| [B0] GMPY4 $K[3],B24,$KxB[3]
+|| STW $KxE[0],*${KPA}[-4]
+|| STW $KxE[1],*${KPB}[-3]
+ STW $KxE[2],*${KPA}[-2]
+|| STW $KxE[3],*${KPB}[-1]
+|| [B0] SUB B0,1,B0
+;;====================================================================
+ BNOP B31,3
+ MVC B30,GFPGFR ; restore GFPGFR(*)
+ MVK 0,RET
+ .endasmfunc
+___
+# (*) Even though ABI doesn't specify GFPGFR as non-volatile, there
+# are code samples out there that *assume* its default value.
+}
+{
+my ($inp,$out,$blocks,$key,$ivp)=("A4","B4","A6","B6","A8");
+$code.=<<___;
+ .global _AES_ctr32_encrypt
+_AES_ctr32_encrypt:
+ .asmfunc
+ LDNDW *${ivp}[0],A31:A30 ; load counter value
+|| MV $blocks,A2 ; reassign $blocks
+|| MV RA,B27 ; reassign RA
+|| MV $key,B26 ; reassign $key
+ LDNDW *${ivp}[1],B31:B30
+|| MVK 0,B2 ; don't let __encrypt load input
+|| MVK 0,A1 ; and postpone writing output
+ .if .BIG_ENDIAN
+ NOP
+ .else
+ NOP 4
+ SWAP2 B31,B31 ; keep least significant 32 bits
+ SWAP4 B31,B31 ; in host byte order
+ .endif
+ctr32_loop?:
+ [A2] BNOP __encrypt
+|| [A1] XOR A29,A9,A9 ; input^Ek(counter)
+|| [A1] XOR A28,A8,A8
+|| [A2] LDNDW *INP++,A29:A28 ; load input
+ [!A2] BNOP B27 ; return
+|| [A1] XOR B29,B9,B9
+|| [A1] XOR B28,B8,B8
+|| [A2] LDNDW *INP++,B29:B28
+ .if .BIG_ENDIAN
+ [A1] STNDW A9:A8,*OUT++ ; save output
+|| [A2] MV A31,A9 ; pass counter value to __encrypt
+|| [A2] MV A30,A8 ; pass counter value to __encrypt
+ [A1] STNDW B9:B8,*OUT++
+|| [A2] DMV B31,B30,B9:B8
+|| [A2] ADD B30,1,B30 ; counter++
+ .else
+ [A1] STNDW A9:A8,*OUT++ ; save output
+|| [A2] MV A31,A9
+|| [A2] MV A30,A8
+|| [A2] SWAP2 B31,B0
+|| [A2] ADD B31,1,B31 ; counter++
+ [A1] STNDW B9:B8,*OUT++
+|| [A2] MV B30,B8
+|| [A2] SWAP4 B0,B9
+ .endif
+ [A2] ADDKPC ctr32_loop?,RA ; return to ctr32_loop?
+|| [A2] MV B26,KEY ; pass $key
+|| [A2] SUB A2,1,A2 ; $blocks--
+||[!A1] MVK 1,A1
+ NOP
+ NOP
+ .endasmfunc
+___
+}
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+ .if __TI_EABI__
+ .sect ".text:aes_asm.const"
+ .else
+ .sect ".const:aes_asm"
+ .endif
+ .align 128
+AES_Te:
+ .byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84
+ .byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d
+ .byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd
+ .byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54
+ .byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03
+ .byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d
+ .byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62
+ .byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a
+ .byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d
+ .byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87
+ .byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb
+ .byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b
+ .byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67
+ .byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea
+ .byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7
+ .byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b
+ .byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c
+ .byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a
+ .byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41
+ .byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f
+ .byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4
+ .byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08
+ .byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73
+ .byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f
+ .byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52
+ .byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e
+ .byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1
+ .byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5
+ .byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36
+ .byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d
+ .byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69
+ .byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f
+ .byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,