From 780ce3849f9efc5404d94464e0eeff966bebbbf1 Mon Sep 17 00:00:00 2001 From: zhuchen Date: Mon, 24 Jul 2023 16:03:29 +0800 Subject: Fixed incorrect usage of vshuf.b instruction In the definition of the latest revised LoongArch64 vector instruction manual, it is clearly pointed out that the undefined upper three bits of each byte in the control register of the vshuf.b instruction should not be used, otherwise uncertain results may be obtained. Therefore, it is necessary to correct the use of the vshuf.b instruction in the existing vpaes-loongarch64.pl code to avoid erroneous calculation results in future LoongArch64 processors. Reviewed-by: Paul Dale Reviewed-by: Tomas Mraz (Merged from https://github.com/openssl/openssl/pull/21530) --- crypto/aes/asm/vpaes-loongarch64.pl | 139 ++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 69 deletions(-) (limited to 'crypto') diff --git a/crypto/aes/asm/vpaes-loongarch64.pl b/crypto/aes/asm/vpaes-loongarch64.pl index 286adc25f3..2f6abba1b7 100644 --- a/crypto/aes/asm/vpaes-loongarch64.pl +++ b/crypto/aes/asm/vpaes-loongarch64.pl @@ -62,14 +62,14 @@ _vpaes_encrypt_core: ld.w $t5,$a2,240 vori.b $vr1,$vr9,0 la.local $t0,Lk_ipt - vld $vr2,$t0,0 # iptlo + vld $vr2,$t0,0 # iptlo vandn.v $vr1,$vr1,$vr0 vld $vr5,$a5,0 # round0 key vsrli.w $vr1,$vr1,4 vand.v $vr0,$vr0,$vr9 - vshuf.b $vr2,$vr0,$vr2,$vr0 + vshuf.b $vr2,$vr18,$vr2,$vr0 vld $vr0,$t0,16 # ipthi - vshuf.b $vr0,$vr1,$vr0,$vr1 + vshuf.b $vr0,$vr18,$vr0,$vr1 vxor.v $vr2,$vr2,$vr5 addi.d $a5,$a5,16 vxor.v $vr0,$vr0,$vr2 @@ -81,26 +81,26 @@ _vpaes_encrypt_core: # middle of middle round vori.b $vr4,$vr13,0 # 4 : sb1u vori.b $vr0,$vr12,0 # 0 : sb1t - vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sb1u - vshuf.b $vr0,$vr3,$vr0,$vr3 # 0 = sb1t + vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sb1u + vshuf.b $vr0,$vr18,$vr0,$vr3 # 0 = sb1t vxor.v $vr4,$vr4,$vr5 # 4 = sb1u + k vori.b $vr5,$vr15,0 # 4 : sb2u vxor.v $vr0,$vr0,$vr4 # 0 = A add.d $t0,$a7,$a6 # Lk_mc_forward[] vld $vr1,$t0,-0x40 - vshuf.b $vr5,$vr2,$vr5,$vr2 # 4 = sb2u + vshuf.b $vr5,$vr18,$vr5,$vr2 # 4 = sb2u vld $vr4,$t0,0 # Lk_mc_backward[] vori.b $vr2,$vr14,0 # 2 : sb2t - vshuf.b $vr2,$vr3,$vr2,$vr3 # 2 = sb2t + vshuf.b $vr2,$vr18,$vr2,$vr3 # 2 = sb2t vori.b $vr3,$vr0,0 # 3 = A vxor.v $vr2,$vr5,$vr2 # 2 = 2A - vshuf.b $vr0,$vr1,$vr0,$vr1 # 0 = B + vshuf.b $vr0,$vr18,$vr0,$vr1 # 0 = B addi.d $a5,$a5,16 # next key vxor.v $vr0,$vr0,$vr2 # 0 = 2A+B - vshuf.b $vr3,$vr4,$vr3,$vr4 # 3 = D + vshuf.b $vr3,$vr18,$vr3,$vr4 # 3 = D addi.d $a7,$a7,16 # next mc vxor.v $vr3,$vr3,$vr0 # 3 = 2A+B+D - vshuf.b $vr0,$vr1,$vr0,$vr1 # 0 = 2B+C + vshuf.b $vr0,$vr18,$vr0,$vr1 # 0 = 2B+C andi $a7,$a7,0x30 # ... mod 4 addi.d $t5,$t5,-1 # nr-- vxor.v $vr0,$vr0,$vr3 # 0 = 2A+3B+C+D @@ -112,33 +112,33 @@ _vpaes_encrypt_core: vandn.v $vr1,$vr1,$vr0 # 1 = i<<4 vsrli.w $vr1,$vr1,4 # 1 = i vand.v $vr0,$vr0,$vr9 # 0 = k - vshuf.b $vr5,$vr0,$vr5,$vr0 # 2 = a/k + vshuf.b $vr5,$vr18,$vr5,$vr0 # 2 = a/k vori.b $vr3,$vr10,0 # 3 : 1/i vxor.v $vr0,$vr0,$vr1 # 0 = j - vshuf.b $vr3,$vr1,$vr3,$vr1 # 3 = 1/i + vshuf.b $vr3,$vr18,$vr3,$vr1 # 3 = 1/i vori.b $vr4,$vr10,0 # 4 : 1/j vxor.v $vr3,$vr3,$vr5 # 3 = iak = 1/i + a/k - vshuf.b $vr4,$vr0,$vr4,$vr0 # 4 = 1/j + vshuf.b $vr4,$vr18,$vr4,$vr0 # 4 = 1/j vori.b $vr2,$vr10,0 # 2 : 1/iak vxor.v $vr4,$vr4,$vr5 # 4 = jak = 1/j + a/k - vshuf.b $vr2,$vr3,$vr2,$vr3 # 2 = 1/iak + vshuf.b $vr2,$vr18,$vr2,$vr3 # 2 = 1/iak vori.b $vr3,$vr10,0 # 3 : 1/jak vxor.v $vr2,$vr2,$vr0 # 2 = io - vshuf.b $vr3,$vr4,$vr3,$vr4 # 3 = 1/jak - vld $vr5,$a5, 0 + vshuf.b $vr3,$vr18,$vr3,$vr4 # 3 = 1/jak + vld $vr5,$a5,0 vxor.v $vr3,$vr3,$vr1 # 3 = jo bnez $t5,.Lenc_loop # middle of last round vld $vr4,$a6, -0x60 # 3 : sbou Lk_sbo vld $vr0,$a6, -0x50 # 0 : sbot Lk_sbo+16 - vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbou + vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbou vxor.v $vr4,$vr4,$vr5 # 4 = sb1u + k - vshuf.b $vr0,$vr3,$vr0,$vr3 # 0 = sb1t + vshuf.b $vr0,$vr18,$vr0,$vr3 # 0 = sb1t add.d $t0,$a7,$a6 # Lk_sr[] - vld $vr1,$t0, 0x40 + vld $vr1,$t0,0x40 vxor.v $vr0,$vr0,$vr4 # 0 = A - vshuf.b $vr0,$vr1,$vr0,$vr1 + vshuf.b $vr0,$vr18,$vr0,$vr1 jr $ra .cfi_endproc .size _vpaes_encrypt_core,.-_vpaes_encrypt_core @@ -163,11 +163,11 @@ _vpaes_decrypt_core: vld $vr5,$a5,0 # round0 key slli.d $a7,$a7,4 vand.v $vr0,$vr9,$vr0 - vshuf.b $vr2,$vr0,$vr2,$vr0 + vshuf.b $vr2,$vr18,$vr2,$vr0 vld $vr0,$t0,16 # ipthi xori $a7,$a7,0x30 la.local $a6,Lk_dsbd - vshuf.b $vr0,$vr1,$vr0,$vr1 + vshuf.b $vr0,$vr18,$vr0,$vr1 andi $a7,$a7,0x30 vxor.v $vr2,$vr2,$vr5 la.local $t0,Lk_mc_forward @@ -184,29 +184,29 @@ _vpaes_decrypt_core: ## vld $vr4,$a6,-0x20 # 4 : sb9u vld $vr1,$a6,-0x10 # 0 : sb9t - vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sb9u - vshuf.b $vr1,$vr3,$vr1,$vr3 # 0 = sb9t + vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sb9u + vshuf.b $vr1,$vr18,$vr1,$vr3 # 0 = sb9t vxor.v $vr0,$vr0,$vr4 vld $vr4,$a6,0x0 # 4 : sbdu vxor.v $vr0,$vr0,$vr1 # 0 = ch vld $vr1,$a6,0x10 # 0 : sbdt - vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbdu - vshuf.b $vr0,$vr5,$vr0,$vr5 # MC ch - vshuf.b $vr1,$vr3,$vr1,$vr3 # 0 = sbdt + vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbdu + vshuf.b $vr0,$vr18,$vr0,$vr5 # MC ch + vshuf.b $vr1,$vr18,$vr1,$vr3 # 0 = sbdt vxor.v $vr0,$vr0,$vr4 # 4 = ch vld $vr4,$a6,0x20 # 4 : sbbu vxor.v $vr0,$vr0,$vr1 # 0 = ch vld $vr1,$a6,0x30 # 0 : sbbt - vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbbu - vshuf.b $vr0,$vr5,$vr0,$vr5 # MC ch - vshuf.b $vr1,$vr3,$vr1,$vr3 # 0 = sbbt + vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbbu + vshuf.b $vr0,$vr18,$vr0,$vr5 # MC ch + vshuf.b $vr1,$vr18,$vr1,$vr3 # 0 = sbbt vxor.v $vr0,$vr0,$vr4 # 4 = ch vld $vr4,$a6,0x40 # 4 : sbeu vxor.v $vr0,$vr0,$vr1 # 0 = ch vld $vr1,$a6,0x50 # 0 : sbet - vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbeu - vshuf.b $vr0,$vr5,$vr0,$vr5 # MC ch - vshuf.b $vr1,$vr3,$vr1,$vr3 # 0 = sbet + vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbeu + vshuf.b $vr0,$vr18,$vr0,$vr5 # MC ch + vshuf.b $vr1,$vr18,$vr1,$vr3 # 0 = sbet vxor.v $vr0,$vr0,$vr4 # 4 = ch addi.d $a5,$a5, 16 # next round key vbsrl.v $vr16,$vr5,0xc @@ -222,32 +222,32 @@ _vpaes_decrypt_core: vori.b $vr2,$vr11,0 # 2 : a/k vsrli.w $vr1,$vr1,4 # 1 = i vand.v $vr0,$vr0,$vr9 # 0 = k - vshuf.b $vr2,$vr0,$vr2,$vr0 # 2 = a/k + vshuf.b $vr2,$vr18,$vr2,$vr0 # 2 = a/k vori.b $vr3,$vr10,0 # 3 : 1/i vxor.v $vr0,$vr0,$vr1 # 0 = j - vshuf.b $vr3,$vr1,$vr3,$vr1 # 3 = 1/i + vshuf.b $vr3,$vr18,$vr3,$vr1 # 3 = 1/i vori.b $vr4,$vr10,0 # 4 : 1/j vxor.v $vr3,$vr3,$vr2 # 3 = iak = 1/i + a/k - vshuf.b $vr4,$vr0,$vr4,$vr0 # 4 = 1/j + vshuf.b $vr4,$vr18,$vr4,$vr0 # 4 = 1/j vxor.v $vr4,$vr4,$vr2 # 4 = jak = 1/j + a/k vori.b $vr2,$vr10,0 # 2 : 1/iak - vshuf.b $vr2,$vr3,$vr2,$vr3 # 2 = 1/iak + vshuf.b $vr2,$vr18,$vr2,$vr3 # 2 = 1/iak vori.b $vr3,$vr10,0 # 3 : 1/jak vxor.v $vr2,$vr2,$vr0 # 2 = io - vshuf.b $vr3,$vr4,$vr3,$vr4 # 3 = 1/jak + vshuf.b $vr3,$vr18,$vr3,$vr4 # 3 = 1/jak vld $vr0,$a5,0 vxor.v $vr3,$vr3,$vr1 # 3 = jo bnez $t5,.Ldec_loop # middle of last round vld $vr4,$a6,0x60 # 3 : sbou - vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbou + vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbou vxor.v $vr4,$vr4,$vr0 # 4 = sb1u + k vld $vr0,$a6,0x70 # 0 : sbot vld $vr2,$a7,-0x160 # Lk_sr-.Lk_dsbd=-0x160 - vshuf.b $vr0,$vr3,$vr0,$vr3 # 0 = sb1t + vshuf.b $vr0,$vr18,$vr0,$vr3 # 0 = sb1t vxor.v $vr0,$vr0,$vr4 # 0 = A - vshuf.b $vr0,$vr2,$vr0,$vr2 + vshuf.b $vr0,$vr18,$vr0,$vr2 jr $ra .cfi_endproc .size _vpaes_decrypt_core,.-_vpaes_decrypt_core @@ -292,7 +292,7 @@ _vpaes_schedule_core: # decrypting, output zeroth round key after shiftrows add.d $t2,$a4,$a6 vld $vr1,$t2,0 - vshuf.b $vr3,$vr1,$vr3,$vr1 + vshuf.b $vr3,$vr18,$vr3,$vr1 vst $vr3,$a2,0 xori $a4,$a4,0x30 @@ -415,7 +415,7 @@ _vpaes_schedule_core: # encrypting add.d $t0,$a4,$a6 vld $vr1,$t0,0 - vshuf.b $vr0,$vr1,$vr0,$vr1 # output permute + vshuf.b $vr0,$vr18,$vr0,$vr1 # output permute la.local $a7,Lk_opt # prepare to output transform addi.d $a2,$a2,32 @@ -530,24 +530,24 @@ _vpaes_schedule_low_round: vsrli.w $vr1,$vr1,0x4 # 1 = i vand.v $vr0,$vr0,$vr9 # 0 = k vaddi.du $vr2,$vr11,0x0 # 2 : a/k - vshuf.b $vr2,$vr0,$vr2,$vr0 # 2 = a/k + vshuf.b $vr2,$vr18,$vr2,$vr0 # 2 = a/k vxor.v $vr0,$vr0,$vr1 # 0 = j vaddi.du $vr3,$vr10,0x0 # 3 : 1/i - vshuf.b $vr3,$vr1,$vr3,$vr1 # 3 = 1/i + vshuf.b $vr3,$vr18,$vr3,$vr1 # 3 = 1/i vxor.v $vr3,$vr3,$vr2 # 3 = iak = 1/i + a/k vaddi.du $vr4,$vr10,0x0 # 4 : 1/j - vshuf.b $vr4,$vr0,$vr4,$vr0 # 4 = 1/j + vshuf.b $vr4,$vr18,$vr4,$vr0 # 4 = 1/j vxor.v $vr4,$vr4,$vr2 # 4 = jak = 1/j + a/k vaddi.du $vr2,$vr10,0x0 # 2 : 1/iak - vshuf.b $vr2,$vr3,$vr2,$vr3 # 2 = 1/iak + vshuf.b $vr2,$vr18,$vr2,$vr3 # 2 = 1/iak vxor.v $vr2,$vr2,$vr0 # 2 = io vaddi.du $vr3,$vr10,0x0 # 3 : 1/jak - vshuf.b $vr3,$vr4,$vr3,$vr4 # 3 = 1/jak + vshuf.b $vr3,$vr18,$vr3,$vr4 # 3 = 1/jak vxor.v $vr3,$vr3,$vr1 # 3 = jo vaddi.du $vr4,$vr13,0x0 # 4 : sbou - vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbou + vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbou vaddi.du $vr0,$vr12,0x0 # 0 : sbot - vshuf.b $vr0,$vr3,$vr0,$vr3 # 0 = sb1t + vshuf.b $vr0,$vr18,$vr0,$vr3 # 0 = sb1t vxor.v $vr0,$vr0,$vr4 # 0 = sbox output # add in smeared stuff @@ -575,9 +575,9 @@ _vpaes_schedule_transform: vsrli.w $vr1,$vr1,4 vand.v $vr0,$vr0,$vr9 vld $vr2,$a7,0 # lo - vshuf.b $vr2,$vr0,$vr2,$vr0 + vshuf.b $vr2,$vr18,$vr2,$vr0 vld $vr0,$a7,16 # hi - vshuf.b $vr0,$vr1,$vr0,$vr1 + vshuf.b $vr0,$vr18,$vr0,$vr1 vxor.v $vr0,$vr0,$vr2 jr $ra .cfi_endproc @@ -620,11 +620,11 @@ _vpaes_schedule_mangle: la.local $t0,Lk_s63 vld $vr16,$t0,0 vxor.v $vr4,$vr4,$vr16 - vshuf.b $vr4,$vr5,$vr4,$vr5 + vshuf.b $vr4,$vr18,$vr4,$vr5 vori.b $vr3,$vr4,0 - vshuf.b $vr4,$vr5,$vr4,$vr5 + vshuf.b $vr4,$vr18,$vr4,$vr5 vxor.v $vr3,$vr3,$vr4 - vshuf.b $vr4,$vr5,$vr4,$vr5 + vshuf.b $vr4,$vr18,$vr4,$vr5 vxor.v $vr3,$vr3,$vr4 b .Lschedule_mangle_both @@ -638,33 +638,33 @@ _vpaes_schedule_mangle: vand.v $vr4,$vr4,$vr9 # 4 = lo vld $vr2,$a7,0 - vshuf.b $vr2,$vr4,$vr2,$vr4 + vshuf.b $vr2,$vr18,$vr2,$vr4 vld $vr3,$a7,0x10 - vshuf.b $vr3,$vr1,$vr3,$vr1 + vshuf.b $vr3,$vr18,$vr3,$vr1 vxor.v $vr3,$vr3,$vr2 - vshuf.b $vr3,$vr5,$vr3,$vr5 + vshuf.b $vr3,$vr18,$vr3,$vr5 vld $vr2,$a7,0x20 - vshuf.b $vr2,$vr4,$vr2,$vr4 + vshuf.b $vr2,$vr18,$vr2,$vr4 vxor.v $vr2,$vr2,$vr3 vld $vr3,$a7,0x30 - vshuf.b $vr3,$vr1,$vr3,$vr1 + vshuf.b $vr3,$vr18,$vr3,$vr1 vxor.v $vr3,$vr3,$vr2 - vshuf.b $vr3,$vr5,$vr3,$vr5 + vshuf.b $vr3,$vr18,$vr3,$vr5 vld $vr2,$a7,0x40 - vshuf.b $vr2,$vr4,$vr2,$vr4 + vshuf.b $vr2,$vr18,$vr2,$vr4 vxor.v $vr2,$vr2,$vr3 vld $vr3,$a7,0x50 - vshuf.b $vr3,$vr1,$vr3,$vr1 + vshuf.b $vr3,$vr18,$vr3,$vr1 vxor.v $vr3,$vr3,$vr2 - vshuf.b $vr3,$vr5,$vr3,$vr5 + vshuf.b $vr3,$vr18,$vr3,$vr5 vld $vr2,$a7,0x60 - vshuf.b $vr2,$vr4,$vr2,$vr4 + vshuf.b $vr2,$vr18,$vr2,$vr4 vxor.v $vr2,$vr2,$vr3 vld $vr3,$a7,0x70 - vshuf.b $vr3,$vr1,$vr3,$vr1 + vshuf.b $vr3,$vr18,$vr3,$vr1 vxor.v $vr3,$vr3,$vr2 addi.d $a2,$a2,-16 @@ -672,7 +672,7 @@ _vpaes_schedule_mangle: .Lschedule_mangle_both: add.d $t2,$a4,$a6 vld $vr1,$t2,0 - vshuf.b $vr3,$vr1,$vr3,$vr1 + vshuf.b $vr3,$vr18,$vr3,$vr1 addi.d $a4,$a4,-16 andi $a4,$a4,0x30 vst $vr3,$a2,0 @@ -885,6 +885,7 @@ _vpaes_preheat: vld $vr12,$a6,0x40 # Lk_sb1+16 vld $vr15,$a6,0x50 # Lk_sb2 vld $vr14,$a6,0x60 # Lk_sb2+16 + vldi $vr18,0 # $vr18 in this program is equal to 0 jirl $zero,$ra,0 .cfi_endproc .size _vpaes_preheat,.-_vpaes_preheat @@ -899,8 +900,8 @@ $code.=<<___; .section .rodata .align 6 Lk_inv: # inv, inva - .quad 0x0E05060F0D080180, 0x040703090A0B0C02 - .quad 0x01040A060F0B0780, 0x030D0E0C02050809 + .quad 0x0E05060F0D080110, 0x040703090A0B0C02 + .quad 0x01040A060F0B0710, 0x030D0E0C02050809 Lk_s0F: # s0F .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F -- cgit v1.2.3