summaryrefslogtreecommitdiffstats
path: root/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s')
-rw-r--r--vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s449
1 files changed, 449 insertions, 0 deletions
diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s
new file mode 100644
index 000000000..533014ea3
--- /dev/null
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s
@@ -0,0 +1,449 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Based on CRYPTOGAMS code with the following comment:
+// # ====================================================================
+// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// # project. The module is, however, dual licensed under OpenSSL and
+// # CRYPTOGAMS licenses depending on where you obtain it. For further
+// # details see http://www.openssl.org/~appro/cryptogams/.
+// # ====================================================================
+
+// Code for the perl script that generates the ppc64 assembler
+// can be found in the cryptogams repository at the link below. It is based on
+// the original from openssl.
+
+// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
+
+// The differences in this and the original implementation are
+// due to the calling conventions and initialization of constants.
+
+// +build !gccgo,!appengine
+
+#include "textflag.h"
+
+#define OUT R3
+#define INP R4
+#define LEN R5
+#define KEY R6
+#define CNT R7
+#define TMP R15
+
+#define CONSTBASE R16
+#define BLOCKS R17
+
+DATA consts<>+0x00(SB)/8, $0x3320646e61707865
+DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
+DATA consts<>+0x10(SB)/8, $0x0000000000000001
+DATA consts<>+0x18(SB)/8, $0x0000000000000000
+DATA consts<>+0x20(SB)/8, $0x0000000000000004
+DATA consts<>+0x28(SB)/8, $0x0000000000000000
+DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
+DATA consts<>+0x38(SB)/8, $0x0203000106070405
+DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
+DATA consts<>+0x48(SB)/8, $0x0102030005060704
+DATA consts<>+0x50(SB)/8, $0x6170786561707865
+DATA consts<>+0x58(SB)/8, $0x6170786561707865
+DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
+DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
+DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
+DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
+DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
+DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
+DATA consts<>+0x90(SB)/8, $0x0000000100000000
+DATA consts<>+0x98(SB)/8, $0x0000000300000002
+GLOBL consts<>(SB), RODATA, $0xa0
+
+//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
+TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
+ MOVD out+0(FP), OUT
+ MOVD inp+8(FP), INP
+ MOVD len+16(FP), LEN
+ MOVD key+24(FP), KEY
+ MOVD counter+32(FP), CNT
+
+ // Addressing for constants
+ MOVD $consts<>+0x00(SB), CONSTBASE
+ MOVD $16, R8
+ MOVD $32, R9
+ MOVD $48, R10
+ MOVD $64, R11
+ SRD $6, LEN, BLOCKS
+ // V16
+ LXVW4X (CONSTBASE)(R0), VS48
+ ADD $80,CONSTBASE
+
+ // Load key into V17,V18
+ LXVW4X (KEY)(R0), VS49
+ LXVW4X (KEY)(R8), VS50
+
+ // Load CNT, NONCE into V19
+ LXVW4X (CNT)(R0), VS51
+
+ // Clear V27
+ VXOR V27, V27, V27
+
+ // V28
+ LXVW4X (CONSTBASE)(R11), VS60
+
+ // splat slot from V19 -> V26
+ VSPLTW $0, V19, V26
+
+ VSLDOI $4, V19, V27, V19
+ VSLDOI $12, V27, V19, V19
+
+ VADDUWM V26, V28, V26
+
+ MOVD $10, R14
+ MOVD R14, CTR
+
+loop_outer_vsx:
+ // V0, V1, V2, V3
+ LXVW4X (R0)(CONSTBASE), VS32
+ LXVW4X (R8)(CONSTBASE), VS33
+ LXVW4X (R9)(CONSTBASE), VS34
+ LXVW4X (R10)(CONSTBASE), VS35
+
+ // splat values from V17, V18 into V4-V11
+ VSPLTW $0, V17, V4
+ VSPLTW $1, V17, V5
+ VSPLTW $2, V17, V6
+ VSPLTW $3, V17, V7
+ VSPLTW $0, V18, V8
+ VSPLTW $1, V18, V9
+ VSPLTW $2, V18, V10
+ VSPLTW $3, V18, V11
+
+ // VOR
+ VOR V26, V26, V12
+
+ // splat values from V19 -> V13, V14, V15
+ VSPLTW $1, V19, V13
+ VSPLTW $2, V19, V14
+ VSPLTW $3, V19, V15
+
+ // splat const values
+ VSPLTISW $-16, V27
+ VSPLTISW $12, V28
+ VSPLTISW $8, V29
+ VSPLTISW $7, V30
+
+loop_vsx:
+ VADDUWM V0, V4, V0
+ VADDUWM V1, V5, V1
+ VADDUWM V2, V6, V2
+ VADDUWM V3, V7, V3
+
+ VXOR V12, V0, V12
+ VXOR V13, V1, V13
+ VXOR V14, V2, V14
+ VXOR V15, V3, V15
+
+ VRLW V12, V27, V12
+ VRLW V13, V27, V13
+ VRLW V14, V27, V14
+ VRLW V15, V27, V15
+
+ VADDUWM V8, V12, V8
+ VADDUWM V9, V13, V9
+ VADDUWM V10, V14, V10
+ VADDUWM V11, V15, V11
+
+ VXOR V4, V8, V4
+ VXOR V5, V9, V5
+ VXOR V6, V10, V6
+ VXOR V7, V11, V7
+
+ VRLW V4, V28, V4
+ VRLW V5, V28, V5
+ VRLW V6, V28, V6
+ VRLW V7, V28, V7
+
+ VADDUWM V0, V4, V0
+ VADDUWM V1, V5, V1
+ VADDUWM V2, V6, V2
+ VADDUWM V3, V7, V3
+
+ VXOR V12, V0, V12
+ VXOR V13, V1, V13
+ VXOR V14, V2, V14
+ VXOR V15, V3, V15
+
+ VRLW V12, V29, V12
+ VRLW V13, V29, V13
+ VRLW V14, V29, V14
+ VRLW V15, V29, V15
+
+ VADDUWM V8, V12, V8
+ VADDUWM V9, V13, V9
+ VADDUWM V10, V14, V10
+ VADDUWM V11, V15, V11
+
+ VXOR V4, V8, V4
+ VXOR V5, V9, V5
+ VXOR V6, V10, V6
+ VXOR V7, V11, V7
+
+ VRLW V4, V30, V4
+ VRLW V5, V30, V5
+ VRLW V6, V30, V6
+ VRLW V7, V30, V7
+
+ VADDUWM V0, V5, V0
+ VADDUWM V1, V6, V1
+ VADDUWM V2, V7, V2
+ VADDUWM V3, V4, V3
+
+ VXOR V15, V0, V15
+ VXOR V12, V1, V12
+ VXOR V13, V2, V13
+ VXOR V14, V3, V14
+
+ VRLW V15, V27, V15
+ VRLW V12, V27, V12
+ VRLW V13, V27, V13
+ VRLW V14, V27, V14
+
+ VADDUWM V10, V15, V10
+ VADDUWM V11, V12, V11
+ VADDUWM V8, V13, V8
+ VADDUWM V9, V14, V9
+
+ VXOR V5, V10, V5
+ VXOR V6, V11, V6
+ VXOR V7, V8, V7
+ VXOR V4, V9, V4
+
+ VRLW V5, V28, V5
+ VRLW V6, V28, V6
+ VRLW V7, V28, V7
+ VRLW V4, V28, V4
+
+ VADDUWM V0, V5, V0
+ VADDUWM V1, V6, V1
+ VADDUWM V2, V7, V2
+ VADDUWM V3, V4, V3
+
+ VXOR V15, V0, V15
+ VXOR V12, V1, V12
+ VXOR V13, V2, V13
+ VXOR V14, V3, V14
+
+ VRLW V15, V29, V15
+ VRLW V12, V29, V12
+ VRLW V13, V29, V13
+ VRLW V14, V29, V14
+
+ VADDUWM V10, V15, V10
+ VADDUWM V11, V12, V11
+ VADDUWM V8, V13, V8
+ VADDUWM V9, V14, V9
+
+ VXOR V5, V10, V5
+ VXOR V6, V11, V6
+ VXOR V7, V8, V7
+ VXOR V4, V9, V4
+
+ VRLW V5, V30, V5
+ VRLW V6, V30, V6
+ VRLW V7, V30, V7
+ VRLW V4, V30, V4
+ BC 16, LT, loop_vsx
+
+ VADDUWM V12, V26, V12
+
+ WORD $0x13600F8C // VMRGEW V0, V1, V27
+ WORD $0x13821F8C // VMRGEW V2, V3, V28
+
+ WORD $0x10000E8C // VMRGOW V0, V1, V0
+ WORD $0x10421E8C // VMRGOW V2, V3, V2
+
+ WORD $0x13A42F8C // VMRGEW V4, V5, V29
+ WORD $0x13C63F8C // VMRGEW V6, V7, V30
+
+ XXPERMDI VS32, VS34, $0, VS33
+ XXPERMDI VS32, VS34, $3, VS35
+ XXPERMDI VS59, VS60, $0, VS32
+ XXPERMDI VS59, VS60, $3, VS34
+
+ WORD $0x10842E8C // VMRGOW V4, V5, V4
+ WORD $0x10C63E8C // VMRGOW V6, V7, V6
+
+ WORD $0x13684F8C // VMRGEW V8, V9, V27
+ WORD $0x138A5F8C // VMRGEW V10, V11, V28
+
+ XXPERMDI VS36, VS38, $0, VS37
+ XXPERMDI VS36, VS38, $3, VS39
+ XXPERMDI VS61, VS62, $0, VS36
+ XXPERMDI VS61, VS62, $3, VS38
+
+ WORD $0x11084E8C // VMRGOW V8, V9, V8
+ WORD $0x114A5E8C // VMRGOW V10, V11, V10
+
+ WORD $0x13AC6F8C // VMRGEW V12, V13, V29
+ WORD $0x13CE7F8C // VMRGEW V14, V15, V30
+
+ XXPERMDI VS40, VS42, $0, VS41
+ XXPERMDI VS40, VS42, $3, VS43
+ XXPERMDI VS59, VS60, $0, VS40
+ XXPERMDI VS59, VS60, $3, VS42
+
+ WORD $0x118C6E8C // VMRGOW V12, V13, V12
+ WORD $0x11CE7E8C // VMRGOW V14, V15, V14
+
+ VSPLTISW $4, V27
+ VADDUWM V26, V27, V26
+
+ XXPERMDI VS44, VS46, $0, VS45
+ XXPERMDI VS44, VS46, $3, VS47
+ XXPERMDI VS61, VS62, $0, VS44
+ XXPERMDI VS61, VS62, $3, VS46
+
+ VADDUWM V0, V16, V0
+ VADDUWM V4, V17, V4
+ VADDUWM V8, V18, V8
+ VADDUWM V12, V19, V12
+
+ CMPU LEN, $64
+ BLT tail_vsx
+
+ // Bottom of loop
+ LXVW4X (INP)(R0), VS59
+ LXVW4X (INP)(R8), VS60
+ LXVW4X (INP)(R9), VS61
+ LXVW4X (INP)(R10), VS62
+
+ VXOR V27, V0, V27
+ VXOR V28, V4, V28
+ VXOR V29, V8, V29
+ VXOR V30, V12, V30
+
+ STXVW4X VS59, (OUT)(R0)
+ STXVW4X VS60, (OUT)(R8)
+ ADD $64, INP
+ STXVW4X VS61, (OUT)(R9)
+ ADD $-64, LEN
+ STXVW4X VS62, (OUT)(R10)
+ ADD $64, OUT
+ BEQ done_vsx
+
+ VADDUWM V1, V16, V0
+ VADDUWM V5, V17, V4
+ VADDUWM V9, V18, V8
+ VADDUWM V13, V19, V12
+
+ CMPU LEN, $64
+ BLT tail_vsx
+
+ LXVW4X (INP)(R0), VS59
+ LXVW4X (INP)(R8), VS60
+ LXVW4X (INP)(R9), VS61
+ LXVW4X (INP)(R10), VS62
+ VXOR V27, V0, V27
+
+ VXOR V28, V4, V28
+ VXOR V29, V8, V29
+ VXOR V30, V12, V30
+
+ STXVW4X VS59, (OUT)(R0)
+ STXVW4X VS60, (OUT)(R8)
+ ADD $64, INP
+ STXVW4X VS61, (OUT)(R9)
+ ADD $-64, LEN
+ STXVW4X VS62, (OUT)(V10)
+ ADD $64, OUT
+ BEQ done_vsx
+
+ VADDUWM V2, V16, V0
+ VADDUWM V6, V17, V4
+ VADDUWM V10, V18, V8
+ VADDUWM V14, V19, V12
+
+ CMPU LEN, $64
+ BLT tail_vsx
+
+ LXVW4X (INP)(R0), VS59
+ LXVW4X (INP)(R8), VS60
+ LXVW4X (INP)(R9), VS61
+ LXVW4X (INP)(R10), VS62
+
+ VXOR V27, V0, V27
+ VXOR V28, V4, V28
+ VXOR V29, V8, V29
+ VXOR V30, V12, V30
+
+ STXVW4X VS59, (OUT)(R0)
+ STXVW4X VS60, (OUT)(R8)
+ ADD $64, INP
+ STXVW4X VS61, (OUT)(R9)
+ ADD $-64, LEN
+ STXVW4X VS62, (OUT)(R10)
+ ADD $64, OUT
+ BEQ done_vsx
+
+ VADDUWM V3, V16, V0
+ VADDUWM V7, V17, V4
+ VADDUWM V11, V18, V8
+ VADDUWM V15, V19, V12
+
+ CMPU LEN, $64
+ BLT tail_vsx
+
+ LXVW4X (INP)(R0), VS59
+ LXVW4X (INP)(R8), VS60
+ LXVW4X (INP)(R9), VS61
+ LXVW4X (INP)(R10), VS62
+
+ VXOR V27, V0, V27
+ VXOR V28, V4, V28
+ VXOR V29, V8, V29
+ VXOR V30, V12, V30
+
+ STXVW4X VS59, (OUT)(R0)
+ STXVW4X VS60, (OUT)(R8)
+ ADD $64, INP
+ STXVW4X VS61, (OUT)(R9)
+ ADD $-64, LEN
+ STXVW4X VS62, (OUT)(R10)
+ ADD $64, OUT
+
+ MOVD $10, R14
+ MOVD R14, CTR
+ BNE loop_outer_vsx
+
+done_vsx:
+ // Increment counter by number of 64 byte blocks
+ MOVD (CNT), R14
+ ADD BLOCKS, R14
+ MOVD R14, (CNT)
+ RET
+
+tail_vsx:
+ ADD $32, R1, R11
+ MOVD LEN, CTR
+
+ // Save values on stack to copy from
+ STXVW4X VS32, (R11)(R0)
+ STXVW4X VS36, (R11)(R8)
+ STXVW4X VS40, (R11)(R9)
+ STXVW4X VS44, (R11)(R10)
+ ADD $-1, R11, R12
+ ADD $-1, INP
+ ADD $-1, OUT
+
+looptail_vsx:
+ // Copying the result to OUT
+ // in bytes.
+ MOVBZU 1(R12), KEY
+ MOVBZU 1(INP), TMP
+ XOR KEY, TMP, KEY
+ MOVBU KEY, 1(OUT)
+ BC 16, LT, looptail_vsx
+
+ // Clear the stack values
+ STXVW4X VS48, (R11)(R0)
+ STXVW4X VS48, (R11)(R8)
+ STXVW4X VS48, (R11)(R9)
+ STXVW4X VS48, (R11)(R10)
+ BR done_vsx