summaryrefslogtreecommitdiffstats
path: root/vendor/golang.org/x/crypto
diff options
context:
space:
mode:
authorJesse Duffield <jessedduffield@gmail.com>2020-10-06 21:00:36 +1100
committerJesse Duffield <jessedduffield@gmail.com>2020-10-06 21:58:41 +1100
commit0aed47737c8f366c5e6895c57f0b193ff0eb43a3 (patch)
tree1206e3075357a3f4561ba937859d69df7f3537db /vendor/golang.org/x/crypto
parent6e076472b8fb2d052e0b1c35d3fff3dd263ae722 (diff)
bump go-git to fix invalid merge error
Diffstat (limited to 'vendor/golang.org/x/crypto')
-rw-r--r--vendor/golang.org/x/crypto/chacha20/chacha_generic.go119
-rw-r--r--vendor/golang.org/x/crypto/chacha20/xor.go17
-rw-r--r--vendor/golang.org/x/crypto/poly1305/mac_noasm.go4
-rw-r--r--vendor/golang.org/x/crypto/poly1305/poly1305.go26
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_amd64.go11
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_generic.go21
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_noasm.go13
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go11
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_s390x.go72
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_s390x.s667
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s909
-rw-r--r--vendor/golang.org/x/crypto/ssh/agent/client.go28
-rw-r--r--vendor/golang.org/x/crypto/ssh/certs.go4
-rw-r--r--vendor/golang.org/x/crypto/ssh/cipher.go2
-rw-r--r--vendor/golang.org/x/crypto/ssh/client_auth.go22
-rw-r--r--vendor/golang.org/x/crypto/ssh/kex.go17
-rw-r--r--vendor/golang.org/x/crypto/ssh/keys.go18
-rw-r--r--vendor/golang.org/x/crypto/ssh/mux.go23
18 files changed, 634 insertions, 1350 deletions
diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_generic.go b/vendor/golang.org/x/crypto/chacha20/chacha_generic.go
index 7c498e90d..a2ecf5c32 100644
--- a/vendor/golang.org/x/crypto/chacha20/chacha_generic.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_generic.go
@@ -42,10 +42,14 @@ type Cipher struct {
// The last len bytes of buf are leftover key stream bytes from the previous
// XORKeyStream invocation. The size of buf depends on how many blocks are
- // computed at a time.
+ // computed at a time by xorKeyStreamBlocks.
buf [bufSize]byte
len int
+ // overflow is set when the counter overflowed, no more blocks can be
+ // generated, and the next XORKeyStream call should panic.
+ overflow bool
+
// The counter-independent results of the first round are cached after they
// are computed the first time.
precompDone bool
@@ -89,6 +93,7 @@ func newUnauthenticatedCipher(c *Cipher, key, nonce []byte) (*Cipher, error) {
return nil, errors.New("chacha20: wrong nonce size")
}
+ key, nonce = key[:KeySize], nonce[:NonceSize] // bounds check elimination hint
c.key = [8]uint32{
binary.LittleEndian.Uint32(key[0:4]),
binary.LittleEndian.Uint32(key[4:8]),
@@ -139,15 +144,18 @@ func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) {
// SetCounter sets the Cipher counter. The next invocation of XORKeyStream will
// behave as if (64 * counter) bytes had been encrypted so far.
//
-// To prevent accidental counter reuse, SetCounter panics if counter is
-// less than the current value.
+// To prevent accidental counter reuse, SetCounter panics if counter is less
+// than the current value.
+//
+// Note that the execution time of XORKeyStream is not independent of the
+// counter value.
func (s *Cipher) SetCounter(counter uint32) {
// Internally, s may buffer multiple blocks, which complicates this
// implementation slightly. When checking whether the counter has rolled
// back, we must use both s.counter and s.len to determine how many blocks
// we have already output.
outputCounter := s.counter - uint32(s.len)/blockSize
- if counter < outputCounter {
+ if s.overflow || counter < outputCounter {
panic("chacha20: SetCounter attempted to rollback counter")
}
@@ -196,34 +204,52 @@ func (s *Cipher) XORKeyStream(dst, src []byte) {
dst[i] = src[i] ^ b
}
s.len -= len(keyStream)
- src = src[len(keyStream):]
- dst = dst[len(keyStream):]
+ dst, src = dst[len(keyStream):], src[len(keyStream):]
+ }
+ if len(src) == 0 {
+ return
}
- const blocksPerBuf = bufSize / blockSize
- numBufs := (uint64(len(src)) + bufSize - 1) / bufSize
- if uint64(s.counter)+numBufs*blocksPerBuf >= 1<<32 {
+ // If we'd need to let the counter overflow and keep generating output,
+ // panic immediately. If instead we'd only reach the last block, remember
+ // not to generate any more output after the buffer is drained.
+ numBlocks := (uint64(len(src)) + blockSize - 1) / blockSize
+ if s.overflow || uint64(s.counter)+numBlocks > 1<<32 {
panic("chacha20: counter overflow")
+ } else if uint64(s.counter)+numBlocks == 1<<32 {
+ s.overflow = true
}
// xorKeyStreamBlocks implementations expect input lengths that are a
// multiple of bufSize. Platform-specific ones process multiple blocks at a
// time, so have bufSizes that are a multiple of blockSize.
- rem := len(src) % bufSize
- full := len(src) - rem
-
+ full := len(src) - len(src)%bufSize
if full > 0 {
s.xorKeyStreamBlocks(dst[:full], src[:full])
}
+ dst, src = dst[full:], src[full:]
+
+ // If using a multi-block xorKeyStreamBlocks would overflow, use the generic
+ // one that does one block at a time.
+ const blocksPerBuf = bufSize / blockSize
+ if uint64(s.counter)+blocksPerBuf > 1<<32 {
+ s.buf = [bufSize]byte{}
+ numBlocks := (len(src) + blockSize - 1) / blockSize
+ buf := s.buf[bufSize-numBlocks*blockSize:]
+ copy(buf, src)
+ s.xorKeyStreamBlocksGeneric(buf, buf)
+ s.len = len(buf) - copy(dst, buf)
+ return
+ }
// If we have a partial (multi-)block, pad it for xorKeyStreamBlocks, and
// keep the leftover keystream for the next XORKeyStream invocation.
- if rem > 0 {
+ if len(src) > 0 {
s.buf = [bufSize]byte{}
- copy(s.buf[:], src[full:])
+ copy(s.buf[:], src)
s.xorKeyStreamBlocks(s.buf[:], s.buf[:])
- s.len = bufSize - copy(dst[full:], s.buf[:])
+ s.len = bufSize - copy(dst, s.buf[:])
}
}
@@ -260,7 +286,9 @@ func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
s.precompDone = true
}
- for i := 0; i < len(src); i += blockSize {
+ // A condition of len(src) > 0 would be sufficient, but this also
+ // acts as a bounds check elimination hint.
+ for len(src) >= 64 && len(dst) >= 64 {
// The remainder of the first column round.
fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter)
@@ -285,49 +313,28 @@ func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
}
- // Finally, add back the initial state to generate the key stream.
- x0 += c0
- x1 += c1
- x2 += c2
- x3 += c3
- x4 += c4
- x5 += c5
- x6 += c6
- x7 += c7
- x8 += c8
- x9 += c9
- x10 += c10
- x11 += c11
- x12 += s.counter
- x13 += c13
- x14 += c14
- x15 += c15
+ // Add back the initial state to generate the key stream, then
+ // XOR the key stream with the source and write out the result.
+ addXor(dst[0:4], src[0:4], x0, c0)
+ addXor(dst[4:8], src[4:8], x1, c1)
+ addXor(dst[8:12], src[8:12], x2, c2)
+ addXor(dst[12:16], src[12:16], x3, c3)
+ addXor(dst[16:20], src[16:20], x4, c4)
+ addXor(dst[20:24], src[20:24], x5, c5)
+ addXor(dst[24:28], src[24:28], x6, c6)
+ addXor(dst[28:32], src[28:32], x7, c7)
+ addXor(dst[32:36], src[32:36], x8, c8)
+ addXor(dst[36:40], src[36:40], x9, c9)
+ addXor(dst[40:44], src[40:44], x10, c10)
+ addXor(dst[44:48], src[44:48], x11, c11)
+ addXor(dst[48:52], src[48:52], x12, s.counter)
+ addXor(dst[52:56], src[52:56], x13, c13)
+ addXor(dst[56:60], src[56:60], x14, c14)
+ addXor(dst[60:64], src[60:64], x15, c15)
s.counter += 1
- if s.counter == 0 {
- panic("chacha20: internal error: counter overflow")
- }
- in, out := src[i:], dst[i:]
- in, out = in[:blockSize], out[:blockSize] // bounds check elimination hint
-
- // XOR the key stream with the source and write out the result.
- xor(out[0:], in[0:], x0)
- xor(out[4:], in[4:], x1)
- xor(out[8:], in[8:], x2)
- xor(out[12:], in[12:], x3)
- xor(out[16:], in[16:], x4)
- xor(out[20:], in[20:], x5)
- xor(out[24:], in[24:], x6)
- xor(out[28:], in[28:], x7)
- xor(out[32:], in[32:], x8)
- xor(out[36:], in[36:], x9)
- xor(out[40:], in[40:], x10)
- xor(out[44:], in[44:], x11)
- xor(out[48:], in[48:], x12)
- xor(out[52:], in[52:], x13)
- xor(out[56:], in[56:], x14)
- xor(out[60:], in[60:], x15)
+ src, dst = src[blockSize:], dst[blockSize:]
}
}
diff --git a/vendor/golang.org/x/crypto/chacha20/xor.go b/vendor/golang.org/x/crypto/chacha20/xor.go
index 0110c9865..c2d04851e 100644
--- a/vendor/golang.org/x/crypto/chacha20/xor.go
+++ b/vendor/golang.org/x/crypto/chacha20/xor.go
@@ -13,10 +13,10 @@ const unaligned = runtime.GOARCH == "386" ||
runtime.GOARCH == "ppc64le" ||
runtime.GOARCH == "s390x"
-// xor reads a little endian uint32 from src, XORs it with u and
+// addXor reads a little endian uint32 from src, XORs it with (a + b) and
// places the result in little endian byte order in dst.
-func xor(dst, src []byte, u uint32) {
- _, _ = src[3], dst[3] // eliminate bounds checks
+func addXor(dst, src []byte, a, b uint32) {
+ _, _ = src[3], dst[3] // bounds check elimination hint
if unaligned {
// The compiler should optimize this code into
// 32-bit unaligned little endian loads and stores.
@@ -27,15 +27,16 @@ func xor(dst, src []byte, u uint32) {
v |= uint32(src[1]) << 8
v |= uint32(src[2]) << 16
v |= uint32(src[3]) << 24
- v ^= u
+ v ^= a + b
dst[0] = byte(v)
dst[1] = byte(v >> 8)
dst[2] = byte(v >> 16)
dst[3] = byte(v >> 24)
} else {
- dst[0] = src[0] ^ byte(u)
- dst[1] = src[1] ^ byte(u>>8)
- dst[2] = src[2] ^ byte(u>>16)
- dst[3] = src[3] ^ byte(u>>24)
+ a += b
+ dst[0] = src[0] ^ byte(a)
+ dst[1] = src[1] ^ byte(a>>8)
+ dst[2] = src[2] ^ byte(a>>16)
+ dst[3] = src[3] ^ byte(a>>24)
}
}
diff --git a/vendor/golang.org/x/crypto/poly1305/mac_noasm.go b/vendor/golang.org/x/crypto/poly1305/mac_noasm.go
index b0c2cd056..d118f30ed 100644
--- a/vendor/golang.org/x/crypto/poly1305/mac_noasm.go
+++ b/vendor/golang.org/x/crypto/poly1305/mac_noasm.go
@@ -2,10 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build !amd64,!ppc64le gccgo purego
+// +build !amd64,!ppc64le,!s390x gccgo purego
package poly1305
type mac struct{ macGeneric }
-
-func newMAC(key *[32]byte) mac { return mac{newMACGeneric(key)} }
diff --git a/vendor/golang.org/x/crypto/poly1305/poly1305.go b/vendor/golang.org/x/crypto/poly1305/poly1305.go
index 066159b79..9d7a6af09 100644
--- a/vendor/golang.org/x/crypto/poly1305/poly1305.go
+++ b/vendor/golang.org/x/crypto/poly1305/poly1305.go
@@ -26,7 +26,9 @@ const TagSize = 16
// 16-byte result into out. Authenticating two different messages with the same
// key allows an attacker to forge messages at will.
func Sum(out *[16]byte, m []byte, key *[32]byte) {
- sum(out, m, key)
+ h := New(key)
+ h.Write(m)
+ h.Sum(out[:0])
}
// Verify returns true if mac is a valid authenticator for m with the given key.
@@ -46,10 +48,9 @@ func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
// two different messages with the same key allows an attacker
// to forge messages at will.
func New(key *[32]byte) *MAC {
- return &MAC{
- mac: newMAC(key),
- finalized: false,
- }
+ m := &MAC{}
+ initialize(key, &m.macState)
+ return m
}
// MAC is an io.Writer computing an authentication tag
@@ -58,7 +59,7 @@ func New(key *[32]byte) *MAC {
// MAC cannot be used like common hash.Hash implementations,
// because using a poly1305 key twice breaks its security.
// Therefore writing data to a running MAC after calling
-// Sum causes it to panic.
+// Sum or Verify causes it to panic.
type MAC struct {
mac // platform-dependent implementation
@@ -71,10 +72,10 @@ func (h *MAC) Size() int { return TagSize }
// Write adds more data to the running message authentication code.
// It never returns an error.
//
-// It must not be called after the first call of Sum.
+// It must not be called after the first call of Sum or Verify.
func (h *MAC) Write(p []byte) (n int, err error) {
if h.finalized {
- panic("poly1305: write to MAC after Sum")
+ panic("poly1305: write to MAC after Sum or Verify")
}
return h.mac.Write(p)
}
@@ -87,3 +88,12 @@ func (h *MAC) Sum(b []byte) []byte {
h.finalized = true
return append(b, mac[:]...)
}
+
+// Verify returns whether the authenticator of all data written to
+// the message authentication code matches the expected value.
+func (h *MAC) Verify(expected []byte) bool {
+ var mac [TagSize]byte
+ h.mac.Sum(&mac)
+ h.finalized = true
+ return subtle.ConstantTimeCompare(expected, mac[:]) == 1
+}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
index 35b9e38c9..99e5a1d50 100644
--- a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
@@ -9,17 +9,6 @@ package poly1305
//go:noescape
func update(state *macState, msg []byte)
-func sum(out *[16]byte, m []byte, key *[32]byte) {
- h := newMAC(key)
- h.Write(m)
- h.Sum(out)
-}
-
-func newMAC(key *[32]byte) (h mac) {
- initialize(key, &h.r, &h.s)
- return
-}
-
// mac is a wrapper for macGeneric that redirects calls that would have gone to
// updateGeneric to update.
//
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_generic.go b/vendor/golang.org/x/crypto/poly1305/sum_generic.go
index 1187eab78..c942a6590 100644
--- a/vendor/golang.org/x/crypto/poly1305/sum_generic.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_generic.go
@@ -31,16 +31,18 @@ func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
h.Sum(out)
}
-func newMACGeneric(key *[32]byte) (h macGeneric) {
- initialize(key, &h.r, &h.s)
- return
+func newMACGeneric(key *[32]byte) macGeneric {
+ m := macGeneric{}
+ initialize(key, &m.macState)
+ return m
}
// macState holds numbers in saturated 64-bit little-endian limbs. That is,
// the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
type macState struct {
// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
- // can grow larger during and after rounds.
+ // can grow larger during and after rounds. It must, however, remain below
+ // 2 * (2¹³⁰ - 5).
h [3]uint64
// r and s are the private key components.
r [2]uint64
@@ -97,11 +99,12 @@ const (
rMask1 = 0x0FFFFFFC0FFFFFFC
)
-func initialize(key *[32]byte, r, s *[2]uint64) {
- r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
- r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
- s[0] = binary.LittleEndian.Uint64(key[16:24])
- s[1] = binary.LittleEndian.Uint64(key[24:32])
+// initialize loads the 256-bit key into the two 128-bit secret values r and s.
+func initialize(key *[32]byte, m *macState) {
+ m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
+ m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
+ m.s[0] = binary.LittleEndian.Uint64(key[16:24])
+ m.s[1] = binary.LittleEndian.Uint64(key[24:32])
}
// uint128 holds a 128-bit number as two 64-bit limbs, for use with the
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_noasm.go b/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
deleted file mode 100644
index 2e3ae34c7..000000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x,!go1.11 !amd64,!s390x,!ppc64le gccgo purego
-
-package poly1305
-
-func sum(out *[TagSize]byte, msg []byte, key *[32]byte) {
- h := newMAC(key)
- h.Write(msg)
- h.Sum(out)
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go b/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go
index 92597bb8c..2e7a120b1 100644
--- a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go
@@ -9,17 +9,6 @@ package poly1305
//go:noescape
func update(state *macState, msg []byte)
-func sum(out *[16]byte, m []byte, key *[32]byte) {
- h := newMAC(key)
- h.Write(m)
- h.Sum(out)
-}
-
-func newMAC(key *[32]byte) (h mac) {
- initialize(key, &h.r, &h.s)
- return
-}
-
// mac is a wrapper for macGeneric that redirects calls that would have gone to
// updateGeneric to update.
//
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go b/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
index 5f91ff84a..958fedc07 100644
--- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build go1.11,!gccgo,!purego
+// +build !gccgo,!purego
package poly1305
@@ -10,30 +10,66 @@ import (
"golang.org/x/sys/cpu"
)
-// poly1305vx is an assembly implementation of Poly1305 that uses vector
+// updateVX is an assembly implementation of Poly1305 that uses vector
// instructions. It must only be called if the vector facility (vx) is
// available.
//go:noescape
-func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
+func updateVX(state *macState, msg []byte)
-// poly1305vmsl is an assembly implementation of Poly1305 that uses vector
-// instructions, including VMSL. It must only be called if the vector facility (vx) is
-// available and if VMSL is supported.
-//go:noescape
-func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
+// mac is a replacement for macGeneric that uses a larger buffer and redirects
+// calls that would have gone to updateGeneric to updateVX if the vector
+// facility is installed.
+//
+// A larger buffer is required for good performance because the vector
+// implementation has a higher fixed cost per call than the generic
+// implementation.
+type mac struct {
+ macState
+
+ buffer [16 * TagSize]byte // size must be a multiple of block size (16)
+ offset int
+}
-func sum(out *[16]byte, m []byte, key *[32]byte) {
- if cpu.S390X.HasVX {
- var mPtr *byte
- if len(m) > 0 {
- mPtr = &m[0]
+func (h *mac) Write(p []byte) (int, error) {
+ nn := len(p)
+ if h.offset > 0 {
+ n := copy(h.buffer[h.offset:], p)
+ if h.offset+n < len(h.buffer) {
+ h.offset += n
+ return nn, nil
}
- if cpu.S390X.HasVXE && len(m) > 256 {
- poly1305vmsl(out, mPtr, uint64(len(m)), key)
+ p = p[n:]
+ h.offset = 0
+ if cpu.S390X.HasVX {
+ updateVX(&h.macState, h.buffer[:])
} else {
- poly1305vx(out, mPtr, uint64(len(m)), key)
+ updateGeneric(&h.macState, h.buffer[:])
}
- } else {
- sumGeneric(out, m, key)
}
+
+ tail := len(p) % len(h.buffer) // number of bytes to copy into buffer
+ body := len(p) - tail // number of bytes to process now
+ if body > 0 {
+ if cpu.S390X.HasVX {
+ updateVX(&h.macState, p[:body])
+ } else {
+ updateGeneric(&h.macState, p[:body])
+ }
+ }
+ h.offset = copy(h.buffer[:], p[body:]) // copy tail bytes - can be 0
+ return nn, nil
+}
+
+func (h *mac) Sum(out *[TagSize]byte) {
+ state := h.macState
+ remainder := h.buffer[:h.offset]
+
+ // Use the generic implementation if we have 2 or fewer blocks left
+ // to sum. The vector implementation has a higher startup time.
+ if cpu.S390X.HasVX && len(remainder) > 2*TagSize {
+ updateVX(&state, remainder)
+ } else if len(remainder) > 0 {
+ updateGeneric(&state, remainder)
+ }
+ finalize(out, &state.h, &state.s)
}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s b/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
index 806d1694b..0fa9ee6e0 100644
--- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
+++ b/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
@@ -2,115 +2,187 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build go1.11,!gccgo,!purego
+// +build !gccgo,!purego
#include "textflag.h"
-// Implementation of Poly1305 using the vector facility (vx).
-
-// constants
-#define MOD26 V0
-#define EX0 V1
-#define EX1 V2
-#define EX2 V3
-
-// temporaries
-#define T_0 V4
-#define T_1 V5
-#define T_2 V6
-#define T_3 V7
-#define T_4 V8
-
-// key (r)
-#define R_0 V9
-#define R_1 V10
-#define R_2 V11
-#define R_3 V12
-#define R_4 V13
-#define R5_1 V14
-#define R5_2 V15
-#define R5_3 V16
-#define R5_4 V17
-#define RSAVE_0 R5
-#define RSAVE_1 R6
-#define RSAVE_2 R7
-#define RSAVE_3 R8
-#define RSAVE_4 R9
-#define R5SAVE_1 V28
-#define R5SAVE_2 V29
-#define R5SAVE_3 V30
-#define R5SAVE_4 V31
-
-// message block
-#define F_0 V18
-#define F_1 V19
-#define F_2 V20
-#define F_3 V21
-#define F_4 V22
-
-// accumulator
-#define H_0 V23
-#define H_1 V24
-#define H_2 V25
-#define H_3 V26
-#define H_4 V27
-
-GLOBL ·keyMask<>(SB), RODATA, $16
-DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
-DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
-
-GLOBL ·bswapMask<>(SB), RODATA, $16
-DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
-DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
-
-GLOBL ·constants<>(SB), RODATA, $64
-// MOD26
-DATA ·constants<>+0(SB)/8, $0x3ffffff
-DATA ·constants<>+8(SB)/8, $0x3ffffff
+// This implementation of Poly1305 uses the vector facility (vx)
+// to process up to 2 blocks (32 bytes) per iteration using an
+// algorithm based on the one described in:
+//
+// NEON crypto, Daniel J. Bernstein & Peter Schwabe
+// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
+//
+// This algorithm uses 5 26-bit limbs to represent a 130-bit
+// value. These limbs are, for the most part, zero extended and
+// placed into 64-bit vector register elements. Each vector
+// register is 128-bits wide and so holds 2 of these elements.
+// Using 26-bit limbs allows us plenty of headroom to accomodate
+// accumulations before and after multiplication without
+// overflowing either 32-bits (before multiplication) or 64-bits
+// (after multiplication).
+//
+// In order to parallelise the operations required to calculate
+// the sum we use two separate accumulators and then sum those
+// in an extra final step. For compatibility with the generic
+// implementation we perform this summation at the end of every
+// updateVX call.
+//
+// To use two accumulators we must multiply the message blocks
+// by r² rather than r. Only the final message block should be
+// multiplied by r.
+//
+// Example:
+//
+// We want to calculate the sum (h) for a 64 byte message (m):
+//
+// h = m[0:16]r⁴ + m[16:32]r³ + m[32:48]r² + m[48:64]r
+//
+// To do this we split the calculation into the even indices
+// and odd indices of the message. These form our SIMD 'lanes':
+//
+// h = m[ 0:16]r⁴ + m[32:48]r² + <- lane 0
+// m[16:32]r³ + m[48:64]r <- lane 1
+//
+// To calculate this iteratively we refactor so that both lanes
+// are written in terms of r² and r:
+//
+// h = (m[ 0:16]r² + m[32:48])r² + <- lane 0
+// (m[16:32]r² + m[48:64])r <- lane 1
+// ^ ^
+// | coefficients for second iteration
+// coefficients for first iteration
+//
+// So in this case we would have two iterations. In the first
+// both lanes are multiplied by r². In the second only the
+// first lane is multiplied by r² and the second lane is
+// instead multiplied by r. This gives use the odd and even
+// powers of r that we need from the original equation.
+//
+// Notation:
+//
+// h - accumulator
+// r - key
+// m - message
+//
+// [a, b] - SIMD register holding two 64-bit values
+// [a, b, c, d] - SIMD register holding four 32-bit values
+// xᵢ[n] - limb n of variable x with bit width i
+//
+// Limbs are expressed in little endian order, so for 26-bit
+// limbs x₂₆[4] will be the most significant limb and x₂₆[0]
+// will be the least significant limb.
+
+// masking constants
+#define MOD24 V0 // [0x0000000000ffffff, 0x0000000000ffffff] - mask low 24-bits
+#define MOD26 V1 // [0x0000000003ffffff, 0x0000000003ffffff] - mask low 26-bits
+
+// expansion constants (see EXPAND macro)
+#define EX0 V2
+#define EX1 V3
+#define EX2 V4
+
+// key (r², r or 1 depending on context)
+#define R_0 V5
+#define R_1 V6
+#define R_2 V7
+#define R_3 V8
+#define R_4 V9
+
+// precalculated coefficients (5r², 5r or 0 depending on context)
+#define R5_1 V10
+#define R5_2 V11
+#define R5_3 V12
+#define R5_4 V13
+
+// message block (m)
+#define M_0 V14
+#define M_1 V15
+#define M_2 V16
+#define M_3 V17
+#define M_4 V18
+
+// accumulator (h)
+#define H_0 V19
+#define H_1 V20
+#define H_2 V21
+#define H_3 V22
+#define H_4 V23
+
+// temporary registers (for short-lived values)
+#define T_0 V24
+#define T_1 V25
+#define T_2 V26
+#define T_3 V27
+#define T_4 V28
+
+GLOBL ·constants<>(SB), RODATA, $0x30
// EX0
-DATA ·constants<>+16(SB)/8, $0x0006050403020100
-DATA ·constants<>+24(SB)/8, $0x1016151413121110
+DATA ·constants<>+0x00(SB)/8, $0x0006050403020100
+DATA ·constants<>+0x08(SB)/8, $0x1016151413121110
// EX1
-DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
-DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
+DATA ·constants<>+0x10(SB)/8, $0x060c0b0a09080706
+DATA ·constants<>+0x18(SB)/8, $0x161c1b1a19181716
// EX2
-DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
-DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
-
-// h = (f*g) % (2**130-5) [partial reduction]
+DATA ·constants<>+0x20(SB)/8, $0x0d0d0d0d0d0f0e0d
+DATA ·constants<>+0x28(SB)/8, $0x1d1d1d1d1d1f1e1d
+
+// MULTIPLY multiplies each lane of f and g, partially reduced
+// modulo 2¹³⁰ - 5. The result, h, consists of partial products
+// in each lane that need to be reduced further to produce the
+// final result.
+//
+// h₁₃₀ = (f₁₃₀g₁₃₀) % 2¹³⁰ + (5f₁₃₀g₁₃₀) / 2¹³⁰
+//
+// Note that the multiplication by 5 of the high bits is
+// achieved by precalculating the multiplication of four of the
+// g coefficients by 5. These are g51-g54.
#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
VMLOF f0, g0, h0 \
- VMLOF f0, g1, h1 \
- VMLOF f0, g2, h2 \
VMLOF f0, g3, h3 \
+ VMLOF f0, g1, h1 \
VMLOF f0, g4, h4 \
+ VMLOF f0, g2, h2 \
VMLOF f1, g54, T_0 \
- VMLOF f1, g0, T_1 \
- VMLOF f1, g1, T_2 \
VMLOF f1, g2, T_3 \
+ VMLOF f1, g0, T_1 \
VMLOF f1, g3, T_4 \
+ VMLOF f1, g1, T_2 \
VMALOF f2, g53, h0, h0 \
- VMALOF f2, g54, h1, h1 \
- VMALOF f2, g0, h2, h2 \
VMALOF f2, g1, h3, h3 \
+ VMALOF f2, g54, h1, h1 \
VMALOF f2, g2, h4, h4 \
+ VMALOF f2, g0, h2, h2 \
VMALOF f3, g52, T_0, T_0 \
- VMALOF f3, g53, T_1, T_1 \
- VMALOF f3, g54, T_2, T_2 \
VMALOF f3, g0, T_3, T_3 \
+ VMALOF f3, g53, T_1, T_1 \
VMALOF f3, g1, T_4, T_4 \
+ VMALOF f3, g54, T_2, T_2 \
VMALOF f4, g51, h0, h0 \
- VMALOF f4, g52, h1, h1 \
- VMALOF f4, g53, h2, h2 \
VMALOF f4, g54, h3, h3 \
+ VMALOF f4, g52, h1, h1 \
VMALOF f4, g0, h4, h4 \
+ VMALOF f4, g53, h2, h2 \
VAG T_0, h0, h0 \
- VAG T_1, h1, h1 \