summaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/aesni-intel_avx-x86_64.S
diff options
context:
space:
mode:
authorTim Chen <tim.c.chen@linux.intel.com>2014-01-09 08:57:42 -0800
committerHerbert Xu <herbert@gondor.apana.org.au>2014-01-15 11:36:34 +0800
commit79ba451d66ca8402c8d052ceb50e359ddc5e1161 (patch)
tree23953c81987d74a33f1ee95a2491fb1d889ba0cc /arch/x86/crypto/aesni-intel_avx-x86_64.S
parentfe70be5c8885d07a5c12f3ad97e6f226b0e8cae3 (diff)
crypto: aesni - fix build on x86 (32bit)
We rename aesni-intel_avx.S to aesni-intel_avx-x86_64.S to indicate that it is only used by x86_64 architecture. Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/aesni-intel_avx-x86_64.S')
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S2811
1 files changed, 2811 insertions, 0 deletions
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
new file mode 100644
index 000000000000..522ab68d1c88
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -0,0 +1,2811 @@
+########################################################################
+# Copyright (c) 2013, Intel Corporation
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the
+# distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
+# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+##
+## Authors:
+## Erdinc Ozturk <erdinc.ozturk@intel.com>
+## Vinodh Gopal <vinodh.gopal@intel.com>
+## James Guilford <james.guilford@intel.com>
+## Tim Chen <tim.c.chen@linux.intel.com>
+##
+## References:
+## This code was derived and highly optimized from the code described in paper:
+## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
+## on Intel Architecture Processors. August, 2010
+## The details of the implementation is explained in:
+## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
+## on Intel Architecture Processors. October, 2012.
+##
+## Assumptions:
+##
+##
+##
+## iv:
+## 0 1 2 3
+## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | Salt (From the SA) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | Initialization Vector |
+## | (This is the sequence number from IPSec header) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 0x1 |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+##
+##
+## AAD:
+## AAD padded to 128 bits with 0
+## for example, assume AAD is a u32 vector
+##
+## if AAD is 8 bytes:
+## AAD[3] = {A0, A1}#
+## padded AAD in xmm register = {A1 A0 0 0}
+##
+## 0 1 2 3
+## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | SPI (A1) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 32-bit Sequence Number (A0) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 0x0 |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+## AAD Format with 32-bit Sequence Number
+##
+## if AAD is 12 bytes:
+## AAD[3] = {A0, A1, A2}#
+## padded AAD in xmm register = {A2 A1 A0 0}
+##
+## 0 1 2 3
+## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | SPI (A2) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 64-bit Extended Sequence Number {A1,A0} |
+## | |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 0x0 |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+## AAD Format with 64-bit Extended Sequence Number
+##
+##
+## aadLen:
+## from the definition of the spec, aadLen can only be 8 or 12 bytes.
+## The code additionally supports aadLen of length 16 bytes.
+##
+## TLen:
+## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+##
+## poly = x^128 + x^127 + x^126 + x^121 + 1
+## throughout the code, one tab and two tab indentations are used. one tab is
+## for GHASH part, two tabs is for AES part.
+##
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+.data
+.align 16
+
+POLY: .octa 0xC2000000000000000000000000000001
+POLY2: .octa 0xC20000000000000000000001C2000000
+TWOONE: .octa 0x00000001000000000000000000000001
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
+
+SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F: .octa 0xffffffffffffffffffffffffffffffff
+ZERO: .octa 0x00000000000000000000000000000000
+ONE: .octa 0x00000000000000000000000000000001
+ONEf: .octa 0x01000000000000000000000000000000
+
+.text
+
+
+##define the fields of the gcm aes context
+#{
+# u8 expanded_keys[16*11] store expanded keys
+# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
+# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
+# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
+# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
+# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
+# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
+# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
+# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
+# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+#} gcm_ctx#
+
+HashKey = 16*11 # store HashKey <<1 mod poly here
+HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
+HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
+HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
+HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
+HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
+HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
+HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
+HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
+HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+
+#define arg1 %rdi
+#define arg2 %rsi
+#define arg3 %rdx
+#define arg4 %rcx
+#define arg5 %r8
+#define arg6 %r9
+#define arg7 STACK_OFFSET+8*1(%r14)
+#define arg8 STACK_OFFSET+8*2(%r14)
+#define arg9 STACK_OFFSET+8*3(%r14)
+
+i = 0
+j = 0
+
+out_order = 0
+in_order = 1
+DEC = 0
+ENC = 1
+
+.macro define_reg r n
+reg_\r = %xmm\n
+.endm
+
+.macro setreg
+.altmacro
+define_reg i %i
+define_reg j %j
+.noaltmacro
+.endm
+
+# need to push 4 registers into stack to maintain
+STACK_OFFSET = 8*4
+
+TMP1 = 16*0 # Temporary storage for AAD
+TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+TMP3 = 16*2 # Temporary storage for AES State 3
+TMP4 = 16*3 # Temporary storage for AES State 4
+TMP5 = 16*4 # Temporary storage for AES State 5
+TMP6 = 16*5 # Temporary storage for AES State 6
+TMP7 = 16*6 # Temporary storage for AES State 7
+TMP8 = 16*7 # Temporary storage for AES State 8
+
+VARIABLE_OFFSET = 16*8
+
+################################
+# Utility Macros
+################################
+
+# Encryption of a single block
+.macro ENCRYPT_SINGLE_BLOCK XMM0
+ vpxor (arg1), \XMM0, \XMM0
+ i = 1
+ setreg
+.rep 9
+ vaesenc 16*i(arg1), \XMM0, \XMM0
+ i = (i+1)
+ setreg
+.endr
+ vaesenclast 16*10(arg1), \XMM0, \XMM0
+.endm
+
+#ifdef CONFIG_AS_AVX
+###############################################################################
+# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+# Input: A and B (128-bits each, bit-reflected)
+# Output: C = A*B*x mod poly, (i.e. >>1 )
+# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+###############################################################################
+.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
+
+ vpshufd $0b01001110, \GH, \T2
+ vpshufd $0b01001110, \HK, \T3
+ vpxor \GH , \T2, \T2 # T2 = (a1+a0)
+ vpxor \HK , \T3, \T3 # T3 = (b1+b0)
+
+ vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
+ vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
+ vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
+ vpxor \GH, \T2,\T2
+ vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
+
+ vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
+ vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
+ vpxor \T3, \GH, \GH
+ vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
+
+ #first phase of the reduction
+ vpslld $31, \GH, \T2 # packed right shifting << 31
+ vpslld $30, \GH, \T3 # packed right shifting shift << 30
+ vpslld $25, \GH, \T4 # packed right shifting shift << 25
+
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
+
+ vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
+ vpxor \T2, \GH, \GH # first phase of the reduction complete
+
+ #second phase of the reduction
+
+ vpsrld $1,\GH, \T2 # packed left shifting >> 1
+ vpsrld $2,\GH, \T3 # packed left shifting >> 2
+ vpsrld $7,\GH, \T4 # packed left shifting >> 7
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpxor \T5, \T2, \T2
+ vpxor \T2, \GH, \GH
+ vpxor \T1, \GH, \GH # the result is in GH
+
+
+.endm
+
+.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
+
+ # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa \HK, \T5
+
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
+ vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_2_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
+ vmovdqa \T5, HashKey_3(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_3_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
+ vmovdqa \T5, HashKey_4(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_4_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
+ vmovdqa \T5, HashKey_5(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_5_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
+ vmovdqa \T5, HashKey_6(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_6_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
+ vmovdqa \T5, HashKey_7(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_7_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
+ vmovdqa \T5, HashKey_8(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_8_k(arg1)
+
+.endm
+
+## if a = number of total plaintext bytes
+## b = floor(a/16)
+## num_initial_blocks = b mod 4#
+## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+## r10, r11, r12, rax are clobbered
+## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+
+.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
+ i = (8-\num_initial_blocks)
+ setreg
+
+ mov arg6, %r10 # r10 = AAD
+ mov arg7, %r12 # r12 = aadLen
+
+
+ mov %r12, %r11
+
+ vpxor reg_i, reg_i, reg_i
+_get_AAD_loop\@:
+ vmovd (%r10), \T1
+ vpslldq $12, \T1, \T1
+ vpsrldq $4, reg_i, reg_i
+ vpxor \T1, reg_i, reg_i
+
+ add $4, %r10
+ sub $4, %r12
+ jg _get_AAD_loop\@
+
+
+ cmp $16, %r11
+ je _get_AAD_loop2_done\@
+ mov $16, %r12
+
+_get_AAD_loop2\@:
+ vpsrldq $4, reg_i, reg_i
+ sub $4, %r12
+ cmp %r11, %r12
+ jg _get_AAD_loop2\@
+
+_get_AAD_loop2_done\@:
+
+ #byte-reflect the AAD data
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i
+
+ # initialize the data pointer offset as zero
+ xor %r11, %r11
+
+ # start AES for num_initial_blocks blocks
+ mov arg5, %rax # rax = *Y0
+ vmovdqu (%rax), \CTR # CTR = Y0
+ vpshufb SHUF_MASK(%rip), \CTR, \CTR
+
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, reg_i
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
+ i = (i+1)
+ setreg
+.endr
+
+ vmovdqa (arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpxor \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = 1
+ setreg
+.rep 9
+ vmovdqa 16*j(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenc \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = (j+1)
+ setreg
+.endr
+
+
+ vmovdqa 16*10(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenclast \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vmovdqu (arg3, %r11), \T1
+ vpxor \T1, reg_i, reg_i
+ vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
+ add $16, %r11
+.if \ENC_DEC == DEC
+ vmovdqa \T1, reg_i
+.endif
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
+ i = (i+1)
+ setreg
+.endr
+
+
+ i = (8-\num_initial_blocks)
+ j = (9-\num_initial_blocks)
+ setreg
+ GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
+
+.rep \num_initial_blocks
+ vpxor reg_i, reg_j, reg_j
+ GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ # XMM8 has the combined result here
+
+ vmovdqa \XMM8, TMP1(%rsp)
+ vmovdqa \XMM8, \T3
+
+ cmp $128, %r13
+ jl _initial_blocks_done\@ # no need for precomputed constants
+
+###############################################################################
+# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM1
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM2
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM3
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM4
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM5
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM6
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM7
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM8
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+ vmovdqa (arg1), \T_key
+ vpxor \T_key, \XMM1, \XMM1
+ vpxor \T_key, \XMM2, \XMM2
+ vpxor \T_key, \XMM3, \XMM3
+ vpxor \T_key, \XMM4, \XMM4
+ vpxor \T_key, \XMM5, \XMM5
+ vpxor \T_key, \XMM6, \XMM6
+ vpxor \T_key, \XMM7, \XMM7
+ vpxor \T_key, \XMM8, \XMM8
+
+ i = 1
+ setreg
+.rep 9 # do 9 rounds
+ vmovdqa 16*i(arg1), \T_key
+ vaesenc \T_key, \XMM1, \XMM1
+ vaesenc \T_key, \XMM2, \XMM2
+ vaesenc \T_key, \XMM3, \XMM3
+ vaesenc \T_key, \XMM4, \XMM4
+ vaesenc \T_key, \XMM5, \XMM5
+ vaesenc \T_key, \XMM6, \XMM6
+ vaesenc \T_key, \XMM7, \XMM7
+ vaesenc \T_key, \XMM8, \XMM8
+ i = (i+1)
+ setreg
+.endr
+
+
+ vmovdqa 16*i(arg1), \T_key
+ vaesenclast \T_key, \XMM1, \XMM1
+ vaesenclast \T_key, \XMM2, \XMM2
+ vaesenclast \T_key, \XMM3, \XMM3
+ vaesenclast \T_key, \XMM4, \XMM4
+ vaesenclast \T_key, \XMM5, \XMM5
+ vaesenclast \T_key, \XMM6, \XMM6
+ vaesenclast \T_key, \XMM7, \XMM7
+ vaesenclast \T_key, \XMM8, \XMM8
+
+ vmovdqu (arg3, %r11), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vmovdqu \XMM1, (arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM1
+ .endif
+
+ vmovdqu 16*1(arg3, %r11), \T1
+ vpxor \T1, \XMM2, \XMM2
+ vmovdqu \XMM2, 16*1(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM2
+ .endif
+
+ vmovdqu 16*2(arg3, %r11), \T1
+ vpxor \T1, \XMM3, \XMM3
+ vmovdqu \XMM3, 16*2(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM3
+ .endif
+
+ vmovdqu 16*3(arg3, %r11), \T1
+ vpxor \T1, \XMM4, \XMM4
+ vmovdqu \XMM4, 16*3(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM4
+ .endif
+
+ vmovdqu 16*4(arg3, %r11), \T1
+ vpxor \T1, \XMM5, \XMM5
+ vmovdqu \XMM5, 16*4(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM5
+ .endif
+
+ vmovdqu 16*5(arg3, %r11), \T1
+ vpxor \T1, \XMM6, \XMM6
+ vmovdqu \XMM6, 16*5(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM6
+ .endif
+
+ vmovdqu 16*6(arg3, %r11), \T1
+ vpxor \T1, \XMM7, \XMM7
+ vmovdqu \XMM7, 16*6(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM7
+ .endif
+
+ vmovdqu 16*7(arg3, %r11), \T1
+ vpxor \T1, \XMM8, \XMM8
+ vmovdqu \XMM8, 16*7(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM8
+ .endif
+
+ add $128, %r11
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+###############################################################################
+
+_initial_blocks_done\@:
+
+.endm
+
+# encrypt 8 blocks at a time
+# ghash the 8 previously encrypted ciphertext blocks
+# arg1, arg2, arg3 are used as pointers only, not modified
+# r11 is the data offset value
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+
+ vmovdqa \XMM1, \T2
+ vmovdqa \XMM2, TMP2(%rsp)
+ vmovdqa \XMM3, TMP3(%rsp)
+ vmovdqa \XMM4, TMP4(%rsp)
+ vmovdqa \XMM5, TMP5(%rsp)
+ vmovdqa \XMM6, TMP6(%rsp)
+ vmovdqa \XMM7, TMP7(%rsp)
+ vmovdqa \XMM8, TMP8(%rsp)
+
+.if \loop_idx == in_order
+ vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONE(%rip), \XMM1, \XMM2
+ vpaddd ONE(%rip), \XMM2, \XMM3
+ vpaddd ONE(%rip), \XMM3, \XMM4
+ vpaddd ONE(%rip), \XMM4, \XMM5
+ vpaddd ONE(%rip), \XMM5, \XMM6
+ vpaddd ONE(%rip), \XMM6, \XMM7
+ vpaddd ONE(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+.else
+ vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONEf(%rip), \XMM1, \XMM2
+ vpaddd ONEf(%rip), \XMM2, \XMM3
+ vpaddd ONEf(%rip), \XMM3, \XMM4
+ vpaddd ONEf(%rip), \XMM4, \XMM5
+ vpaddd ONEf(%rip), \XMM5, \XMM6
+ vpaddd ONEf(%rip), \XMM6, \XMM7
+ vpaddd ONEf(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+.endif
+
+
+ #######################################################################
+
+ vmovdqu (arg1), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vpxor \T1, \XMM2, \XMM2
+ vpxor \T1, \XMM3, \XMM3
+ vpxor \T1, \XMM4, \XMM4
+ vpxor \T1, \XMM5, \XMM5
+ vpxor \T1, \XMM6, \XMM6
+ vpxor \T1, \XMM7, \XMM7
+ vpxor \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+
+
+
+
+ vmovdqu 16*1(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqu 16*2(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ #######################################################################
+
+ vmovdqa HashKey_8(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
+ vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
+
+ vpshufd $0b01001110, \T2, \T6
+ vpxor \T2, \T6, \T6
+
+ vmovdqa HashKey_8_k(arg1), \T5
+ vpclmulqdq $0x00, \T5, \T6, \T6
+
+ vmovdqu 16*3(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP2(%rsp), \T1
+ vmovdqa HashKey_7(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_7_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*4(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+ vmovdqa TMP3(%rsp), \T1
+ vmovdqa HashKey_6(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_6_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*5(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP4(%rsp), \T1
+ vmovdqa HashKey_5(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_5_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*6(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ vmovdqa TMP5(%rsp), \T1
+ vmovdqa HashKey_4(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_4_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*7(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP6(%rsp), \T1
+ vmovdqa HashKey_3(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_3_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+
+ vmovdqu 16*8(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP7(%rsp), \T1
+ vmovdqa HashKey_2(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_2_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ #######################################################################
+
+ vmovdqu 16*9(arg1), \T5
+ vaesenc \T5, \XMM1, \XMM1
+ vaesenc \T5, \XMM2, \XMM2
+ vaesenc \T5, \XMM3, \XMM3
+ vaesenc \T5, \XMM4, \XMM4
+ vaesenc \T5, \XMM5, \XMM5
+ vaesenc \T5, \XMM6, \XMM6
+ vaesenc \T5, \XMM7, \XMM7
+ vaesenc \T5, \XMM8, \XMM8
+
+ vmovdqa TMP8(%rsp), \T1
+ vmovdqa HashKey(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vpxor \T4, \T6, \T6
+ vpxor \T7, \T6, \T6
+
+ vmovdqu 16*10(arg1), \T5
+
+ i = 0
+ j = 1
+ setreg
+.rep 8
+ vpxor 16*i(arg3, %r11), \T5, \T2
+ .if \ENC_DEC == ENC
+ vaesenclast \T2, reg_j, reg_j
+ .else
+ vaesenclast \T2, reg_j, \T3
+ vmovdqu 16*i(arg3, %r11), reg_j
+ vmovdqu \T3, 16*i(arg2, %r11)
+ .endif
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ #######################################################################
+
+
+ vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
+ vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
+ vpxor \T3, \T7, \T7
+ vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
+
+
+
+ #######################################################################
+ #first phase of the reduction
+ #######################################################################
+ vpslld $31, \T7, \T2 # packed right shifting << 31
+ vpslld $30, \T7, \T3 # packed right shifting shift << 30
+ vpslld $25, \T7, \T4 # packed right shifting shift << 25
+
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
+
+ vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
+ vpxor \T2, \T7, \T7 # first phase of the reduction complete
+ #######################################################################
+ .if \ENC_DEC == ENC
+ vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Cipherte