/*
* x86_64/AVX2 assembler optimized version of Serpent
*
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
*
* Based on AVX assembler implementation of Serpent by:
* Copyright © 2012 Johannes Goetzfried
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
*/
#include <linux/linkage.h>
#include "glue_helper-asm-avx2.S"
.file "serpent-avx2-asm_64.S"
.data
.align 16
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.Lxts_gf128mul_and_shl1_mask_0:
.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
.Lxts_gf128mul_and_shl1_mask_1:
.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
.text
#define CTX %rdi
#define RNOT %ymm0
#define tp %ymm1
#define RA1 %ymm2
#define RA2 %ymm3
#define RB1 %ymm4
#define RB2 %ymm5
#define RC1 %ymm6
#define RC2 %ymm7
#define RD1 %ymm8
#define RD2 %ymm9
#define RE1 %ymm10
#define RE2 %ymm11
#define RK0 %ymm12
#define RK1 %ymm13
#define RK2 %ymm14
#define RK3 %ymm15
#define RK0x %xmm12
#define RK1x %xmm13
#define RK2x %xmm14
#define RK3x %xmm15
#define S0_1(x0, x1, x2, x3, x4) \
vpor x0, x3, tp; \
vpxor x3, x0, x0; \
vpxor x2, x3, x4; \
vpxor RNOT, x4, x4; \
vpxor x1, tp, x3; \
vpand x0, x1, x1; \
vpxor x4, x1, x1; \
vpxor x0, x2, x2;
#define S0_2(x0, x1, x2, x3, x4) \
vpxor x3, x0, x0; \
vpor x0, x4, x4; \
vpxor x2, x0, x0; \
vpand x1, x2, x2; \
vpxor x2, x3, x3; \
vpxor RNOT, x1, x1; \
vpxor x4, x2, x2; \
vpxor x2, x1, x1;
#define S1_1(x0, x1, x2, x3, x4) \
vpxor x0, x1, tp; \
vpxor x3, x0, x0; \
vpxor RNOT, x3, x3; \
vpand tp, x1, x4; \
vpor tp, x0, x0; \
vpxor x2, x3, x3; \
vpxor x3, x0, x0; \
vpxor x3, tp, x1;
#define S1_2(x0, x1, x2, x3, x4) \
vpxor x4, x3, x3; \
vpor x4, x1, x1; \
vpxor x2, x4, x4; \
vpand x0, x2, x2; \
vpxor x1, x2, x2; \
vpor x0, x1, x1; \
vpxor RNOT, x0, x0; \
vpxor x2, x0, x0; \
vpxor x1, x4, x4;
#define S2_1(x0, x1, x2, x3, x4) \
vpxor RNOT, x3, x3; \
vpxor x0, x1, x1; \
vpand x2, x0, tp; \
vpxor x3, tp, tp; \
vpor x0, x3, x3; \
vpxor x1, x2, x2; \
vpxor x1, x3, x3; \
vpand tp, x1, x1;
#define S2_2(x0, x1, x2, x3, x4) \
vpxor x2, tp, tp; \
vpand x3, x2, x2; \
vpor x1, x3, x3; \
vpxor RNOT, tp, tp; \
vpxor tp, x3, x3; \
vpxor tp, x0, x4; \
vpxor x2, tp, x0; \
vpor x2, x1, x1;
#define S3_1(x0, x1, x2, x3, x4) \
vpxor x3, x1, tp; \
vpor x0, x3, x3; \
vpand x0, x1, x4; \
vpxor x2, x0, x0; \
vpxor tp, x2, x2; \
vpand x3, tp, x1; \
vpxor x3, x2, x2; \
vpor x4, x0, x0; \
vpxor x3, x4, x4;
#define S3_2(x0, x1, x2, x3, x4) \
vpxor x0, x1, x1; \
vpand x3, x0, x0; \
vpand x4, x3, x3; \
vpxor x2, x3, x3; \
vpor x1, x4, x4; \
vpand x1, x2, x2; \
vpxor x3, x4, x4; \
vpxor x3, x0, x0; \
vpxor x2, x3, x3;
#define S4_1(x0, x1, x2, x3, x4) \
vpand x0, x3, tp; \
vpxor x3, x0, x0; \
vpxor x2, tp, tp; \
vpor x3, x2, x2; \
vpxor x1, x0, x0; \
vpxor tp, x3, x4; \
vpor x0, x2, x2; \
vpxor x1, x2, x2;
#define S4_2(x0, x1, x2, x3, x4) \
vpand x0, x1, x1; \
vpxor x4, x1, x1; \
vpand x2, x4, x4; \
vpxor tp, x2, x2; \
vpxor x0, x4, x4; \
vpor x1, tp, x3; \
vpxor RNOT, x1,