1 files changed, 0 insertions, 2916 deletions
diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl
deleted file mode 100755
index d87e201147..0000000000
--- a/crypto/aes/asm/aes-x86_64.pl
+++ /dev/null
@@ -1,2916 +0,0 @@
-#! /usr/bin/env perl
-# Copyright 2005-2019 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# Version 2.1.
-#
-# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
-# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
-# [you'll notice a lot of resemblance], such as compressed S-boxes
-# in little-endian byte order, prefetch of these tables in CBC mode,
-# as well as avoiding L1 cache aliasing between stack frame and key
-# schedule and already mentioned tables, compressed Td4...
-#
-# Performance in number of cycles per processed byte for 128-bit key:
-#
-#		ECB encrypt	ECB decrypt	CBC large chunk
-# AMD64		33		43		13.0
-# EM64T		38		56		18.6(*)
-# Core 2	30		42		14.5(*)
-# Atom		65		86		32.1(*)
-#
-# (*) with hyper-threading off
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
-*STDOUT=*OUT;
-
-$verticalspin=1;	# unlike 32-bit version $verticalspin performs
-			# ~15% better on both AMD and Intel cores
-$speed_limit=512;	# see aes-586.pl for details
-
-$code=".text\n";
-
-$s0="%eax";
-$s1="%ebx";
-$s2="%ecx";
-$s3="%edx";
-$acc0="%esi";	$mask80="%rsi";
-$acc1="%edi";	$maskfe="%rdi";
-$acc2="%ebp";	$mask1b="%rbp";
-$inp="%r8";
-$out="%r9";
-$t0="%r10d";
-$t1="%r11d";
-$t2="%r12d";
-$rnds="%r13d";
-$sbox="%r14";
-$key="%r15";
-
-sub hi() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1h/;	$r; }
-sub lo() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1l/;
-			$r =~ s/%[er]([sd]i)/%\1l/;
-			$r =~ s/%(r[0-9]+)[d]?/%\1b/;	$r; }
-sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
-			$r =~ s/%r([0-9]+)/%r\1d/;	$r; }
-sub _data_word()
-{ my $i;
-    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
-}
-sub data_word()
-{ my $i;
-  my $last=pop(@_);
-    $code.=".long\t";
-    while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
-    $code.=sprintf"0x%08x\n",$last;
-}
-
-sub data_byte()
-{ my $i;
-  my $last=pop(@_);
-    $code.=".byte\t";
-    while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
-    $code.=sprintf"0x%02x\n",$last&0xff;
-}
-
-sub encvert()
-{ my $t3="%r8d";	# zaps $inp!
-
-$code.=<<___;
-	# favor 3-way issue Opteron pipeline...
-	movzb	`&lo("$s0")`,$acc0
-	movzb	`&lo("$s1")`,$acc1
-	movzb	`&lo("$s2")`,$acc2
-	mov	0($sbox,$acc0,8),$t0
-	mov	0($sbox,$acc1,8),$t1
-	mov	0($sbox,$acc2,8),$t2
-
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	movzb	`&lo("$s3")`,$acc2
-	xor	3($sbox,$acc0,8),$t0
-	xor	3($sbox,$acc1,8),$t1
-	mov	0($sbox,$acc2,8),$t3
-
-	movzb	`&hi("$s3")`,$acc0
-	shr	\$16,$s2
-	movzb	`&hi("$s0")`,$acc2
-	xor	3($sbox,$acc0,8),$t2
-	shr	\$16,$s3
-	xor	3($sbox,$acc2,8),$t3
-
-	shr	\$16,$s1
-	lea	16($key),$key
-	shr	\$16,$s0
-
-	movzb	`&lo("$s2")`,$acc0
-	movzb	`&lo("$s3")`,$acc1
-	movzb	`&lo("$s0")`,$acc2
-	xor	2($sbox,$acc0,8),$t0
-	xor	2($sbox,$acc1,8),$t1
-	xor	2($sbox,$acc2,8),$t2
-
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	movzb	`&lo("$s1")`,$acc2
-	xor	1($sbox,$acc0,8),$t0
-	xor	1($sbox,$acc1,8),$t1
-	xor	2($sbox,$acc2,8),$t3
-
-	mov	12($key),$s3
-	movzb	`&hi("$s1")`,$acc1
-	movzb	`&hi("$s2")`,$acc2
-	mov	0($key),$s0
-	xor	1($sbox,$acc1,8),$t2
-	xor	1($sbox,$acc2,8),$t3
-
-	mov	4($key),$s1
-	mov	8($key),$s2
-	xor	$t0,$s0
-	xor	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-sub enclastvert()
-{ my $t3="%r8d";	# zaps $inp!
-
-$code.=<<___;
-	movzb	`&lo("$s0")`,$acc0
-	movzb	`&lo("$s1")`,$acc1
-	movzb	`&lo("$s2")`,$acc2
-	movzb	2($sbox,$acc0,8),$t0
-	movzb	2($sbox,$acc1,8),$t1
-	movzb	2($sbox,$acc2,8),$t2
-
-	movzb	`&lo("$s3")`,$acc0
-	movzb	`&hi("$s1")`,$acc1
-	movzb	`&hi("$s2")`,$acc2
-	movzb	2($sbox,$acc0,8),$t3
-	mov	0($sbox,$acc1,8),$acc1	#$t0
-	mov	0($sbox,$acc2,8),$acc2	#$t1
-
-	and	\$0x0000ff00,$acc1
-	and	\$0x0000ff00,$acc2
-
-	xor	$acc1,$t0
-	xor	$acc2,$t1
-	shr	\$16,$s2
-
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	shr	\$16,$s3
-	mov	0($sbox,$acc0,8),$acc0	#$t2
-	mov	0($sbox,$acc1,8),$acc1	#$t3
-
-	and	\$0x0000ff00,$acc0
-	and	\$0x0000ff00,$acc1
-	shr	\$16,$s1
-	xor	$acc0,$t2
-	xor	$acc1,$t3
-	shr	\$16,$s0
-
-	movzb	`&lo("$s2")`,$acc0
-	movzb	`&lo("$s3")`,$acc1
-	movzb	`&lo("$s0")`,$acc2
-	mov	0($sbox,$acc0,8),$acc0	#$t0
-	mov	0($sbox,$acc1,8),$acc1	#$t1
-	mov	0($sbox,$acc2,8),$acc2	#$t2
-
-	and	\$0x00ff0000,$acc0
-	and	\$0x00ff0000,$acc1
-	and	\$0x00ff0000,$acc2
-
-	xor	$acc0,$t0
-	xor	$acc1,$t1
-	xor	$acc2,$t2
-
-	movzb	`&lo("$s1")`,$acc0
-	movzb	`&hi("$s3")`,$acc1
-	movzb	`&hi("$s0")`,$acc2
-	mov	0($sbox,$acc0,8),$acc0	#$t3
-	mov	2($sbox,$acc1,8),$acc1	#$t0
-	mov	2($sbox,$acc2,8),$acc2	#$t1
-
-	and	\$0x00ff0000,$acc0
-	and	\$0xff000000,$acc1
-	and	\$0xff000000,$acc2
-
-	xor	$acc0,$t3
-	xor	$acc1,$t0
-	xor	$acc2,$t1
-
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	mov	16+12($key),$s3
-	mov	2($sbox,$acc0,8),$acc0	#$t2
-	mov	2($sbox,$acc1,8),$acc1	#$t3
-	mov	16+0($key),$s0
-
-	and	\$0xff000000,$acc0
-	and	\$0xff000000,$acc1
-
-	xor	$acc0,$t2
-	xor	$acc1,$t3
-
-	mov	16+4($key),$s1
-	mov	16+8($key),$s2
-	xor	$t0,$s0
-	xor	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-sub encstep()
-{ my ($i,@s) = @_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-	if ($i==3) {
-		$tmp0=$s[1];
-		$tmp1=$s[2];
-		$tmp2=$s[3];
-	}
-	$code.="	movzb	".&lo($s[0]).",$out\n";
-	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
-	$code.="	lea	16($key),$key\n"	if ($i==0);
-
-	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
-	$code.="	mov	0($sbox,$out,8),$out\n";
-
-	$code.="	shr	\$16,$tmp1\n";
-	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
-	$code.="	xor	3($sbox,$tmp0,8),$out\n";
-
-	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
-	$code.="	shr	\$24,$tmp2\n";
-	$code.="	xor	4*$i($key),$out\n";
-
-	$code.="	xor	2($sbox,$tmp1,8),$out\n";
-	$code.="	xor	1($sbox,$tmp2,8),$out\n";
-
-	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
-	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
-	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
-	$code.="\n";
-}
-
-sub enclast()
-{ my ($i,@s)=@_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-	if ($i==3) {
-		$tmp0=$s[1];
-		$tmp1=$s[2];
-		$tmp2=$s[3];
-	}
-	$code.="	movzb	".&lo($s[0]).",$out\n";
-	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
-
-	$code.="	mov	2($sbox,$out,8),$out\n";
-	$code.="	shr	\$16,$tmp1\n";
-	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
-
-	$code.="	and	\$0x000000ff,$out\n";
-	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
-	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
-	$code.="	shr	\$24,$tmp2\n";
-
-	$code.="	mov	0($sbox,$tmp0,8),$tmp0\n";
-	$code.="	mov	0($sbox,$tmp1,8),$tmp1\n";
-	$code.="	mov	2($sbox,$tmp2,8),$tmp2\n";
-
-	$code.="	and	\$0x0000ff00,$tmp0\n";
-	$code.="	and	\$0x00ff0000,$tmp1\n";
-	$code.="	and	\$0xff000000,$tmp2\n";
-
-	$code.="	xor	$tmp0,$out\n";
-	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
-	$code.="	xor	$tmp1,$out\n";
-	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
-	$code.="	xor	$tmp2,$out\n";
-	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
-	$code.="\n";
-}
-
-$code.=<<___;
-.type	_x86_64_AES_encrypt,\@abi-omnipotent
-.align	16
-_x86_64_AES_encrypt:
-	xor	0($key),$s0			# xor with key
-	xor	4($key),$s1
-	xor	8($key),$s2
-	xor	12($key),$s3
-
-	mov	240($key),$rnds			# load key->rounds
-	sub	\$1,$rnds
-	jmp	.Lenc_loop
-.align	16
-.Lenc_loop:
-___
-	if ($verticalspin) { &encvert(); }
-	else {	&encstep(0,$s0,$s1,$s2,$s3);
-		&encstep(1,$s1,$s2,$s3,$s0);
-		&encstep(2,$s2,$s3,$s0,$s1);
-		&encstep(3,$s3,$s0,$s1,$s2);
-	}
-$code.=<<___;
-	sub	\$1,$rnds
-	jnz	.Lenc_loop
-___
-	if ($verticalspin) { &enclastvert(); }
-	else {	&enclast(0,$s0,$s1,$s2,$s3);
-		&enclast(1,$s1,$s2,$s3,$s0);
-		&enclast(2,$s2,$s3,$s0,$s1);
-		&enclast(3,$s3,$s0,$s1,$s2);
-		$code.=<<___;
-		xor	16+0($key),$s0		# xor with key
-		xor	16+4($key),$s1
-		xor	16+8($key),$s2
-		xor	16+12($key),$s3
-___
-	}
-$code.=<<___;
-	.byte	0xf3,0xc3			# rep ret
-.size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
-___
-
-# it's possible to implement this by shifting tN by 8, filling least
-# significant byte with byte load and finally bswap-ing at the end,
-# but such partial register load kills Core 2...
-sub enccompactvert()
-{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
-
-$code.=<<___;
-	movzb	`&lo("$s0")`,$t0
-	movzb	`&lo("$s1")`,$t1
-	movzb	`&lo("$s2")`,$t2
-	movzb	`&lo("$s3")`,$t3
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	shr	\$16,$s2
-	movzb	`&hi("$s3")`,$acc2
-	movzb	($sbox,$t0,1),$t0
-	movzb	($sbox,$t1,1),$t1
-	movzb	($sbox,$t2,1),$t2
-	movzb	($sbox,$t3,1),$t3
-
-	movzb	($sbox,$acc0,1),$t4	#$t0
-	movzb	`&hi("$s0")`,$acc0
-	movzb	($sbox,$acc1,1),$t5	#$t1
-	movzb	`&lo("$s2")`,$acc1
-	movzb	($sbox,$acc2,1),$acc2	#$t2
-	movzb	($sbox,$acc0,1),$acc0	#$t3
-
-	shl	\$8,$t4
-	shr	\$16,$s3
-	shl	\$8,$t5
-	xor	$t4,$t0
-	shr	\$16,$s0
-	movzb	`&lo("$s3")`,$t4
-	shr	\$16,$s1
-	xor	$t5,$t1
-	shl	\$8,$acc2
-	movzb	`&lo("$s0")`,$t5
-	movzb	($sbox,$acc1,1),$acc1	#$t0
-	xor	$acc2,$t2
-
-	shl	\$8,$acc0
-	movzb	`&lo("$s1")`,$acc2
-	shl	\$16,$acc1
-	xor	$acc0,$t3
-	movzb	($sbox,$t4,1),$t4	#$t1
-	movzb	`&hi("$s3")`,$acc0
-	movzb	($sbox,$t5,1),$t5	#$t2
-	xor	$acc1,$t0
-
-	shr	\$8,$s2
-	movzb	`&hi("$s0")`,$acc1
-	shl	\$16,$t4
-	shr	\$8,$s1
-	shl	\$16,$t5
-	xor	$t4,$t1
-	movzb	($sbox,$acc2,1),$acc2	#$t3
-	movzb	($sbox,$acc0,1),$acc0	#$t0
-	movzb	($sbox,$acc1,1),$acc1	#$t1
-	movzb	($sbox,$s2,1),$s3	#$t3
-	movzb	($sbox,$s1,1),$s2	#$t2
-
-	shl	\$16,$acc2
-	xor	$t5,$t2
-	shl	\$24,$acc0
-	xor	$acc2,$t3
-	shl	\$24,$acc1
-	xor	$acc0,$t0
-	shl	\$24,$s3
-	xor	$acc1,$t1
-	shl	\$24,$s2
-	mov	$t0,$s0
-	mov	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-sub enctransform_ref()
-{ my $sn = shift;
-  my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
-
-$code.=<<___;
-	mov	$sn,$acc
-	and	\$0x80808080,$acc
-	mov	$acc,$tmp
-	shr	\$7,$tmp
-	lea	($sn,$sn),$r2
-	sub	$tmp,$acc
-	and	\$0xfefefefe,$r2
-	and	\$0x1b1b1b1b,$acc
-	mov	$sn,$tmp
-	xor	$acc,$r2
-
-	xor	$r2,$sn
-	rol	\$24,$sn
-	xor	$r2,$sn
-	ror	\$16,$tmp
-	xor	$tmp,$sn
-	ror	\$8,$tmp
-	xor	$tmp,$sn
-___
-}
-
-# unlike decrypt case it does not pay off to parallelize enctransform
-sub enctransform()
-{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
-
-$code.=<<___;
-	mov	\$0x80808080,$t0
-	mov	\$0x80808080,$t1
-	and	$s0,$t0
-	and	$s1,$t1
-	mov	$t0,$acc0
-	mov	$t1,$acc1
-	shr	\$7,$t0
-	lea	($s0,$s0),$r20
-	shr	\$7,$t1
-	lea	($s1,$s1),$r21
-	sub	$t0,$acc0
-	sub	$t1,$acc1
-	and	\$0xfefefefe,$r20
-	and	\$0xfefefefe,$r21
-	and	\$0x1b1b1b1b,$acc0
-	and	\$0x1b1b1b1b,$acc1
-	mov	$s0,$t0
-	mov	$s1,$t1
-	xor	$acc0,$r20
-	xor	$acc1,$r21
-
-	xor	$r20,$s0
-	xor	$r21,$s1
-	 mov	\$0x80808080,$t2
-	rol	\$24,$s0
-	 mov	\$0x80808080,$t3
-	rol	\$24,$s1
-	 and	$s2,$t2
-	 and	$s3,$t3
-	xor	$r20,$s0
-	xor	$r21,$s1
-	 mov	$t2,$acc0
-	ror	\$16,$t0
-	 mov	$t3,$acc1
-	ror	\$16,$t1
-	 lea	($s2,$s2),$r20
-	 shr	\$7,$t2
-	xor	$t0,$s0
-	 shr	\$7,$t3
-	xor	$t1,$s1
-	ror	\$8,$t0
-	 lea	($s3,$s3),$r21
-	ror	\$8,$t1
-	 sub	$t2,$acc0
-	 sub	$t3,$acc1
-	xor	$t0,$s0
-	xor	$t1,$s1
-
-	and	\$0xfefefefe,$r20
-	and	\$0xfefefefe,$r21
-	and	\$0x1b1b1b1b,$acc0
-	and	\$0x1b1b1b1b,$acc1
-	mov	$s2,$t2
-	mov	$s3,$t3
-	xor	$acc0,$r20
-	xor	$acc1,$r21
-
-	ror	\$16,$t2
-	xor	$r20,$s2
-	ror	\$16,$t3
-	xor	$r21,$s3
-	rol	\$24,$s2
-	mov	0($sbox),$acc0			# prefetch Te4
-	rol	\$24,$s3
-	xor	$r20,$s2
-	mov	64($sbox),$acc1
-	xor	$r21,$s3
-	mov	128($sbox),$r20
-	xor	$t2,$s2
-	ror	\$8,$t2
-	xor	$t3,$s3
-	ror	\$8,$t3
-	xor	$t2,$s2
-	mov	192($sbox),$r21
-	xor	$t3,$s3
-___
-}
-
-$code.=<<___;
-.type	_x86_64_AES_encrypt_compact,\@abi-omnipotent
-.align	16
-_x86_64_AES_encrypt_compact:
-.cfi_startproc
-	lea	128($sbox),$inp			# size optimization
-	mov	0-128($inp),$acc1		# prefetch Te4
-	mov	32-128($inp),$acc2
-	mov	64-128($inp),$t0
-	mov	96-128($inp),$t1
-	mov	128-128($inp),$acc1
-	mov	160-128($inp),$acc2
-	mov	192-128($inp),$t0
-	mov	224-128($inp),$t1
-	jmp	.Lenc_loop_compact
-.align	16
-.Lenc_loop_compact:
-		xor	0($key),$s0		# xor with key
-		xor	4($key),$s1
-		xor	8($key),$s2
-		xor	12($key),$s3
-		lea	16($key),$key
-___
-		&enccompactvert();
-$code.=<<___;
-		cmp	16(%rsp),$key
-		je	.Lenc_compact_done
-___
-		&enctransform();
-$code.=<<___;
-	jmp	.Lenc_loop_compact
-.align	16
-.Lenc_compact_done:
-	xor	0($key),$s0
-	xor	4($key),$s1
-	xor	8($key),$s2
-	xor	12($key),$s3
-	.byte	0xf3,0xc3			# rep ret
-.cfi_endproc
-.size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
-___
-
-# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
-$code.=<<___;
-.globl	AES_encrypt
-.type	AES_encrypt,\@function,3
-.align	16
-.globl	asm_AES_encrypt
-.hidden	asm_AES_encrypt
-asm_AES_encrypt:
-AES_encrypt:
-.cfi_startproc
-	mov	%rsp,%rax
-.cfi_def_cfa_register	%rax
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-
-	# allocate frame "above" key schedule
-	lea	-63(%rdx),%rcx	# %rdx is key argument
-	and	\$-64,%rsp
-	sub	%rsp,%rcx
-	neg	%rcx
-	and	\$0x3c0,%rcx
-	sub	%rcx,%rsp
-	sub	\$32,%rsp
-
-	mov	%rsi,16(%rsp)	# save out
-	mov	%rax,24(%rsp)	# save original stack pointer
-.cfi_cfa_expression	%rsp+24,deref,+8
-.Lenc_prologue:
-
-	mov	%rdx,$key
-	mov	240($key),$rnds	# load rounds
-
-	mov	0(%rdi),$s0	# load input vector
-	mov	4(%rdi),$s1
-	mov	8(%rdi),$s2
-	mov	12(%rdi),$s3
-
-	shl	\$4,$rnds
-	lea	($key,$rnds),%rbp
-	mov	$key,(%rsp)	# key schedule
-	mov	%rbp,8(%rsp)	# end of key schedule
-
-	# pick Te4 copy which can't "overlap" with stack frame or key schedule
-	lea	.LAES_Te+2048(%rip),$sbox
-	lea	768(%rsp),%rbp
-	sub	$sbox,%rbp
-	and	\$0x300,%rbp
-	lea	($sbox,%rbp),$sbox
-
-	call	_x86_64_AES_encrypt_compact
-
-	mov	16(%rsp),$out	# restore out
-	mov	24(%rsp),%rsi	# restore saved stack pointer
-.cfi_def_cfa	%rsi,8
-	mov	$s0,0($out)	# write output vector
-	mov	$s1,4($out)
-	mov	$s2,8($out)
-	mov	$s3,12($out)
-
-	mov	-48(%rsi),%r15
-.cfi_restore	%r15
-	mov	-40(%rsi),%r14
-.cfi_restore	%r14
-	mov	-32(%rsi),%r13
-.cfi_restore	%r13
-	mov	-24(%rsi),%r12
-.cfi_restore	%r12
-	mov	-16(%rsi),%rbp
-.cfi_restore	%rbp
-	mov	-8(%rsi),%rbx
-.cfi_restore	%rbx
-	lea	(%rsi),%rsp
-.cfi_def_cfa_register	%rsp
-.Lenc_epilogue:
-	ret
-.cfi_endproc
-.size	AES_encrypt,.-AES_encrypt
-___
-
-#------------------------------------------------------------------#
-
-sub decvert()
-{ my $t3="%r8d";	# zaps $inp!
-
-$code.=<<___;
-	# favor 3-way issue Opteron pipeline...
-	movzb	`&lo("$s0")`,$acc0
-	movzb	`&lo("$s1")`,$acc1
-	movzb	`&lo("$s2")`,$acc2
-	mov	0($sbox,$acc0,8),$t0
-	mov	0($sbox,$acc1,8),$t1
-	mov	0($sbox,$acc2,8),$t2
-
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	movzb	`&lo("$s3")`,$acc2
-	xor	3($sbox,$acc0,8),$t0
-	xor	3($sbox,$acc1,8),$t1
-	mov	0($sbox,$acc2,8),$t3
-
-	movzb	`&hi("$s1")`,$acc0
-	shr	\$16,$s0
-	movzb	`&hi("$s2")`,$acc2
-	xor	3($sbox,$acc0,8),$t2
-	shr	\$16,$s3
-	xor	3($sbox,$acc2,8),$t3
-
-	shr	\$16,$s1
-	lea	16($key),$key
-	shr	\$16,$s2
-
-	movzb	`&lo("$s2")`,$acc0
-	movzb	`&lo("$s3")`,$acc1
-	movzb	`&lo("$s0")`,$acc2
-	xor	2($sbox,$acc0,8),$t0
-	xor	2($sbox,$acc1,8),$t1
-	xor	2($sbox,$acc2,8),$t2
-
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	movzb	`&lo("$s1")`,$acc2
-	xor	1($sbox,$acc0,8),$t0
-	xor	1($sbox,$acc1,8),$t1
-	xor	2($sbox,$acc2,8),$t3
-
-	movzb	`&hi("$s3")`,$acc0
-	mov	12($key),$s3
-	movzb	`&hi("$s0")`,$acc2
-	xor	1($sbox,$acc0,8),$t2
-	mov	0($key),$s0
-	xor	1($sbox,$acc2,8),$t3
-
-	xor	$t0,$s0
-	mov	4($key),$s1
-	mov	8($key),$s2
-	xor	$t2,$s2
-	xor	$t1,$s1
-	xor	$t3,$s3
-___
-}
-
-sub declastvert()
-{ my $t3="%r8d";	# zaps $inp!
-
-$code.=<<___;
-	lea	2048($sbox),$sbox	# size optimization
-	movzb	`&lo("$s0")`,$acc0
-	movzb	`&lo("$s1")`,$acc1
-	movzb	`&lo("$s2")`,$acc2
-	movzb	($sbox,$acc0,1),$t0
-	movzb	($sbox,$acc1,1),$t1
-	movzb	($sbox,$acc2,1),$t2
-
-	movzb	`&lo("$s3")`,$acc0
-	movzb	`&hi("$s3")`,$acc1
-	movzb	`&hi("$s0")`,$acc2
-	movzb	($sbox,$acc0,1),$t3
-	movzb	($sbox,$acc1,1),$acc1	#$t0
-	movzb	($sbox,$acc2,1),$acc2	#$t1
-
-	shl	\$8,$acc1
-	shl	\$8,$acc2
-
-	xor	$acc1,$t0
-	xor	$acc2,$t1
-	shr	\$16,$s3
-
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	shr	\$16,$s0
-	movzb	($sbox,$acc0,1),$acc0	#$t2
-	movzb	($sbox,$acc1,1),$acc1	#$t3
-
-	shl	\$8,$acc0
-	shl	\$8,$acc1
-	shr	\$16,$s1
-	xor	$acc0,$t2
-	xor	$acc1,$t3
-	shr	\$16,$s2
-
-	movzb	`&lo("$s2")`,$acc0
-	movzb	`&lo("$s3")`,$acc1
-	movzb	`&lo("$s0")`,$acc2
-	movzb	($sbox,$acc0,1),$acc0	#$t0
-	movzb	($sbox,$acc1,1),$acc1	#$t1
-	movzb	($sbox,$acc2,1),$acc2	#$t2
-
-	shl	\$16,$acc0
-	shl	\$16,$acc1
-	shl	\$16,$acc2
-
-	xor	$acc0,$t0
-	xor	$acc1,$t1
-	xor	$acc2,$t2
-
-	movzb	`&lo("$s1")`,$acc0
-	movzb	`&hi("$s1")`,$acc1
-	movzb	`&hi("$s2")`,$acc2
-	movzb	($sbox,$acc0,1),$acc0	#$t3
-	movzb	($sbox,$acc1,1),$acc1	#$t0
-	movzb	($sbox,$acc2,1),$acc2	#$t1
-
-	shl	\$16,$acc0
-	shl	\$24,$acc1
-	shl	\$24,$acc2
-
-	xor	$acc0,$t3
-	xor	$acc1,$t0
-	xor	$acc2,$t1
-
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	mov	16+12($key),$s3
-	movzb	($sbox,$acc0,1),$acc0	#$t2
-	movzb	($sbox,$acc1,1),$acc1	#$t3
-	mov	16+0($key),$s0
-
-	shl	\$24,$acc0
-	shl	\$24,$acc1
-
-	xor	$acc0,$t2
-	xor	$acc1,$t3
-
-	mov	16+4($key),$s1
-	mov	16+8($key),$s2
-	lea	-2048($sbox),$sbox
-	xor	$t0,$s0
-	xor	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-sub decstep()
-{ my ($i,@s) = @_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-	$code.="	mov	$s[0],$out\n"		if ($i!=3);
-			$tmp1=$s[2]			if ($i==3);
-	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
-	$code.="	and	\$0xFF,$out\n";
-
-	$code.="	mov	0($sbox,$out,8),$out\n";
-	$code.="	shr	\$16,$tmp1\n";
-			$tmp2=$s[3]			if ($i==3);
-	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
-
-			$tmp0=$s[1]			if ($i==3);
-	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
-	$code.="	and	\$0xFF,$tmp1\n";
-	$code.="	shr	\$24,$tmp2\n";
-
-	$code.="	xor	3($sbox,$tmp0,8),$out\n";
-	$code.="	xor	2($sbox,$tmp1,8),$out\n";
-	$code.="	xor	1($sbox,$tmp2,8),$out\n";
-
-	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
-	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
-	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
-	$code.="\n";
-}
-
-sub declast()
-{ my ($i,@s)=@_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-	$code.="	mov	$s[0],$out\n"		if ($i!=3);
-			$tmp1=$s[2]			if ($i==3);
-	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
-	$code.="	and	\$0xFF,$out\n";
-
-	$code.="	movzb	2048($sbox,$out,1),$out\n";
-	$code.="	shr	\$16,$tmp1\n";
-			$tmp2=$s[3]			if ($i==3);
-	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
-
-			$tmp0=$s[1]			if ($i==3);
-	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
-	$code.="	and	\$0xFF,$tmp1\n";
-	$code.="	shr	\$24,$tmp2\n";
-
-	$code.="	movzb	2048($sbox,$tmp0,1),$tmp0\n";
-	$code.="	movzb	2048($sbox,$tmp1,1),$tmp1\n";
-	$code.="	movzb	2048($sbox,$tmp2,1),$tmp2\n";
-
-	$code.="	shl	\$8,$tmp0\n";
-	$code.="	shl	\$16,$tmp1\n";
-	$code.="	shl	\$24,$tmp2\n";
-
-	$code.="	xor	$tmp0,$out\n";
-	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
-	$code.="	xor	$tmp1,$out\n";
-	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
-	$code.="	xor	$tmp2,$out\n";
-	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
-	$code.="\n";
-}
-
-$code.=<<___;
-.type	_x86_64_AES_decrypt,\@abi-omnipotent
-.align	16
-_x86_64_AES_decrypt:
-	xor	0($key),$s0			# xor with key
-	xor	4($key),$s1
-	xor	8($key),$s2
-	xor	12($key),$s3
-
-	mov	240($key),$rnds			# load key->rounds
-	sub	\$1,$rnds
-	jmp	.Ldec_loop
-.align	16
-.Ldec_loop:
-___
-	if ($verticalspin) { &decvert(); }
-	else {	&decstep(0,$s0,$s3,$s2,$s1);
-		&decstep(1,$s1,$s0,$s3,$s2);
-		&decstep(2,$s2,$s1,$s0,$s3);
-		&decstep(3,$s3,$s2,$s1,$s0);
-		$code.=<<___;
-		lea	16($key),$key
-		xor	0($key),$s0			# xor with key
-		xor	4($key),$s1
-		xor	8($key),$s2
-		xor	12($key),$s3
-___
-	}
-$code.=<<___;
-	sub	\$1,$rnds
-	jnz	.Ldec_loop
-___
-	if ($verticalspin) { &declastvert(); }
-	else {	&declast(0,$s0,$s3,$s2,$s1);
-		&declast(1,$s1,$s0,$s3,$s2);
-		&declast(2,$s2,$s1,$s0,$s3);
-		&declast(3,$s3,$s2,$s1,$s0);
-		$code.=<<___;
-		xor	16+0($key),$s0			# xor with key
-		xor	16+4($key),$s1
-		xor	16+8($key),$s2
-		xor	16+12($key),$s3
-___
-	}
-$code.=<<___;
-	.byte	0xf3,0xc3			# rep ret
-.size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
-___
-
-sub deccompactvert()
-{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
-
-$code.=<<___;
-	movzb	`&lo("$s0")`,$t0
-	movzb	`&lo("$s1")`,$t1
-	movzb	`&lo("$s2")`,$t2
-	movzb	`&lo("$s3")`,$t3
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	shr	\$16,$s3
-	movzb	`&hi("$s1")`,$acc2
-	movzb	($sbox,$t0,1),$t0
-	movzb	($sbox,$t1,1),$t1
-	movzb	($sbox,$t2,1),$t2
-	movzb	($sbox,$t3,1),$t3
-
-	movzb	($sbox,$acc0,1),$t4	#$t0
-	movzb	`&hi("$s2")`,$acc0
-	movzb	($sbox,$acc1,1),$t5	#$t1
-	movzb	($sbox,$acc2,1),$acc2	#$t2
-	movzb	($sbox,$acc0,1),$acc0	#$t3
-
-	shr	\$16,$s2
-	shl	\$8,$t5
-	shl	\$8,$t4
-	movzb	`&lo("$s2")`,$acc1
-	shr	\$16,$s0
-	xor	$t4,$t0
-	shr	\$16,$s1
-	movzb	`&lo("$s3")`,$t4
-
-	shl	\$8,$acc2
-	xor	$t5,$t1
-	shl	\$8,$acc0
-	movzb	`&lo("$s0")`,$t5
-	movzb	($sbox,$acc1,1),$acc1	#$t0
-	xor	$acc2,$t2
-	movzb	`&lo("$s1")`,$acc2
-
-	shl	\$16,$acc1
-	xor	$acc0,$t3
-	movzb	($sbox,$t4,1),$t4	#$t1
-	movzb	`&hi("$s1")`,$acc0
-	movzb	($sbox,$acc2,1),$acc2	#$t3
-	xor	$acc1,$t0
-	movzb	($sbox,$t5,1),$t5	#$t2
-	movzb	`&hi("$s2")`,$acc1
-
-	shl	\$16,$acc2
-	shl	\$16,$t4
-	shl	\$16,$t5
-	xor	$acc2,$t3
-	movzb	`&hi("$s3")`,$acc2
-	xor	$t4,$t1
-	shr	\$8,$s0
-	xor	$t5,$t2
-
-	movzb	($sbox,$acc0,1),$acc0	#$t0
-	movzb	($sbox,$acc1,1),$s1	#$t1
-	movzb	($sbox,$acc2,1),$s2	#$t2
-	movzb	($sbox,$s0,1),$s3	#$t3
-
-	mov	$t0,$s0
-	shl	\$24,$acc0
-	shl	\$24,$s1
-	shl	\$24,$s2
-	xor	$acc0,$s0
-	shl	\$24,$s3
-	xor	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-# parallelized version! input is pair of 64-bit values: %rax=s1.s0
-# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
-# %ecx=s2 and %edx=s3.
-sub dectransform()
-{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
-  my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
-  my $prefetch = shift;
-
-$code.=<<___;
-	mov	$mask80,$tp40
-	mov	$mask80,$tp48
-	and	$tp10,$tp40
-	and	$tp18,$tp48
-	mov	$tp40,$acc0
-	mov	$tp48,$acc8
-	shr	\$7,$tp40
-	lea	($tp10,$tp10),$tp20
-	shr	\$7,$tp48
-	lea	($tp18,$tp18),$tp28
-	sub	$tp40,$acc0
-	sub	$tp48,$acc8
-	and	$maskfe,$tp20
-	and	$maskfe,$tp28
-	and	$mask1b,$acc0
-	and	$mask1b,$acc8
-	xor	$acc0,$tp20
-	xor	$acc8,$tp28
-	mov	$mask80,$tp80
-	mov	$mask80,$tp88
-
-	and	$tp20,$tp80
-	and	$tp28,$tp88
-	mov	$tp80,$acc0
-	mov	$tp88,$acc8
-	shr	\$7,$tp80
-	lea	($tp20,$tp20),$tp40
-	shr	\$7,$tp88
-	lea	($tp28,$tp28),$tp48
-	sub	$tp80,$acc0
-	sub	$tp88,$acc8
-	and	$maskfe,$tp40
-	and	$maskfe,$tp48
-	and	$mask1b,$acc0
-	and	$mask1b,$acc8
-	xor	$acc0,$tp40
-	xor	$acc8,$tp48
-	mov	$mask80,$tp80
-	mov	$mask80,$tp88
-
-	and	$tp40,$tp80
-	and	$tp48,$tp88
-	mov	$tp80,$acc0
-	mov	$tp88,$acc8
-	shr	\$7,$tp80
-	 xor	$tp10,$tp20		# tp2^=tp1
-	shr	\$7,$tp88
-	 xor	$tp18,$tp28		# tp2^=tp1
-	sub	$tp80,$acc0
-	sub	$tp88,$acc8
-	lea	($tp40,$tp40),$tp80
-	lea	($tp48,$tp48),$tp88
-	 xor	$tp10,$tp40		# tp4^=tp1
-	 xor	$tp18,$tp48		# tp4^=tp1
-	and	$maskfe,$tp80
-	and	$maskfe,$tp88
-	and	$mask1b,$acc0
-	and	$mask1b,$acc8
-	xor	$acc0,$tp80
-	xor	$acc8,$tp88
-
-	xor	$tp80,$tp10		# tp1^=tp8
-	xor	$tp88,$tp18		# tp1^=tp8
-	xor	$tp80,$tp20		# tp2^tp1^=tp8
-	xor	$tp88,$tp28		# tp2^tp1^=tp8
-	mov	$tp10,$acc0
-	mov	$tp18,$acc8
-	xor	$tp80,$tp40		# tp4^tp1^=tp8
-	shr	\$32,$acc0
-	xor	$tp88,$tp48		# tp4^tp1^=tp8
-	shr	\$32,$acc8
-	xor	$tp20,$tp80		# tp8^=tp8^tp2^tp1=tp2^tp1
-	rol	\$8,`&LO("$tp10")`	# ROTATE(tp1^tp8,8)
-	xor	$tp28,$tp88		# tp8^=tp8^tp2^tp1=tp2^tp1
-	rol	\$8,`&LO("$tp18")`	# ROTATE(tp1^tp8,8)
-	xor	$tp40,$tp80		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
-	rol	\$8,`&LO("$acc0")`	# ROTATE(tp1^tp8,8)
-	xor	$tp48,$tp88		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
-
-	rol	\$8,`&LO("$acc8")`	# ROTATE(tp1^tp8,8)
-	xor	`&LO("$tp80")`,`&LO("$tp10")`
-	shr	\$32,$tp80
-	xor	`&LO("$tp88")`,`&LO("$tp18")`
-	shr	\$32,$tp88
-	xor	`&LO("$tp80")`,`&LO("$acc0")`
-	xor	`&LO("$tp88")`,`&LO("$acc8")`
-
-	mov	$tp20,$tp80
-	rol	\$24,`&LO("$tp20")`	# ROTATE(tp2^tp1^tp8,24)
-	mov	$tp28,$tp88
-	rol	\$24,`&LO("$tp28")`	# ROTATE(tp2^tp1^tp8,24)
-	shr	\$32,$tp80
-	xor	`&LO("$tp20")`,`&LO("$tp10")`
-	shr	\$32,$tp88
-	xor	`&LO("$tp28")`,`&LO("$tp18")`
-	rol	\$24,`&LO("$tp80")`	# ROTATE(tp2^tp1^tp8,24)
-	mov	$tp40,$tp20
-	rol	\$24,`&LO("$tp88")`	# ROTATE(tp2^tp1^tp8,24)
-	mov	$tp48,$tp28
-	shr	\$32,$tp20
-	xor	`&LO("$tp80")`,`&LO("$acc0")`
-	shr	\$32,$tp28
-	xor	`&LO("$tp88")`,`&LO("$acc8")`
-
-	`"mov	0($sbox),$mask80"	if ($prefetch)`
-	rol	\$16,`&LO("$tp40")`	# ROTATE(tp4^tp1^tp8,16)
-	`"mov	64($sbox),$maskfe"	if ($prefetch)`
-	rol	\$16,`&LO("$tp48")`	# ROTATE(tp4^tp1^tp8,16)
-	`"mov	128($sbox),$mask1b"	if ($prefetch)`
-	rol	\$16,`&LO("$tp20")`	# ROTATE(tp4^tp1^tp8,16)
-	`"mov	192($sbox),$tp80"	if ($prefetch)`
-	xor	`&LO("$tp40")`,`&LO("$tp10")`
-	rol	\$16,`&LO("$tp28")`	# ROTATE(tp4^tp1^tp8,16)
-	xor	`&LO("$tp48")`,`&LO("$tp18")`
-	`"mov	256($sbox),$tp88"	if ($prefetch)`
-	xor	`&LO("$tp20")`,`&LO("$acc0")`
-	xor	`&LO("$tp28")`,`&LO("$acc8")`
-___
-}
-
-$code.=<<___;
-.type	_x86_64_AES_decrypt_compact,\@abi-omnipotent
-.align	16
-_x86_64_AES_decrypt_compact:
-.cfi_startproc
-	lea	128($sbox),$inp			# size optimization
-	mov	0-128($inp),$acc1		# prefetch Td4
-	mov	32-128($inp),$acc2
-	mov	64-128($inp),$t0
-	mov	96-128($inp),$t1
-	mov	128-128($inp),$acc1
-	mov	160-128($inp),$acc2
-	mov	192-128($inp),$t0
-	mov	224-128($inp),$t1
-	jmp	.Ldec_loop_compact
-
-.align	16
-.Ldec_loop_compact:
-		xor	0($key),$s0		# xor with key
-		xor	4($key),$s1
-		xor	8($key),$s2
-		xor	12($key),$s3
-		lea	16($key),$key
-___
-		&deccompactvert();
-$code.=<<___;
-		cmp	16(%rsp),$key
-		je	.Ldec_compact_done
-
-		mov	256+0($sbox),$mask80
-		shl	\$32,%rbx
-		shl	\$32,%rdx
-		mov	256+8($sbox),$maskfe
-		or	%rbx,%rax
-		or	%rdx,%rcx
-		mov	256+16($sbox),$mask1b
-___
-		&dectransform(1);
-$code.=<<___;
-	jmp	.Ldec_loop_compact
-.align	16
-.Ldec_compact_done:
-	xor	0($key),$s0
-	xor	4($key),$s1
-	xor	8($key),$s2
-	xor	12($key),$s3
-	.byte	0xf3,0xc3			# rep ret
-.cfi_endproc
-.size	_x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
-___
-
-# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
-$code.=<<___;
-.globl	AES_decrypt
-.type	AES_decrypt,\@function,3
-.align	16
-.globl	asm_AES_decrypt
-.hidden	asm_AES_decrypt
-asm_AES_decrypt:
-AES_decrypt:
-.cfi_startproc
-	mov	%rsp,%rax
-.cfi_def_cfa_register	%rax
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-
-	# allocate frame "above" key schedule
-	lea	-63(%rdx),%rcx	# %rdx is key argument
-	and	\$-64,%rsp
-	sub	%rsp,%rcx
-	neg	%rcx
-	and	\$0x3c0,%rcx
-	sub	%rcx,%rsp
-	sub	\$32,%rsp
-
-	mov	%rsi,16(%rsp)	# save out
-	mov	%rax,24(%rsp)	# save original stack pointer
-.cfi_cfa_expression	%rsp+24,deref,+8
-.Ldec_prologue:
-
-	mov	%rdx,$key
-	mov	240($key),$rnds	# load rounds
-
-	mov	0(%rdi),$s0	# load input vector
-	mov	4(%rdi),$s1
-	mov	8(%rdi),$s2
-	mov	12(%rdi),$s3
-
-	shl	\$4,$rnds
-	lea	($key,$rnds),%rbp
-	mov	$key,(%rsp)	# key schedule
-	mov	%rbp,8(%rsp)	# end of key schedule
-
-	# pick Td4 copy which can't "overlap" with stack frame or key schedule
-	lea	.LAES_Td+2048(%rip),$sbox
-	lea	768(%rsp),%rbp
-	sub	$sbox,%rbp
-	and	\$0x300,%rbp
-	lea	($sbox,%rbp),$sbox
-	shr	\$3,%rbp	# recall "magic" constants!
-	add	%rbp,$sbox
-
-	call	_x86_64_AES_decrypt_compact
-
-	mov	16(%rsp),$out	# restore out
-	mov	24(%rsp),%rsi	# restore saved stack pointer
-.cfi_def_cfa	%rsi,8
-	mov	$s0,0($out)	# write output vector
-	mov	$s1,4($out)
-	mov	$s2,8($out)
-	mov	$s3,12($out)
-
-	mov	-48(%rsi),%r15
-.cfi_restore	%r15
-	mov	-40(%rsi),%r14
-.cfi_restore	%r14
-	mov	-32(%rsi),%r13
-.cfi_restore	%r13
-	mov	-24(%rsi),%r12
-.cfi_restore	%r12
-	mov	-16(%rsi),%rbp
-.cfi_restore	%rbp
-	mov	-8(%rsi),%rbx
-.cfi_restore	%rbx
-	lea	(%rsi),%rsp