6 files changed, 109 insertions, 31 deletions
diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl
index ff0b068229..4899421f30 100644
--- a/crypto/aes/asm/aesni-sha1-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -95,6 +95,8 @@ $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
 	   $1>=10);
 
+$shaext=1;	### set to zero if compiling for 1.0.1
+
 $stitched_decrypt=0;
 
 open OUT,"| \"$^X\" $xlate $flavour $output";
@@ -119,6 +121,8 @@ aesni_cbc_sha1_enc:
 	# caller should check for SSSE3 and AES-NI bits
 	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
 	mov	OPENSSL_ia32cap_P+4(%rip),%r11
+___
+$code.=<<___ if ($shaext);
 	bt	\$61,%r11		# check SHA bit
 	jc	aesni_cbc_sha1_enc_shaext
 ___
@@ -1657,7 +1661,7 @@ K_XX_XX:
 .asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	64
 ___
-						{{{
+						if ($shaext) {{{
 ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 
 $rounds="%r11d";
@@ -1676,7 +1680,7 @@ aesni_cbc_sha1_enc_shaext:
 	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
 ___
 $code.=<<___ if ($win64);
-	lea	`-8-4*16`(%rsp),%rsp
+	lea	`-8-10*16`(%rsp),%rsp
 	movaps	%xmm6,-8-10*16(%rax)
 	movaps	%xmm7,-8-9*16(%rax)
 	movaps	%xmm8,-8-8*16(%rax)
@@ -1867,7 +1871,21 @@ ssse3_handler:
 	lea	(%rsi,%r10),%r10	# epilogue label
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lcommon_seh_tail
+___
+$code.=<<___ if ($shaext);
+	lea	aesni_cbc_sha1_enc_shaext(%rip),%r10
+	cmp	%r10,%rbx
+	jb	.Lseh_no_shaext
 
+	lea	(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	168(%rax),%rax		# adjust stack pointer
+	jmp	.Lcommon_seh_tail
+.Lseh_no_shaext:
+___
+$code.=<<___;
 	lea	96(%rax),%rsi
 	lea	512($context),%rdi	# &context.Xmm6
 	mov	\$20,%ecx
@@ -1939,6 +1957,11 @@ $code.=<<___ if ($avx);
 	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
 	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
 ___
+$code.=<<___ if ($shaext);
+	.rva	.LSEH_begin_aesni_cbc_sha1_enc_shaext
+	.rva	.LSEH_end_aesni_cbc_sha1_enc_shaext
+	.rva	.LSEH_info_aesni_cbc_sha1_enc_shaext
+___
 $code.=<<___;
 .section	.xdata
 .align	8
@@ -1953,6 +1976,12 @@ $code.=<<___ if ($avx);
 	.rva	ssse3_handler
 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
 ___
+$code.=<<___ if ($shaext);
+.LSEH_info_aesni_cbc_sha1_enc_shaext:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
+___
 }
 
 ####################################################################
diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl
index b6ad7b29ed..63e2747350 100644
--- a/crypto/aes/asm/aesni-sha256-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha256-x86_64.pl
@@ -59,6 +59,9 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	$avx = ($1>=10) + ($1>=11);
 }
 
+$shaext=1;	### set to zero if compiling for 1.0.1
+$avx=1		if (!$shaext && $avx);
+
 open OUT,"| \"$^X\" $xlate $flavour $output";
 *STDOUT=*OUT;
 
@@ -113,10 +116,12 @@ $code.=<<___ if ($avx);
 	je	.Lprobe
 	mov	0(%r11),%eax
 	mov	4(%r11),%r10
-
+___
+$code.=<<___ if ($shaext);
 	bt	\$61,%r10			# check for SHA
 	jc	${func}_shaext
-
+___
+$code.=<<___;
 	mov	%r10,%r11
 	shr	\$32,%r11
 
@@ -1259,16 +1264,17 @@ ___
     $r++;	unshift(@rndkey,pop(@rndkey));
 };
 
+if ($shaext) {
+my $Tbl="%rax";
+
 $code.=<<___;
 .type	${func}_shaext,\@function,6
 .align	32
 ${func}_shaext:
-	mov	%rsp,%rax
 	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
-	push	%rbx
 ___
 $code.=<<___ if ($win64);
-	lea	`-4*16`(%rsp),%rsp
+	lea	`-8-10*16`(%rsp),%rsp
 	movaps	%xmm6,-8-10*16(%rax)
 	movaps	%xmm7,-8-9*16(%rax)
 	movaps	%xmm8,-8-8*16(%rax)
@@ -1465,24 +1471,24 @@ $code.=<<___;
 	movdqu		$CDGH,16($ctx)
 ___
 $code.=<<___ if ($win64);
-	movaps	-8-10*16(%rax),%xmm6
-	movaps	-8-9*16(%rax),%xmm7
-	movaps	-8-8*16(%rax),%xmm8
-	movaps	-8-7*16(%rax),%xmm9
-	movaps	-8-6*16(%rax),%xmm10
-	movaps	-8-5*16(%rax),%xmm11
-	movaps	-8-4*16(%rax),%xmm12
-	movaps	-8-3*16(%rax),%xmm13
-	movaps	-8-2*16(%rax),%xmm14
-	movaps	-8-1*16(%rax),%xmm15
+	movaps	0*16(%rsp),%xmm6
+	movaps	1*16(%rsp),%xmm7
+	movaps	2*16(%rsp),%xmm8
+	movaps	3*16(%rsp),%xmm9
+	movaps	4*16(%rsp),%xmm10
+	movaps	5*16(%rsp),%xmm11
+	movaps	6*16(%rsp),%xmm12
+	movaps	7*16(%rsp),%xmm13
+	movaps	8*16(%rsp),%xmm14
+	movaps	9*16(%rsp),%xmm15
+	lea	8+10*16(%rsp),%rsp
 .Lepilogue_shaext:
 ___
 $code.=<<___;
-	mov	-8(%rax),%rbx
-	mov	%rax,%rsp
 	ret
 .size	${func}_shaext,.-${func}_shaext
 ___
+}
 }}}}}
 
 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
@@ -1527,6 +1533,19 @@ se_handler:
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lin_prologue
 ___
+$code.=<<___ if ($shaext);
+	lea	aesni_cbc_sha256_enc_shaext(%rip),%r10
+	cmp	%r10,%rbx
+	jb	.Lnot_in_shaext
+
+	lea	(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	168(%rax),%rax		# adjust stack pointer
+	jmp	.Lin_prologue
+.Lnot_in_shaext:
+___
 $code.=<<___ if ($avx>1);
 	lea	.Lavx2_shortcut(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
@@ -1613,6 +1632,11 @@ $code.=<<___ if ($avx>1);
 	.rva	.LSEH_end_${func}_avx2
 	.rva	.LSEH_info_${func}_avx2
 ___
+$code.=<<___ if ($shaext);
+	.rva	.LSEH_begin_${func}_shaext
+	.rva	.LSEH_end_${func}_shaext
+	.rva	.LSEH_info_${func}_shaext
+___
 $code.=<<___ if ($avx);
 .section	.xdata
 .align	8
@@ -1632,6 +1656,12 @@ $code.=<<___ if ($avx>1);
 	.rva	se_handler
 	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
 ___
+$code.=<<___ if ($shaext);
+.LSEH_info_${func}_shaext:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
+___
 }
 
 ####################################################################
diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl
index acc4f639a7..81252a62e9 100644
--- a/crypto/sha/asm/sha1-586.pl
+++ b/crypto/sha/asm/sha1-586.pl
@@ -128,6 +128,8 @@ $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
 		`ml 2>&1` =~ /Version ([0-9]+)\./ &&
 		$1>=10);	# first version supporting AVX
 
+$shaext=$xmm;	### set to zero if compiling for 1.0.1
+
 &external_label("OPENSSL_ia32cap_P") if ($xmm);
 
 
@@ -307,7 +309,7 @@ if ($alt) {
 
 &function_begin("sha1_block_data_order");
 if ($xmm) {
-  &static_label("shaext_shortcut");
+  &static_label("shaext_shortcut")	if ($shaext);
   &static_label("ssse3_shortcut");
   &static_label("avx_shortcut")		if ($ymm);
   &static_label("K_XX_XX");
@@ -325,8 +327,10 @@ if ($xmm) {
 	&mov	($C,&DWP(8,$T));
 	&test	($A,1<<24);		# check FXSR bit
 	&jz	(&label("x86"));
-	&test	($C,1<<29);		# check SHA bit
-	&jnz	(&label("shaext_shortcut"));
+	if ($shaext) {
+		&test	($C,1<<29);		# check SHA bit
+		&jnz	(&label("shaext_shortcut"));
+	}
 	if ($ymm) {
 		&and	($D,1<<28);		# mask AVX bit
 		&and	($A,1<<30);		# mask "Intel CPU" bit
@@ -405,7 +409,7 @@ if ($xmm) {
 &function_end("sha1_block_data_order");
 
 if ($xmm) {
-{
+if ($shaext) {
 ######################################################################
 # Intel SHA Extensions implementation of SHA1 update function.
 #
diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl
index ea288c15d5..6dc64a2ead 100755
--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl
@@ -107,6 +107,9 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	$avx = ($1>=10) + ($1>=11);
 }
 
+$shaext=1;	### set to zero if compiling for 1.0.1
+$avx=1		if (!$shaext && $avx);
+
 open OUT,"| \"$^X\" $xlate $flavour $output";
 *STDOUT=*OUT;
 
@@ -245,7 +248,8 @@ sha1_block_data_order:
 	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
 	test	\$`1<<9`,%r8d		# check SSSE3 bit
 	jz	.Lialu
-
+___
+$code.=<<___ if ($shaext);
 	test	\$`1<<29`,%r10d		# check SHA bit	
 	jnz	_shaext_shortcut
 ___
@@ -321,7 +325,7 @@ $code.=<<___;
 	ret
 .size	sha1_block_data_order,.-sha1_block_data_order
 ___
-{{{
+if ($shaext) {{{
 ######################################################################
 # Intel SHA Extensions implementation of SHA1 update function.
 #
@@ -1956,9 +1960,13 @@ ssse3_handler:
 	.rva	.LSEH_begin_sha1_block_data_order
 	.rva	.LSEH_end_sha1_block_data_order
 	.rva	.LSEH_info_sha1_block_data_order
+___
+$code.=<<___ if ($shaext);
 	.rva	.LSEH_begin_sha1_block_data_order_shaext
 	.rva	.LSEH_end_sha1_block_data_order_shaext
 	.rva	.LSEH_info_sha1_block_data_order_shaext
+___
+$code.=<<___;
 	.rva	.LSEH_begin_sha1_block_data_order_ssse3
 	.rva	.LSEH_end_sha1_block_data_order_ssse3
 	.rva	.LSEH_info_sha1_block_data_order_ssse3
diff --git a/crypto/sha/asm/sha256-586.pl b/crypto/sha/asm/sha256-586.pl
index 09648a8207..ee094a9214 100644
--- a/crypto/sha/asm/sha256-586.pl
+++ b/crypto/sha/asm/sha256-586.pl
@@ -82,6 +82,8 @@ if ($xmm && !$avx && $ARGV[0] eq "win32" &&
 	$avx = ($1>=10) + ($1>=11);
 }
 
+$shaext=$xmm;	### set to zero if compiling for 1.0.1
+
 $unroll_after = 64*4;	# If pre-evicted from L1P cache first spin of
 			# fully unrolled loop was measured to run about
 			# 3-4x slower. If slowdown coefficient is N and
@@ -205,8 +207,8 @@ sub BODY_00_15() {
 	&jz	($unroll_after?&label("no_xmm"):&label("loop"));
 	&and	("ecx",1<<30);		# mask "Intel CPU" bit
 	&and	("ebx",1<<28|1<<9);	# mask AVX and SSSE3 bits
-	&test	("edx",1<<29)		if ($xmm);	# check for SHA
-	&jnz	(&label("shaext"))	if ($xmm);
+	&test	("edx",1<<29)		if ($shaext);	# check for SHA
+	&jnz	(&label("shaext"))	if ($shaext);
 	&or	("ecx","ebx");
 	&and	("ecx",1<<28|1<<30);
 	&cmp	("ecx",1<<28|1<<30);
@@ -505,7 +507,7 @@ my @AH=($A,$K256);
 &function_end_A();
 }
 						if (!$i386 && $xmm) {{{
-{
+if ($shaext) {
 ######################################################################
 # Intel SHA Extensions implementation of SHA256 update function.
 #
diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl
index 01698c40cf..0556a8f36a 100755
--- a/crypto/sha/asm/sha512-x86_64.pl
+++ b/crypto/sha/asm/sha512-x86_64.pl
@@ -123,6 +123,9 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	$avx = ($1>=10) + ($1>=11);
 }
 
+$shaext=1;	### set to zero if compiling for 1.0.1
+$avx=1		if (!$shaext && $avx);
+
 open OUT,"| \"$^X\" $xlate $flavour $output";
 *STDOUT=*OUT;
 
@@ -259,7 +262,7 @@ $code.=<<___ if ($SZ==4 || $avx);
 	mov	4(%r11),%r10d
 	mov	8(%r11),%r11d
 ___
-$code.=<<___ if ($SZ==4);
+$code.=<<___ if ($SZ==4 && $shaext);
 	test	\$`1<<29`,%r11d		# check for SHA
 	jnz	_shaext_shortcut
 ___
@@ -518,7 +521,7 @@ ___
 ######################################################################
 # SIMD code paths
 #
-if ($SZ==4) {{{
+if ($SZ==4 && $shaext) {{{
 ######################################################################
 # Intel SHA Extensions implementation of SHA256 update function.
 #
@@ -2295,10 +2298,12 @@ shaext_handler:
 	.rva	.LSEH_end_$func
 	.rva	.LSEH_info_$func
 ___
-$code.=<<___ if ($SZ==4);
+$code.=<<___ if ($SZ==4 && $shext);
 	.rva	.LSEH_begin_${func}_shaext
 	.rva	.LSEH_end_${func}_shaext
 	.rva	.LSEH_info_${func}_shaext
+___
+$code.=<<___ if ($SZ==4);
 	.rva	.LSEH_begin_${func}_ssse3
 	.rva	.LSEH_end_${func}_ssse3
 	.rva	.LSEH_info_${func}_ssse3