From 624265c60e07f8e5f251d0f5b79e34cf0221af73 Mon Sep 17 00:00:00 2001
From: Rich Salz <rsalz@openssl.org>
Date: Thu, 15 Jun 2017 12:03:40 -0400
Subject: Cleanup some copyright stuff

Remove some incorrect copyright references.
Move copyright to standard place
Add OpenSSL copyright where missing.
Remove copyrighted file that we don't use any more
Remove Itanium assembler for RC4 and MD5 (assembler versions of old and
weak algorithms for an old chip)
Standardize apps/rehash copyright comment; approved by Timo
Put dual-copyright notice on mkcert

Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/3691)
---
 Configurations/00-base-templates.conf |    3 +-
 apps/rehash.c                         |    7 +-
 apps/tsget.in                         |    2 +-
 apps/vms_term_sock.c                  |    1 +
 apps/vms_term_sock.h                  |    1 +
 crypto/asn1/x_spki.c                  |    5 -
 crypto/ec/asm/ecp_nistz256-ppc64.pl   |    7 +
 crypto/md5/asm/md5-ia64.S             | 1002 ---------------------------------
 crypto/md5/build.info                 |   12 -
 crypto/mem_sec.c                      |    6 +-
 crypto/rc4/asm/rc4-ia64.pl            |  767 -------------------------
 crypto/rc4/build.info                 |   16 -
 engines/vendor_defns/hwcryptohook.h   |  509 -----------------
 test/certs/mkcert.sh                  |   11 +-
 test/ossl_shim/include/openssl/base.h |   57 +-
 15 files changed, 28 insertions(+), 2378 deletions(-)
 delete mode 100644 crypto/md5/asm/md5-ia64.S
 delete mode 100644 crypto/rc4/asm/rc4-ia64.pl
 delete mode 100644 engines/vendor_defns/hwcryptohook.h

diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf
index 84f71442c1..c0162808b3 100644
--- a/Configurations/00-base-templates.conf
+++ b/Configurations/00-base-templates.conf
@@ -184,9 +184,8 @@
 	cpuid_asm_src   => "ia64cpuid.s",
 	bn_asm_src      => "bn-ia64.s ia64-mont.s",
 	aes_asm_src     => "aes_core.c aes_cbc.c aes-ia64.s",
-	md5_asm_src     => "md5-ia64.s",
 	sha1_asm_src    => "sha1-ia64.s sha256-ia64.s sha512-ia64.s",
-	rc4_asm_src     => "rc4-ia64.s rc4_skey.c",
+	rc4_asm_src     => "rc4_skey.c",
 	modes_asm_src   => "ghash-ia64.s",
 	perlasm_scheme	=> "void"
     },
diff --git a/apps/rehash.c b/apps/rehash.c
index e3c02448fb..ad7108aad9 100644
--- a/apps/rehash.c
+++ b/apps/rehash.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2015-2017 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright (c) 2013-2014 Timo Teräs <timo.teras@gmail.com>
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -7,12 +8,6 @@
  * https://www.openssl.org/source/license.html
  */
 
-/*
- * C implementation based on the original Perl and shell versions
- *
- * Copyright (c) 2013-2014 Timo TerÃ¤s <timo.teras@iki.fi>
- */
-
 #include "apps.h"
 
 #if defined(OPENSSL_SYS_UNIX) || defined(__APPLE__) || \
diff --git a/apps/tsget.in b/apps/tsget.in
index c6193e57da..71bcc24525 100644
--- a/apps/tsget.in
+++ b/apps/tsget.in
@@ -1,6 +1,6 @@
 #!{- $config{hashbangperl} -}
-# Copyright (c) 2002 The OpenTSA Project. All rights reserved.
 # Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2002 The OpenTSA Project. All rights reserved.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
diff --git a/apps/vms_term_sock.c b/apps/vms_term_sock.c
index a7d87ff361..ff954ed283 100644
--- a/apps/vms_term_sock.c
+++ b/apps/vms_term_sock.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
  * Copyright 2016 VMS Software, Inc. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
diff --git a/apps/vms_term_sock.h b/apps/vms_term_sock.h
index 662fa0adaf..c4d1702d79 100644
--- a/apps/vms_term_sock.h
+++ b/apps/vms_term_sock.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
  * Copyright 2016 VMS Software, Inc. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
diff --git a/crypto/asn1/x_spki.c b/crypto/asn1/x_spki.c
index c45400b42f..0d72a3f3a9 100644
--- a/crypto/asn1/x_spki.c
+++ b/crypto/asn1/x_spki.c
@@ -7,11 +7,6 @@
  * https://www.openssl.org/source/license.html
  */
 
- /*
-  * This module was send to me my Pat Richards <patr@x509.com> who wrote it.
-  * It is under my Copyright with his permission
-  */
-
 #include <stdio.h>
 #include "internal/cryptlib.h"
 #include <openssl/x509.h>
diff --git a/crypto/ec/asm/ecp_nistz256-ppc64.pl b/crypto/ec/asm/ecp_nistz256-ppc64.pl
index 73d0746eb9..70af6b6f5e 100755
--- a/crypto/ec/asm/ecp_nistz256-ppc64.pl
+++ b/crypto/ec/asm/ecp_nistz256-ppc64.pl
@@ -1,4 +1,11 @@
 #! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
diff --git a/crypto/md5/asm/md5-ia64.S b/crypto/md5/asm/md5-ia64.S
deleted file mode 100644
index c20467b47b..0000000000
--- a/crypto/md5/asm/md5-ia64.S
+++ /dev/null
@@ -1,1002 +0,0 @@
-/*
- *
- * Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
- *
- * Licensed under the OpenSSL license (the "License").  You may not use
- * this file except in compliance with the License.  You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- */
-
-/* Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
-
-//	Common registers are assigned as follows:
-//
-//	COMMON
-//
-//	t0		Const Tbl Ptr	TPtr
-//	t1		Round Constant	TRound
-//	t4		Block residual	LenResid
-//	t5		Residual Data	DTmp
-//
-//	{in,out}0	Block 0 Cycle	RotateM0
-//	{in,out}1	Block Value 12	M12
-//	{in,out}2	Block Value 8	M8
-//	{in,out}3	Block Value 4	M4
-//	{in,out}4	Block Value 0	M0
-//	{in,out}5	Block 1 Cycle	RotateM1
-//	{in,out}6	Block Value 13	M13
-//	{in,out}7	Block Value 9	M9
-//	{in,out}8	Block Value 5	M5
-//	{in,out}9	Block Value 1	M1
-//	{in,out}10	Block 2 Cycle	RotateM2
-//	{in,out}11	Block Value 14	M14
-//	{in,out}12	Block Value 10	M10
-//	{in,out}13	Block Value 6	M6
-//	{in,out}14	Block Value 2	M2
-//	{in,out}15	Block 3 Cycle	RotateM3
-//	{in,out}16	Block Value 15	M15
-//	{in,out}17	Block Value 11	M11
-//	{in,out}18	Block Value 7	M7
-//	{in,out}19	Block Value 3	M3
-//	{in,out}20	Scratch			Z
-//	{in,out}21	Scratch			Y
-//	{in,out}22	Scratch			X
-//	{in,out}23	Scratch			W
-//	{in,out}24	Digest A		A
-//	{in,out}25	Digest B		B
-//	{in,out}26	Digest C		C
-//	{in,out}27	Digest D		D
-//	{in,out}28	Active Data Ptr	DPtr
-//	in28		Dummy Value		-
-//	out28		Dummy Value		-
-//	bt0			Coroutine Link	QUICK_RTN
-//
-///	These predicates are used for computing the padding block(s) and
-///	are shared between the driver and digest co-routines
-//
-//	pt0			Extra Pad Block	pExtra
-//	pt1			Load next word	pLoad
-//	pt2			Skip next word	pSkip
-//	pt3			Search for Pad	pNoPad
-//	pt4			Pad Word 0		pPad0
-//	pt5			Pad Word 1		pPad1
-//	pt6			Pad Word 2		pPad2
-//	pt7			Pad Word 3		pPad3
-
-#define	DTmp		r19
-#define	LenResid	r18
-#define	QUICK_RTN	b6
-#define	TPtr		r14
-#define	TRound		r15
-#define	pExtra		p6
-#define	pLoad		p7
-#define	pNoPad		p9
-#define	pPad0		p10
-#define	pPad1		p11
-#define	pPad2		p12
-#define	pPad3		p13
-#define	pSkip		p8
-
-#define	A_		out24
-#define	B_		out25
-#define	C_		out26
-#define	D_		out27
-#define	DPtr_		out28
-#define	M0_		out4
-#define	M1_		out9
-#define	M10_		out12
-#define	M11_		out17
-#define	M12_		out1
-#define	M13_		out6
-#define	M14_		out11
-#define	M15_		out16
-#define	M2_		out14
-#define	M3_		out19
-#define	M4_		out3
-#define	M5_		out8
-#define	M6_		out13
-#define	M7_		out18
-#define	M8_		out2
-#define	M9_		out7
-#define	RotateM0_	out0
-#define	RotateM1_	out5
-#define	RotateM2_	out10
-#define	RotateM3_	out15
-#define	W_		out23
-#define	X_		out22
-#define	Y_		out21
-#define	Z_		out20
-
-#define	A		in24
-#define	B		in25
-#define	C		in26
-#define	D		in27
-#define	DPtr		in28
-#define	M0		in4
-#define	M1		in9
-#define	M10		in12
-#define	M11		in17
-#define	M12		in1
-#define	M13		in6
-#define	M14		in11
-#define	M15		in16
-#define	M2		in14
-#define	M3		in19
-#define	M4		in3
-#define	M5		in8
-#define	M6		in13
-#define	M7		in18
-#define	M8		in2
-#define	M9		in7
-#define	RotateM0	in0
-#define	RotateM1	in5
-#define	RotateM2	in10
-#define	RotateM3	in15
-#define	W		in23
-#define	X		in22
-#define	Y		in21
-#define	Z		in20
-
-/* register stack configuration for md5_block_asm_data_order(): */
-#define	MD5_NINP	3
-#define	MD5_NLOC	0
-#define MD5_NOUT	29
-#define MD5_NROT	0
-
-/* register stack configuration for helpers: */
-#define	_NINPUTS	MD5_NOUT
-#define	_NLOCALS	0
-#define _NOUTPUT	0
-#define	_NROTATE	24	/* this must be <= _NINPUTS */
-
-#if defined(_HPUX_SOURCE) && !defined(_LP64)
-#define	ADDP	addp4
-#else
-#define	ADDP	add
-#endif
-
-#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
-#define HOST_IS_BIG_ENDIAN
-#endif
-
-//	Macros for getting the left and right portions of little-endian words
-
-#define	GETLW(dst, src, align)	dep.z dst = src, 32 - 8 * align, 8 * align
-#define	GETRW(dst, src, align)	extr.u dst = src, 8 * align, 32 - 8 * align
-
-//	MD5 driver
-//
-//		Reads an input block, then calls the digest block
-//		subroutine and adds the results to the accumulated
-//		digest.  It allocates 32 outs which the subroutine
-//		uses as it's inputs and rotating
-//		registers. Initializes the round constant pointer and
-//		takes care of saving/restoring ar.lc
-//
-///	INPUT
-//
-//	in0		Context Ptr		CtxPtr0
-//	in1		Input Data Ptr		DPtrIn
-//	in2		Integral Blocks		BlockCount
-//	rp		Return Address		-
-//
-///	CODE
-//
-//	v2		Input Align		InAlign
-//	t0		Shared w/digest		-
-//	t1		Shared w/digest		-
-//	t2		Shared w/digest		-
-//	t3		Shared w/digest		-
-//	t4		Shared w/digest		-
-//	t5		Shared w/digest		-
-//	t6		PFS Save		PFSSave
-//	t7		ar.lc Save		LCSave
-//	t8		Saved PR		PRSave
-//	t9		2nd CtxPtr		CtxPtr1
-//	t10		Table Base		CTable
-//	t11		Table[0]		CTable0
-//	t13		Accumulator A		AccumA
-//	t14		Accumulator B		AccumB
-//	t15		Accumulator C		AccumC
-//	t16		Accumulator D		AccumD
-//	pt0		Shared w/digest		-
-//	pt1		Shared w/digest		-
-//	pt2		Shared w/digest		-
-//	pt3		Shared w/digest		-
-//	pt4		Shared w/digest		-
-//	pt5		Shared w/digest		-
-//	pt6		Shared w/digest		-
-//	pt7		Shared w/digest		-
-//	pt8		Not Aligned		pOff
-//	pt8		Blocks Left		pAgain
-
-#define	AccumA		r27
-#define	AccumB		r28
-#define	AccumC		r29
-#define	AccumD		r30
-#define	CTable		r24
-#define	CTable0		r25
-#define	CtxPtr0		in0
-#define	CtxPtr1		r23
-#define	DPtrIn		in1
-#define	BlockCount	in2
-#define	InAlign		r10
-#define	LCSave		r21
-#define	PFSSave		r20
-#define	PRSave		r22
-#define	pAgain		p63
-#define	pOff		p63
-
-	.text
-
-/* md5_block_asm_data_order(MD5_CTX *c, const void *data, size_t num)
-
-     where:
-      c: a pointer to a structure of this type:
-
-	   typedef struct MD5state_st
-	     {
-	       MD5_LONG A,B,C,D;
-	       MD5_LONG Nl,Nh;
-	       MD5_LONG data[MD5_LBLOCK];
-	       unsigned int num;
-	     }
-	   MD5_CTX;
-
-      data: a pointer to the input data (may be misaligned)
-      num:  the number of 16-byte blocks to hash (i.e., the length
-            of DATA is 16*NUM.
-
-   */
-
-	.type	md5_block_asm_data_order, @function
-	.global	md5_block_asm_data_order
-	.align	32
-	.proc	md5_block_asm_data_order
-md5_block_asm_data_order:
-.md5_block:
-	.prologue
-{	.mmi
-	.save	ar.pfs, PFSSave
-	alloc	PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT
-	ADDP	CtxPtr1 = 8, CtxPtr0
-	mov	CTable = ip
-}
-{	.mmi
-	ADDP	DPtrIn = 0, DPtrIn
-	ADDP	CtxPtr0 = 0, CtxPtr0
-	.save	ar.lc, LCSave
-	mov	LCSave = ar.lc
-}
-;;
-{	.mmi
-	add	CTable = .md5_tbl_data_order#-.md5_block#, CTable
-	and	InAlign = 0x3, DPtrIn
-}
-
-{	.mmi
-	ld4	AccumA = [CtxPtr0], 4
-	ld4	AccumC = [CtxPtr1], 4
-	.save pr, PRSave
-	mov	PRSave = pr
-	.body
-}
-;;
-{	.mmi
-	ld4	AccumB = [CtxPtr0]
-	ld4	AccumD = [CtxPtr1]
-	dep	DPtr_ = 0, DPtrIn, 0, 2
-} ;;
-#ifdef HOST_IS_BIG_ENDIAN
-	rum	psr.be;;	// switch to little-endian
-#endif
-{	.mmb
-	ld4	CTable0 = [CTable], 4
-	cmp.ne	pOff, p0 = 0, InAlign
-(pOff)	br.cond.spnt.many .md5_unaligned
-} ;;
-
-//	The FF load/compute loop rotates values three times, so that
-//	loading into M12 here produces the M0 value, M13 -> M1, etc.
-
-.md5_block_loop0:
-{	.mmi
-	ld4	M12_ = [DPtr_], 4
-	mov	TPtr = CTable
-	mov	TRound = CTable0
-} ;;
-{	.mmi
-	ld4	M13_ = [DPtr_], 4
-	mov	A_ = AccumA
-	mov	B_ = AccumB
-} ;;
-{	.mmi
-	ld4	M14_ = [DPtr_], 4
-	mov	C_ = AccumC
-	mov	D_ = AccumD
-} ;;
-{	.mmb
-	ld4	M15_ = [DPtr_], 4
-	add	BlockCount = -1, BlockCount
-	br.call.sptk.many QUICK_RTN = md5_digest_block0
-} ;;
-
-//	Now, we add the new digest values and do some clean-up
-//	before checking if there's another full block to process
-
-{	.mmi
-	add	AccumA = AccumA, A_
-	add	AccumB = AccumB, B_
-	cmp.ne	pAgain, p0 = 0, BlockCount
-}
-{	.mib
-	add	AccumC = AccumC, C_
-	add	AccumD = AccumD, D_
-(pAgain) br.cond.dptk.many .md5_block_loop0
-} ;;
-
-.md5_exit:
-#ifdef HOST_IS_BIG_ENDIAN
-	sum	psr.be;;	// switch back to big-endian mode
-#endif
-{	.mmi
-	st4	[CtxPtr0] = AccumB, -4
-	st4	[CtxPtr1] = AccumD, -4
-	mov	pr = PRSave, 0x1ffff ;;
-}
-{	.mmi
-	st4	[CtxPtr0] = AccumA
-	st4	[CtxPtr1] = AccumC
-	mov	ar.lc = LCSave
-} ;;
-{	.mib
-	mov	ar.pfs = PFSSave
-	br.ret.sptk.few	rp
-} ;;
-
-#define	MD5UNALIGNED(offset)						\
-.md5_process##offset:							\
-{	.mib ;								\
-	nop	0x0	;						\
-	GETRW(DTmp, DTmp, offset) ;					\
-} ;;									\
-.md5_block_loop##offset:						\
-{	.mmi ;								\
-	ld4	Y_ = [DPtr_], 4 ;					\
-	mov	TPtr = CTable ;						\
-	mov	TRound = CTable0 ;					\
-} ;;									\
-{	.mmi ;								\
-	ld4	M13_ = [DPtr_], 4 ;					\
-	mov	A_ = AccumA ;						\
-	mov	B_ = AccumB ;						\
-} ;;									\
-{	.mii ;								\
-	ld4	M14_ = [DPtr_], 4 ;					\
-	GETLW(W_, Y_, offset) ;						\
-	mov	C_ = AccumC ;						\
-}									\
-{	.mmi ;								\
-	mov	D_ = AccumD ;;						\
-	or	M12_ = W_, DTmp ;					\
-	GETRW(DTmp, Y_, offset) ;					\
-}									\
-{	.mib ;								\
-	ld4	M15_ = [DPtr_], 4 ;					\
-	add	BlockCount = -1, BlockCount ;				\
-	br.call.sptk.many QUICK_RTN = md5_digest_block##offset;		\
-} ;;									\
-{	.mmi ;								\
-	add	AccumA = AccumA, A_ ;					\
-	add	AccumB = AccumB, B_ ;					\
-	cmp.ne	pAgain, p0 = 0, BlockCount ;				\
-}									\
-{	.mib ;								\
-	add	AccumC = AccumC, C_ ;					\
-	add	AccumD = AccumD, D_ ;					\
-(pAgain) br.cond.dptk.many .md5_block_loop##offset ;			\
-} ;;									\
-{	.mib ;								\
-	nop	0x0 ;							\
-	nop	0x0 ;							\
-	br.cond.sptk.many .md5_exit ;					\
-} ;;
-
-	.align	32
-.md5_unaligned:
-//
-//	Because variable shifts are expensive, we special case each of
-//	the four alignements. In practice, this won't hurt too much
-//	since only one working set of code will be loaded.
-//
-{	.mib
-	ld4	DTmp = [DPtr_], 4
-	cmp.eq	pOff, p0 = 1, InAlign
-(pOff)	br.cond.dpnt.many .md5_process1
-} ;;
-{	.mib
-	cmp.eq	pOff, p0 = 2, InAlign
-	nop	0x0
-(pOff)	br.cond.dpnt.many .md5_process2
-} ;;
-	MD5UNALIGNED(3)
-	MD5UNALIGNED(1)
-	MD5UNALIGNED(2)
-
-	.endp md5_block_asm_data_order
-
-
-// MD5 Perform the F function and load
-//
-// Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values,
-// computes the FF() round of functions, then branches to the common
-// digest code to finish up with GG(), HH, and II().
-//
-// INPUT
-//
-// rp Return Address -
-//
-// CODE
-//
-// v0 PFS bit bucket PFS
-// v1 Loop Trip Count LTrip
-// pt0 Load next word pMore
-
-/* For F round: */
-#define LTrip	r9
-#define PFS	r8
-#define pMore	p6
-
-/* For GHI rounds: */
-#define T	r9
-#define U	r10
-#define V	r11
-
-#define COMPUTE(a, b, s, M, R)			\
-{						\
-	.mii ;					\
-	ld4 TRound = [TPtr], 4 ;		\
-	dep.z Y = Z, 32, 32 ;;			\
-	shrp Z = Z, Y, 64 - s ;			\
-} ;;						\
-{						\
-	.mmi ;					\
-	add a = Z, b ;				\
-	mov R = M ;				\
-	nop 0x0 ;				\
-} ;;
-
-#define LOOP(a, b, s, M, R, label)		\
-{	.mii ;					\
-	ld4 TRound = [TPtr], 4 ;		\
-	dep.z Y = Z, 32, 32 ;;			\
-	shrp Z = Z, Y, 64 - s ;			\
-} ;;						\
-{	.mib ;					\
-	add a = Z, b ;				\
-	mov R = M ;				\
-	br.ctop.sptk.many label ;		\
-} ;;
-
-// G(B, C, D) = (B & D) | (C & ~D)
-
-#define G(a, b, c, d, M)			\
-{	.mmi ;					\
-	add Z = M, TRound ;			\
-	and Y = b, d ;				\
-	andcm X = c, d ;			\
-} ;;						\
-{	.mii ;					\
-	add Z = Z, a ;				\
-	or Y = Y, X ;;				\
-	add Z = Z, Y ;				\
-} ;;
-
-// H(B, C, D) = B ^ C ^ D
-
-#define H(a, b, c, d, M)			\
-{	.mmi ;					\
-	add Z = M, TRound ;			\
-	xor Y = b, c ;				\
-	nop 0x0 ;				\
-} ;;						\
-{	.mii ;					\
-	add Z = Z, a ;				\
-	xor Y = Y, d ;;				\
-	add Z = Z, Y ;				\
-} ;;
-
-// I(B, C, D) = C ^ (B | ~D)
-//
-// However, since we have an andcm operator, we use the fact that
-//
-// Y ^ Z == ~Y ^ ~Z
-//
-// to rewrite the expression as
-//
-// I(B, C, D) = ~C ^ (~B & D)
-
-#define I(a, b, c, d, M)			\
-{	.mmi ;					\
-	add Z = M, TRound ;			\
-	andcm Y = d, b ;			\
-	andcm X = -1, c ;			\
-} ;;						\
-{	.mii ;					\
-	add Z = Z, a ;				\
-	xor Y = Y, X ;;				\
-	add Z = Z, Y ;				\
-} ;;
-
-#define GG4(label)				\
-	G(A, B, C, D, M0)			\
-	COMPUTE(A, B, 5, M0, RotateM0)		\
-	G(D, A, B, C, M1)			\
-	COMPUTE(D, A, 9, M1, RotateM1)		\
-	G(C, D, A, B, M2)			\
-	COMPUTE(C, D, 14, M2, RotateM2)		\
-	G(B, C, D, A, M3)			\
-	LOOP(B, C, 20, M3, RotateM3, label)
-
-#define HH4(label)				\
-	H(A, B, C, D, M0)			\
-	COMPUTE(A, B, 4, M0, RotateM0)		\
-	H(D, A, B, C, M1)			\
-	COMPUTE(D, A, 11, M1, RotateM1)		\
-	H(C, D, A, B, M2)			\
-	COMPUTE(C, D, 16, M2, RotateM2)		\
-	H(B, C, D, A, M3)			\
-	LOOP(B, C, 23, M3, RotateM3, label)
-
-#define II4(label)				\
-	I(A, B, C, D, M0)			\
-	COMPUTE(A, B, 6, M0, RotateM0)		\
-	I(D, A, B, C, M1)			\
-	COMPUTE(D, A, 10, M1, RotateM1)		\
-	I(C, D, A, B, M2)			\
-	COMPUTE(C, D, 15, M2, RotateM2)		\
-	I(B, C, D, A, M3)			\
-	LOOP(B, C, 21, M3, RotateM3, label)
-
-#define FFLOAD(a, b, c, d, M, N, s)		\
-{	.mii ;					\
-(pMore) ld4 N = [DPtr], 4 ;			\
-	add Z = M, TRound ;			\
-	and Y = c, b ;				\
-}						\
-{	.mmi ;					\
-	andcm X = d, b ;;			\
-	add Z = Z, a ;				\
-	or Y = Y, X ;				\
-} ;;						\
-{	.mii ;					\
-	ld4 TRound = [TPtr], 4 ;		\
-	add Z = Z, Y ;;				\
-	dep.z Y = Z, 32, 32 ;			\
-} ;;						\
-{	.mii ;					\
-	nop 0x0 ;				\
-	shrp Z = Z, Y, 64 - s ;;		\
-	add a = Z, b ;				\
-} ;;
-
-#define FFLOOP(a, b, c, d, M, N, s, dest)	\
-{	.mii ;					\
-(pMore)	ld4 N = [DPtr], 4 ;			\
-	add Z = M, TRound ;			\
-	and Y = c, b ;				\
-}						\
-{	.mmi ;					\
-	andcm X = d, b ;;			\
-	add Z = Z, a ;				\
-	or Y = Y, X ;				\
-} ;;						\
-{	.mii ;					\
-	ld4 TRound = [TPtr], 4 ;		\
-	add Z = Z, Y ;;				\
-	dep.z Y = Z, 32, 32 ;			\
-} ;;						\
-{	.mii ;					\
-	nop 0x0 ;				\
-	shrp Z = Z, Y, 64 - s ;;		\
-	add a = Z, b ;				\
-}						\
-{	.mib ;					\
-	cmp.ne pMore, p0 = 0, LTrip ;		\
-	add LTrip = -1, LTrip ;			\
-	br.ctop.dptk.many dest ;		\
-} ;;
-
-	.type md5_digest_block0, @function
-	.align 32
-
-	.proc md5_digest_block0
-	.prologue
-md5_digest_block0:
-	.altrp QUICK_RTN
-	.body
-{	.mmi
-	alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
-	mov LTrip = 2
-	mov ar.lc = 3
-} ;;
-{	.mii
-	cmp.eq pMore, p0 = r0, r0
-	mov ar.ec = 0
-	nop 0x0
-} ;;
-
-.md5_FF_round0:
-	FFLOAD(A, B, C, D, M12, RotateM0, 7)
-	FFLOAD(D, A, B, C, M13, RotateM1, 12)
-	FFLOAD(C, D, A, B, M14, RotateM2, 17)
-	FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0)
-	//
-	// !!! Fall through to md5_digest_GHI
-	//
-	.endp md5_digest_block0
-
-	.type md5_digest_GHI, @function
-	.align 32
-
-	.proc md5_digest_GHI
-	.prologue
-	.regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
-md5_digest_GHI:
-	.altrp QUICK_RTN
-	.body
-//
-// The following sequence shuffles the block counstants round for the
-// next round:
-//
-// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-// 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
-//
-{	.mmi
-	mov Z = M0
-	mov Y = M15
-	mov ar.lc = 3
-}
-{	.mmi
-	mov X = M2
-	mov W = M9
-	mov V = M4
-} ;;
-
-{	.mmi
-	mov M0 = M1
-	mov M15 = M12
-	mov ar.ec = 1
-}
-{	.mmi
-	mov M2 = M11
-	mov M9 = M14
-	mov M4 = M5
-} ;;
-
-{	.mmi
-	mov M1 = M6
-	mov M12 = M13
-	mov U = M3
-}
-{	.mmi
-	mov M11 = M8
-	mov M14 = M7
-	mov M5 = M10
-} ;;
-
-{	.mmi
-	mov M6 = Y
-	mov M13 = X
-	mov M3 = Z
-}
-{	.mmi
-	mov M8 = W
-	mov M7 = V
-	mov M10 = U
-} ;;
-
-.md5_GG_round:
-	GG4(.md5_GG_round)
-
-// The following sequence shuffles the block constants round for the
-// next round:
-//
-// 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
-// 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
-
-{	.mmi
-	mov Z = M0
-	mov Y = M1
-	mov ar.lc = 3
-}
-{	.mmi
-	mov X = M3
-	mov W = M5
-	mov V = M6
-} ;;
-
-{	.mmi
-	mov M0 = M4
-	mov M1 = M11
-	mov ar.ec = 1
-}
-{	.mmi
-	mov M3 = M9
-	mov U = M8
-	mov T = M13
-} ;;
-
-{	.mmi
-	mov M4 = Z
-	mov M11 = Y
-	mov M5 = M7
-}
-{	.mmi
-	mov M6 = M14
-	mov M8 = M12
-	mov M13 = M15
-} ;;
-
-{	.mmi
-	mov M7 = W
-	mov M14 = V
-	nop 0x0
-}
-{	.mmi
-	mov M9 = X
-	mov M12 = U
-	mov M15 = T
-} ;;
-
-.md5_HH_round:
-	HH4(.md5_HH_round)
-
-// The following sequence shuffles the block constants round for the
-// next round:
-//
-// 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
-// 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9
-
-{	.mmi
-	mov Z = M0
-	mov Y = M15
-	mov ar.lc = 3
-}
-{	.mmi
-	mov X = M10
-	mov W = M1
-	mov V = M4
-} ;;
-
-{	.mmi
-	mov M0 = M9
-	mov M15 = M12
-	mov ar.ec = 1
-}
-{	.mmi
-	mov M10 = M11
-	mov M1 = M6
-	mov M4 = M13
-} ;;
-
-{	.mmi
-	mov M9 = M14
-	mov M12 = M5
-	mov U = M3
-}
-{	.mmi
-	mov M11 = M8
-	mov M6 = M7
-	mov M13 = M2
-} ;;
-
-{	.mmi
-	mov M14 = Y
-	mov M5 = X
-	mov M3 = Z
-}
-{	.mmi
-	mov M8 = W
-	mov M7 = V
-	mov M2 = U
-} ;;
-
-.md5_II_round:
-	II4(.md5_II_round)
-
-{	.mib
-	nop 0x0
-	nop 0x0
-	br.ret.sptk.many QUICK_RTN
-} ;;
-
-	.endp md5_digest_GHI
-
-#define FFLOADU(a, b, c, d, M, P, N, s, offset)	\
-{	.mii ;					\
-(pMore) ld4 N = [DPtr], 4 ;			\
-	add Z = M, TRound ;			\
-	and Y = c, b ;				\
-}						\
-{	.mmi ;					\
-	andcm X = d, b ;;			\
-	add Z = Z, a ;				\
-	or Y = Y, X ;				\
-} ;;						\
-{	.mii ;					\
-	ld4 TRound = [TPtr], 4 ;		\
-	GETLW(W, P, offset) ;			\
-	add Z = Z, Y ;				\
-} ;;						\
-{	.mii ;					\
-	or W = W, DTmp ;			\
-	dep.z Y = Z, 32, 32 ;;			\
-	shrp Z = Z, Y, 64 - s ;			\
-} ;;						\
-{	.mii ;					\
-	add a = Z, b ;				\
-	GETRW(DTmp, P, offset) ;		\
-	mov P = W ;				\
-} ;;
-
-#define FFLOOPU(a, b, c, d, M, P, N, s, offset)		\
-{	.mii ;						\
-(pMore) ld4 N = [DPtr], 4 ;				\
-	add Z = M, TRound ;				\
-	and Y = c, b ;					\
-}							\
-{	.mmi ;						\
-	andcm X = d, b ;;				\
-	add Z = Z, a ;					\
-	or Y = Y, X ;					\
-} ;;							\
-{	.mii ;						\
-	ld4 TRound = [TPtr], 4 ;			\
-(pMore) GETLW(W, P, offset) 	;			\
-	add Z = Z, Y ;					\
-} ;;							\
-{	.mii ;						\
-(pMore) or W = W, DTmp ;				\
-	dep.z Y = Z, 32, 32 ;;				\
-	shrp Z = Z, Y, 64 - s ;				\
-} ;;							\
-{	.mii ;						\
-	add a = Z, b ;					\
-(pMore) GETRW(DTmp, P, offset) 	;			\
-(pMore) mov P = W ;					\
-}							\
-{	.mib ;						\
-	cmp.ne pMore, p0 = 0, LTrip ;			\
-	add LTrip = -1, LTrip ;				\
-	br.ctop.sptk.many .md5_FF_round##offset ;	\
-} ;;
-
-#define MD5FBLOCK(offset)						\
-	.type md5_digest_block##offset, @function ;			\
-									\
-	.align 32 ;							\
-	.proc md5_digest_block##offset ;				\
-	.prologue ;							\
-	.altrp QUICK_RTN ;						\
-	.body ;								\
-md5_digest_block##offset:						\
-{	.mmi ;								\
-	alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ;	\
-	mov LTrip = 2 ;							\
-	mov ar.lc = 3 ;							\
-} ;;									\
-{	.mii ;								\
-	cmp.eq pMore, p0 = r0, r0 ;					\
-	mov ar.ec = 0 ;							\
-	nop 0x0 ;							\
-} ;;									\
-									\
-	.pred.rel "mutex", pLoad, pSkip ;				\
-.md5_FF_round##offset:							\
-	FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset)		\
-	FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset)		\
-	FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset)		\
-	FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset)	\
-									\
-{	.mib ;								\
-	nop 0x0 ;							\
-	nop 0x0 ;							\
-	br.cond.sptk.many md5_digest_GHI ;				\
-} ;;									\
-	.endp md5_digest_block##offset
-
-MD5FBLOCK(1)
-MD5FBLOCK(2)
-MD5FBLOCK(3)
-
-	.align 64
-	.type md5_constants, @object
-md5_constants:
-.md5_tbl_data_order:			// To ensure little-endian data
-					// order, code as bytes.
-	data1 0x78, 0xa4, 0x6a, 0xd7	//     0
-	data1 0x56, 0xb7, 0xc7, 0xe8	//     1
-	data1 0xdb, 0x70, 0x20, 0x24	//     2
-	data1 0xee, 0xce, 0xbd, 0xc1	//     3
-	data1 0xaf, 0x0f, 0x7c, 0xf5	//     4
-	data1 0x2a, 0xc6, 0x87, 0x47	//     5
-	data1 0x13, 0x46, 0x30, 0xa8	//     6
-	data1 0x01, 0x95, 0x46, 0xfd	//     7
-	data1 0xd8, 0x98, 0x80, 0x69	//     8
-	data1 0xaf, 0xf7, 0x44, 0x8b	//     9
-	data1 0xb1, 0x5b, 0xff, 0xff	//    10
-	data1 0xbe, 0xd7, 0x5c, 0x89	//    11
-	data1 0x22, 0x11, 0x90, 0x6b	//    12
-	data1 0x93, 0x71, 0x98, 0xfd	//    13
-	data1 0x8e, 0x43, 0x79, 0xa6	//    14
-	data1 0x21, 0x08, 0xb4, 0x49	//    15
-	data1 0x62, 0x25, 0x1e, 0xf6	//    16
-	data1 0x40, 0xb3, 0x40, 0xc0	//    17
-	data1 0x51, 0x5a, 0x5e, 0x26	//    18
-	data1 0xaa, 0xc7, 0xb6, 0xe9	//    19
-	data1 0x5d, 0x10, 0x2f, 0xd6	//    20
-	data1 0x53, 0x14, 0x44, 0x02	//    21
-	data1 0x81, 0xe6, 0xa1, 0xd8	//    22
-	data1 0xc8, 0xfb, 0xd3, 0xe7	//    23
-	data1 0xe6, 0xcd, 0xe1, 0x21	//    24
-	data1 0xd6, 0x07, 0x37, 0xc3	//    25
-	data1 0x87, 0x0d, 0xd5, 0xf4	//    26
-	data1 0xed, 0x14, 0x5a, 0x45	//    27
-	data1 0x05, 0xe9, 0xe3, 0xa9	//    28
-	data1 0xf8, 0xa3, 0xef, 0xfc	//    29
-	data1 0xd9, 0x02, 0x6f, 0x67	//    30
-	data1 0x8a, 0x4c, 0x2a, 0x8d	//    31
-	data1 0x42, 0x39, 0xfa, 0xff	//    32
-	data1 0x81, 0xf6, 0x71, 0x87	//    33
-	data1 0x22, 0x61, 0x9d, 0x6d	//    34
-	data1 0x0c, 0x38, 0xe5, 0xfd	//    35
-	data1 0x44, 0xea, 0xbe, 0xa4	//    36
-	data1 0xa9, 0xcf, 0xde, 0x4b	//    37
-	data1 0x60, 0x4b, 0xbb, 0xf6	//    38
-	data1 0x70, 0xbc, 0xbf, 0xbe	//    39
-	data1 0xc6, 0x7e, 0x9b, 0x28	//    40
-	data1 0xfa, 0x27, 0xa1, 0xea	//    41
-	data1 0x85, 0x30, 0xef, 0xd4	//    42
-	data1 0x05, 0x1d, 0x88, 0x04	//    43
-	data1 0x39, 0xd0, 0xd4, 0xd9	//    44
-	data1 0xe5, 0x99, 0xdb, 0xe6	//    45
-	data1 0xf8, 0x7c, 0xa2, 0x1f	//    46
-	data1 0x65, 0x56, 0xac, 0xc4	//    47
-	data1 0x44, 0x22, 0x29, 0xf4	//    48
-	data1 0x97, 0xff, 0x2a, 0x43	//    49
-	data1 0xa7, 0x23, 0x94, 0xab	//    50
-	data1 0x39, 0xa0, 0x93, 0xfc	//    51
-	data1 0xc3, 0x59, 0x5b, 0x65	//    52
-	data1 0x92, 0xcc, 0x0c, 0x8f	//    53
-	data1 0x7d, 0xf4, 0xef, 0xff	//    54
-	data1 0xd1, 0x5d, 0x84, 0x85	//    55
-	data1 0x4f, 0x7e, 0xa8, 0x6f	//    56
-	data1 0xe0, 0xe6, 0x2c, 0xfe	//    57
-	data1 0x14, 0x43, 0x01, 0xa3	//    58
-	data1 0xa1, 0x11, 0x08, 0x4e	//    59
-	data1 0x82, 0x7e, 0x53, 0xf7	//    60
-	data1 0x35, 0xf2, 0x3a, 0xbd	//    61
-	data1 0xbb, 0xd2, 0xd7, 0x2a	//    62
-	data1 0x91, 0xd3, 0x86, 0xeb	//    63
-.size	md5_constants#,64*4
diff --git a/crypto/md5/build.info b/crypto/md5/build.info
index 38323a3fc2..95e2be880e 100644
--- a/crypto/md5/build.info
+++ b/crypto/md5/build.info
@@ -8,15 +8,3 @@ GENERATE[md5-x86_64.s]=asm/md5-x86_64.pl $(PERLASM_SCHEME)
 
 GENERATE[md5-sparcv9.S]=asm/md5-sparcv9.pl $(PERLASM_SCHEME)
 INCLUDE[md5-sparcv9.o]=..
-
-BEGINRAW[makefile(windows)]
-{- $builddir -}\md5-ia64.asm: {- $sourcedir -}\asm\md5-ia64.S
-	$(CC) $(CFLAGS) -EP {- $sourcedir -}\asm\md5-ia64.S > $@.i && move /Y $@.i $@
-ENDRAW[makefile(windows)]
-
-BEGINRAW[Makefile]
-{- $builddir -}/md5-ia64.s: {- $sourcedir -}/asm/md5-ia64.S
-	$(CC) $(CFLAGS) -E {- $sourcedir -}/asm/md5-ia64.S | \
-	$(PERL) -ne 's/;\s+/;\n/g; print;' > $@
-
-ENDRAW[Makefile]
diff --git a/crypto/mem_sec.c b/crypto/mem_sec.c
index 6fc1aca1e0..11e95c42b8 100644
--- a/crypto/mem_sec.c
+++ b/crypto/mem_sec.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2015-2017 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2004-2014, Akamai Technologies. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -7,11 +8,6 @@
  * https://www.openssl.org/source/license.html
  */
 
-/*
- * Copyright 2004-2014, Akamai Technologies. All Rights Reserved.
- * This file is distributed under the terms of the OpenSSL license.
- */
-
 /*
  * This file is in two halves. The first half implements the public API
  * to be used by external consumers, and to be used by OpenSSL to store
diff --git a/crypto/rc4/asm/rc4-ia64.pl b/crypto/rc4/asm/rc4-ia64.pl
deleted file mode 100644
index 5e8f5f55b2..0000000000
--- a/crypto/rc4/asm/rc4-ia64.pl
+++ /dev/null
@@ -1,767 +0,0 @@
-#! /usr/bin/env perl
-# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-#
-# ====================================================================
-# Written by David Mosberger <David.Mosberger@acm.org> based on the
-# Itanium optimized Crypto code which was released by HP Labs at
-# http://www.hpl.hp.com/research/linux/crypto/.
-#
-# Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
-#
-# Permission is hereby granted, free of charge, to any person obtaining
-# a copy of this software and associated documentation files (the
-# "Software"), to deal in the Software without restriction, including
-# without limitation the rights to use, copy, modify, merge, publish,
-# distribute, sublicense, and/or sell copies of the Software, and to
-# permit persons to whom the Software is furnished to do so, subject to
-# the following conditions:
-#
-# The above copyright notice and this permission notice shall be
-# included in all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
-
-
-
-# This is a little helper program which generates a software-pipelined
-# for RC4 encryption.  The basic algorithm looks like this:
-#
-#   for (counter = 0; counter < len; ++counter)
-#     {
-#       in = inp[counter];
-#       SI = S[I];
-#       J = (SI + J) & 0xff;
-#       SJ = S[J];
-#       T = (SI + SJ) & 0xff;
-#       S[I] = SJ, S[J] = SI;
-#       ST = S[T];
-#       outp[counter] = in ^ ST;
-#       I = (I + 1) & 0xff;
-#     }
-#
-# Pipelining this loop isn't easy, because the stores to the S[] array
-# need to be observed in the right order.  The loop generated by the
-# code below has the following pipeline diagram:
-#
-#      cycle
-#     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
-# iter
-#   1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
-#   2:             xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
-#   3:                         xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
-#
-#   where:
-# 	LDI = load of S[I]
-# 	LDJ = load of S[J]
-# 	SWP = swap of S[I] and S[J]
-# 	LDT = load of S[T]
-#
-# Note that in the above diagram, the major trouble-spot is that LDI
-# of the 2nd iteration is performed BEFORE the SWP of the first
-# iteration.  Fortunately, this is easy to detect (I of the 1st
-# iteration will be equal to J of the 2nd iteration) and when this
-# happens, we simply forward the proper value from the 1st iteration
-# to the 2nd one.  The proper value in this case is simply the value
-# of S[I] from the first iteration (thanks to the fact that SWP
-# simply swaps the contents of S[I] and S[J]).
-#
-# Another potential trouble-spot is in cycle 7, where SWP of the 1st
-# iteration issues at the same time as the LDI of the 3rd iteration.
-# However, thanks to IA-64 execution semantics, this can be taken
-# care of simply by placing LDI later in the instruction-group than
-# SWP.  IA-64 CPUs will automatically forward the value if they
-# detect that the SWP and LDI are accessing the same memory-location.
-
-# The core-loop that can be pipelined then looks like this (annotated
-# with McKinley/Madison issue port & latency numbers, assuming L1
-# cache hits for the most part):
-
-# operation:	    instruction:		    issue-ports:  latency
-# ------------------  -----------------------------   ------------- -------
-
-# Data = *inp++       ld1 data = [inp], 1             M0-M1         1 cyc     c0
-#                     shladd Iptr = I, KeyTable, 3    M0-M3, I0, I1 1 cyc
-# I = (I + 1) & 0xff  padd1 nextI = I, one            M0-M3, I0, I1 3 cyc
-#                     ;;
-# SI = S[I]           ld8 SI = [Iptr]                 M0-M1         1 cyc     c1 * after SWAP!
-#                     ;;
-#                     cmp.eq.unc pBypass = I, J                                  * after J is valid!
-# J = SI + J          add J = J, SI                   M0-M3, I0, I1 1 cyc     c2
-#                     (pBypass) br.cond.spnt Bypass
-#                     ;;
-# ---------------------------------------------------------------------------------------
-# J = J & 0xff        zxt1 J = J                      I0, I1, 1 cyc           c3
-#                     ;;
-#                     shladd Jptr = J, KeyTable, 3    M0-M3, I0, I1 1 cyc     c4
-#                     ;;
-# SJ = S[J]           ld8 SJ = [Jptr]                 M0-M1         1 cyc     c5
-#                     ;;
-# ---------------------------------------------------------------------------------------
-# T = (SI + SJ)       add T = SI, SJ                  M0-M3, I0, I1 1 cyc     c6
-#                     ;;
-# T = T & 0xff        zxt1 T = T                      I0, I1        1 cyc
-# S[I] = SJ           st8 [Iptr] = SJ                 M2-M3                   c7
-# S[J] = SI           st8 [Jptr] = SI                 M2-M3
-#                     ;;
-#                     shladd Tptr = T, KeyTable, 3    M0-M3, I0, I1 1 cyc     c8
-#                     ;;
-# ---------------------------------------------------------------------------------------
-# T = S[T]            ld8 T = [Tptr]                  M0-M1         1 cyc     c9
-#                     ;;
-# data ^= T           xor data = data, T              M0-M3, I0, I1 1 cyc     c10
-#                     ;;
-# *out++ = Data ^ T   dep word = word, data, 8, POS   I0, I1        1 cyc     c11
-#                     ;;
-# ---------------------------------------------------------------------------------------
-
-# There are several points worth making here:
-
-#   - Note that due to the bypass/forwarding-path, the first two
-#     phases of the loop are strangly mingled together.  In
-#     particular, note that the first stage of the pipeline is
-#     using the value of "J", as calculated by the second stage.
-#   - Each bundle-pair will have exactly 6 instructions.
-#   - Pipelined, the loop can execute in 3 cycles/iteration and
-#     4 stages.  However, McKinley/Madison can issue "st1" to
-#     the same bank at a rate of at most one per 4 cycles.  Thus,
-#     instead of storing each byte, we accumulate them in a word
-#     and then write them back at once with a single "st8" (this
-#     implies that the setup code needs to ensure that the output
-#     buffer is properly aligned, if need be, by encoding the
-#     first few bytes separately).
-#   - There is no space for a "br.ctop" instruction.  For this
-#     reason we can't use module-loop support in IA-64 and have
-#     to do a traditional, purely software-pipelined loop.
-#   - We can't replace any of the remaining "add/zxt1" pairs with
-#     "padd1" because the latency for that instruction is too high
-#     and would push the loop to the point where more bypasses
-#     would be needed, which we don't have space for.
-#   - The above loop runs at around 3.26 cycles/byte, or roughly
-#     440 MByte/sec on a 1.5GHz Madison.  This is well below the
-#     system bus bandwidth and hence with judicious use of
-#     "lfetch" this loop can run at (almost) peak speed even when
-#     the input and output data reside in memory.  The
-#     max. latency that can be tolerated is (PREFETCH_DISTANCE *
-#     L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
-#     least) 1-ahead prefetching of 128 byte cache-lines.  Note
-#     that we do NOT prefetch into L1, since that would only
-#     interfere with the S[] table values stored there.  This is
-#     acceptable because there is a 10 cycle latency between
-#     load and first use of the input data.
-#   - We use a branch to out-of-line bypass-code of cycle-pressure:
-#     we calculate the next J, check for the need to activate the
-#     bypass path, and activate the bypass path ALL IN THE SAME
-#     CYCLE.  If we didn't have these constraints, we could do
-#     the bypass with a simple conditional move instruction.
-#     Fortunately, the bypass paths get activated relatively
-#     infrequently, so the extra branches don't cost all that much
-#     (about 0.04 cycles/byte, measured on a 16396 byte file with
-#     random input data).
-#
-
-$output = pop;
-open STDOUT,">$output";
-
-$phases = 4;		# number of stages/phases in the pipelined-loop
-$unroll_count = 6;	# number of times we unrolled it
-$pComI = (1 << 0);
-$pComJ = (1 << 1);
-$pComT = (1 << 2);
-$pOut  = (1 << 3);
-
-$NData = 4;
-$NIP = 3;
-$NJP = 2;
-$NI = 2;
-$NSI = 3;
-$NSJ = 2;
-$NT = 2;
-$NOutWord = 2;
-
-#
-# $threshold is the minimum length before we attempt to use the
-# big software-pipelined loop.  It MUST be greater-or-equal
-# to:
-#  		PHASES * (UNROLL_COUNT + 1) + 7
-#
-# The "+ 7" comes from the fact we may have to encode up to
-#   7 bytes separately before the output pointer is aligned.
-#
-$threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
-
-sub I {
-    local *code = shift;
-    local $format = shift;
-    $code .= sprintf ("\t\t".$format."\n", @_);
-}
-
-sub P {
-    local *code = shift;
-    local $format = shift;
-    $code .= sprintf ($format."\n", @_);
-}
-
-sub STOP {
-    local *code = shift;
-    $code .=<<___;
-		;;
-___
-}
-
-sub emit_body {
-    local *c = shift;
-    local *bypass = shift;
-    local ($iteration, $p) = @_;
-
-    local $i0 = $iteration;
-    local $i1 = $iteration - 1;
-    local $i2 = $iteration - 2;
-    local $i3 = $iteration - 3;
-    local $iw0 = ($iteration - 3) / 8;
-    local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
-    local $byte_num = ($iteration - 3) % 8;
-    local $label = $iteration + 1;
-    local $pAny = ($p & 0xf) == 0xf;
-    local $pByp = (($p & $pComI) && ($iteration > 0));
-
-    $c.=<<___;
-//////////////////////////////////////////////////
-___
-
-    if (($p & 0xf) == 0) {
-	$c.="#ifdef HOST_IS_BIG_ENDIAN\n";
-	&I(\$c,"shr.u	OutWord[%u] = OutWord[%u], 32;;",
-				$iw1 % $NOutWord, $iw1 % $NOutWord);
-	$c.="#endif\n";
-	&I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
-	return;
-    }
-
-    # Cycle 0
-    &I(\$c, "{ .mmi")					      if ($pAny);
-    &I(\$c, "ld1    Data[%u] = [InPtr], 1", $i0 % $NData)     if ($p & $pComI);
-    &I(\$c, "padd1  I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
-    &I(\$c, "zxt1   J = J")				      if ($p & $pComJ);
-    &I(\$c, "}")					      if ($pAny);
-    &I(\$c, "{ .mmi")					      if ($pAny);
-    &I(\$c, "LKEY   T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT)   if ($p & $pOut);
-    &I(\$c, "add    T[%u] = SI[%u], SJ[%u]",
-       $i0 % $NT, $i2 % $NSI, $i1 % $NSJ)		      if ($p & $pComT);
-    &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
-    &I(\$c, "}")					      if ($pAny);
-    &STOP(\$c);
-
-    # Cycle 1
-    &I(\$c, "{ .mmi")					      if ($pAny);
-    &I(\$c, "SKEY   [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
-    &I(\$c, "SKEY   [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
-    &I(\$c, "zxt1   T[%u] = T[%u]", $i0 % $NT, $i0 % $NT)     if ($p & $pComT);
-    &I(\$c, "}")					      if ($pAny);
-    &I(\$c, "{ .mmi")					      if ($pAny);
-    &I(\$c, "LKEY   SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
-    &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP)		      if ($p & $pComJ);
-    &I(\$c, "xor    Data[%u] = Data[%u], T[%u]",
-       $i3 % $NData, $i3 % $NData, $i1 % $NT)		      if ($p & $pOut);
-    &I(\$c, "}")					      if ($pAny);
-    &STOP(\$c);
-
-    # Cycle 2
-    &I(\$c, "{ .mmi")					      if ($pAny);
-    &I(\$c, "LKEY   SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
-    &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI)	      if ($pByp);
-    &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
-       $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
-    &I(\$c, "}")					      if ($pAny);
-    &I(\$c, "{ .mmb")					      if ($pAny);
-    &I(\$c, "add    J = J, SI[%u]", $i0 % $NSI)		      if ($p & $pComI);
-    &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT)    if ($p & $pComT);
-    &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
-    &I(\$c, "}") if ($pAny);
-    &STOP(\$c);
-
-    &P(\$c, ".rc4Resume%u:", $label)			      if ($pByp);
-    if ($byte_num == 0 && $iteration >= $phases) {
-	&I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
-	   $iw1 % $NOutWord)				      if ($p & $pOut);
-	if ($iteration == (1 + $unroll_count) * $phases - 1) {
-	    if ($unroll_count == 6) {
-		&I(\$c, "mov OutWord[%u] = OutWord[%u]",
-		   $iw1 % $NOutWord, $iw0 % $NOutWord);
-	    }
-	    &I(\$c, "lfetch.nt1 [InPrefetch], %u",
-	       $unroll_count * $phases);
-	    &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
-	       $unroll_count * $phases);
-	    &I(\$c, "br.cloop.sptk.few .rc4Loop");
-	}
-    }
-
-    if ($pByp) {
-	&P(\$bypass, ".rc4Bypass%u:", $label);
-	&I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
-	&I(\$bypass, "nop 0");
-	&I(\$bypass, "nop 0");
-	&I(\$bypass, ";;");
-	&I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
-	&I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
-	&I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
-	&I(\$bypass, ";;");
-    }
-}
-
-$code=<<___;
-.ident \"rc4-ia64.s, version 3.0\"
-.ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
-
-#define LCSave		r8
-#define PRSave		r9
-
-/* Inputs become invalid once rotation begins!  */
-
-#define StateTable	in0
-#define DataLen		in1
-#define InputBuffer	in2
-#define OutputBuffer	in3
-
-#define KTable		r14
-#define J		r15
-#define InPtr		r16
-#define OutPtr		r17
-#define InPrefetch	r18
-#define OutPrefetch	r19
-#define One		r20
-#define LoopCount	r21
-#define Remainder	r22
-#define IFinal		r23
-#define EndPtr		r24
-
-#define tmp0		r25
-#define tmp1		r26
-
-#define pBypass		p6
-#define pDone		p7
-#define pSmall		p8
-#define pAligned	p9
-#define pUnaligned	p10
-
-#define pComputeI	pPhase[0]
-#define pComputeJ	pPhase[1]
-#define pComputeT	pPhase[2]
-#define pOutput		pPhase[3]
-
-#define RetVal		r8
-#define L_OK		p7
-#define L_NOK		p8
-
-#define	_NINPUTS	4
-#define	_NOUTPUT	0
-
-#define	_NROTATE	24
-#define	_NLOCALS	(_NROTATE - _NINPUTS - _NOUTPUT)
-
-#ifndef SZ
-# define SZ	4	// this must be set to sizeof(RC4_INT)
-#endif
-
-#if SZ == 1
-# define LKEY			ld1
-# define SKEY			st1
-# define KEYADDR(dst, i)	add dst = i, KTable
-#elif SZ == 2
-# define LKEY			ld2
-# define SKEY			st2
-# define KEYADDR(dst, i)	shladd dst = i, 1, KTable
-#elif SZ == 4
-# define LKEY			ld4
-# define SKEY			st4
-# define KEYADDR(dst, i)	shladd dst = i, 2, KTable
-#else
-# define LKEY			ld8
-# define SKEY			st8
-# define KEYADDR(dst, i)	shladd dst = i, 3, KTable
-#endif
-
-#if defined(_HPUX_SOURCE) && !defined(_LP64)
-# define ADDP	addp4
-#else
-# define ADDP	add
-#endif
-
-/* Define a macro for the bit number of the n-th byte: */
-
-#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
-# define HOST_IS_BIG_ENDIAN
-# define BYTE_POS(n)	(56 - (8 * (n)))
-#else
-# define BYTE_POS(n)	(8 * (n))
-#endif
-
-/*
-   We must perform the first phase of the pipeline explicitly since
-   we will always load from the stable the first time. The br.cexit
-   will never be taken since regardless of the number of bytes because
-   the epilogue count is 4.
-*/
-/* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
-   assembler failed on original macro with syntax error. <appro> */
-#define MODSCHED_RC4_PROLOGUE						   \\
-	{								   \\
-				ld1		Data[0] = [InPtr], 1;	   \\
-				add		IFinal = 1, I[1];	   \\
-				KEYADDR(IPr[0], I[1]);			   \\
-	} ;;								   \\
-	{								   \\
-				LKEY		SI[0] = [IPr[0]];	   \\
-				mov		pr.rot = 0x10000;	   \\
-				mov		ar.ec = 4;		   \\
-	} ;;								   \\
-	{								   \\
-				add		J = J, SI[0];		   \\
-				zxt1		I[0] = IFinal;		   \\
-				br.cexit.spnt.few .+16; /* never taken */  \\
-	} ;;
-#define MODSCHED_RC4_LOOP(label)					   \\
-label:									   \\
-	{	.mmi;							   \\
-		(pComputeI)	ld1		Data[0] = [InPtr], 1;	   \\
-		(pComputeI)	add		IFinal = 1, I[1];	   \\
-		(pComputeJ)	zxt1		J = J;			   \\
-	}{	.mmi;							   \\
-		(pOutput)	LKEY		T[1] = [T[1]];		   \\
-		(pComputeT)	add		T[0] = SI[2], SJ[1];	   \\
-		(pComputeI)	KEYADDR(IPr[0], I[1]);			   \\
-	} ;;								   \\
-	{	.mmi;							   \\
-		(pComputeT)	SKEY		[IPr[2]] = SJ[1];	   \\
-		(pComputeT)	SKEY		[JP[1]] = SI[2];	   \\
-		(pComputeT)	zxt1		T[0] = T[0];		   \\
-	}{	.mmi;							   \\
-		(pComputeI)	LKEY		SI[0] = [IPr[0]];	   \\
-		(pComputeJ)	KEYADDR(JP[0], J);			   \\
-		(pComputeI)	cmp.eq.unc	pBypass, p0 = I[1], J;	   \\
-	} ;;								   \\
-	{	.mmi;							   \\
-		(pComputeJ)	LKEY		SJ[0] = [JP[0]];	   \\
-		(pOutput)	xor		Data[3] = Data[3], T[1];   \\
-				nop		0x0;			   \\
-	}{	.mmi;							   \\
-		(pComputeT)	KEYADDR(T[0], T[0]);			   \\
-		(pBypass)	mov		SI[0] = SI[1];		   \\
-		(pComputeI)	zxt1		I[0] = IFinal;		   \\
-	} ;;								   \\
-	{	.mmb;							   \\
-		(pOutput)	st1		[OutPtr] = Data[3], 1;	   \\
-		(pComputeI)	add		J = J, SI[0];		   \\
-				br.ctop.sptk.few label;			   \\
-	} ;;
-
-	.text
-
-	.align	32
-
-	.type	RC4, \@function
-	.global	RC4
-
-	.proc	RC4
-	.prologue
-
-RC4:
-	{
-	  	.mmi
-		alloc	r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
-
-		.rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
-		      OutWord[2]
-		.rotp pPhase[4]
-
-		ADDP		InPrefetch = 0, InputBuffer
-		ADDP		KTable = 0, StateTable
-	}
-	{
-		.mmi
-		ADDP		InPtr = 0, InputBuffer
-		ADDP		OutPtr = 0, OutputBuffer
-		mov		RetVal = r0
-	}
-	;;
-	{
-		.mmi
-		lfetch.nt1	[InPrefetch], 0x80
-		ADDP		OutPrefetch = 0, OutputBuffer
-	}
-	{               // Return 0 if the input length is nonsensical
-        	.mib
-		ADDP		StateTable = 0, StateTable
-        	cmp.ge.unc  	L_NOK, L_OK = r0, DataLen
-	(L_NOK) br.ret.sptk.few rp
-	}
-	;;
-	{
-        	.mib
-        	cmp.eq.or  	L_NOK, L_OK = r0, InPtr
-        	cmp.eq.or  	L_NOK, L_OK = r0, OutPtr
-		nop		0x0
-	}
-	{
-		.mib
-        	cmp.eq.or  	L_NOK, L_OK = r0, StateTable
-		nop		0x0
-	(L_NOK) br.ret.sptk.few rp
-	}
-	;;
-		LKEY		I[1] = [KTable], SZ
-/* Prefetch the state-table. It contains 256 elements of size SZ */
-
-#if SZ == 1
-		ADDP		tmp0 = 1*128, StateTable
-#elif SZ == 2
-		ADDP		tmp0 = 3*128, StateTable
-		ADDP		tmp1 = 2*128, StateTable
-#elif SZ == 4
-		ADDP		tmp0 = 7*128, StateTable
-		ADDP		tmp1 = 6*128, StateTable
-#elif SZ == 8
-		ADDP		tmp0 = 15*128, StateTable
-		ADDP		tmp1 = 14*128, StateTable
-#endif
-		;;
-#if SZ >= 8
-		lfetch.fault.nt1		[tmp0], -256	// 15
-		lfetch.fault.nt1		[tmp1], -256;;
-		lfetch.fault.nt1		[tmp0], -256	// 13
-		lfetch.fault.nt1		[tmp1], -256;;
-		lfetch.fault.nt1		[tmp0], -256	// 11
-		lfetch.fault.nt1		[tmp1], -256;;
-		lfetch.fault.nt1		[tmp0], -256	//  9
-		lfetch.fault.nt1		[tmp1], -256;;
-#endif
-#if SZ >= 4
-		lfetch.fault.nt1		[tmp0], -256	//  7
-		lfetch.fault.nt1		[tmp1], -256;;
-		lfetch.fault.nt1		[tmp0], -256	//  5
-		lfetch.fault.nt1		[tmp1], -256;;
-#endif
-#if SZ >= 2
-		lfetch.fault.nt1		[tmp0], -256	//  3
-		lfetch.fault.nt1		[tmp1], -256;;
-#endif
-	{
-		.mii
-		lfetch.fault.nt1		[tmp0]		//  1
-		add		I[1]=1,I[1];;
-		zxt1		I[1]=I[1]
-	}
-	{
-		.mmi
-		lfetch.nt1	[InPrefetch], 0x80
-		lfetch.excl.nt1	[OutPrefetch], 0x80
-		.save		pr, PRSave
-		mov		PRSave = pr
-	} ;;
-	{
-		.mmi
-		lfetch.excl.nt1	[OutPrefetch], 0x80
-		LKEY		J = [KTable], SZ
-		ADDP		EndPtr = DataLen, InPtr
-	}  ;;
-	{
-		.mmi
-		ADDP		EndPtr = -1, EndPtr	// Make it point to
-							// last data byte.
-		mov		One = 1
-		.save		ar.lc, LCSave
-		mov		LCSave = ar.lc
-		.body
-	} ;;
-	{
-		.mmb
-		sub		Remainder = 0, OutPtr
-		cmp.gtu		pSmall, p0 = $threshold, DataLen
-(pSmall)	br.cond.dpnt	.rc4Remainder		// Data too small for
-							// big loop.
-	} ;;
-	{
-		.mmi
-		and		Remainder = 0x7, Remainder
-		;;
-		cmp.eq		pAligned, pUnaligned = Remainder, r0
-		nop		0x0
-	} ;;
-	{
-		.mmb
-.pred.rel	"mutex",pUnaligned,pAligned
-(pUnaligned)	add		Remainder = -1, Remainder
-(pAligned)	sub		Remainder = EndPtr, InPtr
-(pAligned)	br.cond.dptk.many .rc4Aligned
-	} ;;
-	{
-		.mmi
-		nop		0x0
-		nop		0x0
-		mov.i		ar.lc = Remainder
-	}
-
-/* Do the initial few bytes via the compact, modulo-scheduled loop
-   until the output pointer is 8-byte-aligned.  */
-
-		MODSCHED_RC4_PROLOGUE
-		MODSCHED_RC4_LOOP(.RC4AlignLoop)
-
-	{
-		.mib
-		sub		Remainder = EndPtr, InPtr
-		zxt1		IFinal = IFinal
-		clrrrb				// Clear CFM.rrb.pr so
-		;;				// next "mov pr.rot = N"
-						// does the right thing.
-	}
-	{
-		.mmi
-		mov		I[1] = IFinal
-		nop		0x0
-		nop		0x0
-	} ;;
-
-
-.rc4Aligned:
-
-/*
-   Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
- */
-
-	{
-		.mlx
-		add	LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
-		movl		Remainder = 0xaaaaaaaaaaaaaaab
-	} ;;
-	{
-		.mmi
-		setf.sig	f6 = LoopCount		// M2, M3	6 cyc
-		setf.sig	f7 = Remainder		// M2, M3	6 cyc
-		nop		0x0
-	} ;;
-	{
-		.mfb
-		nop		0x0
-		xmpy.hu		f6 = f6, f7
-		nop		0x0
-	} ;;
-	{
-		.mmi
-		getf.sig	LoopCount = f6;;	// M2		5 cyc
-		nop		0x0
-		shr.u		LoopCount = LoopCount, 4
-	} ;;
-	{
-		.mmi
-		nop		0x0
-		nop		0x0
-		mov.i		ar.lc = LoopCount
-	} ;;
-
-/* Now comes the unrolled loop: */
-
-.rc4Prologue:
-___
-
-$iteration = 0;
-
-# Generate the prologue:
-$predicates = 1;
-for ($i = 0; $i < $phases; ++$i) {
-    &emit_body (\$code, \$bypass, $iteration++, $predicates);
-    $predicates = ($predicates << 1) | 1;
-}
-
-$code.=<<___;
-.rc4Loop:
-___
-
-# Generate the body:
-for ($i = 0; $i < $unroll_count*$phases; ++$i) {
-    &emit_body (\$code, \$bypass, $iteration++, $predicates);
-}
-
-$code.=<<___;
-.rc4Epilogue:
-___
-
-# Generate the epilogue:
-for ($i = 0; $i < $phases; ++$i) {
-    $predicates <<= 1;
-    &emit_body (\$code, \$bypass, $iteration++, $predicates);
-}
-
-$code.=<<___;
-	{
-		.mmi
-		lfetch.nt1	[EndPtr]	// fetch line with last byte
-		mov		IFinal = I[1]
-		nop		0x0
-	}
-
-.rc4Remainder:
-	{
-		.mmi
-		sub		Remainder = EndPtr, InPtr	// Calculate
-								// # of bytes
-								// left - 1
-		nop		0x0
-		nop		0x0
-	} ;;
-	{
-		.mib
-		cmp.eq		pDone, p0 = -1, Remainder // done already?
-		mov.i		ar.lc = Remainder
-(pDone)		br.cond.dptk.few .rc4Complete
-	}
-
-/* Do the remaining bytes via the compact, modulo-scheduled loop */
-
-		MODSCHED_RC4_PROLOGUE
-		MODSCHED_RC4_LOOP(.RC4RestLoop)
-
-.rc4Complete:
-	{
-		.mmi
-		add		KTable = -SZ, KTable
-		add		IFinal = -1, IFinal
-		mov		ar.lc = LCSave
-	} ;;
-	{
-		.mii
-		SKEY		[KTable] = J,-SZ
-		zxt1		IFinal = IFinal
-		mov		pr = PRSave, 0x1FFFF
-	} ;;
-	{
-		.mib
-		SKEY		[KTable] = IFinal
-		add		RetVal = 1, r0
-		br.ret.sptk.few	rp
-	} ;;
-___
-
-# Last but not least, emit the code for the bypass-code of the unrolled loop:
-
-$code.=$bypass;
-
-$code.=<<___;
-	.endp RC4
-___
-
-print $code;
-
-close STDOUT;
diff --git a/crypto/rc4/build.info b/crypto/rc4/build.info
index 6c488890f9..8b1ed0e14e 100644
--- a/crypto/rc4/build.info
+++ b/crypto/rc4/build.info
@@ -10,23 +10,7 @@ GENERATE[rc4-md5-x86_64.s]=asm/rc4-md5-x86_64.pl $(PERLASM_SCHEME)
 
 GENERATE[rc4-parisc.s]=asm/rc4-parisc.pl $(PERLASM_SCHEME)
 
-BEGINRAW[makefile(windows)]
-{- $builddir -}\rc4-ia64.asm: {- $sourcedir -}\asm\rc4-ia64.pl
-	$(PERL) {- $sourcedir -}\asm\rc4-ia64.pl $@.S
-	$(CC) -DSZ=4 -EP $@.S > $@.i && move /Y $@.i $@
-	del /Q $@.S
-ENDRAW[makefile(windows)]
-
 BEGINRAW[Makefile]
-{- $builddir -}/rc4-ia64.s: {- $sourcedir -}/asm/rc4-ia64.pl
-	@(trap "rm $@.*" INT 0; \
-	  $(PERL) $< $(CFLAGS) $(LIB_CFLAGS) $@.S; \
-	  case `awk '/^#define RC4_INT/{print$$NF}' $(BLDDIR)/include/openssl/opensslconf.h` in \
-	  int)	set -x; $(CC) $(CFLAGS) $(LIB_CFLAGS) -DSZ=4 -E $@.S > $@.i && mv -f $@.i $@;; \
-	  char)	set -x; $(CC) $(CFLAGS) $(LIB_CFLAGS) -DSZ=1 -E $@.S > $@.i && mv -f $@.i $@;; \
-	  *)	exit 1 ;; \
-	  esac )
-
 # GNU make "catch all"
 {- $builddir -}/rc4-%.s:	{- $sourcedir -}/asm/rc4-%.pl
 	CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
diff --git a/engines/vendor_defns/hwcryptohook.h b/engines/vendor_defns/hwcryptohook.h
deleted file mode 100644
index c3dcd56f4f..0000000000
--- a/engines/vendor_defns/hwcryptohook.h
+++ /dev/null
@@ -1,509 +0,0 @@
-/*
- * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
- *
- * Licensed under the OpenSSL license (the "License").  You may not use
- * this file except in compliance with the License.  You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- */
-
-/*-
- * ModExp / RSA (with/without KM) plugin API
- *
- * The application will load a dynamic library which
- * exports entrypoint(s) defined in this file.
- *
- * This set of entrypoints provides only a multithreaded,
- * synchronous-within-each-thread, facility.
- *
- *
- * This file is Copyright 1998-2000 nCipher Corporation Limited.
- *
- * Redistribution and use in source and binary forms, with opr without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the copyright notice,
- *    this list of conditions, and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above
- *    copyright notice, this list of conditions, and the following
- *    disclaimer, in the documentation and/or other materials provided
- *    with the distribution
- *
- * IN NO EVENT SHALL NCIPHER CORPORATION LIMITED (`NCIPHER') AND/OR
- * ANY OTHER AUTHORS OR DISTRIBUTORS OF THIS FILE BE LIABLE for any
- * damages arising directly or indirectly from this file, its use or
- * this licence.  Without prejudice to the generality of the
- * foregoing: all liability shall be excluded for direct, indirect,
- * special, incidental, consequential or other damages or any loss of
- * profits, business, revenue goodwill or anticipated savings;
- * liability shall be excluded even if nCipher or anyone else has been
- * advised of the possibility of damage.  In any event, if the
- * exclusion of liability is not effective, the liability of nCipher
- * or any author or distributor shall be limited to the lesser of the
- * price paid and 1,000 pounds sterling. This licence only fails to
- * exclude or limit liability for death or personal injury arising out
- * of negligence, and only to the extent that such an exclusion or
- * limitation is not effective.
- *
- * NCIPHER AND THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ALL
- * AND ANY WARRANTIES (WHETHER EXPRESS OR IMPLIED), including, but not
- * limited to, any implied warranties of merchantability, fitness for
- * a particular purpose, satisfactory quality, and/or non-infringement
- * of any third party rights.
- *
- * US Government use: This software and documentation is Commercial
- * Computer Software and Computer Software Documentation, as defined in
- * sub-paragraphs (a)(1) and (a)(5) of DFAR 252.227-7014, "Rights in
- * Noncommercial Computer Software and Noncommercial Computer Software
- * Documentation."  Use, duplication or disclosure by the Government is
- * subject to the terms and conditions specified here.
- *
- * By using or distributing this file you will be accepting these
- * terms and conditions, including the limitation of liability and
- * lack of warranty.  If you do not wish to accept these terms and
- * conditions, DO NOT USE THE FILE.
- *
- *
- * The actual dynamically loadable plugin, and the library files for
- * static linking, which are also provided in some distributions, are
- * not covered by the licence described above.  You should have
- * received a separate licence with terms and conditions for these
- * library files; if you received the library files without a licence,
- * please contact nCipher.
- *
- */
-
-#ifndef HWCRYPTOHOOK_H
-# define HWCRYPTOHOOK_H
-
-# include <sys/types.h>
-# include <stdio.h>
-
-# ifndef HWCRYPTOHOOK_DECLARE_APPTYPES
-#  define HWCRYPTOHOOK_DECLARE_APPTYPES 1
-# endif
-
-# define HWCRYPTOHOOK_ERROR_FAILED   -1
-# define HWCRYPTOHOOK_ERROR_FALLBACK -2
-# define HWCRYPTOHOOK_ERROR_MPISIZE  -3
-
-# if HWCRYPTOHOOK_DECLARE_APPTYPES
-
-/*-
- * These structs are defined by the application and opaque to the
- * crypto plugin.  The application may define these as it sees fit.
- * Default declarations are provided here, but the application may
- *  #define HWCRYPTOHOOK_DECLARE_APPTYPES 0
- * to prevent these declarations, and instead provide its own
- * declarations of these types.  (Pointers to them must still be
- * ordinary pointers to structs or unions, or the resulting combined
- * program will have a type inconsistency.)
- */
-typedef struct HWCryptoHook_MutexValue HWCryptoHook_Mutex;
-typedef struct HWCryptoHook_CondVarValue HWCryptoHook_CondVar;
-typedef struct HWCryptoHook_PassphraseContextValue
- HWCryptoHook_PassphraseContext;
-typedef struct HWCryptoHook_CallerContextValue HWCryptoHook_CallerContext;
-
-# endif                         /* HWCRYPTOHOOK_DECLARE_APPTYPES */
-
-/*-
- * These next two structs are opaque to the application.  The crypto
- * plugin will return pointers to them; the caller simply manipulates
- * the pointers.
- */
-typedef struct HWCryptoHook_Context *HWCryptoHook_ContextHandle;
-typedef struct HWCryptoHook_RSAKey *HWCryptoHook_RSAKeyHandle;
-
-typedef struct {
-    char *buf;
-    size_t size;
-} HWCryptoHook_ErrMsgBuf;
-/*-
- * Used for error reporting.  When a HWCryptoHook function fails it
- * will return a sentinel value (0 for pointer-valued functions, or a
- * negative number, usually HWCRYPTOHOOK_ERROR_FAILED, for
- * integer-valued ones).  It will, if an ErrMsgBuf is passed, also put
- * an error message there.
- *
- * size is the size of the buffer, and will not be modified.  If you
- * pass 0 for size you must pass 0 for buf, and nothing will be
- * recorded (just as if you passed 0 for the struct pointer).
- * Messages written to the buffer will always be null-terminated, even
- * when truncated to fit within size bytes.
- *
- * The contents of the buffer are not defined if there is no error.
- */
-
-typedef struct HWCryptoHook_MPIStruct {
-    unsigned char *buf;
-    size_t size;
-} HWCryptoHook_MPI;
-/*-
- * When one of these is returned, a pointer is passed to the function.
- * At call, size is the space available.  Afterwards it is updated to
- * be set to the actual length (which may be more than the space available,
- * if there was not enough room and the result was truncated).
- * buf (the pointer) is not updated.
- *
- * size is in bytes and may be zero at call or return, but must be a
- * multiple of the limb size.  Zero limbs at the MS end are not
- * permitted.
- */
-
-# define HWCryptoHook_InitFlags_FallbackModExp    0x0002UL
-# define HWCryptoHook_InitFlags_FallbackRSAImmed  0x0004UL
-/*-
- * Enable requesting fallback to software in case of problems with the
- * hardware support.  This indicates to the crypto provider that the
- * application is prepared to fall back to software operation if the
- * ModExp* or RSAImmed* functions return HWCRYPTOHOOK_ERROR_FALLBACK.
- * Without this flag those calls will never return
- * HWCRYPTOHOOK_ERROR_FALLBACK.  The flag will also cause the crypto
- * provider to avoid repeatedly attempting to contact dead hardware
- * within a short interval, if appropriate.
- */
-
-# define HWCryptoHook_InitFlags_SimpleForkCheck   0x0010UL
-/*-
- * Without _SimpleForkCheck the library is allowed to assume that the
- * application will not fork and call the library in the child(ren).
- *
- * When it is specified, this is allowed.  However, after a fork
- * neither parent nor child may unload any loaded keys or call
- * _Finish.  Instead, they should call exit (or die with a signal)
- * without calling _Finish.  After all the children have died the
- * parent may unload keys or call _Finish.
- *
- * This flag only has any effect on UN*X platforms.
- */
-
-typedef struct {
-    unsigned long flags;
-    void *logstream;            /* usually a FILE*.  See below. */
-    size_t limbsize;            /* bignum format - size of radix type, must
-                                 * be power of 2 */
-    int mslimbfirst;            /* 0 or 1 */
-    int msbytefirst;            /* 0 or 1; -1 = native */
-    /*-
-    * All the callback functions should return 0 on success, or a
-    * nonzero integer (whose value will be visible in the error message
-    * put in the buffer passed to the call).
-    *
-    * If a callback is not available pass a null function pointer.
-    *
-    * The callbacks may not call down again into the crypto plugin.
-    */
-    /*-
-    * For thread-safety.  Set everything to 0 if you promise only to be
-    * singlethreaded.  maxsimultaneous is the number of calls to
-    * ModExp[Crt]/RSAImmed{Priv,Pub}/RSA.  If you don't know what to
-    * put there then say 0 and the hook library will use a default.
-    *
-    * maxmutexes is a small limit on the number of simultaneous mutexes
-    * which will be requested by the library.  If there is no small
-    * limit, set it to 0.  If the crypto plugin cannot create the
-    * advertised number of mutexes the calls to its functions may fail.
-    * If a low number of mutexes is advertised the plugin will try to
-    * do the best it can.  Making larger numbers of mutexes available
-    * may improve performance and parallelism by reducing contention
-    * over critical sections.  Unavailability of any mutexes, implying
-    * single-threaded operation, should be indicated by the setting
-    * mutex_init et al to 0.
-    */
-    int maxmutexes;
-    int maxsimultaneous;
-    size_t mutexsize;
-    int (*mutex_init) (HWCryptoHook_Mutex *,
-                       HWCryptoHook_CallerContext * cactx);
-    int (*mutex_acquire) (HWCryptoHook_Mutex *);
-    void (*mutex_release) (HWCryptoHook_Mutex *);
-    void (*mutex_destroy) (HWCryptoHook_Mutex *);
-    /*-
-    * For greater efficiency, can use condition vars internally for
-    * synchronisation.  In this case maxsimultaneous is ignored, but
-    * the other mutex stuff must be available.  In singlethreaded
-    * programs, set everything to 0.
-    */
-    size_t condvarsize;
-    int (*condvar_init) (HWCryptoHook_CondVar *,
-                         HWCryptoHook_CallerContext * cactx);
-    int (*condvar_wait) (HWCryptoHook_CondVar *, HWCryptoHook_Mutex *);
-    void (*condvar_signal) (HWCryptoHook_CondVar *);
-    void (*condvar_broadcast)