Linux-2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ia64/lib
26 files changed, 5093 insertions, 0 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
new file mode 100644
index 000000000000..1902c3c2ef92
--- /dev/null
+++ b/arch/ia64/lib/Makefile
@@ -0,0 +1,52 @@
+#
+# Makefile for ia64-specific library routines..
+#
+
+obj-y := io.o
+
+lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
+	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
+	bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o	\
+	clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o	\
+	flush.o ip_fast_csum.o do_csum.o				\
+	memset.o strlen.o swiotlb.o
+
+lib-$(CONFIG_ITANIUM)	+= copy_page.o copy_user.o memcpy.o
+lib-$(CONFIG_MCKINLEY)	+= copy_page_mck.o memcpy_mck.o
+lib-$(CONFIG_PERFMON)	+= carta_random.o
+lib-$(CONFIG_MD_RAID5)	+= xor.o
+lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
+
+AFLAGS___divdi3.o	=
+AFLAGS___udivdi3.o	= -DUNSIGNED
+AFLAGS___moddi3.o	= 	     -DMODULO
+AFLAGS___umoddi3.o	= -DUNSIGNED -DMODULO
+
+AFLAGS___divsi3.o	=
+AFLAGS___udivsi3.o	= -DUNSIGNED
+AFLAGS___modsi3.o	=	     -DMODULO
+AFLAGS___umodsi3.o	= -DUNSIGNED -DMODULO
+
+$(obj)/__divdi3.o: $(src)/idiv64.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__moddi3.o: $(src)/idiv64.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__divsi3.o: $(src)/idiv32.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__modsi3.o: $(src)/idiv32.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE
+	$(call if_changed_dep,as_o_S)
diff --git a/arch/ia64/lib/bitop.c b/arch/ia64/lib/bitop.c
new file mode 100644
index 000000000000..82e299c8464e
--- /dev/null
+++ b/arch/ia64/lib/bitop.c
@@ -0,0 +1,88 @@
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/intrinsics.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+
+/*
+ * Find next zero bit in a bitmap reasonably efficiently..
+ */
+
+int __find_next_zero_bit (const void *addr, unsigned long size, unsigned long offset)
+{
+	unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
+	unsigned long result = offset & ~63UL;
+	unsigned long tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset &= 63UL;
+	if (offset) {
+		tmp = *(p++);
+		tmp |= ~0UL >> (64-offset);
+		if (size < 64)
+			goto found_first;
+		if (~tmp)
+			goto found_middle;
+		size -= 64;
+		result += 64;
+	}
+	while (size & ~63UL) {
+		if (~(tmp = *(p++)))
+			goto found_middle;
+		result += 64;
+		size -= 64;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+found_first:
+	tmp |= ~0UL << size;
+	if (tmp == ~0UL)		/* any bits zero? */
+		return result + size;	/* nope */
+found_middle:
+	return result + ffz(tmp);
+}
+EXPORT_SYMBOL(__find_next_zero_bit);
+
+/*
+ * Find next bit in a bitmap reasonably efficiently..
+ */
+int __find_next_bit(const void *addr, unsigned long size, unsigned long offset)
+{
+	unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
+	unsigned long result = offset & ~63UL;
+	unsigned long tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset &= 63UL;
+	if (offset) {
+		tmp = *(p++);
+		tmp &= ~0UL << offset;
+		if (size < 64)
+			goto found_first;
+		if (tmp)
+			goto found_middle;
+		size -= 64;
+		result += 64;
+	}
+	while (size & ~63UL) {
+		if ((tmp = *(p++)))
+			goto found_middle;
+		result += 64;
+		size -= 64;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+  found_first:
+	tmp &= ~0UL >> (64-size);
+	if (tmp == 0UL)		/* Are any bits set? */
+		return result + size; /* Nope. */
+  found_middle:
+	return result + __ffs(tmp);
+}
+EXPORT_SYMBOL(__find_next_bit);
diff --git a/arch/ia64/lib/carta_random.S b/arch/ia64/lib/carta_random.S
new file mode 100644
index 000000000000..d0674c360364
--- /dev/null
+++ b/arch/ia64/lib/carta_random.S
@@ -0,0 +1,54 @@
+/*
+ * Fast, simple, yet decent quality random number generator based on
+ * a paper by David G. Carta ("Two Fast Implementations of the
+ * `Minimal Standard' Random Number Generator," Communications of the
+ * ACM, January, 1990).
+ *
+ * Copyright (C) 2002 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+#include <asm/asmmacro.h>
+
+#define a	r2
+#define m	r3
+#define lo	r8
+#define hi	r9
+#define t0	r16
+#define t1	r17
+#define	seed	r32
+
+GLOBAL_ENTRY(carta_random32)
+	movl	a = (16807 << 16) | 16807
+	;;
+	pmpyshr2.u t0 = a, seed, 0
+	pmpyshr2.u t1 = a, seed, 16
+	;;
+	unpack2.l t0 = t1, t0
+	dep	m = -1, r0, 0, 31
+	;;
+	zxt4	lo = t0
+	shr.u	hi = t0, 32
+	;;
+	dep	t0 = 0, hi, 15, 49	// t0 = (hi & 0x7fff)
+	;;
+	shl	t0 = t0, 16		// t0 = (hi & 0x7fff) << 16
+	shr	t1 = hi, 15		// t1 = (hi >> 15)
+	;;
+	add	lo = lo, t0
+	;;
+	cmp.gtu	p6, p0 = lo, m
+	;;
+(p6)	and	lo = lo, m
+	;;
+(p6)	add	lo = 1, lo
+	;;
+	add	lo = lo, t1
+	;;
+	cmp.gtu p6, p0 = lo, m
+	;;
+(p6)	and	lo = lo, m
+	;;
+(p6)	add	lo = 1, lo
+	br.ret.sptk.many rp
+END(carta_random32)
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
new file mode 100644
index 000000000000..beb11721d9f5
--- /dev/null
+++ b/arch/ia64/lib/checksum.c
@@ -0,0 +1,102 @@
+/*
+ * Network checksum routines
+ *
+ * Copyright (C) 1999, 2003 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Most of the code coming from arch/alpha/lib/checksum.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed..
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <asm/byteorder.h>
+
+static inline unsigned short
+from64to16 (unsigned long x)
+{
+	/* add up 32-bit words for 33 bits */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up 16-bit and 17-bit words for 17+c bits */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up 16-bit and 2-bit for 16+c bit */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up carry.. */
+	x = (x & 0xffff) + (x >> 16);
+	return x;
+}
+
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented.
+ */
+unsigned short int
+csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short len,
+		   unsigned short proto, unsigned int sum)
+{
+	return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 16) +
+			   ((unsigned long) proto << 8));
+}
+
+EXPORT_SYMBOL(csum_tcpudp_magic);
+
+unsigned int
+csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len,
+		    unsigned short proto, unsigned int sum)
+{
+	unsigned long result;
+
+	result = (saddr + daddr + sum +
+		  ((unsigned long) ntohs(len) << 16) +
+		  ((unsigned long) proto << 8));
+
+	/* Fold down to 32-bits so we don't lose in the typedef-less network stack.  */
+	/* 64 to 33 */
+	result = (result & 0xffffffff) + (result >> 32);
+	/* 33 to 32 */
+	result = (result & 0xffffffff) + (result >> 32);
+	return result;
+}
+
+extern unsigned long do_csum (const unsigned char *, long);
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+unsigned int
+csum_partial (const unsigned char * buff, int len, unsigned int sum)
+{
+	unsigned long result = do_csum(buff, len);
+
+	/* add in old sum, and carry.. */
+	result += sum;
+	/* 32+c bits -> 32 bits */
+	result = (result & 0xffffffff) + (result >> 32);
+	return result;
+}
+
+EXPORT_SYMBOL(csum_partial);
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+unsigned short
+ip_compute_csum (unsigned char * buff, int len)
+{
+	return ~do_csum(buff,len);
+}
+
+EXPORT_SYMBOL(ip_compute_csum);
diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S
new file mode 100644
index 000000000000..d4987061dda7
--- /dev/null
+++ b/arch/ia64/lib/clear_page.S
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 1999-2002 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
+ *
+ * 1/06/01 davidm	Tuned for Itanium.
+ * 2/12/02 kchen	Tuned for both Itanium and McKinley
+ * 3/08/02 davidm	Some more tweaking
+ */
+#include <linux/config.h>
+
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+
+#ifdef CONFIG_ITANIUM
+# define L3_LINE_SIZE	64	// Itanium L3 line size
+# define PREFETCH_LINES	9	// magic number
+#else
+# define L3_LINE_SIZE	128	// McKinley L3 line size
+# define PREFETCH_LINES	12	// magic number
+#endif
+
+#define saved_lc	r2
+#define dst_fetch	r3
+#define dst1		r8
+#define dst2		r9
+#define dst3		r10
+#define dst4		r11
+
+#define dst_last	r31
+
+GLOBAL_ENTRY(clear_page)
+	.prologue
+	.regstk 1,0,0,0
+	mov r16 = PAGE_SIZE/L3_LINE_SIZE-1	// main loop count, -1=repeat/until
+	.save ar.lc, saved_lc
+	mov saved_lc = ar.lc
+
+	.body
+	mov ar.lc = (PREFETCH_LINES - 1)
+	mov dst_fetch = in0
+	adds dst1 = 16, in0
+	adds dst2 = 32, in0
+	;;
+.fetch:	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
+	adds dst3 = 48, in0		// executing this multiple times is harmless
+	br.cloop.sptk.few .fetch
+	;;
+	addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
+	mov ar.lc = r16			// one L3 line per iteration
+	adds dst4 = 64, in0
+	;;
+#ifdef CONFIG_ITANIUM
+	// Optimized for Itanium
+1:	stf.spill.nta [dst1] = f0, 64
+	stf.spill.nta [dst2] = f0, 64
+	cmp.lt p8,p0=dst_fetch, dst_last
+	;;
+#else
+	// Optimized for McKinley
+1:	stf.spill.nta [dst1] = f0, 64
+	stf.spill.nta [dst2] = f0, 64
+	stf.spill.nta [dst3] = f0, 64
+	stf.spill.nta [dst4] = f0, 128
+	cmp.lt p8,p0=dst_fetch, dst_last
+	;;
+	stf.spill.nta [dst1] = f0, 64
+	stf.spill.nta [dst2] = f0, 64
+#endif
+	stf.spill.nta [dst3] = f0, 64
+(p8)	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
+	br.cloop.sptk.few 1b
+	;;
+	mov ar.lc = saved_lc		// restore lc
+	br.ret.sptk.many rp
+END(clear_page)
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S
new file mode 100644
index 000000000000..eecd8577b209
--- /dev/null
+++ b/arch/ia64/lib/clear_user.S
@@ -0,0 +1,209 @@
+/*
+ * This routine clears to zero a linear memory buffer in user space.
+ *
+ * Inputs:
+ *	in0:	address of buffer
+ *	in1:	length of buffer in bytes
+ * Outputs:
+ *	r8:	number of bytes that didn't get cleared due to a fault
+ *
+ * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ */
+
+#include <asm/asmmacro.h>
+
+//
+// arguments
+//
+#define buf		r32
+#define len		r33
+
+//
+// local registers
+//
+#define cnt		r16
+#define buf2		r17
+#define saved_lc	r18
+#define saved_pfs	r19
+#define tmp		r20
+#define len2		r21
+#define len3		r22
+
+//
+// Theory of operations:
+//	- we check whether or not the buffer is small, i.e., less than 17
+//	  in which case we do the byte by byte loop.
+//
+//	- Otherwise we go progressively from 1 byte store to 8byte store in
+//	  the head part, the body is a 16byte store loop and we finish we the
+//	  tail for the last 15 bytes.
+//	  The good point about this breakdown is that the long buffer handling
+//	  contains only 2 branches.
+//
+//	The reason for not using shifting & masking for both the head and the
+//	tail is to stay semantically correct. This routine is not supposed
+//	to write bytes outside of the buffer. While most of the time this would
+//	be ok, we can't tolerate a mistake. A classical example is the case
+//	of multithreaded code were to the extra bytes touched is actually owned
+//	by another thread which runs concurrently to ours. Another, less likely,
+//	example is with device drivers where reading an I/O mapped location may
+//	have side effects (same thing for writing).
+//
+
+GLOBAL_ENTRY(__do_clear_user)
+	.prologue
+	.save ar.pfs, saved_pfs
+	alloc	saved_pfs=ar.pfs,2,0,0,0
+	cmp.eq p6,p0=r0,len		// check for zero length
+	.save ar.lc, saved_lc
+	mov saved_lc=ar.lc		// preserve ar.lc (slow)
+	.body
+	;;				// avoid WAW on CFM
+	adds tmp=-1,len			// br.ctop is repeat/until
+	mov ret0=len			// return value is length at this point
+(p6)	br.ret.spnt.many rp
+	;;
+	cmp.lt p6,p0=16,len		// if len > 16 then long memset
+	mov ar.lc=tmp			// initialize lc for small count
+(p6)	br.cond.dptk .long_do_clear
+	;;				// WAR on ar.lc
+	//
+	// worst case 16 iterations, avg 8 iterations
+	//
+	// We could have played with the predicates to use the extra
+	// M slot for 2 stores/iteration but the cost the initialization
+	// the various counters compared to how long the loop is supposed
+	// to last on average does not make this solution viable.
+	//
+1:
+	EX( .Lexit1, st1 [buf]=r0,1 )
+	adds len=-1,len			// countdown length using len
+	br.cloop.dptk 1b
+	;;				// avoid RAW on ar.lc
+	//
+	// .Lexit4: comes from byte by byte loop
+	//	    len contains bytes left
+.Lexit1:
+	mov ret0=len			// faster than using ar.lc
+	mov ar.lc=saved_lc
+	br.ret.sptk.many rp		// end of short clear_user
+
+
+	//
+	// At this point we know we have more than 16 bytes to copy
+	// so we focus on alignment (no branches required)
+	//
+	// The use of len/len2 for countdown of the number of bytes left
+	// instead of ret0 is due to the fact that the exception code
+	// changes the values of r8.
+	//
+.long_do_clear:
+	tbit.nz p6,p0=buf,0		// odd alignment (for long_do_clear)
+	;;
+	EX( .Lexit3, (p6) st1 [buf]=r0,1 )	// 1-byte aligned
+(p6)	adds len=-1,len;;		// sync because buf is modified
+	tbit.nz p6,p0=buf,1
+	;;
+	EX( .Lexit3, (p6) st2 [buf]=r0,2 )	// 2-byte aligned
+(p6)	adds len=-2,len;;
+	tbit.nz p6,p0=buf,2
+	;;
+	EX( .Lexit3, (p6) st4 [buf]=r0,4 )	// 4-byte aligned
+(p6)	adds len=-4,len;;
+	tbit.nz p6,p0=buf,3
+	;;
+	EX( .Lexit3, (p6) st8 [buf]=r0,8 )	// 8-byte aligned
+(p6)	adds len=-8,len;;
+	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
+	;;
+	cmp.eq p6,p0=r0,cnt
+	adds tmp=-1,cnt
+(p6)	br.cond.dpnt .dotail		// we have less than 16 bytes left
+	;;
+	adds buf2=8,buf			// setup second base pointer
+	mov ar.lc=tmp
+	;;
+
+	//
+	// 16bytes/iteration core loop
+	//
+	// The second store can never generate a fault because
+	// we come into the loop only when we are 16-byte aligned.
+	// This means that if we cross a page then it will always be
+	// in the first store and never in the second.
+	//
+	//
+	// We need to keep track of the remaining length. A possible (optimistic)
+	// way would be to use ar.lc and derive how many byte were left by
+	// doing : left= 16*ar.lc + 16.  this would avoid the addition at
+	// every iteration.
+	// However we need to keep the synchronization point. A template
+	// M;;MB does not exist and thus we can keep the addition at no
+	// extra cycle cost (use a nop slot anyway). It also simplifies the
+	// (unlikely)  error recovery code
+	//
+
+2:	EX(.Lexit3, st8 [buf]=r0,16 )
+	;;				// needed to get len correct when error
+	st8 [buf2]=r0,16
+	adds len=-16,len
+	br.cloop.dptk 2b
+	;;
+	mov ar.lc=saved_lc
+	//
+	// tail correction based on len only
+	//
+	// We alternate the use of len3,len2 to allow parallelism and correct
+	// error handling. We also reuse p6/p7 to return correct value.
+	// The addition of len2/len3 does not cost anything more compared to
+	// the regular memset as we had empty slots.
+	//
+.dotail:
+	mov len2=len			// for parallelization of error handling
+	mov len3=len
+	tbit.nz p6,p0=len,3
+	;;
+	EX( .Lexit2, (p6) st8 [buf]=r0,8 )	// at least 8 bytes
+(p6)	adds len3=-8,len2
+	tbit.nz p7,p6=len,2
+	;;
+	EX( .Lexit2, (p7) st4 [buf]=r0,4 )	// at least 4 bytes
+(p7)	adds len2=-4,len3
+	tbit.nz p6,p7=len,1
+	;;
+	EX( .Lexit2, (p6) st2 [buf]=r0,2 )	// at least 2 bytes
+(p6)	adds len3=-2,len2
+	tbit.nz p7,p6=len,0
+	;;
+	EX( .Lexit2, (p7) st1 [buf]=r0 )	// only 1 byte left
+	mov ret0=r0				// success
+	br.ret.sptk.many rp			// end of most likely path
+
+	//
+	// Outlined error handling code
+	//
+
+	//
+	// .Lexit3: comes from core loop, need restore pr/lc
+	//	    len contains bytes left
+	//
+	//
+	// .Lexit2:
+	//	if p6 -> coming from st8 or st2 : len2 contains what's left
+	//	if p7 -> coming from st4 or st1 : len3 contains what's left
+	// We must restore lc/pr even though might not have been used.
+.Lexit2:
+	.pred.rel "mutex", p6, p7
+(p6)	mov len=len2
+(p7)	mov len=len3
+	;;
+	//
+	// .Lexit4: comes from head, need not restore pr/lc
+	//	    len contains bytes left
+	//
+.Lexit3:
+	mov ret0=len
+	mov ar.lc=saved_lc
+	br.ret.sptk.many rp
+END(__do_clear_user)
diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S
new file mode 100644
index 000000000000..127d1d050d78
--- /dev/null
+++ b/arch/ia64/lib/copy_page.S
@@ -0,0 +1,98 @@
+/*
+ *
+ * Optimized version of the standard copy_page() function
+ *
+ * Inputs:
+ *	in0:	address of target page
+ *	in1:	address of source page
+ * Output:
+ *	no return value
+ *
+ * Copyright (C) 1999, 2001 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *	David Mosberger <davidm@hpl.hp.com>
+ *
+ * 4/06/01 davidm	Tuned to make it perform well both for cached and uncached copies.
+ */
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+
+#define PIPE_DEPTH	3
+#define EPI		p[PIPE_DEPTH-1]
+
+#define lcount		r16
+#define saved_pr	r17
+#define saved_lc	r18
+#define saved_pfs	r19
+#define src1		r20
+#define src2		r21
+#define tgt1		r22
+#define tgt2		r23
+#define srcf		r24
+#define tgtf		r25
+#define tgt_last	r26
+
+#define Nrot		((8*PIPE_DEPTH+7)&~7)
+
+GLOBAL_ENTRY(copy_page)
+	.prologue
+	.save ar.pfs, saved_pfs
+	alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
+
+	.rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
+	      t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
+	.rotp p[PIPE_DEPTH]
+
+	.save ar.lc, saved_lc
+	mov saved_lc=ar.lc
+	mov ar.ec=PIPE_DEPTH
+
+	mov lcount=PAGE_SIZE/64-1
+	.save pr, saved_pr
+	mov saved_pr=pr
+	mov pr.rot=1<<16
+
+	.body
+
+	mov src1=in1
+	adds src2=8,in1
+	mov tgt_last = PAGE_SIZE
+	;;
+	adds tgt2=8,in0
+	add srcf=512,in1
+	mov ar.lc=lcount
+	mov tgt1=in0
+	add tgtf=512,in0
+	add tgt_last = tgt_last, in0
+	;;
+1:
+(p[0])	ld8 t1[0]=[src1],16
+(EPI)	st8 [tgt1]=t1[PIPE_DEPTH-1],16
+(p[0])	ld8 t2[0]=[src2],16
+(EPI)	st8 [tgt2]=t2[PIPE_DEPTH-1],16
+	cmp.ltu p6,p0 = tgtf, tgt_last
+	;;
+(p[0])	ld8 t3[0]=[src1],16
+(EPI)	st8 [tgt1]=t3[PIPE_DEPTH-1],16
+(p[0])	ld8 t4[0]=[src2],16
+(EPI)	st8 [tgt2]=t4[PIPE_DEPTH-1],16
+	;;
+(p[0])	ld8 t5[0]=[src1],16
+(EPI)	st8 [tgt1]=t5[PIPE_DEPTH-1],16
+(p[0])	ld8 t6[0]=[src2],16
+(EPI)	st8 [tgt2]=t6[PIPE_DEPTH-1],16
+	;;
+(p[0])	ld8 t7[0]=[src1],16
+(EPI)	st8 [tgt1]=t7[PIPE_DEPTH-1],16
+(p[0])	ld8 t8[0]=[src2],16
+(EPI)	st8 [tgt2]=t8[PIPE_DEPTH-1],16
+
+(p6)	lfetch [srcf], 64
+(p6)	lfetch [tgtf], 64
+	br.ctop.sptk.few 1b
+	;;
+	mov pr=saved_pr,0xffffffffffff0000	// restore predicates
+	mov ar.pfs=saved_pfs
+	mov ar.lc=saved_lc
+	br.ret.sptk.many rp
+END(copy_page)
diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S
new file mode 100644
index 000000000000..3c45d60a81b4
--- /dev/null
+++ b/arch/ia64/lib/copy_page_mck.S
@@ -0,0 +1,185 @@
+/*
+ * McKinley-optimized version of copy_page().
+ *
+ * Copyright (C) 2002 Hewlett-Packard Co
+ *	David Mosberger <davidm@hpl.hp.com>
+ *
+ * Inputs:
+ *	in0:	address of target page
+ *	in1:	address of source page
+ * Output:
+ *	no return value
+ *
+ * General idea:
+ *	- use regular loads and stores to prefetch data to avoid consuming M-slot just for
+ *	  lfetches => good for in-cache performance
+ *	- avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
+ *	  cycle
+ *
+ * Principle of operation:
+ *	First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
+ *	To avoid secondary misses in L2, we prefetch both source and destination with a line-size
+ *	of 128 bytes.  When both of these lines are in the L2 and the first half of the
+ *	source line is in L1, we start copying the remaining words.  The second half of the
+ *	source line is prefetched in an earlier iteration, so that by the time we start
+ *	accessing it, it's also present in the L1.
+ *
+ *	We use a software-pipelined loop to control the overall operation.  The pipeline
+ *	has 2*PREFETCH_DIST+K stages.  The first PREFETCH_DIST stages are used for prefetching
+ *	source cache-lines.  The second PREFETCH_DIST stages are used for prefetching destination
+ *	cache-lines, the last K stages are used to copy the cache-line words not copied by
+ *	the prefetches.  The four relevant points in the pipelined are called A, B, C, D:
+ *	p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
+ *	should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
+ *	into L1D and p[D] is TRUE if a cacheline needs to be copied.
+ *
+ *	This all sounds very complicated, but thanks to the modulo-scheduled loop support,
+ *	the resulting code is very regular and quite easy to follow (once you get the idea).
+ *
+ *	As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
+ *	as the separate .prefetch_loop.  Logically, this loop performs exactly like the
+ *	main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
+ *	so that each loop iteration is faster (again, good for cached case).
+ *
+ *	When reading the code, it helps to keep the following picture in mind:
+ *
+ *	       word 0 word 1
+ *            +------+------+---
+ *	      |	v[x] | 	t1  | ^
+ *	      |	t2   |	t3  | |
+ *	      |	t4   |	t5  | |
+ *	      |	t6   |	t7  | | 128 bytes
+ *     	      |	n[y] | 	t9  | |	(L2 cache line)
+ *	      |	t10  | 	t11 | |
+ *	      |	t12  | 	t13 | |
+ *	      |	t14  | 	t15 | v
+ *	      +------+------+---
+ *
+ *	Here, v[x] is copied by the (memory) prefetch.  n[y] is loaded at p[C]
+ *	to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
+ *	an order that avoids bank conflicts.
+ */
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+
+#define PREFETCH_DIST	8		// McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
+
+#define src0		r2
+#define src1		r3
+#define dst0		r9
+#define dst1		r10
+#define src_pre_mem	r11
+#define dst_pre_mem	r14
+#define src_pre_l2	r15
+#define dst_pre_l2	r16
+#define t1		r17
+#define t2		r18
+#define t3		r19
+#define t4		r20
+#define t5		t1	// alias!
+#define t6		t2	// alias!
+#define t7		t3	// alias!
+#define t9		t5	// alias!
+#define t10		t4	// alias!
+#define t11		t7	// alias!
+#define t12		t6	// alias!
+#define t14		t10	// alias!
+#define t13		r21
+#define t15		r22
+
+#define saved_lc	r23
+#define saved_pr	r24
+
+#define	A	0
+#define B	(PREFETCH_DIST)
+#define C	(B + PREFETCH_DIST)
+#define D	(C + 3)
+#define N	(D + 1)
+#define Nrot	((N + 7) & ~7)
+
+GLOBAL_ENTRY(copy_page)
+	.prologue
+	alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
+
+	.rotr v[2*PREFETCH_DIST], n[D-C+1]
+	.rotp p[N]
+
+	.save ar.lc, saved_lc
+	mov saved_lc = ar.lc
+	.save pr, saved_pr
+	mov saved_pr = pr
+	.body
+
+	mov src_pre_mem = in1
+	mov pr.rot = 0x10000
+	mov ar.ec = 1				// special unrolled loop
+
+	mov dst_pre_mem = in0
+	mov ar.lc = 2*PREFETCH_DIST - 1
+
+	add src_pre_l2 = 8*8, in1
+	add dst_pre_l2 = 8*8, in0
+	add src0 = 8, in1			// first t1 src
+	add src1 = 3*8, in1			// first t3 src
+	add dst0 = 8, in0			// first t1 dst
+	add dst1 = 3*8, in0			// first t3 dst
+	mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
+	nop.m 0
+	nop.i 0
+	;;
+	// same as .line_copy loop, but with all predicated-off instructions removed:
+.prefetch_loop:
+(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0
+(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2
+	br.ctop.sptk .prefetch_loop
+	;;
+	cmp.eq p16, p0 = r0, r0			// reset p16 to 1 (br.ctop cleared it to zero)
+	mov ar.lc = t1				// with 64KB pages, t1 is too big to fit in 8 bits!
+	mov ar.ec = N				// # of stages in pipeline
+	;;
+.line_copy:
+(p[D])	ld8 t2 = [src0], 3*8			// M0
+(p[D])	ld8 t4 = [src1], 3*8			// M1
+(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2 prefetch dst from memory
+(p[D])	st8 [dst_pre_l2] = n[D-C], 128		// M3 prefetch dst from L2
+	;;
+(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0 prefetch src from memory
+(p[C])	ld8 n[0] = [src_pre_l2], 128		// M1 prefetch src from L2
+(p[D])	st8 [dst0] =  t1, 8			// M2
+(p[D])	st8 [dst1] =  t3, 8			// M3
+	;;
+(p[D])	ld8  t5 = [src0], 8
+(p[D])	ld8  t7 = [src1], 3*8
+(p[D])	st8 [dst0] =  t2, 3*8
+(p[D])	st8 [dst1] =  t4, 3*8
+	;;
+(p[D])	ld8  t6 = [src0], 3*8
+(p[D])	ld8 t10 = [src1], 8
+(p[D])	st8 [dst0] =  t5, 8
+(p[D])	st8 [dst1] =  t7, 3*8
+	;;
+(p[D])	ld8  t9 = [src0], 3*8
+(p[D])	ld8 t11 = [src1], 3*8
+(p[D])	st8 [dst0] =  t6, 3*8
+(p[D])	st8 [dst1] = t10, 8
+	;;
+(p[D])	ld8 t12 = [src0], 8
+(p[D])	ld8 t14 = [src1], 8
+(p[D])	st8 [dst0] =  t9, 3*8
+(p[D])	st8 [dst1] = t11, 3*8
+	;;
+(p[D])	ld8 t13 = [src0], 4*8
+(p[D])	ld8 t15 = [src1], 4*8
+(p[D])	st8 [dst0] = t12, 8
+(p[D])	st8 [dst1] = t14, 8
+	;;
+(p[D-1])ld8  t1 = [src0], 8
+(p[D-1])ld8  t3 = [src1], 8
+(p[D])	st8 [dst0] = t13, 4*8
+(p[D])	st8 [dst1] = t15, 4*8
+	br.ctop.sptk .line_copy
+	;;
+	mov ar.lc = saved_lc
+	mov pr = saved_pr, -1
+	br.ret.sptk.many rp
+END(copy_page)
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
new file mode 100644
index 000000000000..c952bdc6a093
--- /dev/null
+++ b/arch/ia64/lib/copy_user.S
@@ -0,0 +1,610 @@
+/*
+ *
+ * Optimized version of the copy_user() routine.
+ * It is used to copy date across the kernel/user boundary.
+ *
+ * The source and destination are always on opposite side of
+ * the boundary. When reading from user space we must catch
+ * faults on loads. When writing to user space we must catch
+ * errors on stores. Note that because of the nature of the copy
+ * we don't need to worry about overlapping regions.
+ *
+ *
+ * Inputs:
+ *	in0	address of source buffer
+ *	in1	address of destination buffer
+ *	in2	number of bytes to copy
+ *
+ * Outputs:
+ *	ret0	0 in case of success. The number of bytes NOT copied in
+ *		case of error.
+ *
+ * Copyright (C) 2000-2001 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Fixme:
+ *	- handle the case where we have more than 16 bytes and the alignment
+ *	  are different.
+ *	- more benchmarking
+ *	- fix extraneous stop bit introduced by the EX() macro.
+ */
+
+#include <asm/asmmacro.h>
+
+//
+// Tuneable parameters
+//
+#define COPY_BREAK	16	// we do byte copy below (must be >=16)
+#define PIPE_DEPTH	21	// pipe depth
+
+#define EPI		p[PIPE_DEPTH-1]
+
+//
+// arguments
+//
+#define dst		in0
+#define src		in1
+#define len		in2
+
+//
+// local registers
+//
+#define t1		r2	// rshift in bytes
+#define t2		r3	// lshift in bytes
+#define rshift		r14	// right shift in bits
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ia64/lib