summaryrefslogtreecommitdiffstats
path: root/arch/i386/lib
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/i386/lib
Linux-2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'arch/i386/lib')
-rw-r--r--arch/i386/lib/Makefile10
-rw-r--r--arch/i386/lib/bitops.c70
-rw-r--r--arch/i386/lib/checksum.S496
-rw-r--r--arch/i386/lib/dec_and_lock.c40
-rw-r--r--arch/i386/lib/delay.c49
-rw-r--r--arch/i386/lib/getuser.S70
-rw-r--r--arch/i386/lib/memcpy.c44
-rw-r--r--arch/i386/lib/mmx.c399
-rw-r--r--arch/i386/lib/putuser.S87
-rw-r--r--arch/i386/lib/strstr.c31
-rw-r--r--arch/i386/lib/usercopy.c636
11 files changed, 1932 insertions, 0 deletions
diff --git a/arch/i386/lib/Makefile b/arch/i386/lib/Makefile
new file mode 100644
index 000000000000..7b1932d20f96
--- /dev/null
+++ b/arch/i386/lib/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for i386-specific library files..
+#
+
+
+lib-y = checksum.o delay.o usercopy.o getuser.o putuser.o memcpy.o strstr.o \
+ bitops.o
+
+lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
+lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
diff --git a/arch/i386/lib/bitops.c b/arch/i386/lib/bitops.c
new file mode 100644
index 000000000000..97db3853dc82
--- /dev/null
+++ b/arch/i386/lib/bitops.c
@@ -0,0 +1,70 @@
+#include <linux/bitops.h>
+#include <linux/module.h>
+
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+int find_next_bit(const unsigned long *addr, int size, int offset)
+{
+ const unsigned long *p = addr + (offset >> 5);
+ int set = 0, bit = offset & 31, res;
+
+ if (bit) {
+ /*
+ * Look for nonzero in the first 32 bits:
+ */
+ __asm__("bsfl %1,%0\n\t"
+ "jne 1f\n\t"
+ "movl $32, %0\n"
+ "1:"
+ : "=r" (set)
+ : "r" (*p >> bit));
+ if (set < (32 - bit))
+ return set + offset;
+ set = 32 - bit;
+ p++;
+ }
+ /*
+ * No set bit yet, search remaining full words for a bit
+ */
+ res = find_first_bit (p, size - 32 * (p - addr));
+ return (offset + set + res);
+}
+EXPORT_SYMBOL(find_next_bit);
+
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+int find_next_zero_bit(const unsigned long *addr, int size, int offset)
+{
+ unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
+ int set = 0, bit = offset & 31, res;
+
+ if (bit) {
+ /*
+ * Look for zero in the first 32 bits.
+ */
+ __asm__("bsfl %1,%0\n\t"
+ "jne 1f\n\t"
+ "movl $32, %0\n"
+ "1:"
+ : "=r" (set)
+ : "r" (~(*p >> bit)));
+ if (set < (32 - bit))
+ return set + offset;
+ set = 32 - bit;
+ p++;
+ }
+ /*
+ * No zero yet, search remaining full bytes for a zero
+ */
+ res = find_first_zero_bit (p, size - 32 * (p - (unsigned long *) addr));
+ return (offset + set + res);
+}
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/i386/lib/checksum.S b/arch/i386/lib/checksum.S
new file mode 100644
index 000000000000..94c7867ddc33
--- /dev/null
+++ b/arch/i386/lib/checksum.S
@@ -0,0 +1,496 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IP/TCP/UDP checksumming routines
+ *
+ * Authors: Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Tom May, <ftom@netcom.com>
+ * Pentium Pro/II routines:
+ * Alexander Kjeldaas <astor@guardian.no>
+ * Finn Arne Gangstad <finnag@guardian.no>
+ * Lots of code moved from tcp.c and ip.c; see those files
+ * for more names.
+ *
+ * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ * handling.
+ * Andi Kleen, add zeroing on error
+ * converted to pure assembler
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/errno.h>
+
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+
+/*
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+ */
+
+.text
+.align 4
+.globl csum_partial
+
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+
+ /*
+ * Experiments with Ethernet and SLIP connections show that buff
+ * is aligned on either a 2-byte or 4-byte boundary. We get at
+ * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+ * Fortunately, it is easy to convert 2-byte alignment to 4-byte
+ * alignment for the unrolled loop.
+ */
+csum_partial:
+ pushl %esi
+ pushl %ebx
+ movl 20(%esp),%eax # Function arg: unsigned int sum
+ movl 16(%esp),%ecx # Function arg: int len
+ movl 12(%esp),%esi # Function arg: unsigned char *buff
+ testl $3, %esi # Check alignment.
+ jz 2f # Jump if alignment is ok.
+ testl $1, %esi # Check alignment.
+ jz 10f # Jump if alignment is boundary of 2bytes.
+
+ # buf is odd
+ dec %ecx
+ jl 8f
+ movzbl (%esi), %ebx
+ adcl %ebx, %eax
+ roll $8, %eax
+ inc %esi
+ testl $2, %esi
+ jz 2f
+10:
+ subl $2, %ecx # Alignment uses up two bytes.
+ jae 1f # Jump if we had at least two bytes.
+ addl $2, %ecx # ecx was < 2. Deal with it.
+ jmp 4f
+1: movw (%esi), %bx
+ addl $2, %esi
+ addw %bx, %ax
+ adcl $0, %eax
+2:
+ movl %ecx, %edx
+ shrl $5, %ecx
+ jz 2f
+ testl %esi, %esi
+1: movl (%esi), %ebx
+ adcl %ebx, %eax
+ movl 4(%esi), %ebx
+ adcl %ebx, %eax
+ movl 8(%esi), %ebx
+ adcl %ebx, %eax
+ movl 12(%esi), %ebx
+ adcl %ebx, %eax
+ movl 16(%esi), %ebx
+ adcl %ebx, %eax
+ movl 20(%esi), %ebx
+ adcl %ebx, %eax
+ movl 24(%esi), %ebx
+ adcl %ebx, %eax
+ movl 28(%esi), %ebx
+ adcl %ebx, %eax
+ lea 32(%esi), %esi
+ dec %ecx
+ jne 1b
+ adcl $0, %eax
+2: movl %edx, %ecx
+ andl $0x1c, %edx
+ je 4f
+ shrl $2, %edx # This clears CF
+3: adcl (%esi), %eax
+ lea 4(%esi), %esi
+ dec %edx
+ jne 3b
+ adcl $0, %eax
+4: andl $3, %ecx
+ jz 7f
+ cmpl $2, %ecx
+ jb 5f
+ movw (%esi),%cx
+ leal 2(%esi),%esi
+ je 6f
+ shll $16,%ecx
+5: movb (%esi),%cl
+6: addl %ecx,%eax
+ adcl $0, %eax
+7:
+ testl $1, 12(%esp)
+ jz 8f
+ roll $8, %eax
+8:
+ popl %ebx
+ popl %esi
+ ret
+
+#else
+
+/* Version for PentiumII/PPro */
+
+csum_partial:
+ pushl %esi
+ pushl %ebx
+ movl 20(%esp),%eax # Function arg: unsigned int sum
+ movl 16(%esp),%ecx # Function arg: int len
+ movl 12(%esp),%esi # Function arg: const unsigned char *buf
+
+ testl $3, %esi
+ jnz 25f
+10:
+ movl %ecx, %edx
+ movl %ecx, %ebx
+ andl $0x7c, %ebx
+ shrl $7, %ecx
+ addl %ebx,%esi
+ shrl $2, %ebx
+ negl %ebx
+ lea 45f(%ebx,%ebx,2), %ebx
+ testl %esi, %esi
+ jmp *%ebx
+
+ # Handle 2-byte-aligned regions
+20: addw (%esi), %ax
+ lea 2(%esi), %esi
+ adcl $0, %eax
+ jmp 10b
+25:
+ testl $1, %esi
+ jz 30f
+ # buf is odd
+ dec %ecx
+ jl 90f
+ movzbl (%esi), %ebx
+ addl %ebx, %eax
+ adcl $0, %eax
+ roll $8, %eax
+ inc %esi
+ testl $2, %esi
+ jz 10b
+
+30: subl $2, %ecx
+ ja 20b
+ je 32f
+ addl $2, %ecx
+ jz 80f
+ movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
+ addl %ebx, %eax
+ adcl $0, %eax
+ jmp 80f
+32:
+ addw (%esi), %ax # csumming 2 bytes, 2-aligned
+ adcl $0, %eax
+ jmp 80f
+
+40:
+ addl -128(%esi), %eax
+ adcl -124(%esi), %eax
+ adcl -120(%esi), %eax
+ adcl -116(%esi), %eax
+ adcl -112(%esi), %eax
+ adcl -108(%esi), %eax
+ adcl -104(%esi), %eax
+ adcl -100(%esi), %eax
+ adcl -96(%esi), %eax
+ adcl -92(%esi), %eax
+ adcl -88(%esi), %eax
+ adcl -84(%esi), %eax
+ adcl -80(%esi), %eax
+ adcl -76(%esi), %eax
+ adcl -72(%esi), %eax
+ adcl -68(%esi), %eax
+ adcl -64(%esi), %eax
+ adcl -60(%esi), %eax
+ adcl -56(%esi), %eax
+ adcl -52(%esi), %eax
+ adcl -48(%esi), %eax
+ adcl -44(%esi), %eax
+ adcl -40(%esi), %eax
+ adcl -36(%esi), %eax
+ adcl -32(%esi), %eax
+ adcl -28(%esi), %eax
+ adcl -24(%esi), %eax
+ adcl -20(%esi), %eax
+ adcl -16(%esi), %eax
+ adcl -12(%esi), %eax
+ adcl -8(%esi), %eax
+ adcl -4(%esi), %eax
+45:
+ lea 128(%esi), %esi
+ adcl $0, %eax
+ dec %ecx
+ jge 40b
+ movl %edx, %ecx
+50: andl $3, %ecx
+ jz 80f
+
+ # Handle the last 1-3 bytes without jumping
+ notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
+ movl $0xffffff,%ebx # by the shll and shrl instructions
+ shll $3,%ecx
+ shrl %cl,%ebx
+ andl -128(%esi),%ebx # esi is 4-aligned so should be ok
+ addl %ebx,%eax
+ adcl $0,%eax
+80:
+ testl $1, 12(%esp)
+ jz 90f
+ roll $8, %eax
+90:
+ popl %ebx
+ popl %esi
+ ret
+
+#endif
+
+/*
+unsigned int csum_partial_copy_generic (const char *src, char *dst,
+ int len, int sum, int *src_err_ptr, int *dst_err_ptr)
+ */
+
+/*
+ * Copy from ds while checksumming, otherwise like csum_partial
+ *
+ * The macros SRC and DST specify the type of access for the instruction.
+ * thus we can call a custom exception handler for all access types.
+ *
+ * FIXME: could someone double-check whether I haven't mixed up some SRC and
+ * DST definitions? It's damn hard to trigger all cases. I hope I got
+ * them all but there's no guarantee.
+ */
+
+#define SRC(y...) \
+ 9999: y; \
+ .section __ex_table, "a"; \
+ .long 9999b, 6001f ; \
+ .previous
+
+#define DST(y...) \
+ 9999: y; \
+ .section __ex_table, "a"; \
+ .long 9999b, 6002f ; \
+ .previous
+
+.align 4
+.globl csum_partial_copy_generic
+
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+
+#define ARGBASE 16
+#define FP 12
+
+csum_partial_copy_generic:
+ subl $4,%esp
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ movl ARGBASE+16(%esp),%eax # sum
+ movl ARGBASE+12(%esp),%ecx # len
+ movl ARGBASE+4(%esp),%esi # src
+ movl ARGBASE+8(%esp),%edi # dst
+
+ testl $2, %edi # Check alignment.
+ jz 2f # Jump if alignment is ok.
+ subl $2, %ecx # Alignment uses up two bytes.
+ jae 1f # Jump if we had at least two bytes.
+ addl $2, %ecx # ecx was < 2. Deal with it.
+ jmp 4f
+SRC(1: movw (%esi), %bx )
+ addl $2, %esi
+DST( movw %bx, (%edi) )
+ addl $2, %edi
+ addw %bx, %ax
+ adcl $0, %eax
+2:
+ movl %ecx, FP(%esp)
+ shrl $5, %ecx
+ jz 2f
+ testl %esi, %esi
+SRC(1: movl (%esi), %ebx )
+SRC( movl 4(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, (%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 4(%edi) )
+
+SRC( movl 8(%esi), %ebx )
+SRC( movl 12(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, 8(%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 12(%edi) )
+
+SRC( movl 16(%esi), %ebx )
+SRC( movl 20(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, 16(%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 20(%edi) )
+
+SRC( movl 24(%esi), %ebx )
+SRC( movl 28(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, 24(%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 28(%edi) )
+
+ lea 32(%esi), %esi
+ lea 32(%edi), %edi
+ dec %ecx
+ jne 1b
+ adcl $0, %eax
+2: movl FP(%esp), %edx
+ movl %edx, %ecx
+ andl $0x1c, %edx
+ je 4f
+ shrl $2, %edx # This clears CF
+SRC(3: movl (%esi), %ebx )
+ adcl %ebx, %eax
+DST( movl %ebx, (%edi) )
+ lea 4(%esi), %esi
+ lea 4(%edi), %edi
+ dec %edx
+ jne 3b
+ adcl $0, %eax
+4: andl $3, %ecx
+ jz 7f
+ cmpl $2, %ecx
+ jb 5f
+SRC( movw (%esi), %cx )
+ leal 2(%esi), %esi
+DST( movw %cx, (%edi) )
+ leal 2(%edi), %edi
+ je 6f
+ shll $16,%ecx
+SRC(5: movb (%esi), %cl )
+DST( movb %cl, (%edi) )
+6: addl %ecx, %eax
+ adcl $0, %eax
+7:
+5000:
+
+# Exception handler:
+.section .fixup, "ax"
+
+6001:
+ movl ARGBASE+20(%esp), %ebx # src_err_ptr
+ movl $-EFAULT, (%ebx)
+
+ # zero the complete destination - computing the rest
+ # is too much work
+ movl ARGBASE+8(%esp), %edi # dst
+ movl ARGBASE+12(%esp), %ecx # len
+ xorl %eax,%eax
+ rep ; stosb
+
+ jmp 5000b
+
+6002:
+ movl ARGBASE+24(%esp), %ebx # dst_err_ptr
+ movl $-EFAULT,(%ebx)
+ jmp 5000b
+
+.previous
+
+ popl %ebx
+ popl %esi
+ popl %edi
+ popl %ecx # equivalent to addl $4,%esp
+ ret
+
+#else
+
+/* Version for PentiumII/PPro */
+
+#define ROUND1(x) \
+ SRC(movl x(%esi), %ebx ) ; \
+ addl %ebx, %eax ; \
+ DST(movl %ebx, x(%edi) ) ;
+
+#define ROUND(x) \
+ SRC(movl x(%esi), %ebx ) ; \
+ adcl %ebx, %eax ; \
+ DST(movl %ebx, x(%edi) ) ;
+
+#define ARGBASE 12
+
+csum_partial_copy_generic:
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+ movl ARGBASE+4(%esp),%esi #src
+ movl ARGBASE+8(%esp),%edi #dst
+ movl ARGBASE+12(%esp),%ecx #len
+ movl ARGBASE+16(%esp),%eax #sum
+# movl %ecx, %edx
+ movl %ecx, %ebx
+ movl %esi, %edx
+ shrl $6, %ecx
+ andl $0x3c, %ebx
+ negl %ebx
+ subl %ebx, %esi
+ subl %ebx, %edi
+ lea -1(%esi),%edx
+ andl $-32,%edx
+ lea 3f(%ebx,%ebx), %ebx
+ testl %esi, %esi
+ jmp *%ebx
+1: addl $64,%esi
+ addl $64,%edi
+ SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
+ ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
+ ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
+ ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
+ ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
+3: adcl $0,%eax
+ addl $64, %edx
+ dec %ecx
+ jge 1b
+4: movl ARGBASE+12(%esp),%edx #len
+ andl $3, %edx
+ jz 7f
+ cmpl $2, %edx
+ jb 5f
+SRC( movw (%esi), %dx )
+ leal 2(%esi), %esi
+DST( movw %dx, (%edi) )
+ leal 2(%edi), %edi
+ je 6f
+ shll $16,%edx
+5:
+SRC( movb (%esi), %dl )
+DST( movb %dl, (%edi) )
+6: addl %edx, %eax
+ adcl $0, %eax
+7:
+.section .fixup, "ax"
+6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
+ movl $-EFAULT, (%ebx)
+ # zero the complete destination (computing the rest is too much work)
+ movl ARGBASE+8(%esp),%edi # dst
+ movl ARGBASE+12(%esp),%ecx # len
+ xorl %eax,%eax
+ rep; stosb
+ jmp 7b
+6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
+ movl $-EFAULT, (%ebx)
+ jmp 7b
+.previous
+
+ popl %esi
+ popl %edi
+ popl %ebx
+ ret
+
+#undef ROUND
+#undef ROUND1
+
+#endif
diff --git a/arch/i386/lib/dec_and_lock.c b/arch/i386/lib/dec_and_lock.c
new file mode 100644
index 000000000000..ab43394dc775
--- /dev/null
+++ b/arch/i386/lib/dec_and_lock.c
@@ -0,0 +1,40 @@
+/*
+ * x86 version of "atomic_dec_and_lock()" using
+ * the atomic "cmpxchg" instruction.
+ *
+ * (For CPU's lacking cmpxchg, we use the slow
+ * generic version, and this one never even gets
+ * compiled).
+ */
+
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+
+int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+{
+ int counter;
+ int newcount;
+
+repeat:
+ counter = atomic_read(atomic);
+ newcount = counter-1;
+
+ if (!newcount)
+ goto slow_path;
+
+ asm volatile("lock; cmpxchgl %1,%2"
+ :"=a" (newcount)
+ :"r" (newcount), "m" (atomic->counter), "0" (counter));
+
+ /* If the above failed, "eax" will have changed */
+ if (newcount != counter)
+ goto repeat;
+ return 0;
+
+slow_path:
+ spin_lock(lock);
+ if (atomic_dec_and_test(atomic))
+ return 1;
+ spin_unlock(lock);
+ return 0;
+}
diff --git a/arch/i386/lib/delay.c b/arch/i386/lib/delay.c
new file mode 100644
index 000000000000..080639f262b1
--- /dev/null
+++ b/arch/i386/lib/delay.c
@@ -0,0 +1,49 @@
+/*
+ * Precise Delay Loops for i386
+ *
+ * Copyright (C) 1993 Linus Torvalds
+ * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * The __delay function must _NOT_ be inlined as its execution time
+ * depends wildly on alignment on many x86 processors. The additional
+ * jump magic is needed to get the timing stable on all the CPU's
+ * we have to worry about.
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <asm/processor.h>
+#include <asm/delay.h>
+#include <asm/timer.h>
+
+#ifdef CONFIG_SMP
+#include <asm/smp.h>
+#endif
+
+extern struct timer_opts* timer;
+
+void __delay(unsigned long loops)
+{
+ cur_timer->delay(loops);
+}
+
+inline void __const_udelay(unsigned long xloops)
+{
+ int d0;
+ xloops *= 4;
+ __asm__("mull %0"
+ :"=d" (xloops), "=&a" (d0)
+ :"1" (xloops),"0" (cpu_data[_smp_processor_id()].loops_per_jiffy * (HZ/4)));
+ __delay(++xloops);
+}
+
+void __udelay(unsigned long usecs)
+{
+ __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
+}
+
+void __ndelay(unsigned long nsecs)
+{
+ __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
+}
diff --git a/arch/i386/lib/getuser.S b/arch/i386/lib/getuser.S
new file mode 100644
index 000000000000..62d7f178a326
--- /dev/null
+++ b/arch/i386/lib/getuser.S
@@ -0,0 +1,70 @@
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+#include <asm/thread_info.h>
+
+
+/*
+ * __get_user_X
+ *
+ * Inputs: %eax contains the address
+ *
+ * Outputs: %eax is error code (0 or -EFAULT)
+ * %edx contains zero-extended value
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+.text
+.align 4
+.globl __get_user_1
+__get_user_1:
+ GET_THREAD_INFO(%edx)
+ cmpl TI_addr_limit(%edx),%eax
+ jae bad_get_user
+1: movzbl (%eax),%edx
+ xorl %eax,%eax
+ ret
+
+.align 4
+.globl __get_user_2
+__get_user_2:
+ addl $1,%eax
+ jc bad_get_user
+ GET_THREAD_INFO(%edx)
+ cmpl TI_addr_limit(%edx),%eax
+ jae bad_get_user
+2: movzwl -1(%eax),%edx
+ xorl %eax,%eax
+ ret
+
+.align 4
+.globl __get_user_4
+__get_user_4:
+ addl $3,%eax
+ jc bad_get_user
+ GET_THREAD_INFO(%edx)
+ cmpl TI_addr_limit(%edx),%eax
+ jae bad_get_user
+3: movl -3(%eax),%edx
+ xorl %eax,%eax
+ ret
+
+bad_get_user:
+ xorl %edx,%edx
+ movl $-14,%eax
+ ret
+
+.section __ex_table,"a"
+ .long 1b,bad_get_user
+ .long 2b,bad_get_user
+ .long 3b,bad_get_user
+.previous
diff --git a/arch/i386/lib/memcpy.c b/arch/i386/lib/memcpy.c
new file mode 100644
index 000000000000..891b2359d18a
--- /dev/null
+++ b/arch/i386/lib/memcpy.c
@@ -0,0 +1,44 @@
+#include <linux/config.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+#undef memcpy
+#undef memset
+
+void *memcpy(void *to, const void *from, size_t n)
+{
+#ifdef CONFIG_X86_USE_3DNOW
+ return __memcpy3d(to, from, n);
+#else
+ return __memcpy(to, from, n);
+#endif
+}
+EXPORT_SYMBOL(memcpy);
+
+void *memset(void *s, int c, size_t count)
+{
+ return __memset(s, c, count);
+}
+EXPORT_SYMBOL(memset);
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+ int d0, d1, d2;
+
+ if (dest < src) {
+ memcpy(dest,src,n);
+ } else {
+ __asm__ __volatile__(
+ "std\n\t"
+ "rep\n\t"
+ "movsb\n\t"
+ "cld"
+ : "=&c" (d0), "=&S" (d1), "=&D" (d2)
+ :"0" (n),
+ "1" (n-1+(const char *)src),
+ "2" (n-1+(char *)dest)
+ :"memory");
+ }
+ return dest;
+}
+EXPORT_SYMBOL(memmove);
diff --git a/arch/i386/lib/mmx.c b/arch/i386/lib/mmx.c
new file mode 100644
index 000000000000..01f8b1a2cc84
--- /dev/null
+++ b/arch/i386/lib/mmx.c
@@ -0,0 +1,399 @@
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+
+#include <asm/i387.h>
+
+
+/*
+ * MMX 3DNow! library helper functions
+ *
+ * To do:
+ * We can use MMX just for prefetch in IRQ's. This may be a win.
+ * (reported so on K6-III)
+ * We should use a better code neutral filler for the short jump
+ * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
+ * We also want to clobber the filler register so we don't get any
+ * register forwarding stalls on the filler.
+ *
+ * Add *user handling. Checksums are not a win with MMX on any CPU
+ * tested so far for any MMX solution figured.
+ *
+ * 22/09/2000 - Arjan van de Ven
+ * Improved for non-egineering-sample Athlons
+ *
+ */
+
+void *_mmx_memcpy(void *to, const void *from, size_t len)
+{
+ void *p;
+ int i;
+
+ if (unlikely(in_interrupt()))
+ return __memcpy(to, from, len);
+
+ p = to;
+ i = len >> 6; /* len/64 */
+
+ kernel_fpu_begin();
+
+ __asm__ __volatile__ (
+ "1: prefetch (%0)\n" /* This set is 28 bytes */
+ " prefetch 64(%0)\n"
+ " prefetch 128(%0)\n"
+ " prefetch 192(%0)\n"
+ " prefetch 256(%0)\n"
+ "2: \n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (from) );
+
+
+ for(; i>5; i--)
+ {
+ __asm__ __volatile__ (
+ "1: prefetch 320(%0)\n"
+ "2: movq (%0), %%mm0\n"
+ " movq 8(%0), %%mm1\n"
+ " movq 16(%0), %%mm2\n"
+ " movq 24(%0), %%mm3\n"
+ " movq %%mm0, (%1)\n"
+ " movq %%mm1, 8(%1)\n"
+ " movq %%mm2, 16(%1)\n"
+ " movq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm0\n"
+ " movq 40(%0), %%mm1\n"
+ " movq 48(%0), %%mm2\n"
+ " movq 56(%0), %%mm3\n"
+ " movq %%mm0, 32(%1)\n"
+ " movq %%mm1, 40(%1)\n"
+ " movq %%mm2, 48(%1)\n"
+ " movq %%mm3, 56(%1)\n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (from), "r" (to) : "memory");
+ from+=64;
+ to+=64;
+ }
+
+ for(; i>0; i--)
+ {
+ __asm__ __volatile__ (
+ " movq (%0), %%mm0\n"
+ " movq 8(%0), %%mm1\n"
+ " movq 16(%0), %%mm2\n"
+ " movq 24(%0), %%mm3\n"
+ " movq %%mm0, (%1)\n"
+ " movq %%mm1, 8(%1)\n"
+ " movq %%mm2, 16(%1)\n"
+ " movq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm0\n"
+ " movq 40(%0), %%mm1\n"
+ " movq 48(%0), %%mm2\n"
+ " movq 56(%0), %%mm3\n"
+ " movq %%mm0, 32(%1)\n"
+ " movq %%mm1, 40(%1)\n"
+ " movq %%mm2, 48(%1)\n"
+ " movq %%mm3, 56(%1)\n"
+ : : "r" (from), "r" (to) : "memory");
+ from+=64;
+ to+=64;
+ }
+ /*
+ * Now do the tail of the block
+ */
+ __memcpy(to, from, len&63);
+ kernel_fpu_end();
+ return p;
+}
+
+#ifdef CONFIG_MK7
+
+/*
+ * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
+ * other MMX using processors do not.
+ */
+
+static void fast_clear_page(void *page)
+{
+ int i;
+
+ kernel_fpu_begin();
+
+ __asm__ __volatile__ (
+ " pxor %%mm0, %%mm0\n" : :
+ );
+
+ for(i=0;i<4096/64;i++)
+ {
+ __asm__ __volatile__ (
+ " movntq %%mm0, (%0)\n"
+ " movntq %%mm0, 8(%0)\n"
+ " movntq %%mm0, 16(%0)\n"
+ " movntq %%mm0, 24(%0)\n"
+ " movntq %%mm0, 32(%0)\n"
+ " movntq %%mm0, 40(%0)\n"
+ " movntq %%mm0, 48(%0)\n"
+ " movntq %%mm0, 56(%0)\n"
+ : : "r" (page) : "memory");
+ page+=64;
+ }
+ /* since movntq is weakly-ordered, a "sfence" is needed to become
+ * ordered again.
+ */
+ __asm__ __volatile__ (
+ " sfence \n" : :
+ );
+ kernel_fpu_end();
+}
+
+static void fast_copy_page(void *to, void *from)
+{
+ int i;
+
+ kernel_fpu_begin();
+
+ /* maybe the prefetch stuff can go before the expensive fnsave...
+ * but that is for later. -AV
+ */
+ __asm__ __volatile__ (
+ "1: prefetch (%0)\n"
+ " prefetch 64(%0)\n"
+ " prefetch 128(%0)\n"
+ " prefetch 192(%0)\n"
+ " prefetch 256(%0)\n"
+ "2: \n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (from) );
+
+ for(i=0; i<(4096-320)/64; i++)
+ {
+ __asm__ __volatile__ (
+ "1: prefetch 320(%0)\n"
+ "2: movq (%0), %%mm0\n"
+ " movntq %%mm0, (%1)\n"
+ " movq 8(%0), %%mm1\n"
+ " movntq %%mm1, 8(%1)\n"
+ " movq 16(%0), %%mm2\n"
+ " movntq %%mm2, 16(%1)\n"
+ " movq 24(%0), %%mm3\n"
+ " movntq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm4\n"
+ " movntq %%mm4, 32(%1)\n"
+ " movq 40(%0), %%mm5\n"
+ " movntq %%mm5, 40(%1)\n"
+ " movq 48(%0), %%mm6\n"
+ " movntq %%mm6, 48(%1)\n"
+ " movq 56(%0), %%mm7\n"
+ " movntq %%mm7, 56(%1)\n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (from), "r" (to) : "memory");
+ from+=64;
+ to+=64;
+ }
+ for(i=(4096-320)/64; i<4096/64; i++)
+ {
+ __asm__ __volatile__ (
+ "2: movq (%0), %%mm0\n"
+ " movntq %%mm0, (%1)\n"
+ " movq 8(%0), %%mm1\n"
+ " movntq %%mm1, 8(%1)\n"
+ " movq 16(%0), %%mm2\n"
+ " movntq %%mm2, 16(%1)\n"
+ " movq 24(%0), %%mm3\n"
+ " movntq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm4\n"
+ " movntq %%mm4, 32(%1)\n"
+ " movq 40(%0), %%mm5\n"
+ " movntq %%mm5, 40(%1)\n"
+ " movq 48(%0), %%mm6\n"
+ " movntq %%mm6, 48(%1)\n"
+ " movq 56(%0), %%mm7\n"
+ " movntq %%mm7, 56(%1)\n"
+ : : "r" (from), "r" (to) : "memory");
+ from+=64;
+ to+=64;
+ }
+ /* since movntq is weakly-ordered, a "sfence" is needed to become
+ * ordered again.
+ */
+ __asm__ __volatile__ (
+ " sfence \n" : :
+ );
+ kernel_fpu_end();
+}
+
+#else
+
+/*
+ * Generic MMX implementation without K7 specific streaming
+ */
+
+static void fast_clear_page(void *page)
+{
+ int i;
+
+ kernel_fpu_begin();
+
+ __asm__ __volatile__ (
+ " pxor %%mm0, %%mm0\n" : :
+ );
+
+ for(i=0;i<4096/128;i++)
+ {
+ __asm__ __volatile__ (
+ " movq %%mm0, (%0)\n"
+ " movq %%mm0, 8(%0)\n"
+ " movq %%mm0, 16(%0)\n"
+ " movq %%mm0, 24(%0)\n"
+ " movq %%mm0, 32(%0)\n"
+ " movq %%mm0, 40(%0)\n"
+ " movq %%mm0, 48(%0)\n"
+ " movq %%mm0, 56(%0)\n"
+ " movq %%mm0, 64(%0)\n"
+ " movq %%mm0, 72(%0)\n"
+ " movq %%mm0, 80(%0)\n"
+ " movq %%mm0, 88(%0)\n"
+ " movq %%mm0, 96(%0)\n"
+ " movq %%mm0, 104(%0)\n"
+ " movq %%mm0, 112(%0)\n"
+ " movq %%mm0, 120(%0)\n"
+ : : "r" (page) : "memory");
+ page+=128;
+ }
+
+ kernel_fpu_end();
+}
+
+static void fast_copy_page(void *to, void *from)
+{
+ int i;
+
+
+ kernel_fpu_begin();
+
+ __asm__ __volatile__ (
+ "1: prefetch (%0)\n"
+ " prefetch 64(%0)\n"
+ " prefetch 128(%0)\n"
+ " prefetch 192(%0)\n"
+ " prefetch 256(%0)\n"
+ "2: \n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (from) );
+
+ for(i=0; i<4096/64; i++)
+ {
+ __asm__ __volatile__ (
+ "1: prefetch 320(%0)\n"
+ "2: movq (%0), %%mm0\n"
+ " movq 8(%0), %%mm1\n"
+ " movq 16(%0), %%mm2\n"
+ " movq 24(%0), %%mm3\n"
+ " movq %%mm0, (%1)\n"
+ " movq %%mm1, 8(%1)\n"
+ " movq %%mm2, 16(%1)\n"
+ " movq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm0\n"
+ " movq 40(%0), %%mm1\n"
+ " movq 48(%0), %%mm2\n"
+ " movq 56(%0), %%mm3\n"
+ " movq %%mm0, 32(%1)\n"
+ " movq %%mm1, 40(%1)\n"
+ " movq %%mm2, 48(%1)\n"
+ " movq %%mm3, 56(%1)\n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (from), "r" (to) : "memory");
+ from+=64;
+ to+=64;
+ }
+ kernel_fpu_end();
+}
+
+
+#endif
+
+/*
+ * Favour MMX for page clear and copy.
+ */
+
+static void slow_zero_page(void * page)
+{
+ int d0, d1;
+ __asm__ __volatile__( \
+ "cld\n\t" \
+ "rep ; stosl" \
+ : "=&c" (d0), "=&D" (d1)
+ :"a" (0),"1" (page),"0" (1024)
+ :"memory");
+}
+
+void mmx_clear_page(void * page)
+{
+ if(unlikely(in_interrupt()))
+ slow_zero_page(page);
+ else
+ fast_clear_page(page);
+}
+
+static void slow_copy_page(void *to, void *from)
+{
+ int d0, d1, d2;
+ __asm__ __volatile__( \
+ "cld\n\t" \
+ "rep ; movsl" \
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
+ : "0" (1024),"1" ((long) to),"2" ((long) from) \
+ : "memory");
+}
+