add idl4k kernel firmware version 1.13.0.105

2015-03-26 17:22:37 +01:00
parent 5194d2792e
commit e9070cdc77
31064 changed files with 12769984 additions and 0 deletions
--- a/kernel/arch/ia64/lib/Makefile
+++ b/kernel/arch/ia64/lib/Makefile
@@ -0,0 +1,50 @@
+#
+# Makefile for ia64-specific library routines..
+#
+
+obj-y := io.o
+
+lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
+	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
+	checksum.o clear_page.o csum_partial_copy.o			\
+	clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o	\
+	flush.o ip_fast_csum.o do_csum.o				\
+	memset.o strlen.o xor.o
+
+obj-$(CONFIG_ITANIUM)	+= copy_page.o copy_user.o memcpy.o
+obj-$(CONFIG_MCKINLEY)	+= copy_page_mck.o memcpy_mck.o
+lib-$(CONFIG_PERFMON)	+= carta_random.o
+
+AFLAGS___divdi3.o	=
+AFLAGS___udivdi3.o	= -DUNSIGNED
+AFLAGS___moddi3.o	= 	     -DMODULO
+AFLAGS___umoddi3.o	= -DUNSIGNED -DMODULO
+
+AFLAGS___divsi3.o	=
+AFLAGS___udivsi3.o	= -DUNSIGNED
+AFLAGS___modsi3.o	=	     -DMODULO
+AFLAGS___umodsi3.o	= -DUNSIGNED -DMODULO
+
+$(obj)/__divdi3.o: $(src)/idiv64.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__moddi3.o: $(src)/idiv64.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__divsi3.o: $(src)/idiv32.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__modsi3.o: $(src)/idiv32.S FORCE
+	$(call if_changed_dep,as_o_S)
+
+$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE
+	$(call if_changed_dep,as_o_S)
--- a/kernel/arch/ia64/lib/carta_random.S
+++ b/kernel/arch/ia64/lib/carta_random.S
@@ -0,0 +1,54 @@
+/*
+ * Fast, simple, yet decent quality random number generator based on
+ * a paper by David G. Carta ("Two Fast Implementations of the
+ * `Minimal Standard' Random Number Generator," Communications of the
+ * ACM, January, 1990).
+ *
+ * Copyright (C) 2002 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+#include <asm/asmmacro.h>
+
+#define a	r2
+#define m	r3
+#define lo	r8
+#define hi	r9
+#define t0	r16
+#define t1	r17
+#define	seed	r32
+
+GLOBAL_ENTRY(carta_random32)
+	movl	a = (16807 << 16) | 16807
+	;;
+	pmpyshr2.u t0 = a, seed, 0
+	pmpyshr2.u t1 = a, seed, 16
+	;;
+	unpack2.l t0 = t1, t0
+	dep	m = -1, r0, 0, 31
+	;;
+	zxt4	lo = t0
+	shr.u	hi = t0, 32
+	;;
+	dep	t0 = 0, hi, 15, 49	// t0 = (hi & 0x7fff)
+	;;
+	shl	t0 = t0, 16		// t0 = (hi & 0x7fff) << 16
+	shr	t1 = hi, 15		// t1 = (hi >> 15)
+	;;
+	add	lo = lo, t0
+	;;
+	cmp.gtu	p6, p0 = lo, m
+	;;
+(p6)	and	lo = lo, m
+	;;
+(p6)	add	lo = 1, lo
+	;;
+	add	lo = lo, t1
+	;;
+	cmp.gtu p6, p0 = lo, m
+	;;
+(p6)	and	lo = lo, m
+	;;
+(p6)	add	lo = 1, lo
+	br.ret.sptk.many rp
+END(carta_random32)
--- a/kernel/arch/ia64/lib/checksum.c
+++ b/kernel/arch/ia64/lib/checksum.c
@@ -0,0 +1,101 @@
+/*
+ * Network checksum routines
+ *
+ * Copyright (C) 1999, 2003 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Most of the code coming from arch/alpha/lib/checksum.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed..
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <asm/byteorder.h>
+
+static inline unsigned short
+from64to16 (unsigned long x)
+{
+	/* add up 32-bit words for 33 bits */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up 16-bit and 17-bit words for 17+c bits */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up 16-bit and 2-bit for 16+c bit */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up carry.. */
+	x = (x & 0xffff) + (x >> 16);
+	return x;
+}
+
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented.
+ */
+__sum16
+csum_tcpudp_magic (__be32 saddr, __be32 daddr, unsigned short len,
+		   unsigned short proto, __wsum sum)
+{
+	return (__force __sum16)~from64to16(
+		(__force u64)saddr + (__force u64)daddr +
+		(__force u64)sum + ((len + proto) << 8));
+}
+
+EXPORT_SYMBOL(csum_tcpudp_magic);
+
+__wsum
+csum_tcpudp_nofold (__be32 saddr, __be32 daddr, unsigned short len,
+		    unsigned short proto, __wsum sum)
+{
+	unsigned long result;
+
+	result = (__force u64)saddr + (__force u64)daddr +
+		 (__force u64)sum + ((len + proto) << 8);
+
+	/* Fold down to 32-bits so we don't lose in the typedef-less network stack.  */
+	/* 64 to 33 */
+	result = (result & 0xffffffff) + (result >> 32);
+	/* 33 to 32 */
+	result = (result & 0xffffffff) + (result >> 32);
+	return (__force __wsum)result;
+}
+EXPORT_SYMBOL(csum_tcpudp_nofold);
+
+extern unsigned long do_csum (const unsigned char *, long);
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+__wsum csum_partial(const void *buff, int len, __wsum sum)
+{
+	u64 result = do_csum(buff, len);
+
+	/* add in old sum, and carry.. */
+	result += (__force u32)sum;
+	/* 32+c bits -> 32 bits */
+	result = (result & 0xffffffff) + (result >> 32);
+	return (__force __wsum)result;
+}
+
+EXPORT_SYMBOL(csum_partial);
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+__sum16 ip_compute_csum (const void *buff, int len)
+{
+	return (__force __sum16)~do_csum(buff,len);
+}
+
+EXPORT_SYMBOL(ip_compute_csum);
--- a/kernel/arch/ia64/lib/clear_page.S
+++ b/kernel/arch/ia64/lib/clear_page.S
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 1999-2002 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
+ *
+ * 1/06/01 davidm	Tuned for Itanium.
+ * 2/12/02 kchen	Tuned for both Itanium and McKinley
+ * 3/08/02 davidm	Some more tweaking
+ */
+
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+
+#ifdef CONFIG_ITANIUM
+# define L3_LINE_SIZE	64	// Itanium L3 line size
+# define PREFETCH_LINES	9	// magic number
+#else
+# define L3_LINE_SIZE	128	// McKinley L3 line size
+# define PREFETCH_LINES	12	// magic number
+#endif
+
+#define saved_lc	r2
+#define dst_fetch	r3
+#define dst1		r8
+#define dst2		r9
+#define dst3		r10
+#define dst4		r11
+
+#define dst_last	r31
+
+GLOBAL_ENTRY(clear_page)
+	.prologue
+	.regstk 1,0,0,0
+	mov r16 = PAGE_SIZE/L3_LINE_SIZE-1	// main loop count, -1=repeat/until
+	.save ar.lc, saved_lc
+	mov saved_lc = ar.lc
+
+	.body
+	mov ar.lc = (PREFETCH_LINES - 1)
+	mov dst_fetch = in0
+	adds dst1 = 16, in0
+	adds dst2 = 32, in0
+	;;
+.fetch:	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
+	adds dst3 = 48, in0		// executing this multiple times is harmless
+	br.cloop.sptk.few .fetch
+	;;
+	addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
+	mov ar.lc = r16			// one L3 line per iteration
+	adds dst4 = 64, in0
+	;;
+#ifdef CONFIG_ITANIUM
+	// Optimized for Itanium
+1:	stf.spill.nta [dst1] = f0, 64
+	stf.spill.nta [dst2] = f0, 64
+	cmp.lt p8,p0=dst_fetch, dst_last
+	;;
+#else
+	// Optimized for McKinley
+1:	stf.spill.nta [dst1] = f0, 64
+	stf.spill.nta [dst2] = f0, 64
+	stf.spill.nta [dst3] = f0, 64
+	stf.spill.nta [dst4] = f0, 128
+	cmp.lt p8,p0=dst_fetch, dst_last
+	;;
+	stf.spill.nta [dst1] = f0, 64
+	stf.spill.nta [dst2] = f0, 64
+#endif
+	stf.spill.nta [dst3] = f0, 64
+(p8)	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
+	br.cloop.sptk.few 1b
+	;;
+	mov ar.lc = saved_lc		// restore lc
+	br.ret.sptk.many rp
+END(clear_page)
--- a/kernel/arch/ia64/lib/clear_user.S
+++ b/kernel/arch/ia64/lib/clear_user.S
@@ -0,0 +1,209 @@
+/*
+ * This routine clears to zero a linear memory buffer in user space.
+ *
+ * Inputs:
+ *	in0:	address of buffer
+ *	in1:	length of buffer in bytes
+ * Outputs:
+ *	r8:	number of bytes that didn't get cleared due to a fault
+ *
+ * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ */
+
+#include <asm/asmmacro.h>
+
+//
+// arguments
+//
+#define buf		r32
+#define len		r33
+
+//
+// local registers
+//
+#define cnt		r16
+#define buf2		r17
+#define saved_lc	r18
+#define saved_pfs	r19
+#define tmp		r20
+#define len2		r21
+#define len3		r22
+
+//
+// Theory of operations:
+//	- we check whether or not the buffer is small, i.e., less than 17
+//	  in which case we do the byte by byte loop.
+//
+//	- Otherwise we go progressively from 1 byte store to 8byte store in
+//	  the head part, the body is a 16byte store loop and we finish we the
+//	  tail for the last 15 bytes.
+//	  The good point about this breakdown is that the long buffer handling
+//	  contains only 2 branches.
+//
+//	The reason for not using shifting & masking for both the head and the
+//	tail is to stay semantically correct. This routine is not supposed
+//	to write bytes outside of the buffer. While most of the time this would
+//	be ok, we can't tolerate a mistake. A classical example is the case
+//	of multithreaded code were to the extra bytes touched is actually owned
+//	by another thread which runs concurrently to ours. Another, less likely,
+//	example is with device drivers where reading an I/O mapped location may
+//	have side effects (same thing for writing).
+//
+
+GLOBAL_ENTRY(__do_clear_user)
+	.prologue
+	.save ar.pfs, saved_pfs
+	alloc	saved_pfs=ar.pfs,2,0,0,0
+	cmp.eq p6,p0=r0,len		// check for zero length
+	.save ar.lc, saved_lc
+	mov saved_lc=ar.lc		// preserve ar.lc (slow)
+	.body
+	;;				// avoid WAW on CFM
+	adds tmp=-1,len			// br.ctop is repeat/until
+	mov ret0=len			// return value is length at this point
+(p6)	br.ret.spnt.many rp
+	;;
+	cmp.lt p6,p0=16,len		// if len > 16 then long memset
+	mov ar.lc=tmp			// initialize lc for small count
+(p6)	br.cond.dptk .long_do_clear
+	;;				// WAR on ar.lc
+	//
+	// worst case 16 iterations, avg 8 iterations
+	//
+	// We could have played with the predicates to use the extra
+	// M slot for 2 stores/iteration but the cost the initialization
+	// the various counters compared to how long the loop is supposed
+	// to last on average does not make this solution viable.
+	//
+1:
+	EX( .Lexit1, st1 [buf]=r0,1 )
+	adds len=-1,len			// countdown length using len
+	br.cloop.dptk 1b
+	;;				// avoid RAW on ar.lc
+	//
+	// .Lexit4: comes from byte by byte loop
+	//	    len contains bytes left
+.Lexit1:
+	mov ret0=len			// faster than using ar.lc
+	mov ar.lc=saved_lc
+	br.ret.sptk.many rp		// end of short clear_user
+
+
+	//
+	// At this point we know we have more than 16 bytes to copy
+	// so we focus on alignment (no branches required)
+	//
+	// The use of len/len2 for countdown of the number of bytes left
+	// instead of ret0 is due to the fact that the exception code
+	// changes the values of r8.
+	//
+.long_do_clear:
+	tbit.nz p6,p0=buf,0		// odd alignment (for long_do_clear)
+	;;
+	EX( .Lexit3, (p6) st1 [buf]=r0,1 )	// 1-byte aligned
+(p6)	adds len=-1,len;;		// sync because buf is modified
+	tbit.nz p6,p0=buf,1
+	;;
+	EX( .Lexit3, (p6) st2 [buf]=r0,2 )	// 2-byte aligned
+(p6)	adds len=-2,len;;
+	tbit.nz p6,p0=buf,2
+	;;
+	EX( .Lexit3, (p6) st4 [buf]=r0,4 )	// 4-byte aligned
+(p6)	adds len=-4,len;;
+	tbit.nz p6,p0=buf,3
+	;;
+	EX( .Lexit3, (p6) st8 [buf]=r0,8 )	// 8-byte aligned
+(p6)	adds len=-8,len;;
+	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
+	;;
+	cmp.eq p6,p0=r0,cnt
+	adds tmp=-1,cnt
+(p6)	br.cond.dpnt .dotail		// we have less than 16 bytes left
+	;;
+	adds buf2=8,buf			// setup second base pointer
+	mov ar.lc=tmp
+	;;
+
+	//
+	// 16bytes/iteration core loop
+	//
+	// The second store can never generate a fault because
+	// we come into the loop only when we are 16-byte aligned.
+	// This means that if we cross a page then it will always be
+	// in the first store and never in the second.
+	//
+	//
+	// We need to keep track of the remaining length. A possible (optimistic)
+	// way would be to use ar.lc and derive how many byte were left by
+	// doing : left= 16*ar.lc + 16.  this would avoid the addition at
+	// every iteration.
+	// However we need to keep the synchronization point. A template
+	// M;;MB does not exist and thus we can keep the addition at no
+	// extra cycle cost (use a nop slot anyway). It also simplifies the
+	// (unlikely)  error recovery code
+	//
+
+2:	EX(.Lexit3, st8 [buf]=r0,16 )
+	;;				// needed to get len correct when error
+	st8 [buf2]=r0,16
+	adds len=-16,len
+	br.cloop.dptk 2b
+	;;
+	mov ar.lc=saved_lc
+	//
+	// tail correction based on len only
+	//
+	// We alternate the use of len3,len2 to allow parallelism and correct
+	// error handling. We also reuse p6/p7 to return correct value.
+	// The addition of len2/len3 does not cost anything more compared to
+	// the regular memset as we had empty slots.
+	//
+.dotail:
+	mov len2=len			// for parallelization of error handling
+	mov len3=len
+	tbit.nz p6,p0=len,3
+	;;
+	EX( .Lexit2, (p6) st8 [buf]=r0,8 )	// at least 8 bytes
+(p6)	adds len3=-8,len2
+	tbit.nz p7,p6=len,2
+	;;
+	EX( .Lexit2, (p7) st4 [buf]=r0,4 )	// at least 4 bytes
+(p7)	adds len2=-4,len3
+	tbit.nz p6,p7=len,1
+	;;
+	EX( .Lexit2, (p6) st2 [buf]=r0,2 )	// at least 2 bytes
+(p6)	adds len3=-2,len2
+	tbit.nz p7,p6=len,0
+	;;
+	EX( .Lexit2, (p7) st1 [buf]=r0 )	// only 1 byte left
+	mov ret0=r0				// success
+	br.ret.sptk.many rp			// end of most likely path
+
+	//
+	// Outlined error handling code
+	//
+
+	//
+	// .Lexit3: comes from core loop, need restore pr/lc
+	//	    len contains bytes left
+	//
+	//
+	// .Lexit2:
+	//	if p6 -> coming from st8 or st2 : len2 contains what's left
+	//	if p7 -> coming from st4 or st1 : len3 contains what's left
+	// We must restore lc/pr even though might not have been used.
+.Lexit2:
+	.pred.rel "mutex", p6, p7
+(p6)	mov len=len2
+(p7)	mov len=len3
+	;;
+	//
+	// .Lexit4: comes from head, need not restore pr/lc
+	//	    len contains bytes left
+	//
+.Lexit3:
+	mov ret0=len
+	mov ar.lc=saved_lc
+	br.ret.sptk.many rp
+END(__do_clear_user)
--- a/kernel/arch/ia64/lib/copy_page.S
+++ b/kernel/arch/ia64/lib/copy_page.S
@@ -0,0 +1,98 @@
+/*
+ *
+ * Optimized version of the standard copy_page() function
+ *
+ * Inputs:
+ *	in0:	address of target page
+ *	in1:	address of source page
+ * Output:
+ *	no return value
+ *
+ * Copyright (C) 1999, 2001 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *	David Mosberger <davidm@hpl.hp.com>
+ *
+ * 4/06/01 davidm	Tuned to make it perform well both for cached and uncached copies.
+ */
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+
+#define PIPE_DEPTH	3
+#define EPI		p[PIPE_DEPTH-1]
+
+#define lcount		r16
+#define saved_pr	r17
+#define saved_lc	r18
+#define saved_pfs	r19
+#define src1		r20
+#define src2		r21
+#define tgt1		r22
+#define tgt2		r23
+#define srcf		r24
+#define tgtf		r25
+#define tgt_last	r26
+
+#define Nrot		((8*PIPE_DEPTH+7)&~7)
+
+GLOBAL_ENTRY(copy_page)
+	.prologue
+	.save ar.pfs, saved_pfs
+	alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
+
+	.rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
+	      t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
+	.rotp p[PIPE_DEPTH]
+
+	.save ar.lc, saved_lc
+	mov saved_lc=ar.lc
+	mov ar.ec=PIPE_DEPTH
+
+	mov lcount=PAGE_SIZE/64-1
+	.save pr, saved_pr
+	mov saved_pr=pr
+	mov pr.rot=1<<16
+
+	.body
+
+	mov src1=in1
+	adds src2=8,in1
+	mov tgt_last = PAGE_SIZE
+	;;
+	adds tgt2=8,in0
+	add srcf=512,in1
+	mov ar.lc=lcount
+	mov tgt1=in0
+	add tgtf=512,in0
+	add tgt_last = tgt_last, in0
+	;;
+1:
+(p[0])	ld8 t1[0]=[src1],16
+(EPI)	st8 [tgt1]=t1[PIPE_DEPTH-1],16
+(p[0])	ld8 t2[0]=[src2],16
+(EPI)	st8 [tgt2]=t2[PIPE_DEPTH-1],16
+	cmp.ltu p6,p0 = tgtf, tgt_last
+	;;
+(p[0])	ld8 t3[0]=[src1],16
+(EPI)	st8 [tgt1]=t3[PIPE_DEPTH-1],16
+(p[0])	ld8 t4[0]=[src2],16
+(EPI)	st8 [tgt2]=t4[PIPE_DEPTH-1],16
+	;;
+(p[0])	ld8 t5[0]=[src1],16
+(EPI)	st8 [tgt1]=t5[PIPE_DEPTH-1],16
+(p[0])	ld8 t6[0]=[src2],16
+(EPI)	st8 [tgt2]=t6[PIPE_DEPTH-1],16
+	;;
+(p[0])	ld8 t7[0]=[src1],16
+(EPI)	st8 [tgt1]=t7[PIPE_DEPTH-1],16
+(p[0])	ld8 t8[0]=[src2],16
+(EPI)	st8 [tgt2]=t8[PIPE_DEPTH-1],16
+
+(p6)	lfetch [srcf], 64
+(p6)	lfetch [tgtf], 64
+	br.ctop.sptk.few 1b
+	;;
+	mov pr=saved_pr,0xffffffffffff0000	// restore predicates
+	mov ar.pfs=saved_pfs
+	mov ar.lc=saved_lc
+	br.ret.sptk.many rp
+END(copy_page)
--- a/kernel/arch/ia64/lib/copy_page_mck.S
+++ b/kernel/arch/ia64/lib/copy_page_mck.S
@@ -0,0 +1,185 @@
+/*
+ * McKinley-optimized version of copy_page().
+ *
+ * Copyright (C) 2002 Hewlett-Packard Co
+ *	David Mosberger <davidm@hpl.hp.com>
+ *
+ * Inputs:
+ *	in0:	address of target page
+ *	in1:	address of source page
+ * Output:
+ *	no return value
+ *
+ * General idea:
+ *	- use regular loads and stores to prefetch data to avoid consuming M-slot just for
+ *	  lfetches => good for in-cache performance
+ *	- avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
+ *	  cycle
+ *
+ * Principle of operation:
+ *	First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
+ *	To avoid secondary misses in L2, we prefetch both source and destination with a line-size
+ *	of 128 bytes.  When both of these lines are in the L2 and the first half of the
+ *	source line is in L1, we start copying the remaining words.  The second half of the
+ *	source line is prefetched in an earlier iteration, so that by the time we start
+ *	accessing it, it's also present in the L1.
+ *
+ *	We use a software-pipelined loop to control the overall operation.  The pipeline
+ *	has 2*PREFETCH_DIST+K stages.  The first PREFETCH_DIST stages are used for prefetching
+ *	source cache-lines.  The second PREFETCH_DIST stages are used for prefetching destination
+ *	cache-lines, the last K stages are used to copy the cache-line words not copied by
+ *	the prefetches.  The four relevant points in the pipelined are called A, B, C, D:
+ *	p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
+ *	should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
+ *	into L1D and p[D] is TRUE if a cacheline needs to be copied.
+ *
+ *	This all sounds very complicated, but thanks to the modulo-scheduled loop support,
+ *	the resulting code is very regular and quite easy to follow (once you get the idea).
+ *
+ *	As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
+ *	as the separate .prefetch_loop.  Logically, this loop performs exactly like the
+ *	main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
+ *	so that each loop iteration is faster (again, good for cached case).
+ *
+ *	When reading the code, it helps to keep the following picture in mind:
+ *
+ *	       word 0 word 1
+ *            +------+------+---
+ *	      |	v[x] | 	t1  | ^
+ *	      |	t2   |	t3  | |
+ *	      |	t4   |	t5  | |
+ *	      |	t6   |	t7  | | 128 bytes
+ *     	      |	n[y] | 	t9  | |	(L2 cache line)
+ *	      |	t10  | 	t11 | |
+ *	      |	t12  | 	t13 | |
+ *	      |	t14  | 	t15 | v
+ *	      +------+------+---
+ *
+ *	Here, v[x] is copied by the (memory) prefetch.  n[y] is loaded at p[C]
+ *	to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
+ *	an order that avoids bank conflicts.
+ */
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+
+#define PREFETCH_DIST	8		// McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
+
+#define src0		r2
+#define src1		r3
+#define dst0		r9
+#define dst1		r10
+#define src_pre_mem	r11
+#define dst_pre_mem	r14
+#define src_pre_l2	r15
+#define dst_pre_l2	r16
+#define t1		r17
+#define t2		r18
+#define t3		r19
+#define t4		r20
+#define t5		t1	// alias!
+#define t6		t2	// alias!
+#define t7		t3	// alias!
+#define t9		t5	// alias!
+#define t10		t4	// alias!
+#define t11		t7	// alias!
+#define t12		t6	// alias!
+#define t14		t10	// alias!
+#define t13		r21
+#define t15		r22
+
+#define saved_lc	r23
+#define saved_pr	r24
+
+#define	A	0
+#define B	(PREFETCH_DIST)
+#define C	(B + PREFETCH_DIST)
+#define D	(C + 3)
+#define N	(D + 1)
+#define Nrot	((N + 7) & ~7)
+
+GLOBAL_ENTRY(copy_page)
+	.prologue
+	alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
+
+	.rotr v[2*PREFETCH_DIST], n[D-C+1]
+	.rotp p[N]
+
+	.save ar.lc, saved_lc
+	mov saved_lc = ar.lc
+	.save pr, saved_pr
+	mov saved_pr = pr
+	.body
+
+	mov src_pre_mem = in1
+	mov pr.rot = 0x10000
+	mov ar.ec = 1				// special unrolled loop
+
+	mov dst_pre_mem = in0
+	mov ar.lc = 2*PREFETCH_DIST - 1
+
+	add src_pre_l2 = 8*8, in1
+	add dst_pre_l2 = 8*8, in0
+	add src0 = 8, in1			// first t1 src
+	add src1 = 3*8, in1			// first t3 src
+	add dst0 = 8, in0			// first t1 dst
+	add dst1 = 3*8, in0			// first t3 dst
+	mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
+	nop.m 0
+	nop.i 0
+	;;
+	// same as .line_copy loop, but with all predicated-off instructions removed:
+.prefetch_loop:
+(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0
+(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2
+	br.ctop.sptk .prefetch_loop
+	;;
+	cmp.eq p16, p0 = r0, r0			// reset p16 to 1 (br.ctop cleared it to zero)
+	mov ar.lc = t1				// with 64KB pages, t1 is too big to fit in 8 bits!
+	mov ar.ec = N				// # of stages in pipeline
+	;;
+.line_copy:
+(p[D])	ld8 t2 = [src0], 3*8			// M0
+(p[D])	ld8 t4 = [src1], 3*8			// M1
+(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2 prefetch dst from memory
+(p[D])	st8 [dst_pre_l2] = n[D-C], 128		// M3 prefetch dst from L2
+	;;
+(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0 prefetch src from memory
+(p[C])	ld8 n[0] = [src_pre_l2], 128		// M1 prefetch src from L2
+(p[D])	st8 [dst0] =  t1, 8			// M2
+(p[D])	st8 [dst1] =  t3, 8			// M3
+	;;
+(p[D])	ld8  t5 = [src0], 8
+(p[D])	ld8  t7 = [src1], 3*8
+(p[D])	st8 [dst0] =  t2, 3*8
+(p[D])	st8 [dst1] =  t4, 3*8
+	;;
+(p[D])	ld8  t6 = [src0], 3*8
+(p[D])	ld8 t10 = [src1], 8
+(p[D])	st8 [dst0] =  t5, 8
+(p[D])	st8 [dst1] =  t7, 3*8
+	;;
+(p[D])	ld8  t9 = [src0], 3*8
+(p[D])	ld8 t11 = [src1], 3*8
+(p[D])	st8 [dst0] =  t6, 3*8
+(p[D])	st8 [dst1] = t10, 8
+	;;
+(p[D])	ld8 t12 = [src0], 8
+(p[D])	ld8 t14 = [src1], 8
+(p[D])	st8 [dst0] =  t9, 3*8
+(p[D])	st8 [dst1] = t11, 3*8
+	;;
+(p[D])	ld8 t13 = [src0], 4*8
+(p[D])	ld8 t15 = [src1], 4*8
+(p[D])	st8 [dst0] = t12, 8
+(p[D])	st8 [dst1] = t14, 8
+	;;
+(p[D-1])ld8  t1 = [src0], 8
+(p[D-1])ld8  t3 = [src1], 8
+(p[D])	st8 [dst0] = t13, 4*8
+(p[D])	st8 [dst1] = t15, 4*8
+	br.ctop.sptk .line_copy
+	;;
+	mov ar.lc = saved_lc
+	mov pr = saved_pr, -1
+	br.ret.sptk.many rp
+END(copy_page)
--- a/kernel/arch/ia64/lib/copy_user.S
+++ b/kernel/arch/ia64/lib/copy_user.S
@@ -0,0 +1,610 @@
+/*
+ *
+ * Optimized version of the copy_user() routine.
+ * It is used to copy date across the kernel/user boundary.
+ *
+ * The source and destination are always on opposite side of
+ * the boundary. When reading from user space we must catch
+ * faults on loads. When writing to user space we must catch
+ * errors on stores. Note that because of the nature of the copy
+ * we don't need to worry about overlapping regions.
+ *
+ *
+ * Inputs:
+ *	in0	address of source buffer
+ *	in1	address of destination buffer
+ *	in2	number of bytes to copy
+ *
+ * Outputs:
+ *	ret0	0 in case of success. The number of bytes NOT copied in
+ *		case of error.
+ *
+ * Copyright (C) 2000-2001 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Fixme:
+ *	- handle the case where we have more than 16 bytes and the alignment
+ *	  are different.
+ *	- more benchmarking
+ *	- fix extraneous stop bit introduced by the EX() macro.
+ */
+
+#include <asm/asmmacro.h>
+
+//
+// Tuneable parameters
+//
+#define COPY_BREAK	16	// we do byte copy below (must be >=16)
+#define PIPE_DEPTH	21	// pipe depth
+
+#define EPI		p[PIPE_DEPTH-1]
+
+//
+// arguments
+//
+#define dst		in0
+#define src		in1
+#define len		in2
+
+//
+// local registers
+//
+#define t1		r2	// rshift in bytes
+#define t2		r3	// lshift in bytes
+#define rshift		r14	// right shift in bits
+#define lshift		r15	// left shift in bits
+#define word1		r16
+#define word2		r17
+#define cnt		r18
+#define len2		r19
+#define saved_lc	r20
+#define saved_pr	r21
+#define tmp		r22
+#define val		r23
+#define src1		r24
+#define dst1		r25
+#define src2		r26
+#define dst2		r27
+#define len1		r28
+#define enddst		r29
+#define endsrc		r30
+#define saved_pfs	r31
+
+GLOBAL_ENTRY(__copy_user)
+	.prologue
+	.save ar.pfs, saved_pfs
+	alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
+
+	.rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
+	.rotp p[PIPE_DEPTH]
+
+	adds len2=-1,len	// br.ctop is repeat/until
+	mov ret0=r0
+
+	;;			// RAW of cfm when len=0
+	cmp.eq p8,p0=r0,len	// check for zero length
+	.save ar.lc, saved_lc
+	mov saved_lc=ar.lc	// preserve ar.lc (slow)
+(p8)	br.ret.spnt.many rp	// empty mempcy()
+	;;
+	add enddst=dst,len	// first byte after end of source
+	add endsrc=src,len	// first byte after end of destination
+	.save pr, saved_pr
+	mov saved_pr=pr		// preserve predicates
+
+	.body
+
+	mov dst1=dst		// copy because of rotation
+	mov ar.ec=PIPE_DEPTH
+	mov pr.rot=1<<16	// p16=true all others are false
+
+	mov src1=src		// copy because of rotation
+	mov ar.lc=len2		// initialize lc for small count
+	cmp.lt p10,p7=COPY_BREAK,len	// if len > COPY_BREAK then long copy
+
+	xor tmp=src,dst		// same alignment test prepare
+(p10)	br.cond.dptk .long_copy_user
+	;;			// RAW pr.rot/p16 ?
+	//
+	// Now we do the byte by byte loop with software pipeline
+	//
+	// p7 is necessarily false by now
+1:
+	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
+	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+	br.ctop.dptk.few 1b
+	;;
+	mov ar.lc=saved_lc
+	mov pr=saved_pr,0xffffffffffff0000
+	mov ar.pfs=saved_pfs		// restore ar.ec
+	br.ret.sptk.many rp		// end of short memcpy
+
+	//
+	// Not 8-byte aligned
+	//
+.diff_align_copy_user:
+	// At this point we know we have more than 16 bytes to copy
+	// and also that src and dest do _not_ have the same alignment.
+	and src2=0x7,src1				// src offset
+	and dst2=0x7,dst1				// dst offset
+	;;
+	// The basic idea is that we copy byte-by-byte at the head so
+	// that we can reach 8-byte alignment for both src1 and dst1.
+	// Then copy the body using software pipelined 8-byte copy,
+	// shifting the two back-to-back words right and left, then copy
+	// the tail by copying byte-by-byte.
+	//
+	// Fault handling. If the byte-by-byte at the head fails on the
+	// load, then restart and finish the pipleline by copying zeros
+	// to the dst1. Then copy zeros for the rest of dst1.
+	// If 8-byte software pipeline fails on the load, do the same as
+	// failure_in3 does. If the byte-by-byte at the tail fails, it is
+	// handled simply by failure_in_pipe1.
+	//
+	// The case p14 represents the source has more bytes in the
+	// the first word (by the shifted part), whereas the p15 needs to
+	// copy some bytes from the 2nd word of the source that has the
+	// tail of the 1st of the destination.
+	//
+
+	//
+	// Optimization. If dst1 is 8-byte aligned (quite common), we don't need
+	// to copy the head to dst1, to start 8-byte copy software pipeline.
+	// We know src1 is not 8-byte aligned in this case.
+	//
+	cmp.eq p14,p15=r0,dst2
+(p15)	br.cond.spnt 1f
+	;;
+	sub t1=8,src2
+	mov t2=src2
+	;;
+	shl rshift=t2,3
+	sub len1=len,t1					// set len1
+	;;
+	sub lshift=64,rshift
+	;;
+	br.cond.spnt .word_copy_user
+	;;
+1:
+	cmp.leu	p14,p15=src2,dst2
+	sub t1=dst2,src2
+	;;
+	.pred.rel "mutex", p14, p15
+(p14)	sub word1=8,src2				// (8 - src offset)
+(p15)	sub t1=r0,t1					// absolute value
+(p15)	sub word1=8,dst2				// (8 - dst offset)
+	;;
+	// For the case p14, we don't need to copy the shifted part to
+	// the 1st word of destination.
+	sub t2=8,t1
+(p14)	sub word1=word1,t1
+	;;
+	sub len1=len,word1				// resulting len
+(p15)	shl rshift=t1,3					// in bits
+(p14)	shl rshift=t2,3
+	;;
+(p14)	sub len1=len1,t1
+	adds cnt=-1,word1
+	;;
+	sub lshift=64,rshift
+	mov ar.ec=PIPE_DEPTH
+	mov pr.rot=1<<16	// p16=true all others are false
+	mov ar.lc=cnt
+	;;
+2:
+	EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
+	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+	br.ctop.dptk.few 2b
+	;;
+	clrrrb
+	;;
+.word_copy_user:
+	cmp.gtu p9,p0=16,len1
+(p9)	br.cond.spnt 4f			// if (16 > len1) skip 8-byte copy
+	;;
+	shr.u cnt=len1,3		// number of 64-bit words
+	;;
+	adds cnt=-1,cnt
+	;;
+	.pred.rel "mutex", p14, p15
+(p14)	sub src1=src1,t2
+(p15)	sub src1=src1,t1
+	//
+	// Now both src1 and dst1 point to an 8-byte aligned address. And
+	// we have more than 8 bytes to copy.
+	//
+	mov ar.lc=cnt
+	mov ar.ec=PIPE_DEPTH
+	mov pr.rot=1<<16	// p16=true all others are false
+	;;
+3:
+	//
+	// The pipleline consists of 3 stages:
+	// 1 (p16):	Load a word from src1
+	// 2 (EPI_1):	Shift right pair, saving to tmp
+	// 3 (EPI):	Store tmp to dst1
+	//
+	// To make it simple, use at least 2 (p16) loops to set up val1[n]
+	// because we need 2 back-to-back val1[] to get tmp.
+	// Note that this implies EPI_2 must be p18 or greater.
+	//
+
+#define EPI_1		p[PIPE_DEPTH-2]
+#define SWITCH(pred, shift)	cmp.eq pred,p0=shift,rshift
+#define CASE(pred, shift)	\
+	(pred)	br.cond.spnt .copy_user_bit##shift
+#define BODY(rshift)						\
+.copy_user_bit##rshift:						\
+1:								\
+	EX(.failure_out,(EPI) st8 [dst1]=tmp,8);		\
+(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
+	EX(3f,(p16) ld8 val1[1]=[src1],8);			\
+(p16)	mov val1[0]=r0;						\
+	br.ctop.dptk 1b;					\
+	;;							\
+	br.cond.sptk.many .diff_align_do_tail;			\
+2:								\
+(EPI)	st8 [dst1]=tmp,8;					\
+(EPI_1)	shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
+3:								\
+(p16)	mov val1[1]=r0;						\
+(p16)	mov val1[0]=r0;						\
+	br.ctop.dptk 2b;					\
+	;;							\
+	br.cond.sptk.many .failure_in2
+
+	//
+	// Since the instruction 'shrp' requires a fixed 128-bit value
+	// specifying the bits to shift, we need to provide 7 cases
+	// below.
+	//
+	SWITCH(p6, 8)
+	SWITCH(p7, 16)
+	SWITCH(p8, 24)
+	SWITCH(p9, 32)
+	SWITCH(p10, 40)
+	SWITCH(p11, 48)
+	SWITCH(p12, 56)
+	;;
+	CASE(p6, 8)
+	CASE(p7, 16)
+	CASE(p8, 24)
+	CASE(p9, 32)
+	CASE(p10, 40)
+	CASE(p11, 48)
+	CASE(p12, 56)
+	;;
+	BODY(8)
+	BODY(16)
+	BODY(24)
+	BODY(32)
+	BODY(40)
+	BODY(48)
+	BODY(56)
+	;;
+.diff_align_do_tail:
+	.pred.rel "mutex", p14, p15
+(p14)	sub src1=src1,t1
+(p14)	adds dst1=-8,dst1
+(p15)	sub dst1=dst1,t1
+	;;
+4:
+	// Tail correction.
+	//
+	// The problem with this piplelined loop is that the last word is not
+	// loaded and thus parf of the last word written is not correct.
+	// To fix that, we simply copy the tail byte by byte.
+
+	sub len1=endsrc,src1,1
+	clrrrb
+	;;
+	mov ar.ec=PIPE_DEPTH
+	mov pr.rot=1<<16	// p16=true all others are false
+	mov ar.lc=len1
+	;;
+5:
+	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
+	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+	br.ctop.dptk.few 5b
+	;;
+	mov ar.lc=saved_lc
+	mov pr=saved_pr,0xffffffffffff0000
+	mov ar.pfs=saved_pfs
+	br.ret.sptk.many rp
+
+	//
+	// Beginning of long mempcy (i.e. > 16 bytes)
+	//
+.long_copy_user:
+	tbit.nz p6,p7=src1,0	// odd alignment
+	and tmp=7,tmp
+	;;
+	cmp.eq p10,p8=r0,tmp
+	mov len1=len		// copy because of rotation
+(p8)	br.cond.dpnt .diff_align_copy_user
+	;;
+	// At this point we know we have more than 16 bytes to copy
+	// and also that both src and dest have the same alignment
+	// which may not be the one we want. So for now we must move
+	// forward slowly until we reach 16byte alignment: no need to
+	// worry about reaching the end of buffer.
+	//
+	EX(.failure_in1,(p6) ld1 val1[0]=[src1],1)	// 1-byte aligned
+(p6)	adds len1=-1,len1;;
+	tbit.nz p7,p0=src1,1
+	;;
+	EX(.failure_in1,(p7) ld2 val1[1]=[src1],2)	// 2-byte aligned
+(p7)	adds len1=-2,len1;;
+	tbit.nz p8,p0=src1,2
+	;;
+	//
+	// Stop bit not required after ld4 because if we fail on ld4
+	// we have never executed the ld1, therefore st1 is not executed.
+	//
+	EX(.failure_in1,(p8) ld4 val2[0]=[src1],4)	// 4-byte aligned
+	;;
+	EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
+	tbit.nz p9,p0=src1,3
+	;;
+	//
+	// Stop bit not required after ld8 because if we fail on ld8
+	// we have never executed the ld2, therefore st2 is not executed.
+	//
+	EX(.failure_in1,(p9) ld8 val2[1]=[src1],8)	// 8-byte aligned
+	EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
+(p8)	adds len1=-4,len1
+	;;
+	EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
+(p9)	adds len1=-8,len1;;
+	shr.u cnt=len1,4		// number of 128-bit (2x64bit) words
+	;;
+	EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
+	tbit.nz p6,p0=len1,3
+	cmp.eq p7,p0=r0,cnt
+	adds tmp=-1,cnt			// br.ctop is repeat/until
+(p7)	br.cond.dpnt .dotail		// we have less than 16 bytes left
+	;;
+	adds src2=8,src1
+	adds dst2=8,dst1
+	mov ar.lc=tmp
+	;;
+	//
+	// 16bytes/iteration
+	//
+2:
+	EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
+(p16)	ld8 val2[0]=[src2],16
+
+	EX(.failure_out, (EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16)
+(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
+	br.ctop.dptk 2b
+	;;			// RAW on src1 when fall through from loop
+	//
+	// Tail correction based on len only
+	//
+	// No matter where we come from (loop or test) the src1 pointer
+	// is 16 byte aligned AND we have less than 16 bytes to copy.
+	//
+.dotail:
+	EX(.failure_in1,(p6) ld8 val1[0]=[src1],8)	// at least 8 bytes
+	tbit.nz p7,p0=len1,2
+	;;
+	EX(.failure_in1,(p7) ld4 val1[1]=[src1],4)	// at least 4 bytes
+	tbit.nz p8,p0=len1,1
+	;;
+	EX(.failure_in1,(p8) ld2 val2[0]=[src1],2)	// at least 2 bytes
+	tbit.nz p9,p0=len1,0
+	;;
+	EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
+	;;
+	EX(.failure_in1,(p9) ld1 val2[1]=[src1])	// only 1 byte left
+	mov ar.lc=saved_lc
+	;;
+	EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
+	mov pr=saved_pr,0xffffffffffff0000
+	;;
+	EX(.failure_out, (p8)	st2 [dst1]=val2[0],2)
+	mov ar.pfs=saved_pfs
+	;;
+	EX(.failure_out, (p9)	st1 [dst1]=val2[1])
+	br.ret.sptk.many rp
+
+
+	//
+	// Here we handle the case where the byte by byte copy fails
+	// on the load.
+	// Several factors make the zeroing of the rest of the buffer kind of
+	// tricky:
+	//	- the pipeline: loads/stores are not in sync (pipeline)
+	//
+	//	  In the same loop iteration, the dst1 pointer does not directly
+	//	  reflect where the faulty load was.
+	//
+	//	- pipeline effect
+	//	  When you get a fault on load, you may have valid data from
+	//	  previous loads not yet store in transit. Such data must be
+	//	  store normally before moving onto zeroing the rest.
+	//
+	//	- single/multi dispersal independence.
+	//
+	// solution:
+	//	- we don't disrupt the pipeline, i.e. data in transit in
+	//	  the software pipeline will be eventually move to memory.
+	//	  We simply replace the load with a simple mov and keep the
+	//	  pipeline going. We can't really do this inline because
+	//	  p16 is always reset to 1 when lc > 0.
+	//
+.failure_in_pipe1:
+	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
+1:
+(p16)	mov val1[0]=r0
+(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
+	br.ctop.dptk 1b
+	;;
+	mov pr=saved_pr,0xffffffffffff0000
+	mov ar.lc=saved_lc
+	mov ar.pfs=saved_pfs
+	br.ret.sptk.many rp
+
+	//
+	// This is the case where the byte by byte copy fails on the load
+	// when we copy the head. We need to finish the pipeline and copy
+	// zeros for the rest of the destination. Since this happens
+	// at the top we still need to fill the body and tail.
+.failure_in_pipe2:
+	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
+2:
+(p16)	mov val1[0]=r0
+(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
+	br.ctop.dptk 2b
+	;;
+	sub len=enddst,dst1,1		// precompute len
+	br.cond.dptk.many .failure_in1bis
+	;;
+
+	//
+	// Here we handle the head & tail part when we check for alignment.
+	// The following code handles only the load failures. The
+	// main diffculty comes from the fact that loads/stores are
+	// scheduled. So when you fail on a load, the stores corresponding
+	// to previous successful loads must be executed.
+	//
+	// However some simplifications are possible given the way
+	// things work.
+	//
+	// 1) HEAD
+	// Theory of operation:
+	//
+	//  Page A   | Page B
+	//  ---------|-----
+	//          1|8 x
+	//	  1 2|8 x
+	//	    4|8 x
+	//	  1 4|8 x
+	//        2 4|8 x
+	//      1 2 4|8 x
+	//	     |1
+	//	     |2 x
+	//	     |4 x
+	//
+	// page_size >= 4k (2^12).  (x means 4, 2, 1)
+	// Here we suppose Page A exists and Page B does not.
+	//
+	// As we move towards eight byte alignment we may encounter faults.
+	// The numbers on each page show the size of the load (current alignment).
+	//
+	// Key point:
+	//	- if you fail on 1, 2, 4 then you have never executed any smaller
+	//	  size loads, e.g. failing ld4 means no ld1 nor ld2 executed
+	//	  before.
+	//
+	// This allows us to simplify the cleanup code, because basically you
+	// only have to worry about "pending" stores in the case of a failing
+	// ld8(). Given the way the code is written today, this means only
+	// worry about st2, st4. There we can use the information encapsulated
+	// into the predicates.
+	//
+	// Other key point:
+	//	- if you fail on the ld8 in the head, it means you went straight
+	//	  to it, i.e. 8byte alignment within an unexisting page.
+	// Again this comes from the fact that if you crossed just for the ld8 then
+	// you are 8byte aligned but also 16byte align, therefore you would
+	// either go for the 16byte copy loop OR the ld8 in the tail part.
+	// The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
+	// because it would mean you had 15bytes to copy in which case you
+	// would have defaulted to the byte by byte copy.
+	//
+	//
+	// 2) TAIL
+	// Here we now we have less than 16 bytes AND we are either 8 or 16 byte
+	// aligned.
+	//
+	// Key point:
+	// This means that we either:
+	//		- are right on a page boundary
+	//	OR
+	//		- are at more than 16 bytes from a page boundary with
+	//		  at most 15 bytes to copy: no chance of crossing.
+	//
+	// This allows us to assume that if we fail on a load we haven't possibly
+	// executed any of the previous (tail) ones, so we don't need to do
+	// any stores. For instance, if we fail on ld2, this means we had
+	// 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
+	//
+	// This means that we are in a situation similar the a fault in the
+	// head part. That's nice!
+	//
+.failure_in1:
+	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
+	sub len=endsrc,src1,1
+	//
+	// we know that ret0 can never be zero at this point
+	// because we failed why trying to do a load, i.e. there is still
+	// some work to do.
+	// The failure_in1bis and length problem is taken care of at the
+	// calling side.
+	//
+	;;
+.failure_in1bis:		// from (.failure_in3)
+	mov ar.lc=len		// Continue with a stupid byte store.
+	;;
+5:
+	st1 [dst1]=r0,1
+	br.cloop.dptk 5b
+	;;
+	mov pr=saved_pr,0xffffffffffff0000
+	mov ar.lc=saved_lc
+	mov ar.pfs=saved_pfs
+	br.ret.sptk.many rp
+
+	//
+	// Here we simply restart the loop but instead
+	// of doing loads we fill the pipeline with zeroes
+	// We can't simply store r0 because we may have valid
+	// data in transit in the pipeline.
+	// ar.lc and ar.ec are setup correctly at this point
+	//
+	// we MUST use src1/endsrc here and not dst1/enddst because
+	// of the pipeline effect.
+	//
+.failure_in3:
+	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
+	;;
+2:
+(p16)	mov val1[0]=r0
+(p16)	mov val2[0]=r0
+(EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16
+(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
+	br.ctop.dptk 2b
+	;;
+	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
+	sub len=enddst,dst1,1		// precompute len
+(p6)	br.cond.dptk .failure_in1bis
+	;;
+	mov pr=saved_pr,0xffffffffffff0000
+	mov ar.lc=saved_lc
+	mov ar.pfs=saved_pfs
+	br.ret.sptk.many rp
+
+.failure_in2:
+	sub ret0=endsrc,src1
+	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
+	sub len=enddst,dst1,1		// precompute len
+(p6)	br.cond.dptk .failure_in1bis
+	;;
+	mov pr=saved_pr,0xffffffffffff0000
+	mov ar.lc=saved_lc
+	mov ar.pfs=saved_pfs
+	br.ret.sptk.many rp
+
+	//
+	// handling of failures on stores: that's the easy part
+	//
+.failure_out:
+	sub ret0=enddst,dst1
+	mov pr=saved_pr,0xffffffffffff0000
+	mov ar.lc=saved_lc
+
+	mov ar.pfs=saved_pfs
+	br.ret.sptk.many rp
+END(__copy_user)
--- a/kernel/arch/ia64/lib/csum_partial_copy.c
+++ b/kernel/arch/ia64/lib/csum_partial_copy.c
@@ -0,0 +1,140 @@
+/*
+ * Network Checksum & Copy routine
+ *
+ * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Most of the code has been imported from Linux/Alpha
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * XXX Fixme: those 2 inlines are meant for debugging and will go away
+ */
+static inline unsigned
+short from64to16(unsigned long x)
+{
+	/* add up 32-bit words for 33 bits */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up 16-bit and 17-bit words for 17+c bits */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up 16-bit and 2-bit for 16+c bit */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up carry.. */
+	x = (x & 0xffff) + (x >> 16);
+	return x;
+}
+
+static inline
+unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
+{
+	int odd, count;
+	unsigned long result = (unsigned long)psum;
+
+	if (len <= 0)
+		goto out;
+	odd = 1 & (unsigned long) buff;
+	if (odd) {
+		result = *buff << 8;
+		len--;
+		buff++;
+	}
+	count = len >> 1;		/* nr of 16-bit words.. */
+	if (count) {
+		if (2 & (unsigned long) buff) {
+			result += *(unsigned short *) buff;
+			count--;
+			len -= 2;
+			buff += 2;
+		}
+		count >>= 1;		/* nr of 32-bit words.. */
+		if (count) {
+			if (4 & (unsigned long) buff) {
+				result += *(unsigned int *) buff;
+				count--;
+				len -= 4;
+				buff += 4;
+			}
+			count >>= 1;	/* nr of 64-bit words.. */
+			if (count) {
+				unsigned long carry = 0;
+				do {
+					unsigned long w = *(unsigned long *) buff;
+					count--;
+					buff += 8;
+					result += carry;
+					result += w;
+					carry = (w > result);
+				} while (count);
+				result += carry;
+				result = (result & 0xffffffff) + (result >> 32);
+			}
+			if (len & 4) {
+				result += *(unsigned int *) buff;
+				buff += 4;
+			}
+		}
+		if (len & 2) {
+			result += *(unsigned short *) buff;
+			buff += 2;
+		}
+	}
+	if (len & 1)
+		result += *buff;
+
+	result = from64to16(result);
+
+	if (odd)
+		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+
+out:
+	return result;
+}
+
+/*
+ * XXX Fixme
+ *
+ * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS.
+ * But it's very tricky to get right even in C.
+ */
+extern unsigned long do_csum(const unsigned char *, long);
+
+__wsum
+csum_partial_copy_from_user(const void __user *src, void *dst,
+				int len, __wsum psum, int *errp)
+{
+	unsigned long result;
+
+	/* XXX Fixme
+	 * for now we separate the copy from checksum for obvious
+	 * alignment difficulties. Look at the Alpha code and you'll be
+	 * scared.
+	 */
+
+	if (__copy_from_user(dst, src, len) != 0 && errp)
+		*errp = -EFAULT;
+
+	result = do_csum(dst, len);
+
+	/* add in old sum, and carry.. */
+	result += (__force u32)psum;
+	/* 32+c bits -> 32 bits */
+	result = (result & 0xffffffff) + (result >> 32);
+	return (__force __wsum)result;
+}
+
+EXPORT_SYMBOL(csum_partial_copy_from_user);
+
+__wsum
+csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
+{
+	return csum_partial_copy_from_user((__force const void __user *)src,
+					   dst, len, sum, NULL);
+}
+
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
--- a/kernel/arch/ia64/lib/do_csum.S
+++ b/kernel/arch/ia64/lib/do_csum.S
@@ -0,0 +1,323 @@
+/*
+ *
+ * Optmized version of the standard do_csum() function
+ *
+ * Return: a 64bit quantity containing the 16bit Internet checksum
+ *
+ * Inputs:
+ *	in0: address of buffer to checksum (char *)
+ *	in1: length of the buffer (int)
+ *
+ * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 02/04/22	Ken Chen <kenneth.w.chen@intel.com>
+ *		Data locality study on the checksum buffer.
+ *		More optimization cleanup - remove excessive stop bits.
+ * 02/04/08	David Mosberger <davidm@hpl.hp.com>
+ *		More cleanup and tuning.
+ * 01/04/18	Jun Nakajima <jun.nakajima@intel.com>
+ *		Clean up and optimize and the software pipeline, loading two
+ *		back-to-back 8-byte words per loop. Clean up the initialization
+ *		for the loop. Support the cases where load latency = 1 or 2.
+ *		Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
+ */
+
+#include <asm/asmmacro.h>
+
+//
+// Theory of operations:
+//	The goal is to go as quickly as possible to the point where
+//	we can checksum 16 bytes/loop. Before reaching that point we must
+//	take care of incorrect alignment of first byte.
+//
+//	The code hereafter also takes care of the "tail" part of the buffer
+//	before entering the core loop, if any. The checksum is a sum so it
+//	allows us to commute operations. So we do the "head" and "tail"
+//	first to finish at full speed in the body. Once we get the head and
+//	tail values, we feed them into the pipeline, very handy initialization.
+//
+//	Of course we deal with the special case where the whole buffer fits
+//	into one 8 byte word. In this case we have only one entry in the pipeline.
+//
+//	We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
+//	possible load latency and also to accommodate for head and tail.
+//
+//	The end of the function deals with folding the checksum from 64bits
+//	down to 16bits taking care of the carry.
+//
+//	This version avoids synchronization in the core loop by also using a
+//	pipeline for the accumulation of the checksum in resultx[] (x=1,2).
+//
+//	 wordx[] (x=1,2)
+//	|---|
+//      |   | 0			: new value loaded in pipeline
+//	|---|
+//      |   | -			: in transit data
+//	|---|
+//      |   | LOAD_LATENCY	: current value to add to checksum
+//	|---|
+//      |   | LOAD_LATENCY+1	: previous value added to checksum
+//      |---|			(previous iteration)
+//
+//	resultx[] (x=1,2)
+//	|---|
+//      |   | 0			: initial value
+//	|---|
+//      |   | LOAD_LATENCY-1	: new checksum
+//	|---|
+//      |   | LOAD_LATENCY	: previous value of checksum
+//	|---|
+//      |   | LOAD_LATENCY+1	: final checksum when out of the loop
+//      |---|
+//
+//
+//	See RFC1071 "Computing the Internet Checksum" for various techniques for
+//	calculating the Internet checksum.
+//
+// NOT YET DONE:
+//	- Maybe another algorithm which would take care of the folding at the
+//	  end in a different manner
+//	- Work with people more knowledgeable than me on the network stack
+//	  to figure out if we could not split the function depending on the
+//	  type of packet or alignment we get. Like the ip_fast_csum() routine
+//	  where we know we have at least 20bytes worth of data to checksum.
+//	- Do a better job of handling small packets.
+//	- Note on prefetching: it was found that under various load, i.e. ftp read/write,
+//	  nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
+//	  on the data that buffer points to (partly because the checksum is often preceded by
+//	  a copy_from_user()).  This finding indiate that lfetch will not be beneficial since
+//	  the data is already in the cache.
+//
+
+#define saved_pfs	r11
+#define hmask		r16
+#define tmask		r17
+#define first1		r18
+#define firstval	r19
+#define firstoff	r20
+#define last		r21
+#define lastval		r22
+#define lastoff		r23
+#define saved_lc	r24
+#define saved_pr	r25
+#define tmp1		r26
+#define tmp2		r27
+#define tmp3		r28
+#define carry1		r29
+#define carry2		r30
+#define first2		r31
+
+#define buf		in0
+#define len		in1
+
+#define LOAD_LATENCY	2	// XXX fix me
+
+#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
+# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
+#endif
+
+#define PIPE_DEPTH			(LOAD_LATENCY+2)
+#define ELD	p[LOAD_LATENCY]		// end of load
+#define ELD_1	p[LOAD_LATENCY+1]	// and next stage
+
+// unsigned long do_csum(unsigned char *buf,long len)
+
+GLOBAL_ENTRY(do_csum)
+	.prologue
+	.save ar.pfs, saved_pfs
+	alloc saved_pfs=ar.pfs,2,16,0,16
+	.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
+	.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
+	mov ret0=r0		// in case we have zero length
+	cmp.lt p0,p6=r0,len	// check for zero length or negative (32bit len)
+	;;
+	add tmp1=buf,len	// last byte's address
+	.save pr, saved_pr
+	mov saved_pr=pr		// preserve predicates (rotation)
+(p6)	br.ret.spnt.many rp	// return if zero or negative length
+
+	mov hmask=-1		// initialize head mask
+	tbit.nz p15,p0=buf,0	// is buf an odd address?
+	and first1=-8,buf	// 8-byte align down address of first1 element
+
+	and firstoff=7,buf	// how many bytes off for first1 element
+	mov tmask=-1		// initialize tail mask
+
+	;;
+	adds tmp2=-1,tmp1	// last-1
+	and lastoff=7,tmp1	// how many bytes off for last element
+	;;
+	sub tmp1=8,lastoff	// complement to lastoff
+	and last=-8,tmp2	// address of word containing last byte
+	;;
+	sub tmp3=last,first1	// tmp3=distance from first1 to last
+	.save ar.lc, saved_lc
+	mov saved_lc=ar.lc	// save lc
+	cmp.eq p8,p9=last,first1	// everything fits in one word ?
+
+	ld8 firstval=[first1],8	// load, ahead of time, "first1" word
+	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
+	shl tmp2=firstoff,3	// number of bits
+	;;
+(p9)	ld8 lastval=[last]	// load, ahead of time, "last" word, if needed
+	shl tmp1=tmp1,3		// number of bits
+(p9)	adds tmp3=-8,tmp3	// effectively loaded
+	;;
+(p8)	mov lastval=r0		// we don't need lastval if first1==last
+	shl hmask=hmask,tmp2	// build head mask, mask off [0,first1off[
+	shr.u tmask=tmask,tmp1	// build tail mask, mask off ]8,lastoff]
+	;;
+	.body
+#define count tmp3
+
+(p8)	and hmask=hmask,tmask	// apply tail mask to head mask if 1 word only
+(p9)	and word2[0]=lastval,tmask	// mask last it as appropriate
+	shr.u count=count,3	// how many 8-byte?
+	;;
+	// If count is odd, finish this 8-byte word so that we can
+	// load two back-to-back 8-byte words per loop thereafter.
+	and word1[0]=firstval,hmask	// and mask it as appropriate
+	tbit.nz p10,p11=count,0		// if (count is odd)
+	;;
+(p8)	mov result1[0]=word1[0]
+(p9)	add result1[0]=word1[0],word2[0]
+	;;
+	cmp.ltu p6,p0=result1[0],word1[0]	// check the carry
+	cmp.eq.or.andcm p8,p0=0,count		// exit if zero 8-byte
+	;;
+(p6)	adds result1[0]=1,result1[0]
+(p8)	br.cond.dptk .do_csum_exit	// if (within an 8-byte word)
+(p11)	br.cond.dptk .do_csum16		// if (count is even)
+
+	// Here count is odd.
+	ld8 word1[1]=[first1],8		// load an 8-byte word
+	cmp.eq p9,p10=1,count		// if (count == 1)
+	adds count=-1,count		// loaded an 8-byte word
+	;;
+	add result1[0]=result1[0],word1[1]
+	;;
+	cmp.ltu p6,p0=result1[0],word1[1]
+	;;
+(p6)	adds result1[0]=1,result1[0]
+(p9)	br.cond.sptk .do_csum_exit	// if (count == 1) exit
+	// Fall through to caluculate the checksum, feeding result1[0] as
+	// the initial value in result1[0].
+	//
+	// Calculate the checksum loading two 8-byte words per loop.
+	//
+.do_csum16:
+	add first2=8,first1
+	shr.u count=count,1	// we do 16 bytes per loop
+	;;
+	adds count=-1,count
+	mov carry1=r0
+	mov carry2=r0
+	brp.loop.imp 1f,2f
+	;;
+	mov ar.ec=PIPE_DEPTH
+	mov ar.lc=count	// set lc
+	mov pr.rot=1<<16
+	// result1[0] must be initialized in advance.
+	mov result2[0]=r0
+	;;
+	.align 32
+1:
+(ELD_1)	cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
+(pC1[1])adds carry1=1,carry1
+(ELD_1)	cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
+(pC2[1])adds carry2=1,carry2
+(ELD)	add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
+(ELD)	add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
+2:
+(p[0])	ld8 word1[0]=[first1],16
+(p[0])	ld8 word2[0]=[first2],16
+	br.ctop.sptk 1b
+	;;
+	// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
+(pC1[1])adds carry1=1,carry1	// since we miss the last one
+(pC2[1])adds carry2=1,carry2
+	;;
+	add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
+	add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
+	;;
+	cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
+	cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
+	;;
+(p6)	adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
+(p7)	adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
+	;;
+	add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
+	;;
+	cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
+	;;
+(p6)	adds result1[0]=1,result1[0]
+	;;
+.do_csum_exit:
+	//
+	// now fold 64 into 16 bits taking care of carry
+	// that's not very good because it has lots of sequentiality
+	//
+	mov tmp3=0xffff
+	zxt4 tmp1=result1[0]
+	shr.u tmp2=result1[0],32
+	;;
+	add result1[0]=tmp1,tmp2
+	;;
+	and tmp1=result1[0],tmp3
+	shr.u tmp2=result1[0],16
+	;;
+	add result1[0]=tmp1,tmp2
+	;;
+	and tmp1=result1[0],tmp3
+	shr.u tmp2=result1[0],16
+	;;
+	add result1[0]=tmp1,tmp2
+	;;
+	and tmp1=result1[0],tmp3
+	shr.u tmp2=result1[0],16
+	;;
+	add ret0=tmp1,tmp2
+	mov pr=saved_pr,0xffffffffffff0000
+	;;
+	// if buf was odd then swap bytes
+	mov ar.pfs=saved_pfs		// restore ar.ec
+(p15)	mux1 ret0=ret0,@rev		// reverse word
+	;;
+	mov ar.lc=saved_lc
+(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
+	br.ret.sptk.many rp
+
+//	I (Jun Nakajima) wrote an equivalent code (see below), but it was
+//	not much better than the original. So keep the original there so that
+//	someone else can challenge.
+//
+//	shr.u word1[0]=result1[0],32
+//	zxt4 result1[0]=result1[0]
+//	;;
+//	add result1[0]=result1[0],word1[0]
+//	;;
+//	zxt2 result2[0]=result1[0]
+//	extr.u word1[0]=result1[0],16,16
+//	shr.u carry1=result1[0],32
+//	;;
+//	add result2[0]=result2[0],word1[0]
+//	;;
+//	add result2[0]=result2[0],carry1
+//	;;
+//	extr.u ret0=result2[0],16,16
+//	;;
+//	add ret0=ret0,result2[0]
+//	;;
+//	zxt2 ret0=ret0
+//	mov ar.pfs=saved_pfs		 // restore ar.ec
+//	mov pr=saved_pr,0xffffffffffff0000
+//	;;
+//	// if buf was odd then swap bytes
+//	mov ar.lc=saved_lc
+//(p15)	mux1 ret0=ret0,@rev		// reverse word
+//	;;
+//(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
+//	br.ret.sptk.many rp
+
+END(do_csum)
--- a/kernel/arch/ia64/lib/flush.S
+++ b/kernel/arch/ia64/lib/flush.S
@@ -0,0 +1,117 @@
+/*
+ * Cache flushing routines.
+ *
+ * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 05/28/05 Zoltan Menyhart	Dynamic stride size
+ */
+
+#include <asm/asmmacro.h>
+
+
+	/*
+	 * flush_icache_range(start,end)
+	 *
+	 *	Make i-cache(s) coherent with d-caches.
+	 *
+	 *	Must deal with range from start to end-1 but nothing else (need to
+	 *	be careful not to touch addresses that may be unmapped).
+	 *
+	 *	Note: "in0" and "in1" are preserved for debugging purposes.
+	 */
+	.section .kprobes.text,"ax"
+GLOBAL_ENTRY(flush_icache_range)
+
+	.prologue
+	alloc	r2=ar.pfs,2,0,0,0
+	movl	r3=ia64_i_cache_stride_shift
+ 	mov	r21=1
+	;;
+	ld8	r20=[r3]		// r20: stride shift
+	sub	r22=in1,r0,1		// last byte address
+	;;
+	shr.u	r23=in0,r20		// start / (stride size)
+	shr.u	r22=r22,r20		// (last byte address) / (stride size)
+	shl	r21=r21,r20		// r21: stride size of the i-cache(s)
+	;;
+	sub	r8=r22,r23		// number of strides - 1
+	shl	r24=r23,r20		// r24: addresses for "fc.i" =
+					//	"start" rounded down to stride boundary
+	.save	ar.lc,r3
+	mov	r3=ar.lc		// save ar.lc
+	;;
+
+	.body
+	mov	ar.lc=r8
+	;;
+	/*
+	 * 32 byte aligned loop, even number of (actually 2) bundles
+	 */
+.Loop:	fc.i	r24			// issuable on M0 only
+	add	r24=r21,r24		// we flush "stride size" bytes per iteration
+	nop.i	0
+	br.cloop.sptk.few .Loop
+	;;
+	sync.i
+	;;
+	srlz.i
+	;;
+	mov	ar.lc=r3		// restore ar.lc
+	br.ret.sptk.many rp
+END(flush_icache_range)
+
+	/*
+	 * clflush_cache_range(start,size)
+	 *
+	 *	Flush cache lines from start to start+size-1.
+	 *
+	 *	Must deal with range from start to start+size-1 but nothing else
+	 *	(need to be careful not to touch addresses that may be
+	 *	unmapped).
+	 *
+	 *	Note: "in0" and "in1" are preserved for debugging purposes.
+	 */
+	.section .kprobes.text,"ax"
+GLOBAL_ENTRY(clflush_cache_range)
+
+	.prologue
+	alloc	r2=ar.pfs,2,0,0,0
+	movl	r3=ia64_cache_stride_shift
+	mov	r21=1
+	add     r22=in1,in0
+	;;
+	ld8	r20=[r3]		// r20: stride shift
+	sub	r22=r22,r0,1		// last byte address
+	;;
+	shr.u	r23=in0,r20		// start / (stride size)
+	shr.u	r22=r22,r20		// (last byte address) / (stride size)
+	shl	r21=r21,r20		// r21: stride size of the i-cache(s)
+	;;
+	sub	r8=r22,r23		// number of strides - 1
+	shl	r24=r23,r20		// r24: addresses for "fc" =
+					//	"start" rounded down to stride
+					//	boundary
+	.save	ar.lc,r3
+	mov	r3=ar.lc		// save ar.lc
+	;;
+
+	.body
+	mov	ar.lc=r8
+	;;
+	/*
+	 * 32 byte aligned loop, even number of (actually 2) bundles
+	 */
+.Loop_fc:
+	fc	r24		// issuable on M0 only
+	add	r24=r21,r24	// we flush "stride size" bytes per iteration
+	nop.i	0
+	br.cloop.sptk.few .Loop_fc
+	;;
+	sync.i
+	;;
+	srlz.i
+	;;
+	mov	ar.lc=r3		// restore ar.lc
+	br.ret.sptk.many rp
+END(clflush_cache_range)
--- a/kernel/arch/ia64/lib/idiv32.S
+++ b/kernel/arch/ia64/lib/idiv32.S
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2000 Hewlett-Packard Co
+ * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 32-bit integer division.
+ *
+ * This code is based on the application note entitled "Divide, Square Root
+ * and Remainder Algorithms for the IA-64 Architecture".  This document
+ * is available as Intel document number 248725-002 or via the web at
+ * http://developer.intel.com/software/opensource/numerics/
+ *
+ * For more details on the theory behind these algorithms, see "IA-64
+ * and Elementary Functions" by Peter Markstein; HP Professional Books
+ * (http://www.hp.com/go/retailbooks/)
+ */
+
+#include <asm/asmmacro.h>
+
+#ifdef MODULO
+# define OP	mod
+#else
+# define OP	div
+#endif
+
+#ifdef UNSIGNED
+# define SGN	u
+# define EXTEND	zxt4
+# define INT_TO_FP(a,b)	fcvt.xuf.s1 a=b
+# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s1 a=b
+#else
+# define SGN
+# define EXTEND	sxt4
+# define INT_TO_FP(a,b)	fcvt.xf a=b
+# define FP_TO_INT(a,b)	fcvt.fx.trunc.s1 a=b
+#endif
+
+#define PASTE1(a,b)	a##b
+#define PASTE(a,b)	PASTE1(a,b)
+#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,si3))
+
+GLOBAL_ENTRY(NAME)
+	.regstk 2,0,0,0
+	// Transfer inputs to FP registers.
+	mov r2 = 0xffdd			// r2 = -34 + 65535 (fp reg format bias)
+	EXTEND in0 = in0		// in0 = a
+	EXTEND in1 = in1		// in1 = b
+	;;
+	setf.sig f8 = in0
+	setf.sig f9 = in1
+#ifdef MODULO
+	sub in1 = r0, in1		// in1 = -b
+#endif
+	;;
+	// Convert the inputs to FP, to avoid FP software-assist faults.
+	INT_TO_FP(f8, f8)
+	INT_TO_FP(f9, f9)
+	;;
+	setf.exp f7 = r2		// f7 = 2^-34
+	frcpa.s1 f6, p6 = f8, f9	// y0 = frcpa(b)
+	;;
+(p6)	fmpy.s1 f8 = f8, f6		// q0 = a*y0
+(p6)	fnma.s1 f6 = f9, f6, f1		// e0 = -b*y0 + 1 
+	;;
+#ifdef MODULO
+	setf.sig f9 = in1		// f9 = -b
+#endif
+(p6)	fma.s1 f8 = f6, f8, f8		// q1 = e0*q0 + q0
+(p6)	fma.s1 f6 = f6, f6, f7		// e1 = e0*e0 + 2^-34
+	;;
+#ifdef MODULO
+	setf.sig f7 = in0
+#endif
+(p6)	fma.s1 f6 = f6, f8, f8		// q2 = e1*q1 + q1
+	;;
+	FP_TO_INT(f6, f6)		// q = trunc(q2)
+	;;
+#ifdef MODULO
+	xma.l f6 = f6, f9, f7		// r = q*(-b) + a
+	;;
+#endif
+	getf.sig r8 = f6		// transfer result to result register
+	br.ret.sptk.many rp
+END(NAME)
--- a/kernel/arch/ia64/lib/idiv64.S
+++ b/kernel/arch/ia64/lib/idiv64.S
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 1999-2000 Hewlett-Packard Co
+ * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 64-bit integer division.
+ *
+ * This code is based on the application note entitled "Divide, Square Root
+ * and Remainder Algorithms for the IA-64 Architecture".  This document
+ * is available as Intel document number 248725-002 or via the web at
+ * http://developer.intel.com/software/opensource/numerics/
+ *
+ * For more details on the theory behind these algorithms, see "IA-64
+ * and Elementary Functions" by Peter Markstein; HP Professional Books
+ * (http://www.hp.com/go/retailbooks/)
+ */
+
+#include <asm/asmmacro.h>
+
+#ifdef MODULO
+# define OP	mod
+#else
+# define OP	div
+#endif
+
+#ifdef UNSIGNED
+# define SGN	u
+# define INT_TO_FP(a,b)	fcvt.xuf.s1 a=b
+# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s1 a=b
+#else
+# define SGN
+# define INT_TO_FP(a,b)	fcvt.xf a=b
+# define FP_TO_INT(a,b)	fcvt.fx.trunc.s1 a=b
+#endif
+
+#define PASTE1(a,b)	a##b
+#define PASTE(a,b)	PASTE1(a,b)
+#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,di3))
+
+GLOBAL_ENTRY(NAME)
+	.regstk 2,0,0,0
+	// Transfer inputs to FP registers.
+	setf.sig f8 = in0
+	setf.sig f9 = in1
+	;;
+	// Convert the inputs to FP, to avoid FP software-assist faults.
+	INT_TO_FP(f8, f8)
+	INT_TO_FP(f9, f9)
+	;;
+	frcpa.s1 f11, p6 = f8, f9	// y0 = frcpa(b)
+	;;
+(p6)	fmpy.s1 f7 = f8, f11		// q0 = a*y0
+(p6)	fnma.s1 f6 = f9, f11, f1	// e0 = -b*y0 + 1
+	;;
+(p6)	fma.s1 f10 = f7, f6, f7		// q1 = q0*e0 + q0
+(p6)	fmpy.s1 f7 = f6, f6		// e1 = e0*e0
+	;;
+#ifdef MODULO
+	sub in1 = r0, in1		// in1 = -b
+#endif
+(p6)	fma.s1 f10 = f10, f7, f10	// q2 = q1*e1 + q1
+(p6)	fma.s1 f6 = f11, f6, f11	// y1 = y0*e0 + y0
+	;;
+(p6)	fma.s1 f6 = f6, f7, f6		// y2 = y1*e1 + y1
+(p6)	fnma.s1 f7 = f9, f10, f8	// r = -b*q2 + a
+	;;
+#ifdef MODULO
+	setf.sig f8 = in0		// f8 = a
+	setf.sig f9 = in1		// f9 = -b
+#endif
+(p6)	fma.s1 f11 = f7, f6, f10	// q3 = r*y2 + q2
+	;;
+	FP_TO_INT(f11, f11)		// q = trunc(q3)
+	;;
+#ifdef MODULO
+	xma.l f11 = f11, f9, f8		// r = q*(-b) + a
+	;;
+#endif
+	getf.sig r8 = f11		// transfer result to result register
+	br.ret.sptk.many rp
+END(NAME)
--- a/kernel/arch/ia64/lib/io.c
+++ b/kernel/arch/ia64/lib/io.c
@@ -0,0 +1,164 @@
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include <asm/io.h>
+
+/*
+ * Copy data from IO memory space to "real" memory space.
+ * This needs to be optimized.
+ */
+void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
+{
+	char *dst = to;
+
+	while (count) {
+		count--;
+		*dst++ = readb(from++);
+	}
+}
+EXPORT_SYMBOL(memcpy_fromio);
+
+/*
+ * Copy data from "real" memory space to IO memory space.
+ * This needs to be optimized.
+ */
+void memcpy_toio(volatile void __iomem *to, const void *from, long count)
+{
+	const char *src = from;
+
+	while (count) {
+		count--;
+		writeb(*src++, to++);
+	}
+}
+EXPORT_SYMBOL(memcpy_toio);
+
+/*
+ * "memset" on IO memory space.
+ * This needs to be optimized.
+ */
+void memset_io(volatile void __iomem *dst, int c, long count)
+{
+	unsigned char ch = (char)(c & 0xff);
+
+	while (count) {
+		count--;
+		writeb(ch, dst);
+		dst++;
+	}
+}
+EXPORT_SYMBOL(memset_io);
+
+#ifdef CONFIG_IA64_GENERIC
+
+#undef __ia64_inb
+#undef __ia64_inw
+#undef __ia64_inl
+#undef __ia64_outb
+#undef __ia64_outw
+#undef __ia64_outl
+#undef __ia64_readb
+#undef __ia64_readw
+#undef __ia64_readl
+#undef __ia64_readq
+#undef __ia64_readb_relaxed
+#undef __ia64_readw_relaxed
+#undef __ia64_readl_relaxed
+#undef __ia64_readq_relaxed
+#undef __ia64_writeb
+#undef __ia64_writew
+#undef __ia64_writel
+#undef __ia64_writeq
+#undef __ia64_mmiowb
+
+unsigned int
+__ia64_inb (unsigned long port)
+{
+	return ___ia64_inb(port);
+}
+
+unsigned int
+__ia64_inw (unsigned long port)
+{
+	return ___ia64_inw(port);
+}
+
+unsigned int
+__ia64_inl (unsigned long port)
+{
+	return ___ia64_inl(port);
+}
+
+void
+__ia64_outb (unsigned char val, unsigned long port)
+{
+	___ia64_outb(val, port);
+}
+
+void
+__ia64_outw (unsigned short val, unsigned long port)
+{
+	___ia64_outw(val, port);
+}
+
+void
+__ia64_outl (unsigned int val, unsigned long port)
+{
+	___ia64_outl(val, port);
+}
+
+unsigned char
+__ia64_readb (void __iomem *addr)
+{
+	return ___ia64_readb (addr);
+}
+
+unsigned short
+__ia64_readw (void __iomem *addr)
+{
+	return ___ia64_readw (addr);
+}
+
+unsigned int
+__ia64_readl (void __iomem *addr)
+{
+	return ___ia64_readl (addr);
+}
+
+unsigned long
+__ia64_readq (void __iomem *addr)
+{
+	return ___ia64_readq (addr);
+}
+
+unsigned char
+__ia64_readb_relaxed (void __iomem *addr)
+{
+	return ___ia64_readb (addr);
+}
+
+unsigned short
+__ia64_readw_relaxed (void __iomem *addr)
+{
+	return ___ia64_readw (addr);
+}
+
+unsigned int
+__ia64_readl_relaxed (void __iomem *addr)
+{
+	return ___ia64_readl (addr);
+}
+
+unsigned long
+__ia64_readq_relaxed (void __iomem *addr)
+{
+	return ___ia64_readq (addr);
+}
+
+void
+__ia64_mmiowb(void)
+{
+	___ia64_mmiowb();
+}
+
+#endif /* CONFIG_IA64_GENERIC */
--- a/kernel/arch/ia64/lib/ip_fast_csum.S
+++ b/kernel/arch/ia64/lib/ip_fast_csum.S
@@ -0,0 +1,144 @@
+/*
+ * Optmized version of the ip_fast_csum() function
+ * Used for calculating IP header checksum
+ *
+ * Return: 16bit checksum, complemented
+ *
+ * Inputs:
+ *      in0: address of buffer to checksum (char *)
+ *      in1: length of the buffer (int)
+ *
+ * Copyright (C) 2002, 2006 Intel Corp.
+ * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
+ */
+
+#include <asm/asmmacro.h>
+
+/*
+ * Since we know that most likely this function is called with buf aligned
+ * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
+ * versus calling generic version of do_csum, which has lots of overhead in
+ * handling various alignments and sizes.  However, due to lack of constrains
+ * put on the function input argument, cases with alignment not on 4-byte or
+ * size not equal to 20 bytes will be handled by the generic do_csum function.
+ */
+
+#define in0	r32
+#define in1	r33
+#define in2	r34
+#define in3	r35
+#define in4	r36
+#define ret0	r8
+
+GLOBAL_ENTRY(ip_fast_csum)
+	.prologue
+	.body
+	cmp.ne	p6,p7=5,in1	// size other than 20 byte?
+	and	r14=3,in0	// is it aligned on 4-byte?
+	add	r15=4,in0	// second source pointer
+	;;
+	cmp.ne.or.andcm p6,p7=r14,r0
+	;;
+(p7)	ld4	r20=[in0],8
+(p7)	ld4	r21=[r15],8
+(p6)	br.spnt	.generic
+	;;
+	ld4	r22=[in0],8
+	ld4	r23=[r15],8
+	;;
+	ld4	r24=[in0]
+	add	r20=r20,r21
+	add	r22=r22,r23
+	;;
+	add	r20=r20,r22
+	;;
+	add	r20=r20,r24
+	;;
+	shr.u	ret0=r20,16	// now need to add the carry
+	zxt2	r20=r20
+	;;
+	add	r20=ret0,r20
+	;;
+	shr.u	ret0=r20,16	// add carry again
+	zxt2	r20=r20
+	;;
+	add	r20=ret0,r20
+	;;
+	shr.u	ret0=r20,16
+	zxt2	r20=r20
+	;;
+	add	r20=ret0,r20
+	mov	r9=0xffff
+	;;
+	andcm	ret0=r9,r20
+	.restore sp		// reset frame state
+	br.ret.sptk.many b0
+	;;
+
+.generic:
+	.prologue
+	.save ar.pfs, r35
+	alloc	r35=ar.pfs,2,2,2,0
+	.save rp, r34
+	mov	r34=b0
+	.body
+	dep.z	out1=in1,2,30
+	mov	out0=in0
+	;;
+	br.call.sptk.many b0=do_csum
+	;;
+	andcm	ret0=-1,ret0
+	mov	ar.pfs=r35
+	mov	b0=r34
+	br.ret.sptk.many b0
+END(ip_fast_csum)
+
+GLOBAL_ENTRY(csum_ipv6_magic)
+	ld4	r20=[in0],4
+	ld4	r21=[in1],4
+	zxt4	in2=in2
+	;;
+	ld4	r22=[in0],4
+	ld4	r23=[in1],4
+	dep	r15=in3,in2,32,16
+	;;
+	ld4	r24=[in0],4
+	ld4	r25=[in1],4
+	mux1	r15=r15,@rev
+	add	r16=r20,r21
+	add	r17=r22,r23
+	zxt4	in4=in4
+	;;
+	ld4	r26=[in0],4
+	ld4	r27=[in1],4
+	shr.u	r15=r15,16
+	add	r18=r24,r25
+	add	r8=r16,r17
+	;;
+	add	r19=r26,r27
+	add	r8=r8,r18
+	;;
+	add	r8=r8,r19
+	add	r15=r15,in4
+	;;
+	add	r8=r8,r15
+	;;
+	shr.u	r10=r8,32	// now fold sum into short
+	zxt4	r11=r8
+	;;
+	add	r8=r10,r11
+	;;
+	shr.u	r10=r8,16	// yeah, keep it rolling
+	zxt2	r11=r8
+	;;
+	add	r8=r10,r11
+	;;
+	shr.u	r10=r8,16	// three times lucky
+	zxt2	r11=r8
+	;;
+	add	r8=r10,r11
+	mov	r9=0xffff
+	;;
+	andcm	r8=r9,r8
+	br.ret.sptk.many b0
+END(csum_ipv6_magic)
--- a/kernel/arch/ia64/lib/memcpy.S
+++ b/kernel/arch/ia64/lib/memcpy.S
@@ -0,0 +1,301 @@
+/*
+ *
+ * Optimized version of the standard memcpy() function
+ *
+ * Inputs:
+ * 	in0:	destination address
+ *	in1:	source address
+ *	in2:	number of bytes to copy
+ * Output:
+ * 	no return value
+ *
+ * Copyright (C) 2000-2001 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <asm/asmmacro.h>
+
+GLOBAL_ENTRY(memcpy)
+
+#	define MEM_LAT	21		/* latency to memory */
+
+#	define dst	r2
+#	define src	r3
+#	define retval	r8
+#	define saved_pfs r9
+#	define saved_lc	r10
+#	define saved_pr	r11
+#	define cnt	r16
+#	define src2	r17
+#	define t0	r18
+#	define t1	r19
+#	define t2	r20
+#	define t3	r21
+#	define t4	r22
+#	define src_end	r23
+
+#	define N	(MEM_LAT + 4)
+#	define Nrot	((N + 7) & ~7)
+
+	/*
+	 * First, check if everything (src, dst, len) is a multiple of eight.  If
+	 * so, we handle everything with no taken branches (other than the loop
+	 * itself) and a small icache footprint.  Otherwise, we jump off to
+	 * the more general copy routine handling arbitrary
+	 * sizes/alignment etc.
+	 */
+	.prologue
+	.save ar.pfs, saved_pfs
+	alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
+	.save ar.lc, saved_lc
+	mov saved_lc=ar.lc
+	or t0=in0,in1
+	;;
+
+	or t0=t0,in2
+	.save pr, saved_pr
+	mov saved_pr=pr
+
+	.body
+
+	cmp.eq p6,p0=in2,r0	// zero length?
+	mov retval=in0		// return dst
+(p6)	br.ret.spnt.many rp	// zero length, return immediately
+	;;
+
+	mov dst=in0		// copy because of rotation
+	shr.u cnt=in2,3		// number of 8-byte words to copy
+	mov pr.rot=1<<16
+	;;
+
+	adds cnt=-1,cnt		// br.ctop is repeat/until
+	cmp.gtu p7,p0=16,in2	// copying less than 16 bytes?
+	mov ar.ec=N
+	;;
+
+	and t0=0x7,t0
+	mov ar.lc=cnt
+	;;
+	cmp.ne p6,p0=t0,r0
+
+	mov src=in1		// copy because of rotation
+(p7)	br.cond.spnt.few .memcpy_short
+(p6)	br.cond.spnt.few .memcpy_long
+	;;
+	nop.m	0
+	;;
+	nop.m	0
+	nop.i	0
+	;;
+	nop.m	0
+	;;
+	.rotr val[N]
+	.rotp p[N]
+	.align 32
+1: { .mib
+(p[0])	ld8 val[0]=[src],8
+	nop.i 0
+	brp.loop.imp 1b, 2f
+}
+2: { .mfb
+(p[N-1])st8 [dst]=val[N-1],8
+	nop.f 0
+	br.ctop.dptk.few 1b
+}
+	;;
+	mov ar.lc=saved_lc
+	mov pr=saved_pr,-1
+	mov ar.pfs=saved_pfs
+	br.ret.sptk.many rp
+
+	/*
+	 * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
+	 * copy loop.  This performs relatively poorly on Itanium, but it doesn't
+	 * get used very often (gcc inlines small copies) and due to atomicity
+	 * issues, we want to avoid read-modify-write of entire words.
+	 */
+	.align 32
+.memcpy_short:
+	adds cnt=-1,in2		// br.ctop is repeat/until
+	mov ar.ec=MEM_LAT
+	brp.loop.imp 1f, 2f
+	;;
+	mov ar.lc=cnt
+	;;
+	nop.m	0
+	;;
+	nop.m	0
+	nop.i	0
+	;;
+	nop.m	0
+	;;
+	nop.m	0
+	;;
+	/*
+	 * It is faster to put a stop bit in the loop here because it makes
+	 * the pipeline shorter (and latency is what matters on short copies).
+	 */
+	.align 32
+1: { .mib
+(p[0])	ld1 val[0]=[src],1
+	nop.i 0
+	brp.loop.imp 1b, 2f
+} ;;
+2: { .mfb
+(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
+	nop.f 0
+	br.ctop.dptk.few 1b
+} ;;
+	mov ar.lc=saved_lc
+	mov pr=saved_pr,-1
+	mov ar.pfs=saved_pfs
+	br.ret.sptk.many rp
+
+	/*
+	 * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
+	 * an overriding concern here, but throughput is.  We first do
+	 * sub-word copying until the destination is aligned, then we check
+	 * if the source is also aligned.  If so, we do a simple load/store-loop
+	 * until there are less than 8 bytes left over and then we do the tail,
+	 * by storing the last few bytes using sub-word copying.  If the source
+	 * is not aligned, we branch off to the non-congruent loop.
+	 *
+	 *   stage:   op:
+	 *         0  ld
+	 *	   :
+	 * MEM_LAT+3  shrp
+	 * MEM_LAT+4  st
+	 *
+	 * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
+	 * seems to introduce an unavoidable bubble in the pipeline so the overall
+	 * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
+	 * of 4 byte/cycle.  Still not bad.
+	 */
+#	undef N
+#	undef Nrot
+#	define N	(MEM_LAT + 5)		/* number of stages */
+#	define Nrot	((N+1 + 2 + 7) & ~7)	/* number of rotating regs */
+
+#define LOG_LOOP_SIZE	6
+
+.memcpy_long:
+	alloc t3=ar.pfs,3,Nrot,0,Nrot	// resize register frame
+	and t0=-8,src		// t0 = src & ~7
+	and t2=7,src		// t2 = src & 7
+	;;
+	ld8 t0=[t0]		// t0 = 1st source word
+	adds src2=7,src		// src2 = (src + 7)
+	sub t4=r0,dst		// t4 = -dst
+	;;
+	and src2=-8,src2	// src2 = (src + 7) & ~7
+	shl t2=t2,3		// t2 = 8*(src & 7)
+	shl t4=t4,3		// t4 = 8*(dst & 7)
+	;;
+	ld8 t1=[src2]		// t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
+	sub t3=64,t2		// t3 = 64-8*(src & 7)
+	shr.u t0=t0,t2
+	;;
+	add src_end=src,in2
+	shl t1=t1,t3
+	mov pr=t4,0x38		// (p5,p4,p3)=(dst & 7)
+	;;
+	or t0=t0,t1
+	mov cnt=r0
+	adds src_end=-1,src_end
+	;;
+(p3)	st1 [dst]=t0,1
+(p3)	shr.u t0=t0,8
+(p3)	adds cnt=1,cnt
+	;;
+(p4)	st2 [dst]=t0,2
+(p4)	shr.u t0=t0,16
+(p4)	adds cnt=2,cnt
+	;;
+(p5)	st4 [dst]=t0,4
+(p5)	adds cnt=4,cnt
+	and src_end=-8,src_end	// src_end = last word of source buffer
+	;;
+
+	// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
+
+1:{	add src=cnt,src			// make src point to remainder of source buffer
+	sub cnt=in2,cnt			// cnt = number of bytes left to copy
+	mov t4=ip
+  }	;;
+	and src2=-8,src			// align source pointer
+	adds t4=.memcpy_loops-1b,t4
+	mov ar.ec=N
+
+	and t0=7,src			// t0 = src & 7
+	shr.u t2=cnt,3			// t2 = number of 8-byte words left to copy
+	shl cnt=cnt,3			// move bits 0-2 to 3-5
+	;;
+
+	.rotr val[N+1], w[2]
+	.rotp p[N]
+
+	cmp.ne p6,p0=t0,r0		// is src aligned, too?
+	shl t0=t0,LOG_LOOP_SIZE		// t0 = 8*(src & 7)
+	adds t2=-1,t2			// br.ctop is repeat/until
+	;;
+	add t4=t0,t4
+	mov pr=cnt,0x38			// set (p5,p4,p3) to # of bytes last-word bytes to copy
+	mov ar.lc=t2
+	;;
+	nop.m	0
+	;;
+	nop.m	0
+	nop.i	0
+	;;
+	nop.m	0
+	;;
+(p6)	ld8 val[1]=[src2],8		// prime the pump...
+	mov b6=t4
+	br.sptk.few b6
+	;;
+
+.memcpy_tail:
+	// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
+	// less than 8) and t0 contains the last few bytes of the src buffer:
+(p5)	st4 [dst]=t0,4
+(p5)	shr.u t0=t0,32
+	mov ar.lc=saved_lc
+	;;
+(p4)	st2 [dst]=t0,2
+(p4)	shr.u t0=t0,16
+	mov ar.pfs=saved_pfs
+	;;
+(p3)	st1 [dst]=t0
+	mov pr=saved_pr,-1
+	br.ret.sptk.many rp
+
+///////////////////////////////////////////////////////
+	.align 64
+
+#define COPY(shift,index)									\
+ 1: { .mib											\
+	(p[0])		ld8 val[0]=[src2],8;							\
+	(p[MEM_LAT+3])	shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;			\
+			brp.loop.imp 1b, 2f							\
+    };												\
+ 2: { .mfb											\
+	(p[MEM_LAT+4])	st8 [dst]=w[1],8;							\
+			nop.f 0;								\
+			br.ctop.dptk.few 1b;							\
+    };												\
+			;;									\
+			ld8 val[N-1]=[src_end];	/* load last word (may be same as val[N]) */	\
+			;;									\
+			shrp t0=val[N-1],val[N-index],shift;					\
+			br .memcpy_tail
+.memcpy_loops:
+	COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
+	COPY(8, 0)
+	COPY(16, 0)
+	COPY(24, 0)
+	COPY(32, 0)
+	COPY(40, 0)
+	COPY(48, 0)
+	COPY(56, 0)
+
+END(memcpy)
--- a/kernel/arch/ia64/lib/memcpy_mck.S
+++ b/kernel/arch/ia64/lib/memcpy_mck.S
@@ -0,0 +1,666 @@
+/*
+ * Itanium 2-optimized version of memcpy and copy_user function
+ *
+ * Inputs:
+ * 	in0:	destination address
+ *	in1:	source address
+ *	in2:	number of bytes to copy
+ * Output:
+ *	for memcpy:    return dest
+ * 	for copy_user: return 0 if success,
+ *		       or number of byte NOT copied if error occurred.
+ *
+ * Copyright (C) 2002 Intel Corp.
+ * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
+ */
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+
+#define EK(y...) EX(y)
+
+/* McKinley specific optimization */
+
+#define retval		r8
+#define saved_pfs	r31
+#define saved_lc	r10
+#define saved_pr	r11
+#define saved_in0	r14
+#define saved_in1	r15
+#define saved_in2	r16
+
+#define src0		r2
+#define src1		r3
+#define dst0		r17
+#define dst1		r18
+#define cnt		r9
+
+/* r19-r30 are temp for each code section */
+#define PREFETCH_DIST	8
+#define src_pre_mem	r19
+#define dst_pre_mem	r20
+#define src_pre_l2	r21
+#define dst_pre_l2	r22
+#define t1		r23
+#define t2		r24
+#define t3		r25
+#define t4		r26
+#define t5		t1	// alias!
+#define t6		t2	// alias!
+#define t7		t3	// alias!
+#define n8		r27
+#define t9		t5	// alias!
+#define t10		t4	// alias!
+#define t11		t7	// alias!
+#define t12		t6	// alias!
+#define t14		t10	// alias!
+#define t13		r28
+#define t15		r29
+#define tmp		r30
+
+/* defines for long_copy block */
+#define	A	0
+#define B	(PREFETCH_DIST)
+#define C	(B + PREFETCH_DIST)
+#define D	(C + 1)
+#define N	(D + 1)
+#define Nrot	((N + 7) & ~7)
+
+/* alias */
+#define in0		r32
+#define in1		r33
+#define in2		r34
+
+GLOBAL_ENTRY(memcpy)
+	and	r28=0x7,in0
+	and	r29=0x7,in1
+	mov	f6=f0
+	mov	retval=in0
+	br.cond.sptk .common_code
+	;;
+END(memcpy)
+GLOBAL_ENTRY(__copy_user)
+	.prologue
+// check dest alignment
+	and	r28=0x7,in0
+	and	r29=0x7,in1
+	mov	f6=f1
+	mov	saved_in0=in0	// save dest pointer
+	mov	saved_in1=in1	// save src pointer
+	mov	retval=r0	// initialize return value
+	;;
+.common_code:
+	cmp.gt	p15,p0=8,in2	// check for small size
+	cmp.ne	p13,p0=0,r28	// check dest alignment
+	cmp.ne	p14,p0=0,r29	// check src alignment
+	add	src0=0,in1
+	sub	r30=8,r28	// for .align_dest
+	mov	saved_in2=in2	// save len
+	;;
+	add	dst0=0,in0
+	add	dst1=1,in0	// dest odd index
+	cmp.le	p6,p0 = 1,r30	// for .align_dest
+(p15)	br.cond.dpnt .memcpy_short
+(p13)	br.cond.dpnt .align_dest
+(p14)	br.cond.dpnt .unaligned_src
+	;;
+
+// both dest and src are aligned on 8-byte boundary
+.aligned_src:
+	.save ar.pfs, saved_pfs
+	alloc	saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
+	.save pr, saved_pr
+	mov	saved_pr=pr
+
+	shr.u	cnt=in2,7	// this much cache line
+	;;
+	cmp.lt	p6,p0=2*PREFETCH_DIST,cnt
+	cmp.lt	p7,p8=1,cnt
+	.save ar.lc, saved_lc
+	mov	saved_lc=ar.lc
+	.body
+	add	cnt=-1,cnt
+	add	src_pre_mem=0,in1	// prefetch src pointer
+	add	dst_pre_mem=0,in0	// prefetch dest pointer
+	;;
+(p7)	mov	ar.lc=cnt	// prefetch count
+(p8)	mov	ar.lc=r0
+(p6)	br.cond.dpnt .long_copy
+	;;
+
+.prefetch:
+	lfetch.fault	  [src_pre_mem], 128
+	lfetch.fault.excl [dst_pre_mem], 128
+	br.cloop.dptk.few .prefetch
+	;;
+
+.medium_copy:
+	and	tmp=31,in2	// copy length after iteration
+	shr.u	r29=in2,5	// number of 32-byte iteration
+	add	dst1=8,dst0	// 2nd dest pointer
+	;;
+	add	cnt=-1,r29	// ctop iteration adjustment
+	cmp.eq	p10,p0=r29,r0	// do we really need to loop?
+	add	src1=8,src0	// 2nd src pointer
+	cmp.le	p6,p0=8,tmp
+	;;
+	cmp.le	p7,p0=16,tmp
+	mov	ar.lc=cnt	// loop setup
+	cmp.eq	p16,p17 = r0,r0
+	mov	ar.ec=2
+(p10)	br.dpnt.few .aligned_src_tail
+	;;
+	TEXT_ALIGN(32)
+1:
+EX(.ex_handler, (p16)	ld8	r34=[src0],16)
+EK(.ex_handler, (p16)	ld8	r38=[src1],16)
+EX(.ex_handler, (p17)	st8	[dst0]=r33,16)
+EK(.ex_handler, (p17)	st8	[dst1]=r37,16)
+	;;
+EX(.ex_handler, (p16)	ld8	r32=[src0],16)
+EK(.ex_handler, (p16)	ld8	r36=[src1],16)
+EX(.ex_handler, (p16)	st8	[dst0]=r34,16)
+EK(.ex_handler, (p16)	st8	[dst1]=r38,16)
+	br.ctop.dptk.few 1b
+	;;
+
+.aligned_src_tail:
+EX(.ex_handler, (p6)	ld8	t1=[src0])
+	mov	ar.lc=saved_lc
+	mov	ar.pfs=saved_pfs
+EX(.ex_hndlr_s, (p7)	ld8	t2=[src1],8)
+	cmp.le	p8,p0=24,tmp
+	and	r21=-8,tmp
+	;;
+EX(.ex_hndlr_s, (p8)	ld8	t3=[src1])
+EX(.ex_handler, (p6)	st8	[dst0]=t1)	// store byte 1
+	and	in2=7,tmp	// remaining length
+EX(.ex_hndlr_d, (p7)	st8	[dst1]=t2,8)	// store byte 2
+	add	src0=src0,r21	// setting up src pointer
+	add	dst0=dst0,r21	// setting up dest pointer
+	;;
+EX(.ex_handler, (p8)	st8	[dst1]=t3)	// store byte 3
+	mov	pr=saved_pr,-1
+	br.dptk.many .memcpy_short
+	;;
+
+/* code taken from copy_page_mck */
+.long_copy:
+	.rotr v[2*PREFETCH_DIST]
+	.rotp p[N]
+
+	mov src_pre_mem = src0
+	mov pr.rot = 0x10000
+	mov ar.ec = 1				// special unrolled loop
+
+	mov dst_pre_mem = dst0
+
+	add src_pre_l2 = 8*8, src0
+	add dst_pre_l2 = 8*8, dst0
+	;;
+	add src0 = 8, src_pre_mem		// first t1 src
+	mov ar.lc = 2*PREFETCH_DIST - 1
+	shr.u cnt=in2,7				// number of lines
+	add src1 = 3*8, src_pre_mem		// first t3 src
+	add dst0 = 8, dst_pre_mem		// first t1 dst
+	add dst1 = 3*8, dst_pre_mem		// first t3 dst
+	;;
+	and tmp=127,in2				// remaining bytes after this block
+	add cnt = -(2*PREFETCH_DIST) - 1, cnt
+	// same as .line_copy loop, but with all predicated-off instructions removed:
+.prefetch_loop:
+EX(.ex_hndlr_lcpy_1, (p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0
+EK(.ex_hndlr_lcpy_1, (p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2
+	br.ctop.sptk .prefetch_loop
+	;;
+	cmp.eq p16, p0 = r0, r0			// reset p16 to 1
+	mov ar.lc = cnt
+	mov ar.ec = N				// # of stages in pipeline
+	;;
+.line_copy:
+EX(.ex_handler,	(p[D])	ld8 t2 = [src0], 3*8)			// M0
+EK(.ex_handler,	(p[D])	ld8 t4 = [src1], 3*8)			// M1
+EX(.ex_handler_lcpy,	(p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2 prefetch dst from memory
+EK(.ex_handler_lcpy,	(p[D])	st8 [dst_pre_l2] = n8, 128)		// M3 prefetch dst from L2
+	;;
+EX(.ex_handler_lcpy,	(p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0 prefetch src from memory
+EK(.ex_handler_lcpy,	(p[C])	ld8 n8 = [src_pre_l2], 128)		// M1 prefetch src from L2
+EX(.ex_handler,	(p[D])	st8 [dst0] =  t1, 8)			// M2
+EK(.ex_handler,	(p[D])	st8 [dst1] =  t3, 8)			// M3
+	;;
+EX(.ex_handler,	(p[D])	ld8  t5 = [src0], 8)
+EK(.ex_handler,	(p[D])	ld8  t7 = [src1], 3*8)
+EX(.ex_handler,	(p[D])	st8 [dst0] =  t2, 3*8)
+EK(.ex_handler,	(p[D])	st8 [dst1] =  t4, 3*8)
+	;;
+EX(.ex_handler,	(p[D])	ld8  t6 = [src0], 3*8)
+EK(.ex_handler,	(p[D])	ld8 t10 = [src1], 8)
+EX(.ex_handler,	(p[D])	st8 [dst0] =  t5, 8)
+EK(.ex_handler,	(p[D])	st8 [dst1] =  t7, 3*8)
+	;;
+EX(.ex_handler,	(p[D])	ld8  t9 = [src0], 3*8)
+EK(.ex_handler,	(p[D])	ld8 t11 = [src1], 3*8)
+EX(.ex_handler,	(p[D])	st8 [dst0] =  t6, 3*8)
+EK(.ex_handler,	(p[D])	st8 [dst1] = t10, 8)
+	;;
+EX(.ex_handler,	(p[D])	ld8 t12 = [src0], 8)
+EK(.ex_handler,	(p[D])	ld8 t14 = [src1], 8)
+EX(.ex_handler,	(p[D])	st8 [dst0] =  t9, 3*8)
+EK(.ex_handler,	(p[D])	st8 [dst1] = t11, 3*8)
+	;;
+EX(.ex_handler,	(p[D])	ld8 t13 = [src0], 4*8)
+EK(.ex_handler,	(p[D])	ld8 t15 = [src1], 4*8)
+EX(.ex_handler,	(p[D])	st8 [dst0] = t12, 8)
+EK(.ex_handler,	(p[D])	st8 [dst1] = t14, 8)
+	;;
+EX(.ex_handler,	(p[C])	ld8  t1 = [src0], 8)
+EK(.ex_handler,	(p[C])	ld8  t3 = [src1], 8)
+EX(.ex_handler,	(p[D])	st8 [dst0] = t13, 4*8)
+EK(.ex_handler,	(p[D])	st8 [dst1] = t15, 4*8)
+	br.ctop.sptk .line_copy
+	;;
+
+	add dst0=-8,dst0
+	add src0=-8,src0
+	mov in2=tmp
+	.restore sp
+	br.sptk.many .medium_copy
+	;;
+
+#define BLOCK_SIZE	128*32
+#define blocksize	r23
+#define curlen		r24
+
+// dest is on 8-byte boundary, src is not. We need to do
+// ld8-ld8, shrp, then st8.  Max 8 byte copy per cycle.
+.unaligned_src:
+	.prologue
+	.save ar.pfs, saved_pfs
+	alloc	saved_pfs=ar.pfs,3,5,0,8
+	.save ar.lc, saved_lc
+	mov	saved_lc=ar.lc
+	.save pr, saved_pr
+	mov	saved_pr=pr
+	.body
+.4k_block:
+	mov	saved_in0=dst0	// need to save all input arguments
+	mov	saved_in2=in2
+	mov	blocksize=BLOCK_SIZE
+	;;
+	cmp.lt	p6,p7=blocksize,in2
+	mov	saved_in1=src0
+	;;
+(p6)	mov	in2=blocksize
+	;;
+	shr.u	r21=in2,7	// this much cache line
+	shr.u	r22=in2,4	// number of 16-byte iteration
+	and	curlen=15,in2	// copy length after iteration
+	and	r30=7,src0	// source alignment
+	;;
+	cmp.lt	p7,p8=1,r21
+	add	cnt=-1,r21
+	;;
+
+	add	src_pre_mem=0,src0	// prefetch src pointer
+	add	dst_pre_mem=0,dst0	// prefetch dest pointer
+	and	src0=-8,src0		// 1st src pointer
+(p7)	mov	ar.lc = cnt
+(p8)	mov	ar.lc = r0
+	;;
+	TEXT_ALIGN(32)
+1:	lfetch.fault	  [src_pre_mem], 128
+	lfetch.fault.excl [dst_pre_mem], 128
+	br.cloop.dptk.few 1b
+	;;
+
+	shladd	dst1=r22,3,dst0	// 2nd dest pointer
+	shladd	src1=r22,3,src0	// 2nd src pointer
+	cmp.eq	p8,p9=r22,r0	// do we really need to loop?
+	cmp.le	p6,p7=8,curlen;	// have at least 8 byte remaining?
+	add	cnt=-1,r22	// ctop iteration adjustment
+	;;
+EX(.ex_handler, (p9)	ld8	r33=[src0],8)	// loop primer
+EK(.ex_handler, (p9)	ld8	r37=[src1],8)
+(p8)	br.dpnt.few .noloop
+	;;
+
+// The jump address is calculated based on src alignment. The COPYU
+// macro below need to confine its size to power of two, so an entry
+// can be caulated using shl instead of an expensive multiply. The
+// size is then hard coded by the following #define to match the
+// actual size.  This make it somewhat tedious when COPYU macro gets
+// changed and this need to be adjusted to match.
+#define LOOP_SIZE 6
+1:
+	mov	r29=ip		// jmp_table thread
+	mov	ar.lc=cnt
+	;;
+	add	r29=.jump_table - 1b - (.jmp1-.jump_table), r29
+	shl	r28=r30, LOOP_SIZE	// jmp_table thread
+	mov	ar.ec=2		// loop setup
+	;;
+	add	r29=r29,r28		// jmp_table thread
+	cmp.eq	p16,p17=r0,r0
+	;;
+	mov	b6=r29			// jmp_table thread
+	;;
+	br.cond.sptk.few b6
+
+// for 8-15 byte case
+// We will skip the loop, but need to replicate the side effect
+// that the loop produces.
+.noloop:
+EX(.ex_handler, (p6)	ld8	r37=[src1],8)
+	add	src0=8,src0
+(p6)	shl	r25=r30,3
+	;;
+EX(.ex_handler, (p6)	ld8	r27=[src1])
+(p6)	shr.u	r28=r37,r25
+(p6)	sub	r26=64,r25
+	;;
+(p6)	shl	r27=r27,r26
+	;;
+(p6)	or	r21=r28,r27
+
+.unaligned_src_tail:
+/* check if we have more than blocksize to copy, if so go back */
+	cmp.gt	p8,p0=saved_in2,blocksize
+	;;
+(p8)	add	dst0=saved_in0,blocksize
+(p8)	add	src0=saved_in1,blocksize
+(p8)	sub	in2=saved_in2,blocksize
+(p8)	br.dpnt	.4k_block
+	;;
+
+/* we have up to 15 byte to copy in the tail.
+ * part of work is already done in the jump table code
+ * we are at the following state.
+ * src side:
+ * 
+ *   xxxxxx xx                   <----- r21 has xxxxxxxx already
+ * -------- -------- --------
+ * 0        8        16
+ *          ^
+ *          |
+ *          src1
+ * 
+ * dst
+ * -------- -------- --------
+ * ^
+ * |
+ * dst1
+ */
+EX(.ex_handler, (p6)	st8	[dst1]=r21,8)	// more than 8 byte to copy
+(p6)	add	curlen=-8,curlen	// update length
+	mov	ar.pfs=saved_pfs
+	;;
+	mov	ar.lc=saved_lc
+	mov	pr=saved_pr,-1
+	mov	in2=curlen	// remaining length
+	mov	dst0=dst1	// dest pointer
+	add	src0=src1,r30	// forward by src alignment
+	;;
+
+// 7 byte or smaller.
+.memcpy_short:
+	cmp.le	p8,p9   = 1,in2
+	cmp.le	p10,p11 = 2,in2
+	cmp.le	p12,p13 = 3,in2
+	cmp.le	p14,p15 = 4,in2
+	add	src1=1,src0	// second src pointer
+	add	dst1=1,dst0	// second dest pointer
+	;;
+
+EX(.ex_handler_short, (p8)	ld1	t1=[src0],2)
+EK(.ex_handler_short, (p10)	ld1	t2=[src1],2)
+(p9)	br.ret.dpnt rp		// 0 byte copy
+	;;
+
+EX(.ex_handler_short, (p8)	st1	[dst0]=t1,2)
+EK(.ex_handler_short, (p10)	st1	[dst1]=t2,2)
+(p11)	br.ret.dpnt rp		// 1 byte copy
+
+EX(.ex_handler_short, (p12)	ld1	t3=[src0],2)
+EK(.ex_handler_short, (p14)	ld1	t4=[src1],2)
+(p13)	br.ret.dpnt rp		// 2 byte copy
+	;;
+
+	cmp.le	p6,p7   = 5,in2
+	cmp.le	p8,p9   = 6,in2
+	cmp.le	p10,p11 = 7,in2
+
+EX(.ex_handler_short, (p12)	st1	[dst0]=t3,2)
+EK(.ex_handler_short, (p14)	st1	[dst1]=t4,2)
+(p15)	br.ret.dpnt rp		// 3 byte copy
+	;;
+
+EX(.ex_handler_short, (p6)	ld1	t5=[src0],2)
+EK(.ex_handler_short, (p8)	ld1	t6=[src1],2)
+(p7)	br.ret.dpnt rp		// 4 byte copy
+	;;
+
+EX(.ex_handler_short, (p6)	st1	[dst0]=t5,2)
+EK(.ex_handler_short, (p8)	st1	[dst1]=t6,2)
+(p9)	br.ret.dptk rp		// 5 byte copy
+
+EX(.ex_handler_short, (p10)	ld1	t7=[src0],2)
+(p11)	br.ret.dptk rp		// 6 byte copy
+	;;
+
+EX(.ex_handler_short, (p10)	st1	[dst0]=t7,2)
+	br.ret.dptk rp		// done all cases
+
+
+/* Align dest to nearest 8-byte boundary. We know we have at
+ * least 7 bytes to copy, enough to crawl to 8-byte boundary.
+ * Actual number of byte to crawl depend on the dest alignment.
+ * 7 byte or less is taken care at .memcpy_short
+
+ * src0 - source even index
+ * src1 - source  odd index
+ * dst0 - dest even index
+ * dst1 - dest  odd index
+ * r30  - distance to 8-byte boundary
+ */
+
+.align_dest:
+	add	src1=1,in1	// source odd index
+	cmp.le	p7,p0 = 2,r30	// for .align_dest
+	cmp.le	p8,p0 = 3,r30	// for .align_dest
+EX(.ex_handler_short, (p6)	ld1	t1=[src0],2)
+	cmp.le	p9,p0 = 4,r30	// for .align_dest
+	cmp.le	p10,p0 = 5,r30
+	;;
+EX(.ex_handler_short, (p7)	ld1	t2=[src1],2)
+EK(.ex_handler_short, (p8)	ld1	t3=[src0],2)
+	cmp.le	p11,p0 = 6,r30
+EX(.ex_handler_short, (p6)	st1	[dst0] = t1,2)
+	cmp.le	p12,p0 = 7,r30
+	;;
+EX(.ex_handler_short, (p9)	ld1	t4=[src1],2)
+EK(.ex_handler_short, (p10)	ld1	t5=[src0],2)
+EX(.ex_handler_short, (p7)	st1	[dst1] = t2,2)
+EK(.ex_handler_short, (p8)	st1	[dst0] = t3,2)
+	;;
+EX(.ex_handler_short, (p11)	ld1	t6=[src1],2)
+EK(.ex_handler_short, (p12)	ld1	t7=[src0],2)
+	cmp.eq	p6,p7=r28,r29
+EX(.ex_handler_short, (p9)	st1	[dst1] = t4,2)
+EK(.ex_handler_short, (p10)	st1	[dst0] = t5,2)
+	sub	in2=in2,r30
+	;;
+EX(.ex_handler_short, (p11)	st1	[dst1] = t6,2)
+EK(.ex_handler_short, (p12)	st1	[dst0] = t7)
+	add	dst0=in0,r30	// setup arguments
+	add	src0=in1,r30
+(p6)	br.cond.dptk .aligned_src
+(p7)	br.cond.dpnt .unaligned_src
+	;;
+
+/* main loop body in jump table format */
+#define COPYU(shift)									\
+1:											\
+EX(.ex_handler,  (p16)	ld8	r32=[src0],8);		/* 1 */				\
+EK(.ex_handler,  (p16)	ld8	r36=[src1],8);						\
+		 (p17)	shrp	r35=r33,r34,shift;;	/* 1 */				\
+EX(.ex_handler,  (p6)	ld8	r22=[src1]);	/* common, prime for tail section */	\
+		 nop.m	0;								\
+		 (p16)	shrp	r38=r36,r37,shift;					\
+EX(.ex_handler,  (p17)	st8	[dst0]=r35,8);		/* 1 */				\
+EK(.ex_handler,  (p17)	st8	[dst1]=r39,8);						\
+		 br.ctop.dptk.few 1b;;							\
+		 (p7)	add	src1=-8,src1;	/* back out for <8 byte case */		\
+		 shrp	r21=r22,r38,shift;	/* speculative work */			\
+		 br.sptk.few .unaligned_src_tail /* branch out of jump table */		\
+		 ;;
+	TEXT_ALIGN(32)
+.jump_table:
+	COPYU(8)	// unaligned cases
+.jmp1:
+	COPYU(16)
+	COPYU(24)
+	COPYU(32)
+	COPYU(40)
+	COPYU(48)
+	COPYU(56)
+
+#undef A
+#undef B
+#undef C
+#undef D
+
+/*
+ * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
+ * instruction failed in the bundle.  The exception algorithm is that we
+ * first figure out the faulting address, then detect if there is any
+ * progress made on the copy, if so, redo the copy from last known copied
+ * location up to the faulting address (exclusive). In the copy_from_user
+ * case, remaining byte in kernel buffer will be zeroed.
+ *
+ * Take copy_from_user as an example, in the code there are multiple loads
+ * in a bundle and those multiple loads could span over two pages, the
+ * faulting address is calculated as page_round_down(max(src0, src1)).
+ * This is based on knowledge that if we can access one byte in a page, we
+ * can access any byte in that page.
+ *
+ * predicate used in the exception handler:
+ * p6-p7: direction
+ * p10-p11: src faulting addr calculation
+ * p12-p13: dst faulting addr calculation
+ */
+
+#define A	r19
+#define B	r20
+#define C	r21
+#define D	r22
+#define F	r28
+
+#define memset_arg0	r32
+#define memset_arg2	r33
+
+#define saved_retval	loc0
+#define saved_rtlink	loc1
+#define saved_pfs_stack	loc2
+
+.ex_hndlr_s:
+	add	src0=8,src0
+	br.sptk .ex_handler
+	;;
+.ex_hndlr_d:
+	add	dst0=8,dst0
+	br.sptk .ex_handler
+	;;
+.ex_hndlr_lcpy_1:
+	mov	src1=src_pre_mem
+	mov	dst1=dst_pre_mem
+	cmp.gtu	p10,p11=src_pre_mem,saved_in1
+	cmp.gtu	p12,p13=dst_pre_mem,saved_in0
+	;;
+(p10)	add	src0=8,saved_in1
+(p11)	mov	src0=saved_in1
+(p12)	add	dst0=8,saved_in0
+(p13)	mov	dst0=saved_in0
+	br.sptk	.ex_handler
+.ex_handler_lcpy:
+	// in line_copy block, the preload addresses should always ahead
+	// of the other two src/dst pointers.  Furthermore, src1/dst1 should
+	// always ahead of src0/dst0.
+	mov	src1=src_pre_mem
+	mov	dst1=dst_pre_mem
+.ex_handler:
+	mov	pr=saved_pr,-1		// first restore pr, lc, and pfs
+	mov	ar.lc=saved_lc
+	mov	ar.pfs=saved_pfs
+	;;
+.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
+	cmp.ltu	p6,p7=saved_in0, saved_in1	// get the copy direction
+	cmp.ltu	p10,p11=src0,src1
+	cmp.ltu	p12,p13=dst0,dst1
+	fcmp.eq	p8,p0=f6,f0		// is it memcpy?
+	mov	tmp = dst0
+	;;
+(p11)	mov	src1 = src0		// pick the larger of the two
+(p13)	mov	dst0 = dst1		// make dst0 the smaller one
+(p13)	mov	dst1 = tmp		// and dst1 the larger one
+	;;
+(p6)	dep	F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
+(p7)	dep	F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
+	;;
+(p6)	cmp.le	p14,p0=dst0,saved_in0	// no progress has been made on store
+(p7)	cmp.le	p14,p0=src0,saved_in1	// no progress has been made on load
+	mov	retval=saved_in2
+(p8)	ld1	tmp=[src1]		// force an oops for memcpy call
+(p8)	st1	[dst1]=r0		// force an oops for memcpy call
+(p14)	br.ret.sptk.many rp
+
+/*
+ * The remaining byte to copy is calculated as:
+ *
+ * A =	(faulting_addr - orig_src)	-> len to faulting ld address
+ *	or 
+ * 	(faulting_addr - orig_dst)	-> len to faulting st address
+ * B =	(cur_dst - orig_dst)		-> len copied so far
+ * C =	A - B				-> len need to be copied
+ * D =	orig_len - A			-> len need to be zeroed
+ */
+(p6)	sub	A = F, saved_in0
+(p7)	sub	A = F, saved_in1
+	clrrrb
+	;;
+	alloc	saved_pfs_stack=ar.pfs,3,3,3,0
+	cmp.lt	p8,p0=A,r0
+	sub	B = dst0, saved_in0	// how many byte copied so far
+	;;
+(p8)	mov	A = 0;			// A shouldn't be negative, cap it
+	;;
+	sub	C = A, B
+	sub	D = saved_in2, A
+	;;
+	cmp.gt	p8,p0=C,r0		// more than 1 byte?
+	add	memset_arg0=saved_in0, A
+(p6)	mov	memset_arg2=0		// copy_to_user should not call memset
+(p7)	mov	memset_arg2=D		// copy_from_user need to have kbuf zeroed
+	mov	r8=0
+	mov	saved_retval = D
+	mov	saved_rtlink = b0
+
+	add	out0=saved_in0, B
+	add	out1=saved_in1, B
+	mov	out2=C
+(p8)	br.call.sptk.few b0=__copy_user	// recursive call
+	;;
+
+	add	saved_retval=saved_retval,r8	// above might return non-zero value
+	cmp.gt	p8,p0=memset_arg2,r0	// more than 1 byte?
+	mov	out0=memset_arg0	// *s
+	mov	out1=r0			// c
+	mov	out2=memset_arg2	// n
+(p8)	br.call.sptk.few b0=memset
+	;;
+
+	mov	retval=saved_retval
+	mov	ar.pfs=saved_pfs_stack
+	mov	b0=saved_rtlink
+	br.ret.sptk.many rp
+
+/* end of McKinley specific optimization */
+END(__copy_user)
--- a/kernel/arch/ia64/lib/memset.S
+++ b/kernel/arch/ia64/lib/memset.S
@@ -0,0 +1,362 @@
+/* Optimized version of the standard memset() function.
+
+   Copyright (c) 2002 Hewlett-Packard Co/CERN
+	Sverre Jarp <Sverre.Jarp@cern.ch>
+
+   Return: dest
+
+   Inputs:
+        in0:    dest
+        in1:    value
+        in2:    count
+
+   The algorithm is fairly straightforward: set byte by byte until we
+   we get to a 16B-aligned address, then loop on 128 B chunks using an
+   early store as prefetching, then loop on 32B chucks, then clear remaining
+   words, finally clear remaining bytes.
+   Since a stf.spill f0 can store 16B in one go, we use this instruction
+   to get peak speed when value = 0.  */
+
+#include <asm/asmmacro.h>
+#undef ret
+
+#define dest		in0
+#define value		in1
+#define	cnt		in2
+
+#define tmp		r31
+#define save_lc		r30
+#define ptr0		r29
+#define ptr1		r28
+#define ptr2		r27
+#define ptr3		r26
+#define ptr9 		r24
+#define	loopcnt		r23
+#define linecnt		r22
+#define bytecnt		r21
+
+#define fvalue		f6
+
+// This routine uses only scratch predicate registers (p6 - p15)
+#define p_scr		p6			// default register for same-cycle branches
+#define p_nz		p7
+#define p_zr		p8
+#define p_unalgn	p9
+#define p_y		p11
+#define p_n		p12
+#define p_yy		p13
+#define p_nn		p14
+
+#define MIN1		15
+#define MIN1P1HALF	8
+#define LINE_SIZE	128
+#define LSIZE_SH        7			// shift amount
+#define PREF_AHEAD	8
+
+GLOBAL_ENTRY(memset)
+{ .mmi
+	.prologue
+	alloc	tmp = ar.pfs, 3, 0, 0, 0
+	lfetch.nt1 [dest]			//
+	.save   ar.lc, save_lc
+	mov.i	save_lc = ar.lc
+	.body
+} { .mmi
+	mov	ret0 = dest			// return value
+	cmp.ne	p_nz, p_zr = value, r0		// use stf.spill if value is zero
+	cmp.eq	p_scr, p0 = cnt, r0
+;; }
+{ .mmi
+	and	ptr2 = -(MIN1+1), dest		// aligned address
+	and	tmp = MIN1, dest		// prepare to check for correct alignment
+	tbit.nz p_y, p_n = dest, 0		// Do we have an odd address? (M_B_U)
+} { .mib
+	mov	ptr1 = dest
+	mux1	value = value, @brcst		// create 8 identical bytes in word
+(p_scr)	br.ret.dpnt.many rp			// return immediately if count = 0
+;; }
+{ .mib
+	cmp.ne	p_unalgn, p0 = tmp, r0		//
+} { .mib
+	sub	bytecnt = (MIN1+1), tmp		// NB: # of bytes to move is 1 higher than loopcnt
+	cmp.gt	p_scr, p0 = 16, cnt		// is it a minimalistic task?
+(p_scr)	br.cond.dptk.many .move_bytes_unaligned	// go move just a few (M_B_U)
+;; }
+{ .mmi
+(p_unalgn) add	ptr1 = (MIN1+1), ptr2		// after alignment
+(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		// after alignment
+(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	// should we do a st8 ?
+;; }
+{ .mib
+(p_y)	add	cnt = -8, cnt			//
+(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	// should we do a st4 ?
+} { .mib
+(p_y)	st8	[ptr2] = value,-4		//
+(p_n)	add	ptr2 = 4, ptr2			//
+;; }
+{ .mib
+(p_yy)	add	cnt = -4, cnt			//
+(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	// should we do a st2 ?
+} { .mib
+(p_yy)	st4	[ptr2] = value,-2		//
+(p_nn)	add	ptr2 = 2, ptr2			//
+;; }
+{ .mmi
+	mov	tmp = LINE_SIZE+1		// for compare
+(p_y)	add	cnt = -2, cnt			//
+(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	// should we do a st1 ?
+} { .mmi
+	setf.sig fvalue=value			// transfer value to FLP side
+(p_y)	st2	[ptr2] = value,-1		//
+(p_n)	add	ptr2 = 1, ptr2			//
+;; }
+
+{ .mmi
+(p_yy)	st1	[ptr2] = value 			//
+  	cmp.gt	p_scr, p0 = tmp, cnt		// is it a minimalistic task?
+} { .mbb
+(p_yy)	add	cnt = -1, cnt			//
+(p_scr)	br.cond.dpnt.many .fraction_of_line	// go move just a few
+;; }
+
+{ .mib
+	nop.m 0
+	shr.u	linecnt = cnt, LSIZE_SH
+(p_zr)	br.cond.dptk.many .l1b			// Jump to use stf.spill
+;; }
+
+	TEXT_ALIGN(32) // --------------------- //  L1A: store ahead into cache lines; fill later
+{ .mmi
+	and	tmp = -(LINE_SIZE), cnt		// compute end of range
+	mov	ptr9 = ptr1			// used for prefetching
+	and	cnt = (LINE_SIZE-1), cnt	// remainder
+} { .mmi
+	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
+	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
+;; }
+{ .mmi
+(p_scr)	add	loopcnt = -1, linecnt		//
+	add	ptr2 = 8, ptr1			// start of stores (beyond prefetch stores)
+	add	ptr1 = tmp, ptr1		// first address beyond total range
+;; }
+{ .mmi
+	add	tmp = -1, linecnt		// next loop count
+	mov.i	ar.lc = loopcnt			//
+;; }
+.pref_l1a:
+{ .mib
+	stf8 [ptr9] = fvalue, 128		// Do stores one cache line apart
+	nop.i	0
+	br.cloop.dptk.few .pref_l1a
+;; }
+{ .mmi
+	add	ptr0 = 16, ptr2			// Two stores in parallel
+	mov.i	ar.lc = tmp			//
+;; }
+.l1ax:
+ { .mmi
+	stf8 [ptr2] = fvalue, 8
+	stf8 [ptr0] = fvalue, 8
+ ;; }
+ { .mmi
+	stf8 [ptr2] = fvalue, 24
+	stf8 [ptr0] = fvalue, 24
+ ;; }
+ { .mmi
+	stf8 [ptr2] = fvalue, 8
+	stf8 [ptr0] = fvalue, 8
+ ;; }
+ { .mmi
+	stf8 [ptr2] = fvalue, 24
+	stf8 [ptr0] = fvalue, 24
+ ;; }
+ { .mmi
+	stf8 [ptr2] = fvalue, 8
+	stf8 [ptr0] = fvalue, 8
+ ;; }
+ { .mmi
+	stf8 [ptr2] = fvalue, 24
+	stf8 [ptr0] = fvalue, 24
+ ;; }
+ { .mmi
+	stf8 [ptr2] = fvalue, 8
+	stf8 [ptr0] = fvalue, 32
+ 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
+ ;; }
+{ .mmb
+	stf8 [ptr2] = fvalue, 24
+(p_scr)	stf8 [ptr9] = fvalue, 128
+	br.cloop.dptk.few .l1ax
+;; }
+{ .mbb
+	cmp.le  p_scr, p0 = 8, cnt		// just a few bytes left ?
+(p_scr) br.cond.dpnt.many  .fraction_of_line	// Branch no. 2
+	br.cond.dpnt.many  .move_bytes_from_alignment	// Branch no. 3
+;; }
+
+	TEXT_ALIGN(32)
+.l1b:	// ------------------------------------ //  L1B: store ahead into cache lines; fill later
+{ .mmi
+	and	tmp = -(LINE_SIZE), cnt		// compute end of range
+	mov	ptr9 = ptr1			// used for prefetching
+	and	cnt = (LINE_SIZE-1), cnt	// remainder
+} { .mmi
+	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
+	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
+;; }
+{ .mmi
+(p_scr)	add	loopcnt = -1, linecnt
+	add	ptr2 = 16, ptr1			// start of stores (beyond prefetch stores)
+	add	ptr1 = tmp, ptr1		// first address beyond total range
+;; }
+{ .mmi
+	add	tmp = -1, linecnt		// next loop count
+	mov.i	ar.lc = loopcnt
+;; }
+.pref_l1b:
+{ .mib
+	stf.spill [ptr9] = f0, 128		// Do stores one cache line apart
+	nop.i   0
+	br.cloop.dptk.few .pref_l1b
+;; }
+{ .mmi
+	add	ptr0 = 16, ptr2			// Two stores in parallel
+	mov.i	ar.lc = tmp
+;; }
+.l1bx:
+ { .mmi
+	stf.spill [ptr2] = f0, 32
+	stf.spill [ptr0] = f0, 32
+ ;; }
+ { .mmi
+	stf.spill [ptr2] = f0, 32
+	stf.spill [ptr0] = f0, 32
+ ;; }
+ { .mmi
+	stf.spill [ptr2] = f0, 32
+	stf.spill [ptr0] = f0, 64
+ 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
+ ;; }
+{ .mmb
+	stf.spill [ptr2] = f0, 32
+(p_scr)	stf.spill [ptr9] = f0, 128
+	br.cloop.dptk.few .l1bx
+;; }
+{ .mib
+	cmp.gt  p_scr, p0 = 8, cnt		// just a few bytes left ?
+(p_scr)	br.cond.dpnt.many  .move_bytes_from_alignment	//
+;; }
+
+.fraction_of_line:
+{ .mib
+	add	ptr2 = 16, ptr1
+	shr.u	loopcnt = cnt, 5   		// loopcnt = cnt / 32
+;; }
+{ .mib
+	cmp.eq	p_scr, p0 = loopcnt, r0
+	add	loopcnt = -1, loopcnt
+(p_scr)	br.cond.dpnt.many .store_words
+;; }
+{ .mib
+	and	cnt = 0x1f, cnt			// compute the remaining cnt
+	mov.i   ar.lc = loopcnt
+;; }
+	TEXT_ALIGN(32)
+.l2:	// ------------------------------------ //  L2A:  store 32B in 2 cycles
+{ .mmb
+	stf8	[ptr1] = fvalue, 8
+	stf8	[ptr2] = fvalue, 8
+;; } { .mmb
+	stf8	[ptr1] = fvalue, 24
+	stf8	[ptr2] = fvalue, 24
+	br.cloop.dptk.many .l2
+;; }
+.store_words:
+{ .mib
+	cmp.gt	p_scr, p0 = 8, cnt		// just a few bytes left ?
+(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	// Branch
+;; }
+
+{ .mmi
+	stf8	[ptr1] = fvalue, 8		// store
+	cmp.le	p_y, p_n = 16, cnt
+	add	cnt = -8, cnt			// subtract
+;; }
+{ .mmi
+(p_y)	stf8	[ptr1] = fvalue, 8		// store
+(p_y)	cmp.le.unc p_yy, p_nn = 16, cnt
+(p_y)	add	cnt = -8, cnt			// subtract
+;; }
+{ .mmi						// store
+(p_yy)	stf8	[ptr1] = fvalue, 8
+(p_yy)	add	cnt = -8, cnt			// subtract
+;; }
+
+.move_bytes_from_alignment:
+{ .mib
+	cmp.eq	p_scr, p0 = cnt, r0
+	tbit.nz.unc p_y, p0 = cnt, 2		// should we terminate with a st4 ?
+(p_scr)	br.cond.dpnt.few .restore_and_exit
+;; }
+{ .mib
+(p_y)	st4	[ptr1] = value,4
+	tbit.nz.unc p_yy, p0 = cnt, 1		// should we terminate with a st2 ?
+;; }
+{ .mib
+(p_yy)	st2	[ptr1] = value,2
+	tbit.nz.unc p_y, p0 = cnt, 0		// should we terminate with a st1 ?
+;; }
+
+{ .mib
+(p_y)	st1	[ptr1] = value
+;; }
+.restore_and_exit:
+{ .mib
+	nop.m	0
+	mov.i	ar.lc = save_lc
+	br.ret.sptk.many rp
+;; }
+
+.move_bytes_unaligned:
+{ .mmi
+       .pred.rel "mutex",p_y, p_n
+       .pred.rel "mutex",p_yy, p_nn
+(p_n)	cmp.le  p_yy, p_nn = 4, cnt
+(p_y)	cmp.le  p_yy, p_nn = 5, cnt
+(p_n)	add	ptr2 = 2, ptr1
+} { .mmi
+(p_y)	add	ptr2 = 3, ptr1
+(p_y)	st1	[ptr1] = value, 1		// fill 1 (odd-aligned) byte [15, 14 (or less) left]
+(p_y)	add	cnt = -1, cnt
+;; }
+{ .mmi
+(p_yy)	cmp.le.unc p_y, p0 = 8, cnt
+	add	ptr3 = ptr1, cnt		// prepare last store
+	mov.i	ar.lc = save_lc
+} { .mmi
+(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
+(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [11, 10 (o less) left]
+(p_yy)	add	cnt = -4, cnt
+;; }
+{ .mmi
+(p_y)	cmp.le.unc p_yy, p0 = 8, cnt
+	add	ptr3 = -1, ptr3			// last store
+	tbit.nz p_scr, p0 = cnt, 1		// will there be a st2 at the end ?
+} { .mmi
+(p_y)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
+(p_y)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [7, 6 (or less) left]
+(p_y)	add	cnt = -4, cnt
+;; }
+{ .mmi
+(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
+(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [3, 2 (or less) left]
+	tbit.nz p_y, p0 = cnt, 0		// will there be a st1 at the end ?
+} { .mmi
+(p_yy)	add	cnt = -4, cnt
+;; }
+{ .mmb
+(p_scr)	st2	[ptr1] = value			// fill 2 (aligned) bytes
+(p_y)	st1	[ptr3] = value			// fill last byte (using ptr3)
+	br.ret.sptk.many rp
+}
+END(memset)
--- a/kernel/arch/ia64/lib/strlen.S
+++ b/kernel/arch/ia64/lib/strlen.S
@@ -0,0 +1,192 @@
+/*
+ *
+ * Optimized version of the standard strlen() function
+ *
+ *
+ * Inputs:
+ *	in0	address of string
+ *
+ * Outputs:
+ *	ret0	the number of characters in the string (0 if empty string)
+ *	does not count the \0
+ *
+ * Copyright (C) 1999, 2001 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 09/24/99 S.Eranian add speculation recovery code
+ */
+
+#include <asm/asmmacro.h>
+
+//
+//
+// This is an enhanced version of the basic strlen. it includes a combination
+// of compute zero index (czx), parallel comparisons, speculative loads and
+// loop unroll using rotating registers.
+//
+// General Ideas about the algorithm:
+//	  The goal is to look at the string in chunks of 8 bytes.
+//	  so we need to do a few extra checks at the beginning because the
+//	  string may not be 8-byte aligned. In this case we load the 8byte
+//	  quantity which includes the start of the string and mask the unused
+//	  bytes with 0xff to avoid confusing czx.
+//	  We use speculative loads and software pipelining to hide memory
+//	  latency and do read ahead safely. This way we defer any exception.
+//
+//	  Because we don't want the kernel to be relying on particular
+//	  settings of the DCR register, we provide recovery code in case
+//	  speculation fails. The recovery code is going to "redo" the work using
+//	  only normal loads. If we still get a fault then we generate a
+//	  kernel panic. Otherwise we return the strlen as usual.
+//
+//	  The fact that speculation may fail can be caused, for instance, by
+//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
+//	  a NaT bit will be set if the translation is not present. The normal
+//	  load, on the other hand, will cause the translation to be inserted
+//	  if the mapping exists.
+//
+//	  It should be noted that we execute recovery code only when we need
+//	  to use the data that has been speculatively loaded: we don't execute
+//	  recovery code on pure read ahead data.
+//
+// Remarks:
+//	- the cmp r0,r0 is used as a fast way to initialize a predicate
+//	  register to 1. This is required to make sure that we get the parallel
+//	  compare correct.
+//
+//	- we don't use the epilogue counter to exit the loop but we need to set
+//	  it to zero beforehand.
+//
+//	- after the loop we must test for Nat values because neither the
+//	  czx nor cmp instruction raise a NaT consumption fault. We must be
+//	  careful not to look too far for a Nat for which we don't care.
+//	  For instance we don't need to look at a NaT in val2 if the zero byte
+//	  was in val1.
+//
+//	- Clearly performance tuning is required.
+//
+//
+//
+#define saved_pfs	r11
+#define	tmp		r10
+#define base		r16
+#define orig		r17
+#define saved_pr	r18
+#define src		r19
+#define mask		r20
+#define val		r21
+#define val1		r22
+#define val2		r23
+
+GLOBAL_ENTRY(strlen)
+	.prologue
+	.save ar.pfs, saved_pfs
+	alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
+
+	.rotr v[2], w[2]	// declares our 4 aliases
+
+	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
+	mov orig=in0		// keep trackof initial byte address
+	dep src=0,in0,0,3	// src=8byte-aligned in0 address
+	.save pr, saved_pr
+	mov saved_pr=pr		// preserve predicates (rotation)
+	;;
+
+	.body
+
+	ld8 v[1]=[src],8	// must not speculate: can fail here
+	shl tmp=tmp,3		// multiply by 8bits/byte
+	mov mask=-1		// our mask
+	;;
+	ld8.s w[1]=[src],8	// speculatively load next
+	cmp.eq p6,p0=r0,r0	// sets p6 to true for cmp.and
+	sub tmp=64,tmp		// how many bits to shift our mask on the right
+	;;
+	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
+	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
+	;;
+	add base=-16,src	// keep track of aligned base
+	or v[1]=v[1],mask	// now we have a safe initial byte pattern
+	;;
+1:
+	ld8.s v[0]=[src],8	// speculatively load next
+	czx1.r val1=v[1]	// search 0 byte from right
+	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
+	;;
+	ld8.s w[0]=[src],8	// speculatively load next to next
+	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
+	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
+(p6)	br.wtop.dptk 1b		// loop until p6 == 0
+	;;
+	//
+	// We must return try the recovery code iff
+	// val1_is_nat || (val1==8 && val2_is_nat)
+	//
+	// XXX Fixme
+	//	- there must be a better way of doing the test
+	//
+	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
+	tnat.nz p6,p7=val1	// test NaT on val1
+(p6)	br.cond.spnt .recover	// jump to recovery if val1 is NaT
+	;;
+	//
+	// if we come here p7 is true, i.e., initialized for // cmp
+	//
+	cmp.eq.and  p7,p0=8,val1// val1==8?
+	tnat.nz.and p7,p0=val2	// test NaT if val2
+(p7)	br.cond.spnt .recover	// jump to recovery if val2 is NaT
+	;;
+(p8)	mov val1=val2		// the other test got us out of the loop
+(p8)	adds src=-16,src	// correct position when 3 ahead
+(p9)	adds src=-24,src	// correct position when 4 ahead
+	;;
+	sub ret0=src,orig	// distance from base
+	sub tmp=8,val1		// which byte in word
+	mov pr=saved_pr,0xffffffffffff0000
+	;;
+	sub ret0=ret0,tmp	// adjust
+	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
+	br.ret.sptk.many rp	// end of normal execution
+
+	//
+	// Outlined recovery code when speculation failed
+	//
+	// This time we don't use speculation and rely on the normal exception
+	// mechanism. that's why the loop is not as good as the previous one
+	// because read ahead is not possible
+	//
+	// IMPORTANT:
+	// Please note that in the case of strlen() as opposed to strlen_user()
+	// we don't use the exception mechanism, as this function is not
+	// supposed to fail. If that happens it means we have a bug and the
+	// code will cause of kernel fault.
+	//
+	// XXX Fixme
+	//	- today we restart from the beginning of the string instead
+	//	  of trying to continue where we left off.
+	//
+.recover:
+	ld8 val=[base],8	// will fail if unrecoverable fault
+	;;
+	or val=val,mask		// remask first bytes
+	cmp.eq p0,p6=r0,r0	// nullify first ld8 in loop
+	;;
+	//
+	// ar.ec is still zero here
+	//
+2:
+(p6)	ld8 val=[base],8	// will fail if unrecoverable fault
+	;;
+	czx1.r val1=val		// search 0 byte from right
+	;;
+	cmp.eq p6,p0=8,val1	// val1==8 ?
+(p6)	br.wtop.dptk 2b		// loop until p6 == 0
+	;;			// (avoid WAW on p63)
+	sub ret0=base,orig	// distance from base
+	sub tmp=8,val1
+	mov pr=saved_pr,0xffffffffffff0000
+	;;
+	sub ret0=ret0,tmp	// length=now - back -1
+	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
+	br.ret.sptk.many rp	// end of successful recovery code
+END(strlen)
--- a/kernel/arch/ia64/lib/strlen_user.S
+++ b/kernel/arch/ia64/lib/strlen_user.S
@@ -0,0 +1,198 @@
+/*
+ * Optimized version of the strlen_user() function
+ *
+ * Inputs:
+ *	in0	address of buffer
+ *
+ * Outputs:
+ *	ret0	0 in case of fault, strlen(buffer)+1 otherwise
+ *
+ * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 01/19/99 S.Eranian heavily enhanced version (see details below)
+ * 09/24/99 S.Eranian added speculation recovery code
+ */
+
+#include <asm/asmmacro.h>
+
+//
+// int strlen_user(char *)
+// ------------------------
+// Returns:
+//	- length of string + 1
+//	- 0 in case an exception is raised
+//
+// This is an enhanced version of the basic strlen_user. it includes a
+// combination of compute zero index (czx), parallel comparisons, speculative
+// loads and loop unroll using rotating registers.
+//
+// General Ideas about the algorithm:
+//	  The goal is to look at the string in chunks of 8 bytes.
+//	  so we need to do a few extra checks at the beginning because the
+//	  string may not be 8-byte aligned. In this case we load the 8byte
+//	  quantity which includes the start of the string and mask the unused
+//	  bytes with 0xff to avoid confusing czx.
+//	  We use speculative loads and software pipelining to hide memory
+//	  latency and do read ahead safely. This way we defer any exception.
+//
+//	  Because we don't want the kernel to be relying on particular
+//	  settings of the DCR register, we provide recovery code in case
+//	  speculation fails. The recovery code is going to "redo" the work using
+//	  only normal loads. If we still get a fault then we return an
+//	  error (ret0=0). Otherwise we return the strlen+1 as usual.
+//	  The fact that speculation may fail can be caused, for instance, by
+//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
+//	  a NaT bit will be set if the translation is not present. The normal
+//	  load, on the other hand, will cause the translation to be inserted
+//	  if the mapping exists.
+//
+//	  It should be noted that we execute recovery code only when we need
+//	  to use the data that has been speculatively loaded: we don't execute
+//	  recovery code on pure read ahead data.
+//
+// Remarks:
+//	- the cmp r0,r0 is used as a fast way to initialize a predicate
+//	  register to 1. This is required to make sure that we get the parallel
+//	  compare correct.
+//
+//	- we don't use the epilogue counter to exit the loop but we need to set
+//	  it to zero beforehand.
+//
+//	- after the loop we must test for Nat values because neither the
+//	  czx nor cmp instruction raise a NaT consumption fault. We must be
+//	  careful not to look too far for a Nat for which we don't care.
+//	  For instance we don't need to look at a NaT in val2 if the zero byte
+//	  was in val1.
+//
+//	- Clearly performance tuning is required.
+//
+
+#define saved_pfs	r11
+#define	tmp		r10
+#define base		r16
+#define orig		r17
+#define saved_pr	r18
+#define src		r19
+#define mask		r20
+#define val		r21
+#define val1		r22
+#define val2		r23
+
+GLOBAL_ENTRY(__strlen_user)
+	.prologue
+	.save ar.pfs, saved_pfs
+	alloc saved_pfs=ar.pfs,11,0,0,8
+
+	.rotr v[2], w[2]	// declares our 4 aliases
+
+	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
+	mov orig=in0		// keep trackof initial byte address
+	dep src=0,in0,0,3	// src=8byte-aligned in0 address
+	.save pr, saved_pr
+	mov saved_pr=pr		// preserve predicates (rotation)
+	;;
+
+	.body
+
+	ld8.s v[1]=[src],8	// load the initial 8bytes (must speculate)
+	shl tmp=tmp,3		// multiply by 8bits/byte
+	mov mask=-1		// our mask
+	;;
+	ld8.s w[1]=[src],8	// load next 8 bytes in 2nd pipeline
+	cmp.eq p6,p0=r0,r0	// sets p6 (required because of // cmp.and)
+	sub tmp=64,tmp		// how many bits to shift our mask on the right
+	;;
+	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
+	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
+	;;
+	add base=-16,src	// keep track of aligned base
+	chk.s v[1], .recover	// if already NaT, then directly skip to recover
+	or v[1]=v[1],mask	// now we have a safe initial byte pattern
+	;;
+1:
+	ld8.s v[0]=[src],8	// speculatively load next
+	czx1.r val1=v[1]	// search 0 byte from right
+	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
+	;;
+	ld8.s w[0]=[src],8	// speculatively load next to next
+	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
+	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
+(p6)	br.wtop.dptk.few 1b	// loop until p6 == 0
+	;;
+	//
+	// We must return try the recovery code iff
+	// val1_is_nat || (val1==8 && val2_is_nat)
+	//
+	// XXX Fixme
+	//	- there must be a better way of doing the test
+	//
+	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
+	tnat.nz p6,p7=val1	// test NaT on val1
+(p6)	br.cond.spnt .recover	// jump to recovery if val1 is NaT
+	;;
+	//
+	// if we come here p7 is true, i.e., initialized for // cmp
+	//
+	cmp.eq.and  p7,p0=8,val1// val1==8?
+	tnat.nz.and p7,p0=val2	// test NaT if val2
+(p7)	br.cond.spnt .recover	// jump to recovery if val2 is NaT
+	;;
+(p8)	mov val1=val2		// val2 contains the value
+(p8)	adds src=-16,src	// correct position when 3 ahead
+(p9)	adds src=-24,src	// correct position when 4 ahead
+	;;
+	sub ret0=src,orig	// distance from origin
+	sub tmp=7,val1		// 7=8-1 because this strlen returns strlen+1
+	mov pr=saved_pr,0xffffffffffff0000
+	;;
+	sub ret0=ret0,tmp	// length=now - back -1
+	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
+	br.ret.sptk.many rp	// end of normal execution
+
+	//
+	// Outlined recovery code when speculation failed
+	//
+	// This time we don't use speculation and rely on the normal exception
+	// mechanism. that's why the loop is not as good as the previous one
+	// because read ahead is not possible
+	//
+	// XXX Fixme
+	//	- today we restart from the beginning of the string instead
+	//	  of trying to continue where we left off.
+	//
+.recover:
+	EX(.Lexit1, ld8 val=[base],8)	// load the initial bytes
+	;;
+	or val=val,mask			// remask first bytes
+	cmp.eq p0,p6=r0,r0		// nullify first ld8 in loop
+	;;
+	//
+	// ar.ec is still zero here
+	//
+2:
+	EX(.Lexit1, (p6) ld8 val=[base],8)
+	;;
+	czx1.r val1=val		// search 0 byte from right
+	;;
+	cmp.eq p6,p0=8,val1	// val1==8 ?
+(p6)	br.wtop.dptk.few 2b	// loop until p6 == 0
+	;;
+	sub ret0=base,orig	// distance from base
+	sub tmp=7,val1		// 7=8-1 because this strlen returns strlen+1
+	mov pr=saved_pr,0xffffffffffff0000
+	;;
+	sub ret0=ret0,tmp	// length=now - back -1
+	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
+	br.ret.sptk.many rp	// end of successful recovery code
+
+	//
+	// We failed even on the normal load (called from exception handler)
+	//
+.Lexit1:
+	mov ret0=0
+	mov pr=saved_pr,0xffffffffffff0000
+	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
+	br.ret.sptk.many rp
+END(__strlen_user)
--- a/kernel/arch/ia64/lib/strncpy_from_user.S
+++ b/kernel/arch/ia64/lib/strncpy_from_user.S
@@ -0,0 +1,44 @@
+/*
+ * Just like strncpy() except that if a fault occurs during copying,
+ * -EFAULT is returned.
+ *
+ * Inputs:
+ *	in0:	address of destination buffer
+ *	in1:	address of string to be copied
+ *	in2:	length of buffer in bytes
+ * Outputs:
+ *	r8:	-EFAULT in case of fault or number of bytes copied if no fault
+ *
+ * Copyright (C) 1998-2001 Hewlett-Packard Co
+ * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
+ *			 by Andreas Schwab <schwab@suse.de>).
+ */
+
+#include <asm/asmmacro.h>
+
+GLOBAL_ENTRY(__strncpy_from_user)
+	alloc r2=ar.pfs,3,0,0,0
+	mov r8=0
+	mov r9=in1
+	;;
+	add r10=in1,in2
+	cmp.eq p6,p0=r0,in2
+(p6)	br.ret.spnt.many rp
+
+	// XXX braindead copy loop---this needs to be optimized
+.Loop1:
+	EX(.Lexit, ld1 r8=[in1],1)
+	;;
+	EX(.Lexit, st1 [in0]=r8,1)
+	cmp.ne p6,p7=r8,r0
+	;;
+(p6)	cmp.ne.unc p8,p0=in1,r10
+(p8)	br.cond.dpnt.few .Loop1
+	;;
+(p6)	mov r8=in2		// buffer filled up---return buffer length
+(p7)	sub r8=in1,r9,1		// return string length (excluding NUL character)
+[.Lexit:]
+	br.ret.sptk.many rp
+END(__strncpy_from_user)
--- a/kernel/arch/ia64/lib/strnlen_user.S
+++ b/kernel/arch/ia64/lib/strnlen_user.S
@@ -0,0 +1,45 @@
+/*
+ * Returns 0 if exception before NUL or reaching the supplied limit (N),
+ * a value greater than N if the string is longer than the limit, else
+ * strlen.
+ *
+ * Inputs:
+ *	in0:	address of buffer
+ *	in1:	string length limit N
+ * Outputs:
+ *	r8:	0 in case of fault, strlen(buffer)+1 otherwise
+ *
+ * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+#include <asm/asmmacro.h>
+
+GLOBAL_ENTRY(__strnlen_user)
+	.prologue
+	alloc r2=ar.pfs,2,0,0,0
+	.save ar.lc, r16
+	mov r16=ar.lc			// preserve ar.lc
+
+	.body
+
+	add r3=-1,in1
+	;;
+	mov ar.lc=r3
+	mov r9=0
+	;;
+	// XXX braindead strlen loop---this needs to be optimized
+.Loop1:
+	EXCLR(.Lexit, ld1 r8=[in0],1)
+	add r9=1,r9
+	;;
+	cmp.eq p6,p0=r8,r0
+(p6)	br.cond.dpnt .Lexit
+	br.cloop.dptk.few .Loop1
+
+	add r9=1,in1			// NUL not found---return N+1
+	;;
+.Lexit:
+	mov r8=r9
+	mov ar.lc=r16			// restore ar.lc
+	br.ret.sptk.many rp
+END(__strnlen_user)
--- a/kernel/arch/ia64/lib/xor.S
+++ b/kernel/arch/ia64/lib/xor.S
@@ -0,0 +1,184 @@
+/*
+ * arch/ia64/lib/xor.S
+ *
+ * Optimized RAID-5 checksumming functions for IA-64.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <asm/asmmacro.h>
+
+GLOBAL_ENTRY(xor_ia64_2)
+	.prologue
+	.fframe 0
+	.save ar.pfs, r31
+	alloc r31 = ar.pfs, 3, 0, 13, 16
+	.save ar.lc, r30
+	mov r30 = ar.lc
+	.save pr, r29
+	mov r29 = pr
+	;;
+	.body
+	mov r8 = in1
+	mov ar.ec = 6 + 2
+	shr in0 = in0, 3
+	;;
+	adds in0 = -1, in0
+	mov r16 = in1
+	mov r17 = in2
+	;;
+	mov ar.lc = in0
+	mov pr.rot = 1 << 16
+	;;
+	.rotr s1[6+1], s2[6+1], d[2]
+	.rotp p[6+2]
+0:
+(p[0])	ld8.nta s1[0] = [r16], 8
+(p[0])	ld8.nta s2[0] = [r17], 8
+(p[6])	xor d[0] = s1[6], s2[6]
+(p[6+1])st8.nta [r8] = d[1], 8
+	nop.f 0
+	br.ctop.dptk.few 0b
+	;;
+	mov ar.lc = r30
+	mov pr = r29, -1
+	br.ret.sptk.few rp
+END(xor_ia64_2)
+
+GLOBAL_ENTRY(xor_ia64_3)
+	.prologue
+	.fframe 0
+	.save ar.pfs, r31
+	alloc r31 = ar.pfs, 4, 0, 20, 24
+	.save ar.lc, r30
+	mov r30 = ar.lc
+	.save pr, r29
+	mov r29 = pr
+	;;
+	.body
+	mov r8 = in1
+	mov ar.ec = 6 + 2
+	shr in0 = in0, 3
+	;;
+	adds in0 = -1, in0
+	mov r16 = in1
+	mov r17 = in2
+	;;
+	mov r18 = in3
+	mov ar.lc = in0
+	mov pr.rot = 1 << 16
+	;;
+	.rotr s1[6+1], s2[6+1], s3[6+1], d[2]
+	.rotp p[6+2]
+0:
+(p[0])	ld8.nta s1[0] = [r16], 8
+(p[0])	ld8.nta s2[0] = [r17], 8
+(p[6])	xor d[0] = s1[6], s2[6]
+	;;
+(p[0])	ld8.nta s3[0] = [r18], 8
+(p[6+1])st8.nta [r8] = d[1], 8
+(p[6])	xor d[0] = d[0], s3[6]
+	br.ctop.dptk.few 0b
+	;;
+	mov ar.lc = r30
+	mov pr = r29, -1
+	br.ret.sptk.few rp
+END(xor_ia64_3)
+
+GLOBAL_ENTRY(xor_ia64_4)
+	.prologue
+	.fframe 0
+	.save ar.pfs, r31
+	alloc r31 = ar.pfs, 5, 0, 27, 32
+	.save ar.lc, r30
+	mov r30 = ar.lc
+	.save pr, r29
+	mov r29 = pr
+	;;
+	.body
+	mov r8 = in1
+	mov ar.ec = 6 + 2
+	shr in0 = in0, 3
+	;;
+	adds in0 = -1, in0
+	mov r16 = in1
+	mov r17 = in2
+	;;
+	mov r18 = in3
+	mov ar.lc = in0
+	mov pr.rot = 1 << 16
+	mov r19 = in4
+	;;
+	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
+	.rotp p[6+2]
+0:
+(p[0])	ld8.nta s1[0] = [r16], 8
+(p[0])	ld8.nta s2[0] = [r17], 8
+(p[6])	xor d[0] = s1[6], s2[6]
+(p[0])	ld8.nta s3[0] = [r18], 8
+(p[0])	ld8.nta s4[0] = [r19], 8
+(p[6])	xor r20 = s3[6], s4[6]
+	;;
+(p[6+1])st8.nta [r8] = d[1], 8
+(p[6])	xor d[0] = d[0], r20
+	br.ctop.dptk.few 0b
+	;;
+	mov ar.lc = r30
+	mov pr = r29, -1
+	br.ret.sptk.few rp
+END(xor_ia64_4)
+
+GLOBAL_ENTRY(xor_ia64_5)
+	.prologue
+	.fframe 0
+	.save ar.pfs, r31
+	alloc r31 = ar.pfs, 6, 0, 34, 40
+	.save ar.lc, r30
+	mov r30 = ar.lc
+	.save pr, r29
+	mov r29 = pr
+	;;
+	.body
+	mov r8 = in1
+	mov ar.ec = 6 + 2
+	shr in0 = in0, 3
+	;;
+	adds in0 = -1, in0
+	mov r16 = in1
+	mov r17 = in2
+	;;
+	mov r18 = in3
+	mov ar.lc = in0
+	mov pr.rot = 1 << 16
+	mov r19 = in4
+	mov r20 = in5
+	;;
+	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
+	.rotp p[6+2]
+0:
+(p[0])	ld8.nta s1[0] = [r16], 8
+(p[0])	ld8.nta s2[0] = [r17], 8
+(p[6])	xor d[0] = s1[6], s2[6]
+(p[0])	ld8.nta s3[0] = [r18], 8
+(p[0])	ld8.nta s4[0] = [r19], 8
+(p[6])	xor r21 = s3[6], s4[6]
+	;;
+(p[0])	ld8.nta s5[0] = [r20], 8
+(p[6+1])st8.nta [r8] = d[1], 8
+(p[6])	xor d[0] = d[0], r21
+	;;
+(p[6])	  xor d[0] = d[0], s5[6]
+	nop.f 0
+	br.ctop.dptk.few 0b
+	;;
+	mov ar.lc = r30
+	mov pr = r29, -1
+	br.ret.sptk.few rp
+END(xor_ia64_5)