add idl4k kernel firmware version 1.13.0.105

This commit is contained in:
Jaroslav Kysela
2015-03-26 17:22:37 +01:00
parent 5194d2792e
commit e9070cdc77
31064 changed files with 12769984 additions and 0 deletions

View File

@@ -0,0 +1,50 @@
#
# Makefile for ia64-specific library routines..
#
obj-y := io.o
lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \
checksum.o clear_page.o csum_partial_copy.o \
clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \
flush.o ip_fast_csum.o do_csum.o \
memset.o strlen.o xor.o
obj-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o
obj-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o
lib-$(CONFIG_PERFMON) += carta_random.o
AFLAGS___divdi3.o =
AFLAGS___udivdi3.o = -DUNSIGNED
AFLAGS___moddi3.o = -DMODULO
AFLAGS___umoddi3.o = -DUNSIGNED -DMODULO
AFLAGS___divsi3.o =
AFLAGS___udivsi3.o = -DUNSIGNED
AFLAGS___modsi3.o = -DMODULO
AFLAGS___umodsi3.o = -DUNSIGNED -DMODULO
$(obj)/__divdi3.o: $(src)/idiv64.S FORCE
$(call if_changed_dep,as_o_S)
$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE
$(call if_changed_dep,as_o_S)
$(obj)/__moddi3.o: $(src)/idiv64.S FORCE
$(call if_changed_dep,as_o_S)
$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE
$(call if_changed_dep,as_o_S)
$(obj)/__divsi3.o: $(src)/idiv32.S FORCE
$(call if_changed_dep,as_o_S)
$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE
$(call if_changed_dep,as_o_S)
$(obj)/__modsi3.o: $(src)/idiv32.S FORCE
$(call if_changed_dep,as_o_S)
$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE
$(call if_changed_dep,as_o_S)

View File

@@ -0,0 +1,54 @@
/*
* Fast, simple, yet decent quality random number generator based on
* a paper by David G. Carta ("Two Fast Implementations of the
* `Minimal Standard' Random Number Generator," Communications of the
* ACM, January, 1990).
*
* Copyright (C) 2002 Hewlett-Packard Co
* David Mosberger-Tang <davidm@hpl.hp.com>
*/
#include <asm/asmmacro.h>
#define a r2
#define m r3
#define lo r8
#define hi r9
#define t0 r16
#define t1 r17
#define seed r32
GLOBAL_ENTRY(carta_random32)
movl a = (16807 << 16) | 16807
;;
pmpyshr2.u t0 = a, seed, 0
pmpyshr2.u t1 = a, seed, 16
;;
unpack2.l t0 = t1, t0
dep m = -1, r0, 0, 31
;;
zxt4 lo = t0
shr.u hi = t0, 32
;;
dep t0 = 0, hi, 15, 49 // t0 = (hi & 0x7fff)
;;
shl t0 = t0, 16 // t0 = (hi & 0x7fff) << 16
shr t1 = hi, 15 // t1 = (hi >> 15)
;;
add lo = lo, t0
;;
cmp.gtu p6, p0 = lo, m
;;
(p6) and lo = lo, m
;;
(p6) add lo = 1, lo
;;
add lo = lo, t1
;;
cmp.gtu p6, p0 = lo, m
;;
(p6) and lo = lo, m
;;
(p6) add lo = 1, lo
br.ret.sptk.many rp
END(carta_random32)

View File

@@ -0,0 +1,101 @@
/*
* Network checksum routines
*
* Copyright (C) 1999, 2003 Hewlett-Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
*
* Most of the code coming from arch/alpha/lib/checksum.c
*
* This file contains network checksum routines that are better done
* in an architecture-specific manner due to speed..
*/
#include <linux/module.h>
#include <linux/string.h>
#include <asm/byteorder.h>
static inline unsigned short
from64to16 (unsigned long x)
{
/* add up 32-bit words for 33 bits */
x = (x & 0xffffffff) + (x >> 32);
/* add up 16-bit and 17-bit words for 17+c bits */
x = (x & 0xffff) + (x >> 16);
/* add up 16-bit and 2-bit for 16+c bit */
x = (x & 0xffff) + (x >> 16);
/* add up carry.. */
x = (x & 0xffff) + (x >> 16);
return x;
}
/*
* computes the checksum of the TCP/UDP pseudo-header
* returns a 16-bit checksum, already complemented.
*/
__sum16
csum_tcpudp_magic (__be32 saddr, __be32 daddr, unsigned short len,
unsigned short proto, __wsum sum)
{
return (__force __sum16)~from64to16(
(__force u64)saddr + (__force u64)daddr +
(__force u64)sum + ((len + proto) << 8));
}
EXPORT_SYMBOL(csum_tcpudp_magic);
__wsum
csum_tcpudp_nofold (__be32 saddr, __be32 daddr, unsigned short len,
unsigned short proto, __wsum sum)
{
unsigned long result;
result = (__force u64)saddr + (__force u64)daddr +
(__force u64)sum + ((len + proto) << 8);
/* Fold down to 32-bits so we don't lose in the typedef-less network stack. */
/* 64 to 33 */
result = (result & 0xffffffff) + (result >> 32);
/* 33 to 32 */
result = (result & 0xffffffff) + (result >> 32);
return (__force __wsum)result;
}
EXPORT_SYMBOL(csum_tcpudp_nofold);
extern unsigned long do_csum (const unsigned char *, long);
/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
* returns a 32-bit number suitable for feeding into itself
* or csum_tcpudp_magic
*
* this function must be called with even lengths, except
* for the last fragment, which may be odd
*
* it's best to have buff aligned on a 32-bit boundary
*/
__wsum csum_partial(const void *buff, int len, __wsum sum)
{
u64 result = do_csum(buff, len);
/* add in old sum, and carry.. */
result += (__force u32)sum;
/* 32+c bits -> 32 bits */
result = (result & 0xffffffff) + (result >> 32);
return (__force __wsum)result;
}
EXPORT_SYMBOL(csum_partial);
/*
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
*/
__sum16 ip_compute_csum (const void *buff, int len)
{
return (__force __sum16)~do_csum(buff,len);
}
EXPORT_SYMBOL(ip_compute_csum);

View File

@@ -0,0 +1,76 @@
/*
* Copyright (C) 1999-2002 Hewlett-Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
* David Mosberger-Tang <davidm@hpl.hp.com>
* Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
*
* 1/06/01 davidm Tuned for Itanium.
* 2/12/02 kchen Tuned for both Itanium and McKinley
* 3/08/02 davidm Some more tweaking
*/
#include <asm/asmmacro.h>
#include <asm/page.h>
#ifdef CONFIG_ITANIUM
# define L3_LINE_SIZE 64 // Itanium L3 line size
# define PREFETCH_LINES 9 // magic number
#else
# define L3_LINE_SIZE 128 // McKinley L3 line size
# define PREFETCH_LINES 12 // magic number
#endif
#define saved_lc r2
#define dst_fetch r3
#define dst1 r8
#define dst2 r9
#define dst3 r10
#define dst4 r11
#define dst_last r31
GLOBAL_ENTRY(clear_page)
.prologue
.regstk 1,0,0,0
mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until
.save ar.lc, saved_lc
mov saved_lc = ar.lc
.body
mov ar.lc = (PREFETCH_LINES - 1)
mov dst_fetch = in0
adds dst1 = 16, in0
adds dst2 = 32, in0
;;
.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
adds dst3 = 48, in0 // executing this multiple times is harmless
br.cloop.sptk.few .fetch
;;
addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
mov ar.lc = r16 // one L3 line per iteration
adds dst4 = 64, in0
;;
#ifdef CONFIG_ITANIUM
// Optimized for Itanium
1: stf.spill.nta [dst1] = f0, 64
stf.spill.nta [dst2] = f0, 64
cmp.lt p8,p0=dst_fetch, dst_last
;;
#else
// Optimized for McKinley
1: stf.spill.nta [dst1] = f0, 64
stf.spill.nta [dst2] = f0, 64
stf.spill.nta [dst3] = f0, 64
stf.spill.nta [dst4] = f0, 128
cmp.lt p8,p0=dst_fetch, dst_last
;;
stf.spill.nta [dst1] = f0, 64
stf.spill.nta [dst2] = f0, 64
#endif
stf.spill.nta [dst3] = f0, 64
(p8) stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
br.cloop.sptk.few 1b
;;
mov ar.lc = saved_lc // restore lc
br.ret.sptk.many rp
END(clear_page)

View File

@@ -0,0 +1,209 @@
/*
* This routine clears to zero a linear memory buffer in user space.
*
* Inputs:
* in0: address of buffer
* in1: length of buffer in bytes
* Outputs:
* r8: number of bytes that didn't get cleared due to a fault
*
* Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
*/
#include <asm/asmmacro.h>
//
// arguments
//
#define buf r32
#define len r33
//
// local registers
//
#define cnt r16
#define buf2 r17
#define saved_lc r18
#define saved_pfs r19
#define tmp r20
#define len2 r21
#define len3 r22
//
// Theory of operations:
// - we check whether or not the buffer is small, i.e., less than 17
// in which case we do the byte by byte loop.
//
// - Otherwise we go progressively from 1 byte store to 8byte store in
// the head part, the body is a 16byte store loop and we finish we the
// tail for the last 15 bytes.
// The good point about this breakdown is that the long buffer handling
// contains only 2 branches.
//
// The reason for not using shifting & masking for both the head and the
// tail is to stay semantically correct. This routine is not supposed
// to write bytes outside of the buffer. While most of the time this would
// be ok, we can't tolerate a mistake. A classical example is the case
// of multithreaded code were to the extra bytes touched is actually owned
// by another thread which runs concurrently to ours. Another, less likely,
// example is with device drivers where reading an I/O mapped location may
// have side effects (same thing for writing).
//
GLOBAL_ENTRY(__do_clear_user)
.prologue
.save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,2,0,0,0
cmp.eq p6,p0=r0,len // check for zero length
.save ar.lc, saved_lc
mov saved_lc=ar.lc // preserve ar.lc (slow)
.body
;; // avoid WAW on CFM
adds tmp=-1,len // br.ctop is repeat/until
mov ret0=len // return value is length at this point
(p6) br.ret.spnt.many rp
;;
cmp.lt p6,p0=16,len // if len > 16 then long memset
mov ar.lc=tmp // initialize lc for small count
(p6) br.cond.dptk .long_do_clear
;; // WAR on ar.lc
//
// worst case 16 iterations, avg 8 iterations
//
// We could have played with the predicates to use the extra
// M slot for 2 stores/iteration but the cost the initialization
// the various counters compared to how long the loop is supposed
// to last on average does not make this solution viable.
//
1:
EX( .Lexit1, st1 [buf]=r0,1 )
adds len=-1,len // countdown length using len
br.cloop.dptk 1b
;; // avoid RAW on ar.lc
//
// .Lexit4: comes from byte by byte loop
// len contains bytes left
.Lexit1:
mov ret0=len // faster than using ar.lc
mov ar.lc=saved_lc
br.ret.sptk.many rp // end of short clear_user
//
// At this point we know we have more than 16 bytes to copy
// so we focus on alignment (no branches required)
//
// The use of len/len2 for countdown of the number of bytes left
// instead of ret0 is due to the fact that the exception code
// changes the values of r8.
//
.long_do_clear:
tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear)
;;
EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned
(p6) adds len=-1,len;; // sync because buf is modified
tbit.nz p6,p0=buf,1
;;
EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned
(p6) adds len=-2,len;;
tbit.nz p6,p0=buf,2
;;
EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned
(p6) adds len=-4,len;;
tbit.nz p6,p0=buf,3
;;
EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned
(p6) adds len=-8,len;;
shr.u cnt=len,4 // number of 128-bit (2x64bit) words
;;
cmp.eq p6,p0=r0,cnt
adds tmp=-1,cnt
(p6) br.cond.dpnt .dotail // we have less than 16 bytes left
;;
adds buf2=8,buf // setup second base pointer
mov ar.lc=tmp
;;
//
// 16bytes/iteration core loop
//
// The second store can never generate a fault because
// we come into the loop only when we are 16-byte aligned.
// This means that if we cross a page then it will always be
// in the first store and never in the second.
//
//
// We need to keep track of the remaining length. A possible (optimistic)
// way would be to use ar.lc and derive how many byte were left by
// doing : left= 16*ar.lc + 16. this would avoid the addition at
// every iteration.
// However we need to keep the synchronization point. A template
// M;;MB does not exist and thus we can keep the addition at no
// extra cycle cost (use a nop slot anyway). It also simplifies the
// (unlikely) error recovery code
//
2: EX(.Lexit3, st8 [buf]=r0,16 )
;; // needed to get len correct when error
st8 [buf2]=r0,16
adds len=-16,len
br.cloop.dptk 2b
;;
mov ar.lc=saved_lc
//
// tail correction based on len only
//
// We alternate the use of len3,len2 to allow parallelism and correct
// error handling. We also reuse p6/p7 to return correct value.
// The addition of len2/len3 does not cost anything more compared to
// the regular memset as we had empty slots.
//
.dotail:
mov len2=len // for parallelization of error handling
mov len3=len
tbit.nz p6,p0=len,3
;;
EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes
(p6) adds len3=-8,len2
tbit.nz p7,p6=len,2
;;
EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes
(p7) adds len2=-4,len3
tbit.nz p6,p7=len,1
;;
EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes
(p6) adds len3=-2,len2
tbit.nz p7,p6=len,0
;;
EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left
mov ret0=r0 // success
br.ret.sptk.many rp // end of most likely path
//
// Outlined error handling code
//
//
// .Lexit3: comes from core loop, need restore pr/lc
// len contains bytes left
//
//
// .Lexit2:
// if p6 -> coming from st8 or st2 : len2 contains what's left
// if p7 -> coming from st4 or st1 : len3 contains what's left
// We must restore lc/pr even though might not have been used.
.Lexit2:
.pred.rel "mutex", p6, p7
(p6) mov len=len2
(p7) mov len=len3
;;
//
// .Lexit4: comes from head, need not restore pr/lc
// len contains bytes left
//
.Lexit3:
mov ret0=len
mov ar.lc=saved_lc
br.ret.sptk.many rp
END(__do_clear_user)

View File

@@ -0,0 +1,98 @@
/*
*
* Optimized version of the standard copy_page() function
*
* Inputs:
* in0: address of target page
* in1: address of source page
* Output:
* no return value
*
* Copyright (C) 1999, 2001 Hewlett-Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
* David Mosberger <davidm@hpl.hp.com>
*
* 4/06/01 davidm Tuned to make it perform well both for cached and uncached copies.
*/
#include <asm/asmmacro.h>
#include <asm/page.h>
#define PIPE_DEPTH 3
#define EPI p[PIPE_DEPTH-1]
#define lcount r16
#define saved_pr r17
#define saved_lc r18
#define saved_pfs r19
#define src1 r20
#define src2 r21
#define tgt1 r22
#define tgt2 r23
#define srcf r24
#define tgtf r25
#define tgt_last r26
#define Nrot ((8*PIPE_DEPTH+7)&~7)
GLOBAL_ENTRY(copy_page)
.prologue
.save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
.rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
.rotp p[PIPE_DEPTH]
.save ar.lc, saved_lc
mov saved_lc=ar.lc
mov ar.ec=PIPE_DEPTH
mov lcount=PAGE_SIZE/64-1
.save pr, saved_pr
mov saved_pr=pr
mov pr.rot=1<<16
.body
mov src1=in1
adds src2=8,in1
mov tgt_last = PAGE_SIZE
;;
adds tgt2=8,in0
add srcf=512,in1
mov ar.lc=lcount
mov tgt1=in0
add tgtf=512,in0
add tgt_last = tgt_last, in0
;;
1:
(p[0]) ld8 t1[0]=[src1],16
(EPI) st8 [tgt1]=t1[PIPE_DEPTH-1],16
(p[0]) ld8 t2[0]=[src2],16
(EPI) st8 [tgt2]=t2[PIPE_DEPTH-1],16
cmp.ltu p6,p0 = tgtf, tgt_last
;;
(p[0]) ld8 t3[0]=[src1],16
(EPI) st8 [tgt1]=t3[PIPE_DEPTH-1],16
(p[0]) ld8 t4[0]=[src2],16
(EPI) st8 [tgt2]=t4[PIPE_DEPTH-1],16
;;
(p[0]) ld8 t5[0]=[src1],16
(EPI) st8 [tgt1]=t5[PIPE_DEPTH-1],16
(p[0]) ld8 t6[0]=[src2],16
(EPI) st8 [tgt2]=t6[PIPE_DEPTH-1],16
;;
(p[0]) ld8 t7[0]=[src1],16
(EPI) st8 [tgt1]=t7[PIPE_DEPTH-1],16
(p[0]) ld8 t8[0]=[src2],16
(EPI) st8 [tgt2]=t8[PIPE_DEPTH-1],16
(p6) lfetch [srcf], 64
(p6) lfetch [tgtf], 64
br.ctop.sptk.few 1b
;;
mov pr=saved_pr,0xffffffffffff0000 // restore predicates
mov ar.pfs=saved_pfs
mov ar.lc=saved_lc
br.ret.sptk.many rp
END(copy_page)

View File

@@ -0,0 +1,185 @@
/*
* McKinley-optimized version of copy_page().
*
* Copyright (C) 2002 Hewlett-Packard Co
* David Mosberger <davidm@hpl.hp.com>
*
* Inputs:
* in0: address of target page
* in1: address of source page
* Output:
* no return value
*
* General idea:
* - use regular loads and stores to prefetch data to avoid consuming M-slot just for
* lfetches => good for in-cache performance
* - avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
* cycle
*
* Principle of operation:
* First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
* To avoid secondary misses in L2, we prefetch both source and destination with a line-size
* of 128 bytes. When both of these lines are in the L2 and the first half of the
* source line is in L1, we start copying the remaining words. The second half of the
* source line is prefetched in an earlier iteration, so that by the time we start
* accessing it, it's also present in the L1.
*
* We use a software-pipelined loop to control the overall operation. The pipeline
* has 2*PREFETCH_DIST+K stages. The first PREFETCH_DIST stages are used for prefetching
* source cache-lines. The second PREFETCH_DIST stages are used for prefetching destination
* cache-lines, the last K stages are used to copy the cache-line words not copied by
* the prefetches. The four relevant points in the pipelined are called A, B, C, D:
* p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
* should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
* into L1D and p[D] is TRUE if a cacheline needs to be copied.
*
* This all sounds very complicated, but thanks to the modulo-scheduled loop support,
* the resulting code is very regular and quite easy to follow (once you get the idea).
*
* As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
* as the separate .prefetch_loop. Logically, this loop performs exactly like the
* main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
* so that each loop iteration is faster (again, good for cached case).
*
* When reading the code, it helps to keep the following picture in mind:
*
* word 0 word 1
* +------+------+---
* | v[x] | t1 | ^
* | t2 | t3 | |
* | t4 | t5 | |
* | t6 | t7 | | 128 bytes
* | n[y] | t9 | | (L2 cache line)
* | t10 | t11 | |
* | t12 | t13 | |
* | t14 | t15 | v
* +------+------+---
*
* Here, v[x] is copied by the (memory) prefetch. n[y] is loaded at p[C]
* to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
* an order that avoids bank conflicts.
*/
#include <asm/asmmacro.h>
#include <asm/page.h>
#define PREFETCH_DIST 8 // McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
#define src0 r2
#define src1 r3
#define dst0 r9
#define dst1 r10
#define src_pre_mem r11
#define dst_pre_mem r14
#define src_pre_l2 r15
#define dst_pre_l2 r16
#define t1 r17
#define t2 r18
#define t3 r19
#define t4 r20
#define t5 t1 // alias!
#define t6 t2 // alias!
#define t7 t3 // alias!
#define t9 t5 // alias!
#define t10 t4 // alias!
#define t11 t7 // alias!
#define t12 t6 // alias!
#define t14 t10 // alias!
#define t13 r21
#define t15 r22
#define saved_lc r23
#define saved_pr r24
#define A 0
#define B (PREFETCH_DIST)
#define C (B + PREFETCH_DIST)
#define D (C + 3)
#define N (D + 1)
#define Nrot ((N + 7) & ~7)
GLOBAL_ENTRY(copy_page)
.prologue
alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
.rotr v[2*PREFETCH_DIST], n[D-C+1]
.rotp p[N]
.save ar.lc, saved_lc
mov saved_lc = ar.lc
.save pr, saved_pr
mov saved_pr = pr
.body
mov src_pre_mem = in1
mov pr.rot = 0x10000
mov ar.ec = 1 // special unrolled loop
mov dst_pre_mem = in0
mov ar.lc = 2*PREFETCH_DIST - 1
add src_pre_l2 = 8*8, in1
add dst_pre_l2 = 8*8, in0
add src0 = 8, in1 // first t1 src
add src1 = 3*8, in1 // first t3 src
add dst0 = 8, in0 // first t1 dst
add dst1 = 3*8, in0 // first t3 dst
mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
nop.m 0
nop.i 0
;;
// same as .line_copy loop, but with all predicated-off instructions removed:
.prefetch_loop:
(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0
(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2
br.ctop.sptk .prefetch_loop
;;
cmp.eq p16, p0 = r0, r0 // reset p16 to 1 (br.ctop cleared it to zero)
mov ar.lc = t1 // with 64KB pages, t1 is too big to fit in 8 bits!
mov ar.ec = N // # of stages in pipeline
;;
.line_copy:
(p[D]) ld8 t2 = [src0], 3*8 // M0
(p[D]) ld8 t4 = [src1], 3*8 // M1
(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 prefetch dst from memory
(p[D]) st8 [dst_pre_l2] = n[D-C], 128 // M3 prefetch dst from L2
;;
(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 prefetch src from memory
(p[C]) ld8 n[0] = [src_pre_l2], 128 // M1 prefetch src from L2
(p[D]) st8 [dst0] = t1, 8 // M2
(p[D]) st8 [dst1] = t3, 8 // M3
;;
(p[D]) ld8 t5 = [src0], 8
(p[D]) ld8 t7 = [src1], 3*8
(p[D]) st8 [dst0] = t2, 3*8
(p[D]) st8 [dst1] = t4, 3*8
;;
(p[D]) ld8 t6 = [src0], 3*8
(p[D]) ld8 t10 = [src1], 8
(p[D]) st8 [dst0] = t5, 8
(p[D]) st8 [dst1] = t7, 3*8
;;
(p[D]) ld8 t9 = [src0], 3*8
(p[D]) ld8 t11 = [src1], 3*8
(p[D]) st8 [dst0] = t6, 3*8
(p[D]) st8 [dst1] = t10, 8
;;
(p[D]) ld8 t12 = [src0], 8
(p[D]) ld8 t14 = [src1], 8
(p[D]) st8 [dst0] = t9, 3*8
(p[D]) st8 [dst1] = t11, 3*8
;;
(p[D]) ld8 t13 = [src0], 4*8
(p[D]) ld8 t15 = [src1], 4*8
(p[D]) st8 [dst0] = t12, 8
(p[D]) st8 [dst1] = t14, 8
;;
(p[D-1])ld8 t1 = [src0], 8
(p[D-1])ld8 t3 = [src1], 8
(p[D]) st8 [dst0] = t13, 4*8
(p[D]) st8 [dst1] = t15, 4*8
br.ctop.sptk .line_copy
;;
mov ar.lc = saved_lc
mov pr = saved_pr, -1
br.ret.sptk.many rp
END(copy_page)

View File

@@ -0,0 +1,610 @@
/*
*
* Optimized version of the copy_user() routine.
* It is used to copy date across the kernel/user boundary.
*
* The source and destination are always on opposite side of
* the boundary. When reading from user space we must catch
* faults on loads. When writing to user space we must catch
* errors on stores. Note that because of the nature of the copy
* we don't need to worry about overlapping regions.
*
*
* Inputs:
* in0 address of source buffer
* in1 address of destination buffer
* in2 number of bytes to copy
*
* Outputs:
* ret0 0 in case of success. The number of bytes NOT copied in
* case of error.
*
* Copyright (C) 2000-2001 Hewlett-Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
*
* Fixme:
* - handle the case where we have more than 16 bytes and the alignment
* are different.
* - more benchmarking
* - fix extraneous stop bit introduced by the EX() macro.
*/
#include <asm/asmmacro.h>
//
// Tuneable parameters
//
#define COPY_BREAK 16 // we do byte copy below (must be >=16)
#define PIPE_DEPTH 21 // pipe depth
#define EPI p[PIPE_DEPTH-1]
//
// arguments
//
#define dst in0
#define src in1
#define len in2
//
// local registers
//
#define t1 r2 // rshift in bytes
#define t2 r3 // lshift in bytes
#define rshift r14 // right shift in bits
#define lshift r15 // left shift in bits
#define word1 r16
#define word2 r17
#define cnt r18
#define len2 r19
#define saved_lc r20
#define saved_pr r21
#define tmp r22
#define val r23
#define src1 r24
#define dst1 r25
#define src2 r26
#define dst2 r27
#define len1 r28
#define enddst r29
#define endsrc r30
#define saved_pfs r31
GLOBAL_ENTRY(__copy_user)
.prologue
.save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
.rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
.rotp p[PIPE_DEPTH]
adds len2=-1,len // br.ctop is repeat/until
mov ret0=r0
;; // RAW of cfm when len=0
cmp.eq p8,p0=r0,len // check for zero length
.save ar.lc, saved_lc
mov saved_lc=ar.lc // preserve ar.lc (slow)
(p8) br.ret.spnt.many rp // empty mempcy()
;;
add enddst=dst,len // first byte after end of source
add endsrc=src,len // first byte after end of destination
.save pr, saved_pr
mov saved_pr=pr // preserve predicates
.body
mov dst1=dst // copy because of rotation
mov ar.ec=PIPE_DEPTH
mov pr.rot=1<<16 // p16=true all others are false
mov src1=src // copy because of rotation
mov ar.lc=len2 // initialize lc for small count
cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy
xor tmp=src,dst // same alignment test prepare
(p10) br.cond.dptk .long_copy_user
;; // RAW pr.rot/p16 ?
//
// Now we do the byte by byte loop with software pipeline
//
// p7 is necessarily false by now
1:
EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
br.ctop.dptk.few 1b
;;
mov ar.lc=saved_lc
mov pr=saved_pr,0xffffffffffff0000
mov ar.pfs=saved_pfs // restore ar.ec
br.ret.sptk.many rp // end of short memcpy
//
// Not 8-byte aligned
//
.diff_align_copy_user:
// At this point we know we have more than 16 bytes to copy
// and also that src and dest do _not_ have the same alignment.
and src2=0x7,src1 // src offset
and dst2=0x7,dst1 // dst offset
;;
// The basic idea is that we copy byte-by-byte at the head so
// that we can reach 8-byte alignment for both src1 and dst1.
// Then copy the body using software pipelined 8-byte copy,
// shifting the two back-to-back words right and left, then copy
// the tail by copying byte-by-byte.
//
// Fault handling. If the byte-by-byte at the head fails on the
// load, then restart and finish the pipleline by copying zeros
// to the dst1. Then copy zeros for the rest of dst1.
// If 8-byte software pipeline fails on the load, do the same as
// failure_in3 does. If the byte-by-byte at the tail fails, it is
// handled simply by failure_in_pipe1.
//
// The case p14 represents the source has more bytes in the
// the first word (by the shifted part), whereas the p15 needs to
// copy some bytes from the 2nd word of the source that has the
// tail of the 1st of the destination.
//
//
// Optimization. If dst1 is 8-byte aligned (quite common), we don't need
// to copy the head to dst1, to start 8-byte copy software pipeline.
// We know src1 is not 8-byte aligned in this case.
//
cmp.eq p14,p15=r0,dst2
(p15) br.cond.spnt 1f
;;
sub t1=8,src2
mov t2=src2
;;
shl rshift=t2,3
sub len1=len,t1 // set len1
;;
sub lshift=64,rshift
;;
br.cond.spnt .word_copy_user
;;
1:
cmp.leu p14,p15=src2,dst2
sub t1=dst2,src2
;;
.pred.rel "mutex", p14, p15
(p14) sub word1=8,src2 // (8 - src offset)
(p15) sub t1=r0,t1 // absolute value
(p15) sub word1=8,dst2 // (8 - dst offset)
;;
// For the case p14, we don't need to copy the shifted part to
// the 1st word of destination.
sub t2=8,t1
(p14) sub word1=word1,t1
;;
sub len1=len,word1 // resulting len
(p15) shl rshift=t1,3 // in bits
(p14) shl rshift=t2,3
;;
(p14) sub len1=len1,t1
adds cnt=-1,word1
;;
sub lshift=64,rshift
mov ar.ec=PIPE_DEPTH
mov pr.rot=1<<16 // p16=true all others are false
mov ar.lc=cnt
;;
2:
EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
br.ctop.dptk.few 2b
;;
clrrrb
;;
.word_copy_user:
cmp.gtu p9,p0=16,len1
(p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy
;;
shr.u cnt=len1,3 // number of 64-bit words
;;
adds cnt=-1,cnt
;;
.pred.rel "mutex", p14, p15
(p14) sub src1=src1,t2
(p15) sub src1=src1,t1
//
// Now both src1 and dst1 point to an 8-byte aligned address. And
// we have more than 8 bytes to copy.
//
mov ar.lc=cnt
mov ar.ec=PIPE_DEPTH
mov pr.rot=1<<16 // p16=true all others are false
;;
3:
//
// The pipleline consists of 3 stages:
// 1 (p16): Load a word from src1
// 2 (EPI_1): Shift right pair, saving to tmp
// 3 (EPI): Store tmp to dst1
//
// To make it simple, use at least 2 (p16) loops to set up val1[n]
// because we need 2 back-to-back val1[] to get tmp.
// Note that this implies EPI_2 must be p18 or greater.
//
#define EPI_1 p[PIPE_DEPTH-2]
#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift
#define CASE(pred, shift) \
(pred) br.cond.spnt .copy_user_bit##shift
#define BODY(rshift) \
.copy_user_bit##rshift: \
1: \
EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \
(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
EX(3f,(p16) ld8 val1[1]=[src1],8); \
(p16) mov val1[0]=r0; \
br.ctop.dptk 1b; \
;; \
br.cond.sptk.many .diff_align_do_tail; \
2: \
(EPI) st8 [dst1]=tmp,8; \
(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
3: \
(p16) mov val1[1]=r0; \
(p16) mov val1[0]=r0; \
br.ctop.dptk 2b; \
;; \
br.cond.sptk.many .failure_in2
//
// Since the instruction 'shrp' requires a fixed 128-bit value
// specifying the bits to shift, we need to provide 7 cases
// below.
//
SWITCH(p6, 8)
SWITCH(p7, 16)
SWITCH(p8, 24)
SWITCH(p9, 32)
SWITCH(p10, 40)
SWITCH(p11, 48)
SWITCH(p12, 56)
;;
CASE(p6, 8)
CASE(p7, 16)
CASE(p8, 24)
CASE(p9, 32)
CASE(p10, 40)
CASE(p11, 48)
CASE(p12, 56)
;;
BODY(8)
BODY(16)
BODY(24)
BODY(32)
BODY(40)
BODY(48)
BODY(56)
;;
.diff_align_do_tail:
.pred.rel "mutex", p14, p15
(p14) sub src1=src1,t1
(p14) adds dst1=-8,dst1
(p15) sub dst1=dst1,t1
;;
4:
// Tail correction.
//
// The problem with this piplelined loop is that the last word is not
// loaded and thus parf of the last word written is not correct.
// To fix that, we simply copy the tail byte by byte.
sub len1=endsrc,src1,1
clrrrb
;;
mov ar.ec=PIPE_DEPTH
mov pr.rot=1<<16 // p16=true all others are false
mov ar.lc=len1
;;
5:
EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
br.ctop.dptk.few 5b
;;
mov ar.lc=saved_lc
mov pr=saved_pr,0xffffffffffff0000
mov ar.pfs=saved_pfs
br.ret.sptk.many rp
//
// Beginning of long mempcy (i.e. > 16 bytes)
//
.long_copy_user:
tbit.nz p6,p7=src1,0 // odd alignment
and tmp=7,tmp
;;
cmp.eq p10,p8=r0,tmp
mov len1=len // copy because of rotation
(p8) br.cond.dpnt .diff_align_copy_user
;;
// At this point we know we have more than 16 bytes to copy
// and also that both src and dest have the same alignment
// which may not be the one we want. So for now we must move
// forward slowly until we reach 16byte alignment: no need to
// worry about reaching the end of buffer.
//
EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned
(p6) adds len1=-1,len1;;
tbit.nz p7,p0=src1,1
;;
EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned
(p7) adds len1=-2,len1;;
tbit.nz p8,p0=src1,2
;;
//
// Stop bit not required after ld4 because if we fail on ld4
// we have never executed the ld1, therefore st1 is not executed.
//
EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned
;;
EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
tbit.nz p9,p0=src1,3
;;
//
// Stop bit not required after ld8 because if we fail on ld8
// we have never executed the ld2, therefore st2 is not executed.
//
EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned
EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
(p8) adds len1=-4,len1
;;
EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
(p9) adds len1=-8,len1;;
shr.u cnt=len1,4 // number of 128-bit (2x64bit) words
;;
EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
tbit.nz p6,p0=len1,3
cmp.eq p7,p0=r0,cnt
adds tmp=-1,cnt // br.ctop is repeat/until
(p7) br.cond.dpnt .dotail // we have less than 16 bytes left
;;
adds src2=8,src1
adds dst2=8,dst1
mov ar.lc=tmp
;;
//
// 16bytes/iteration
//
2:
EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
(p16) ld8 val2[0]=[src2],16
EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16)
(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
br.ctop.dptk 2b
;; // RAW on src1 when fall through from loop
//
// Tail correction based on len only
//
// No matter where we come from (loop or test) the src1 pointer
// is 16 byte aligned AND we have less than 16 bytes to copy.
//
.dotail:
EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes
tbit.nz p7,p0=len1,2
;;
EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes
tbit.nz p8,p0=len1,1
;;
EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes
tbit.nz p9,p0=len1,0
;;
EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
;;
EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left
mov ar.lc=saved_lc
;;
EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
mov pr=saved_pr,0xffffffffffff0000
;;
EX(.failure_out, (p8) st2 [dst1]=val2[0],2)
mov ar.pfs=saved_pfs
;;
EX(.failure_out, (p9) st1 [dst1]=val2[1])
br.ret.sptk.many rp
//
// Here we handle the case where the byte by byte copy fails
// on the load.
// Several factors make the zeroing of the rest of the buffer kind of
// tricky:
// - the pipeline: loads/stores are not in sync (pipeline)
//
// In the same loop iteration, the dst1 pointer does not directly
// reflect where the faulty load was.
//
// - pipeline effect
// When you get a fault on load, you may have valid data from
// previous loads not yet store in transit. Such data must be
// store normally before moving onto zeroing the rest.
//
// - single/multi dispersal independence.
//
// solution:
// - we don't disrupt the pipeline, i.e. data in transit in
// the software pipeline will be eventually move to memory.
// We simply replace the load with a simple mov and keep the
// pipeline going. We can't really do this inline because
// p16 is always reset to 1 when lc > 0.
//
.failure_in_pipe1:
sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
1:
(p16) mov val1[0]=r0
(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
br.ctop.dptk 1b
;;
mov pr=saved_pr,0xffffffffffff0000
mov ar.lc=saved_lc
mov ar.pfs=saved_pfs
br.ret.sptk.many rp
//
// This is the case where the byte by byte copy fails on the load
// when we copy the head. We need to finish the pipeline and copy
// zeros for the rest of the destination. Since this happens
// at the top we still need to fill the body and tail.
.failure_in_pipe2:
sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
2:
(p16) mov val1[0]=r0
(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
br.ctop.dptk 2b
;;
sub len=enddst,dst1,1 // precompute len
br.cond.dptk.many .failure_in1bis
;;
//
// Here we handle the head & tail part when we check for alignment.
// The following code handles only the load failures. The
// main diffculty comes from the fact that loads/stores are
// scheduled. So when you fail on a load, the stores corresponding
// to previous successful loads must be executed.
//
// However some simplifications are possible given the way
// things work.
//
// 1) HEAD
// Theory of operation:
//
// Page A | Page B
// ---------|-----
// 1|8 x
// 1 2|8 x
// 4|8 x
// 1 4|8 x
// 2 4|8 x
// 1 2 4|8 x
// |1
// |2 x
// |4 x
//
// page_size >= 4k (2^12). (x means 4, 2, 1)
// Here we suppose Page A exists and Page B does not.
//
// As we move towards eight byte alignment we may encounter faults.
// The numbers on each page show the size of the load (current alignment).
//
// Key point:
// - if you fail on 1, 2, 4 then you have never executed any smaller
// size loads, e.g. failing ld4 means no ld1 nor ld2 executed
// before.
//
// This allows us to simplify the cleanup code, because basically you
// only have to worry about "pending" stores in the case of a failing
// ld8(). Given the way the code is written today, this means only
// worry about st2, st4. There we can use the information encapsulated
// into the predicates.
//
// Other key point:
// - if you fail on the ld8 in the head, it means you went straight
// to it, i.e. 8byte alignment within an unexisting page.
// Again this comes from the fact that if you crossed just for the ld8 then
// you are 8byte aligned but also 16byte align, therefore you would
// either go for the 16byte copy loop OR the ld8 in the tail part.
// The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
// because it would mean you had 15bytes to copy in which case you
// would have defaulted to the byte by byte copy.
//
//
// 2) TAIL
// Here we now we have less than 16 bytes AND we are either 8 or 16 byte
// aligned.
//
// Key point:
// This means that we either:
// - are right on a page boundary
// OR
// - are at more than 16 bytes from a page boundary with
// at most 15 bytes to copy: no chance of crossing.
//
// This allows us to assume that if we fail on a load we haven't possibly
// executed any of the previous (tail) ones, so we don't need to do
// any stores. For instance, if we fail on ld2, this means we had
// 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
//
// This means that we are in a situation similar the a fault in the
// head part. That's nice!
//
.failure_in1:
sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
sub len=endsrc,src1,1
//
// we know that ret0 can never be zero at this point
// because we failed why trying to do a load, i.e. there is still
// some work to do.
// The failure_in1bis and length problem is taken care of at the
// calling side.
//
;;
.failure_in1bis: // from (.failure_in3)
mov ar.lc=len // Continue with a stupid byte store.
;;
5:
st1 [dst1]=r0,1
br.cloop.dptk 5b
;;
mov pr=saved_pr,0xffffffffffff0000
mov ar.lc=saved_lc
mov ar.pfs=saved_pfs
br.ret.sptk.many rp
//
// Here we simply restart the loop but instead
// of doing loads we fill the pipeline with zeroes
// We can't simply store r0 because we may have valid
// data in transit in the pipeline.
// ar.lc and ar.ec are setup correctly at this point
//
// we MUST use src1/endsrc here and not dst1/enddst because
// of the pipeline effect.
//
.failure_in3:
sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
;;
2:
(p16) mov val1[0]=r0
(p16) mov val2[0]=r0
(EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16
(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
br.ctop.dptk 2b
;;
cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
sub len=enddst,dst1,1 // precompute len
(p6) br.cond.dptk .failure_in1bis
;;
mov pr=saved_pr,0xffffffffffff0000
mov ar.lc=saved_lc
mov ar.pfs=saved_pfs
br.ret.sptk.many rp
.failure_in2:
sub ret0=endsrc,src1
cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
sub len=enddst,dst1,1 // precompute len
(p6) br.cond.dptk .failure_in1bis
;;
mov pr=saved_pr,0xffffffffffff0000
mov ar.lc=saved_lc
mov ar.pfs=saved_pfs
br.ret.sptk.many rp
//
// handling of failures on stores: that's the easy part
//
.failure_out:
sub ret0=enddst,dst1
mov pr=saved_pr,0xffffffffffff0000
mov ar.lc=saved_lc
mov ar.pfs=saved_pfs
br.ret.sptk.many rp
END(__copy_user)

View File

@@ -0,0 +1,140 @@
/*
* Network Checksum & Copy routine
*
* Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
*
* Most of the code has been imported from Linux/Alpha
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/string.h>
#include <asm/uaccess.h>
/*
* XXX Fixme: those 2 inlines are meant for debugging and will go away
*/
static inline unsigned
short from64to16(unsigned long x)
{
/* add up 32-bit words for 33 bits */
x = (x & 0xffffffff) + (x >> 32);
/* add up 16-bit and 17-bit words for 17+c bits */
x = (x & 0xffff) + (x >> 16);
/* add up 16-bit and 2-bit for 16+c bit */
x = (x & 0xffff) + (x >> 16);
/* add up carry.. */
x = (x & 0xffff) + (x >> 16);
return x;
}
static inline
unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
{
int odd, count;
unsigned long result = (unsigned long)psum;
if (len <= 0)
goto out;
odd = 1 & (unsigned long) buff;
if (odd) {
result = *buff << 8;
len--;
buff++;
}
count = len >> 1; /* nr of 16-bit words.. */
if (count) {
if (2 & (unsigned long) buff) {
result += *(unsigned short *) buff;
count--;
len -= 2;
buff += 2;
}
count >>= 1; /* nr of 32-bit words.. */
if (count) {
if (4 & (unsigned long) buff) {
result += *(unsigned int *) buff;
count--;
len -= 4;
buff += 4;
}
count >>= 1; /* nr of 64-bit words.. */
if (count) {
unsigned long carry = 0;
do {
unsigned long w = *(unsigned long *) buff;
count--;
buff += 8;
result += carry;
result += w;
carry = (w > result);
} while (count);
result += carry;
result = (result & 0xffffffff) + (result >> 32);
}
if (len & 4) {
result += *(unsigned int *) buff;
buff += 4;
}
}
if (len & 2) {
result += *(unsigned short *) buff;
buff += 2;
}
}
if (len & 1)
result += *buff;
result = from64to16(result);
if (odd)
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
out:
return result;
}
/*
* XXX Fixme
*
* This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS.
* But it's very tricky to get right even in C.
*/
extern unsigned long do_csum(const unsigned char *, long);
__wsum
csum_partial_copy_from_user(const void __user *src, void *dst,
int len, __wsum psum, int *errp)
{
unsigned long result;
/* XXX Fixme
* for now we separate the copy from checksum for obvious
* alignment difficulties. Look at the Alpha code and you'll be
* scared.
*/
if (__copy_from_user(dst, src, len) != 0 && errp)
*errp = -EFAULT;
result = do_csum(dst, len);
/* add in old sum, and carry.. */
result += (__force u32)psum;
/* 32+c bits -> 32 bits */
result = (result & 0xffffffff) + (result >> 32);
return (__force __wsum)result;
}
EXPORT_SYMBOL(csum_partial_copy_from_user);
__wsum
csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
{
return csum_partial_copy_from_user((__force const void __user *)src,
dst, len, sum, NULL);
}
EXPORT_SYMBOL(csum_partial_copy_nocheck);

View File

@@ -0,0 +1,323 @@
/*
*
* Optmized version of the standard do_csum() function
*
* Return: a 64bit quantity containing the 16bit Internet checksum
*
* Inputs:
* in0: address of buffer to checksum (char *)
* in1: length of the buffer (int)
*
* Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
*
* 02/04/22 Ken Chen <kenneth.w.chen@intel.com>
* Data locality study on the checksum buffer.
* More optimization cleanup - remove excessive stop bits.
* 02/04/08 David Mosberger <davidm@hpl.hp.com>
* More cleanup and tuning.
* 01/04/18 Jun Nakajima <jun.nakajima@intel.com>
* Clean up and optimize and the software pipeline, loading two
* back-to-back 8-byte words per loop. Clean up the initialization
* for the loop. Support the cases where load latency = 1 or 2.
* Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
*/
#include <asm/asmmacro.h>
//
// Theory of operations:
// The goal is to go as quickly as possible to the point where
// we can checksum 16 bytes/loop. Before reaching that point we must
// take care of incorrect alignment of first byte.
//
// The code hereafter also takes care of the "tail" part of the buffer
// before entering the core loop, if any. The checksum is a sum so it
// allows us to commute operations. So we do the "head" and "tail"
// first to finish at full speed in the body. Once we get the head and
// tail values, we feed them into the pipeline, very handy initialization.
//
// Of course we deal with the special case where the whole buffer fits
// into one 8 byte word. In this case we have only one entry in the pipeline.
//
// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
// possible load latency and also to accommodate for head and tail.
//
// The end of the function deals with folding the checksum from 64bits
// down to 16bits taking care of the carry.
//
// This version avoids synchronization in the core loop by also using a
// pipeline for the accumulation of the checksum in resultx[] (x=1,2).
//
// wordx[] (x=1,2)
// |---|
// | | 0 : new value loaded in pipeline
// |---|
// | | - : in transit data
// |---|
// | | LOAD_LATENCY : current value to add to checksum
// |---|
// | | LOAD_LATENCY+1 : previous value added to checksum
// |---| (previous iteration)
//
// resultx[] (x=1,2)
// |---|
// | | 0 : initial value
// |---|
// | | LOAD_LATENCY-1 : new checksum
// |---|
// | | LOAD_LATENCY : previous value of checksum
// |---|
// | | LOAD_LATENCY+1 : final checksum when out of the loop
// |---|
//
//
// See RFC1071 "Computing the Internet Checksum" for various techniques for
// calculating the Internet checksum.
//
// NOT YET DONE:
// - Maybe another algorithm which would take care of the folding at the
// end in a different manner
// - Work with people more knowledgeable than me on the network stack
// to figure out if we could not split the function depending on the
// type of packet or alignment we get. Like the ip_fast_csum() routine
// where we know we have at least 20bytes worth of data to checksum.
// - Do a better job of handling small packets.
// - Note on prefetching: it was found that under various load, i.e. ftp read/write,
// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
// on the data that buffer points to (partly because the checksum is often preceded by
// a copy_from_user()). This finding indiate that lfetch will not be beneficial since
// the data is already in the cache.
//
#define saved_pfs r11
#define hmask r16
#define tmask r17
#define first1 r18
#define firstval r19
#define firstoff r20
#define last r21
#define lastval r22
#define lastoff r23
#define saved_lc r24
#define saved_pr r25
#define tmp1 r26
#define tmp2 r27
#define tmp3 r28
#define carry1 r29
#define carry2 r30
#define first2 r31
#define buf in0
#define len in1
#define LOAD_LATENCY 2 // XXX fix me
#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
#endif
#define PIPE_DEPTH (LOAD_LATENCY+2)
#define ELD p[LOAD_LATENCY] // end of load
#define ELD_1 p[LOAD_LATENCY+1] // and next stage
// unsigned long do_csum(unsigned char *buf,long len)
GLOBAL_ENTRY(do_csum)
.prologue
.save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,2,16,0,16
.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
mov ret0=r0 // in case we have zero length
cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len)
;;
add tmp1=buf,len // last byte's address
.save pr, saved_pr
mov saved_pr=pr // preserve predicates (rotation)
(p6) br.ret.spnt.many rp // return if zero or negative length
mov hmask=-1 // initialize head mask
tbit.nz p15,p0=buf,0 // is buf an odd address?
and first1=-8,buf // 8-byte align down address of first1 element
and firstoff=7,buf // how many bytes off for first1 element
mov tmask=-1 // initialize tail mask
;;
adds tmp2=-1,tmp1 // last-1
and lastoff=7,tmp1 // how many bytes off for last element
;;
sub tmp1=8,lastoff // complement to lastoff
and last=-8,tmp2 // address of word containing last byte
;;
sub tmp3=last,first1 // tmp3=distance from first1 to last
.save ar.lc, saved_lc
mov saved_lc=ar.lc // save lc
cmp.eq p8,p9=last,first1 // everything fits in one word ?
ld8 firstval=[first1],8 // load, ahead of time, "first1" word
and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
shl tmp2=firstoff,3 // number of bits
;;
(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed
shl tmp1=tmp1,3 // number of bits
(p9) adds tmp3=-8,tmp3 // effectively loaded
;;
(p8) mov lastval=r0 // we don't need lastval if first1==last
shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[
shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff]
;;
.body
#define count tmp3
(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only
(p9) and word2[0]=lastval,tmask // mask last it as appropriate
shr.u count=count,3 // how many 8-byte?
;;
// If count is odd, finish this 8-byte word so that we can
// load two back-to-back 8-byte words per loop thereafter.
and word1[0]=firstval,hmask // and mask it as appropriate
tbit.nz p10,p11=count,0 // if (count is odd)
;;
(p8) mov result1[0]=word1[0]
(p9) add result1[0]=word1[0],word2[0]
;;
cmp.ltu p6,p0=result1[0],word1[0] // check the carry
cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte
;;
(p6) adds result1[0]=1,result1[0]
(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word)
(p11) br.cond.dptk .do_csum16 // if (count is even)
// Here count is odd.
ld8 word1[1]=[first1],8 // load an 8-byte word
cmp.eq p9,p10=1,count // if (count == 1)
adds count=-1,count // loaded an 8-byte word
;;
add result1[0]=result1[0],word1[1]
;;
cmp.ltu p6,p0=result1[0],word1[1]
;;
(p6) adds result1[0]=1,result1[0]
(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit
// Fall through to caluculate the checksum, feeding result1[0] as
// the initial value in result1[0].
//
// Calculate the checksum loading two 8-byte words per loop.
//
.do_csum16:
add first2=8,first1
shr.u count=count,1 // we do 16 bytes per loop
;;
adds count=-1,count
mov carry1=r0
mov carry2=r0
brp.loop.imp 1f,2f
;;
mov ar.ec=PIPE_DEPTH
mov ar.lc=count // set lc
mov pr.rot=1<<16
// result1[0] must be initialized in advance.
mov result2[0]=r0
;;
.align 32
1:
(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
(pC1[1])adds carry1=1,carry1
(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
(pC2[1])adds carry2=1,carry2
(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
2:
(p[0]) ld8 word1[0]=[first1],16
(p[0]) ld8 word2[0]=[first2],16
br.ctop.sptk 1b
;;
// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
(pC1[1])adds carry1=1,carry1 // since we miss the last one
(pC2[1])adds carry2=1,carry2
;;
add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
;;
cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
;;
(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
;;
add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
;;
cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
;;
(p6) adds result1[0]=1,result1[0]
;;
.do_csum_exit:
//
// now fold 64 into 16 bits taking care of carry
// that's not very good because it has lots of sequentiality
//
mov tmp3=0xffff
zxt4 tmp1=result1[0]
shr.u tmp2=result1[0],32
;;
add result1[0]=tmp1,tmp2
;;
and tmp1=result1[0],tmp3
shr.u tmp2=result1[0],16
;;
add result1[0]=tmp1,tmp2
;;
and tmp1=result1[0],tmp3
shr.u tmp2=result1[0],16
;;
add result1[0]=tmp1,tmp2
;;
and tmp1=result1[0],tmp3
shr.u tmp2=result1[0],16
;;
add ret0=tmp1,tmp2
mov pr=saved_pr,0xffffffffffff0000
;;
// if buf was odd then swap bytes
mov ar.pfs=saved_pfs // restore ar.ec
(p15) mux1 ret0=ret0,@rev // reverse word
;;
mov ar.lc=saved_lc
(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
br.ret.sptk.many rp
// I (Jun Nakajima) wrote an equivalent code (see below), but it was
// not much better than the original. So keep the original there so that
// someone else can challenge.
//
// shr.u word1[0]=result1[0],32
// zxt4 result1[0]=result1[0]
// ;;
// add result1[0]=result1[0],word1[0]
// ;;
// zxt2 result2[0]=result1[0]
// extr.u word1[0]=result1[0],16,16
// shr.u carry1=result1[0],32
// ;;
// add result2[0]=result2[0],word1[0]
// ;;
// add result2[0]=result2[0],carry1
// ;;
// extr.u ret0=result2[0],16,16
// ;;
// add ret0=ret0,result2[0]
// ;;
// zxt2 ret0=ret0
// mov ar.pfs=saved_pfs // restore ar.ec
// mov pr=saved_pr,0xffffffffffff0000
// ;;
// // if buf was odd then swap bytes
// mov ar.lc=saved_lc
//(p15) mux1 ret0=ret0,@rev // reverse word
// ;;
//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
// br.ret.sptk.many rp
END(do_csum)

View File

@@ -0,0 +1,117 @@
/*
* Cache flushing routines.
*
* Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
* David Mosberger-Tang <davidm@hpl.hp.com>
*
* 05/28/05 Zoltan Menyhart Dynamic stride size
*/
#include <asm/asmmacro.h>
/*
* flush_icache_range(start,end)
*
* Make i-cache(s) coherent with d-caches.
*
* Must deal with range from start to end-1 but nothing else (need to
* be careful not to touch addresses that may be unmapped).
*
* Note: "in0" and "in1" are preserved for debugging purposes.
*/
.section .kprobes.text,"ax"
GLOBAL_ENTRY(flush_icache_range)
.prologue
alloc r2=ar.pfs,2,0,0,0
movl r3=ia64_i_cache_stride_shift
mov r21=1
;;
ld8 r20=[r3] // r20: stride shift
sub r22=in1,r0,1 // last byte address
;;
shr.u r23=in0,r20 // start / (stride size)
shr.u r22=r22,r20 // (last byte address) / (stride size)
shl r21=r21,r20 // r21: stride size of the i-cache(s)
;;
sub r8=r22,r23 // number of strides - 1
shl r24=r23,r20 // r24: addresses for "fc.i" =
// "start" rounded down to stride boundary
.save ar.lc,r3
mov r3=ar.lc // save ar.lc
;;
.body
mov ar.lc=r8
;;
/*
* 32 byte aligned loop, even number of (actually 2) bundles
*/
.Loop: fc.i r24 // issuable on M0 only
add r24=r21,r24 // we flush "stride size" bytes per iteration
nop.i 0
br.cloop.sptk.few .Loop
;;
sync.i
;;
srlz.i
;;
mov ar.lc=r3 // restore ar.lc
br.ret.sptk.many rp
END(flush_icache_range)
/*
* clflush_cache_range(start,size)
*
* Flush cache lines from start to start+size-1.
*
* Must deal with range from start to start+size-1 but nothing else
* (need to be careful not to touch addresses that may be
* unmapped).
*
* Note: "in0" and "in1" are preserved for debugging purposes.
*/
.section .kprobes.text,"ax"
GLOBAL_ENTRY(clflush_cache_range)
.prologue
alloc r2=ar.pfs,2,0,0,0
movl r3=ia64_cache_stride_shift
mov r21=1
add r22=in1,in0
;;
ld8 r20=[r3] // r20: stride shift
sub r22=r22,r0,1 // last byte address
;;
shr.u r23=in0,r20 // start / (stride size)
shr.u r22=r22,r20 // (last byte address) / (stride size)
shl r21=r21,r20 // r21: stride size of the i-cache(s)
;;
sub r8=r22,r23 // number of strides - 1
shl r24=r23,r20 // r24: addresses for "fc" =
// "start" rounded down to stride
// boundary
.save ar.lc,r3
mov r3=ar.lc // save ar.lc
;;
.body
mov ar.lc=r8
;;
/*
* 32 byte aligned loop, even number of (actually 2) bundles
*/
.Loop_fc:
fc r24 // issuable on M0 only
add r24=r21,r24 // we flush "stride size" bytes per iteration
nop.i 0
br.cloop.sptk.few .Loop_fc
;;
sync.i
;;
srlz.i
;;
mov ar.lc=r3 // restore ar.lc
br.ret.sptk.many rp
END(clflush_cache_range)

View File

@@ -0,0 +1,83 @@
/*
* Copyright (C) 2000 Hewlett-Packard Co
* Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
*
* 32-bit integer division.
*
* This code is based on the application note entitled "Divide, Square Root
* and Remainder Algorithms for the IA-64 Architecture". This document
* is available as Intel document number 248725-002 or via the web at
* http://developer.intel.com/software/opensource/numerics/
*
* For more details on the theory behind these algorithms, see "IA-64
* and Elementary Functions" by Peter Markstein; HP Professional Books
* (http://www.hp.com/go/retailbooks/)
*/
#include <asm/asmmacro.h>
#ifdef MODULO
# define OP mod
#else
# define OP div
#endif
#ifdef UNSIGNED
# define SGN u
# define EXTEND zxt4
# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
#else
# define SGN
# define EXTEND sxt4
# define INT_TO_FP(a,b) fcvt.xf a=b
# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
#endif
#define PASTE1(a,b) a##b
#define PASTE(a,b) PASTE1(a,b)
#define NAME PASTE(PASTE(__,SGN),PASTE(OP,si3))
GLOBAL_ENTRY(NAME)
.regstk 2,0,0,0
// Transfer inputs to FP registers.
mov r2 = 0xffdd // r2 = -34 + 65535 (fp reg format bias)
EXTEND in0 = in0 // in0 = a
EXTEND in1 = in1 // in1 = b
;;
setf.sig f8 = in0
setf.sig f9 = in1
#ifdef MODULO
sub in1 = r0, in1 // in1 = -b
#endif
;;
// Convert the inputs to FP, to avoid FP software-assist faults.
INT_TO_FP(f8, f8)
INT_TO_FP(f9, f9)
;;
setf.exp f7 = r2 // f7 = 2^-34
frcpa.s1 f6, p6 = f8, f9 // y0 = frcpa(b)
;;
(p6) fmpy.s1 f8 = f8, f6 // q0 = a*y0
(p6) fnma.s1 f6 = f9, f6, f1 // e0 = -b*y0 + 1
;;
#ifdef MODULO
setf.sig f9 = in1 // f9 = -b
#endif
(p6) fma.s1 f8 = f6, f8, f8 // q1 = e0*q0 + q0
(p6) fma.s1 f6 = f6, f6, f7 // e1 = e0*e0 + 2^-34
;;
#ifdef MODULO
setf.sig f7 = in0
#endif
(p6) fma.s1 f6 = f6, f8, f8 // q2 = e1*q1 + q1
;;
FP_TO_INT(f6, f6) // q = trunc(q2)
;;
#ifdef MODULO
xma.l f6 = f6, f9, f7 // r = q*(-b) + a
;;
#endif
getf.sig r8 = f6 // transfer result to result register
br.ret.sptk.many rp
END(NAME)

View File

@@ -0,0 +1,80 @@
/*
* Copyright (C) 1999-2000 Hewlett-Packard Co
* Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
*
* 64-bit integer division.
*
* This code is based on the application note entitled "Divide, Square Root
* and Remainder Algorithms for the IA-64 Architecture". This document
* is available as Intel document number 248725-002 or via the web at
* http://developer.intel.com/software/opensource/numerics/
*
* For more details on the theory behind these algorithms, see "IA-64
* and Elementary Functions" by Peter Markstein; HP Professional Books
* (http://www.hp.com/go/retailbooks/)
*/
#include <asm/asmmacro.h>
#ifdef MODULO
# define OP mod
#else
# define OP div
#endif
#ifdef UNSIGNED
# define SGN u
# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
#else
# define SGN
# define INT_TO_FP(a,b) fcvt.xf a=b
# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
#endif
#define PASTE1(a,b) a##b
#define PASTE(a,b) PASTE1(a,b)
#define NAME PASTE(PASTE(__,SGN),PASTE(OP,di3))
GLOBAL_ENTRY(NAME)
.regstk 2,0,0,0
// Transfer inputs to FP registers.
setf.sig f8 = in0
setf.sig f9 = in1
;;
// Convert the inputs to FP, to avoid FP software-assist faults.
INT_TO_FP(f8, f8)
INT_TO_FP(f9, f9)
;;
frcpa.s1 f11, p6 = f8, f9 // y0 = frcpa(b)
;;
(p6) fmpy.s1 f7 = f8, f11 // q0 = a*y0
(p6) fnma.s1 f6 = f9, f11, f1 // e0 = -b*y0 + 1
;;
(p6) fma.s1 f10 = f7, f6, f7 // q1 = q0*e0 + q0
(p6) fmpy.s1 f7 = f6, f6 // e1 = e0*e0
;;
#ifdef MODULO
sub in1 = r0, in1 // in1 = -b
#endif
(p6) fma.s1 f10 = f10, f7, f10 // q2 = q1*e1 + q1
(p6) fma.s1 f6 = f11, f6, f11 // y1 = y0*e0 + y0
;;
(p6) fma.s1 f6 = f6, f7, f6 // y2 = y1*e1 + y1
(p6) fnma.s1 f7 = f9, f10, f8 // r = -b*q2 + a
;;
#ifdef MODULO
setf.sig f8 = in0 // f8 = a
setf.sig f9 = in1 // f9 = -b
#endif
(p6) fma.s1 f11 = f7, f6, f10 // q3 = r*y2 + q2
;;
FP_TO_INT(f11, f11) // q = trunc(q3)
;;
#ifdef MODULO
xma.l f11 = f11, f9, f8 // r = q*(-b) + a
;;
#endif
getf.sig r8 = f11 // transfer result to result register
br.ret.sptk.many rp
END(NAME)

164
kernel/arch/ia64/lib/io.c Normal file
View File

@@ -0,0 +1,164 @@
#include <linux/module.h>
#include <linux/types.h>
#include <asm/io.h>
/*
* Copy data from IO memory space to "real" memory space.
* This needs to be optimized.
*/
void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
{
char *dst = to;
while (count) {
count--;
*dst++ = readb(from++);
}
}
EXPORT_SYMBOL(memcpy_fromio);
/*
* Copy data from "real" memory space to IO memory space.
* This needs to be optimized.
*/
void memcpy_toio(volatile void __iomem *to, const void *from, long count)
{
const char *src = from;
while (count) {
count--;
writeb(*src++, to++);
}
}
EXPORT_SYMBOL(memcpy_toio);
/*
* "memset" on IO memory space.
* This needs to be optimized.
*/
void memset_io(volatile void __iomem *dst, int c, long count)
{
unsigned char ch = (char)(c & 0xff);
while (count) {
count--;
writeb(ch, dst);
dst++;
}
}
EXPORT_SYMBOL(memset_io);
#ifdef CONFIG_IA64_GENERIC
#undef __ia64_inb
#undef __ia64_inw
#undef __ia64_inl
#undef __ia64_outb
#undef __ia64_outw
#undef __ia64_outl
#undef __ia64_readb
#undef __ia64_readw
#undef __ia64_readl
#undef __ia64_readq
#undef __ia64_readb_relaxed
#undef __ia64_readw_relaxed
#undef __ia64_readl_relaxed
#undef __ia64_readq_relaxed
#undef __ia64_writeb
#undef __ia64_writew
#undef __ia64_writel
#undef __ia64_writeq
#undef __ia64_mmiowb
unsigned int
__ia64_inb (unsigned long port)
{
return ___ia64_inb(port);
}
unsigned int
__ia64_inw (unsigned long port)
{
return ___ia64_inw(port);
}
unsigned int
__ia64_inl (unsigned long port)
{
return ___ia64_inl(port);
}
void
__ia64_outb (unsigned char val, unsigned long port)
{
___ia64_outb(val, port);
}
void
__ia64_outw (unsigned short val, unsigned long port)
{
___ia64_outw(val, port);
}
void
__ia64_outl (unsigned int val, unsigned long port)
{
___ia64_outl(val, port);
}
unsigned char
__ia64_readb (void __iomem *addr)
{
return ___ia64_readb (addr);
}
unsigned short
__ia64_readw (void __iomem *addr)
{
return ___ia64_readw (addr);
}
unsigned int
__ia64_readl (void __iomem *addr)
{
return ___ia64_readl (addr);
}
unsigned long
__ia64_readq (void __iomem *addr)
{
return ___ia64_readq (addr);
}
unsigned char
__ia64_readb_relaxed (void __iomem *addr)
{
return ___ia64_readb (addr);
}
unsigned short
__ia64_readw_relaxed (void __iomem *addr)
{
return ___ia64_readw (addr);
}
unsigned int
__ia64_readl_relaxed (void __iomem *addr)
{
return ___ia64_readl (addr);
}
unsigned long
__ia64_readq_relaxed (void __iomem *addr)
{
return ___ia64_readq (addr);
}
void
__ia64_mmiowb(void)
{
___ia64_mmiowb();
}
#endif /* CONFIG_IA64_GENERIC */

View File

@@ -0,0 +1,144 @@
/*
* Optmized version of the ip_fast_csum() function
* Used for calculating IP header checksum
*
* Return: 16bit checksum, complemented
*
* Inputs:
* in0: address of buffer to checksum (char *)
* in1: length of the buffer (int)
*
* Copyright (C) 2002, 2006 Intel Corp.
* Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
*/
#include <asm/asmmacro.h>
/*
* Since we know that most likely this function is called with buf aligned
* on 4-byte boundary and 20 bytes in length, we can execution rather quickly
* versus calling generic version of do_csum, which has lots of overhead in
* handling various alignments and sizes. However, due to lack of constrains
* put on the function input argument, cases with alignment not on 4-byte or
* size not equal to 20 bytes will be handled by the generic do_csum function.
*/
#define in0 r32
#define in1 r33
#define in2 r34
#define in3 r35
#define in4 r36
#define ret0 r8
GLOBAL_ENTRY(ip_fast_csum)
.prologue
.body
cmp.ne p6,p7=5,in1 // size other than 20 byte?
and r14=3,in0 // is it aligned on 4-byte?
add r15=4,in0 // second source pointer
;;
cmp.ne.or.andcm p6,p7=r14,r0
;;
(p7) ld4 r20=[in0],8
(p7) ld4 r21=[r15],8
(p6) br.spnt .generic
;;
ld4 r22=[in0],8
ld4 r23=[r15],8
;;
ld4 r24=[in0]
add r20=r20,r21
add r22=r22,r23
;;
add r20=r20,r22
;;
add r20=r20,r24
;;
shr.u ret0=r20,16 // now need to add the carry
zxt2 r20=r20
;;
add r20=ret0,r20
;;
shr.u ret0=r20,16 // add carry again
zxt2 r20=r20
;;
add r20=ret0,r20
;;
shr.u ret0=r20,16
zxt2 r20=r20
;;
add r20=ret0,r20
mov r9=0xffff
;;
andcm ret0=r9,r20
.restore sp // reset frame state
br.ret.sptk.many b0
;;
.generic:
.prologue
.save ar.pfs, r35
alloc r35=ar.pfs,2,2,2,0
.save rp, r34
mov r34=b0
.body
dep.z out1=in1,2,30
mov out0=in0
;;
br.call.sptk.many b0=do_csum
;;
andcm ret0=-1,ret0
mov ar.pfs=r35
mov b0=r34
br.ret.sptk.many b0
END(ip_fast_csum)
GLOBAL_ENTRY(csum_ipv6_magic)
ld4 r20=[in0],4
ld4 r21=[in1],4
zxt4 in2=in2
;;
ld4 r22=[in0],4
ld4 r23=[in1],4
dep r15=in3,in2,32,16
;;
ld4 r24=[in0],4
ld4 r25=[in1],4
mux1 r15=r15,@rev
add r16=r20,r21
add r17=r22,r23
zxt4 in4=in4
;;
ld4 r26=[in0],4
ld4 r27=[in1],4
shr.u r15=r15,16
add r18=r24,r25
add r8=r16,r17
;;
add r19=r26,r27
add r8=r8,r18
;;
add r8=r8,r19
add r15=r15,in4
;;
add r8=r8,r15
;;
shr.u r10=r8,32 // now fold sum into short
zxt4 r11=r8
;;
add r8=r10,r11
;;
shr.u r10=r8,16 // yeah, keep it rolling
zxt2 r11=r8
;;
add r8=r10,r11
;;
shr.u r10=r8,16 // three times lucky
zxt2 r11=r8
;;
add r8=r10,r11
mov r9=0xffff
;;
andcm r8=r9,r8
br.ret.sptk.many b0
END(csum_ipv6_magic)

View File

@@ -0,0 +1,301 @@
/*
*
* Optimized version of the standard memcpy() function
*
* Inputs:
* in0: destination address
* in1: source address
* in2: number of bytes to copy
* Output:
* no return value
*
* Copyright (C) 2000-2001 Hewlett-Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
* David Mosberger-Tang <davidm@hpl.hp.com>
*/
#include <asm/asmmacro.h>
GLOBAL_ENTRY(memcpy)
# define MEM_LAT 21 /* latency to memory */
# define dst r2
# define src r3
# define retval r8
# define saved_pfs r9
# define saved_lc r10
# define saved_pr r11
# define cnt r16
# define src2 r17
# define t0 r18
# define t1 r19
# define t2 r20
# define t3 r21
# define t4 r22
# define src_end r23
# define N (MEM_LAT + 4)
# define Nrot ((N + 7) & ~7)
/*
* First, check if everything (src, dst, len) is a multiple of eight. If
* so, we handle everything with no taken branches (other than the loop
* itself) and a small icache footprint. Otherwise, we jump off to
* the more general copy routine handling arbitrary
* sizes/alignment etc.
*/
.prologue
.save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
.save ar.lc, saved_lc
mov saved_lc=ar.lc
or t0=in0,in1
;;
or t0=t0,in2
.save pr, saved_pr
mov saved_pr=pr
.body
cmp.eq p6,p0=in2,r0 // zero length?
mov retval=in0 // return dst
(p6) br.ret.spnt.many rp // zero length, return immediately
;;
mov dst=in0 // copy because of rotation
shr.u cnt=in2,3 // number of 8-byte words to copy
mov pr.rot=1<<16
;;
adds cnt=-1,cnt // br.ctop is repeat/until
cmp.gtu p7,p0=16,in2 // copying less than 16 bytes?
mov ar.ec=N
;;
and t0=0x7,t0
mov ar.lc=cnt
;;
cmp.ne p6,p0=t0,r0
mov src=in1 // copy because of rotation
(p7) br.cond.spnt.few .memcpy_short
(p6) br.cond.spnt.few .memcpy_long
;;
nop.m 0
;;
nop.m 0
nop.i 0
;;
nop.m 0
;;
.rotr val[N]
.rotp p[N]
.align 32
1: { .mib
(p[0]) ld8 val[0]=[src],8
nop.i 0
brp.loop.imp 1b, 2f
}
2: { .mfb
(p[N-1])st8 [dst]=val[N-1],8
nop.f 0
br.ctop.dptk.few 1b
}
;;
mov ar.lc=saved_lc
mov pr=saved_pr,-1
mov ar.pfs=saved_pfs
br.ret.sptk.many rp
/*
* Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
* copy loop. This performs relatively poorly on Itanium, but it doesn't
* get used very often (gcc inlines small copies) and due to atomicity
* issues, we want to avoid read-modify-write of entire words.
*/
.align 32
.memcpy_short:
adds cnt=-1,in2 // br.ctop is repeat/until
mov ar.ec=MEM_LAT
brp.loop.imp 1f, 2f
;;
mov ar.lc=cnt
;;
nop.m 0
;;
nop.m 0
nop.i 0
;;
nop.m 0
;;
nop.m 0
;;
/*
* It is faster to put a stop bit in the loop here because it makes
* the pipeline shorter (and latency is what matters on short copies).
*/
.align 32
1: { .mib
(p[0]) ld1 val[0]=[src],1
nop.i 0
brp.loop.imp 1b, 2f
} ;;
2: { .mfb
(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
nop.f 0
br.ctop.dptk.few 1b
} ;;
mov ar.lc=saved_lc
mov pr=saved_pr,-1
mov ar.pfs=saved_pfs
br.ret.sptk.many rp
/*
* Large (>= 16 bytes) copying is done in a fancy way. Latency isn't
* an overriding concern here, but throughput is. We first do
* sub-word copying until the destination is aligned, then we check
* if the source is also aligned. If so, we do a simple load/store-loop
* until there are less than 8 bytes left over and then we do the tail,
* by storing the last few bytes using sub-word copying. If the source
* is not aligned, we branch off to the non-congruent loop.
*
* stage: op:
* 0 ld
* :
* MEM_LAT+3 shrp
* MEM_LAT+4 st
*
* On Itanium, the pipeline itself runs without stalls. However, br.ctop
* seems to introduce an unavoidable bubble in the pipeline so the overall
* latency is 2 cycles/iteration. This gives us a _copy_ throughput
* of 4 byte/cycle. Still not bad.
*/
# undef N
# undef Nrot
# define N (MEM_LAT + 5) /* number of stages */
# define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */
#define LOG_LOOP_SIZE 6
.memcpy_long:
alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame
and t0=-8,src // t0 = src & ~7
and t2=7,src // t2 = src & 7
;;
ld8 t0=[t0] // t0 = 1st source word
adds src2=7,src // src2 = (src + 7)
sub t4=r0,dst // t4 = -dst
;;
and src2=-8,src2 // src2 = (src + 7) & ~7
shl t2=t2,3 // t2 = 8*(src & 7)
shl t4=t4,3 // t4 = 8*(dst & 7)
;;
ld8 t1=[src2] // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
sub t3=64,t2 // t3 = 64-8*(src & 7)
shr.u t0=t0,t2
;;
add src_end=src,in2
shl t1=t1,t3
mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7)
;;
or t0=t0,t1
mov cnt=r0
adds src_end=-1,src_end
;;
(p3) st1 [dst]=t0,1
(p3) shr.u t0=t0,8
(p3) adds cnt=1,cnt
;;
(p4) st2 [dst]=t0,2
(p4) shr.u t0=t0,16
(p4) adds cnt=2,cnt
;;
(p5) st4 [dst]=t0,4
(p5) adds cnt=4,cnt
and src_end=-8,src_end // src_end = last word of source buffer
;;
// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
1:{ add src=cnt,src // make src point to remainder of source buffer
sub cnt=in2,cnt // cnt = number of bytes left to copy
mov t4=ip
} ;;
and src2=-8,src // align source pointer
adds t4=.memcpy_loops-1b,t4
mov ar.ec=N
and t0=7,src // t0 = src & 7
shr.u t2=cnt,3 // t2 = number of 8-byte words left to copy
shl cnt=cnt,3 // move bits 0-2 to 3-5
;;
.rotr val[N+1], w[2]
.rotp p[N]
cmp.ne p6,p0=t0,r0 // is src aligned, too?
shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7)
adds t2=-1,t2 // br.ctop is repeat/until
;;
add t4=t0,t4
mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes last-word bytes to copy
mov ar.lc=t2
;;
nop.m 0
;;
nop.m 0
nop.i 0
;;
nop.m 0
;;
(p6) ld8 val[1]=[src2],8 // prime the pump...
mov b6=t4
br.sptk.few b6
;;
.memcpy_tail:
// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
// less than 8) and t0 contains the last few bytes of the src buffer:
(p5) st4 [dst]=t0,4
(p5) shr.u t0=t0,32
mov ar.lc=saved_lc
;;
(p4) st2 [dst]=t0,2
(p4) shr.u t0=t0,16
mov ar.pfs=saved_pfs
;;
(p3) st1 [dst]=t0
mov pr=saved_pr,-1
br.ret.sptk.many rp
///////////////////////////////////////////////////////
.align 64
#define COPY(shift,index) \
1: { .mib \
(p[0]) ld8 val[0]=[src2],8; \
(p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift; \
brp.loop.imp 1b, 2f \
}; \
2: { .mfb \
(p[MEM_LAT+4]) st8 [dst]=w[1],8; \
nop.f 0; \
br.ctop.dptk.few 1b; \
}; \
;; \
ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */ \
;; \
shrp t0=val[N-1],val[N-index],shift; \
br .memcpy_tail
.memcpy_loops:
COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
COPY(8, 0)
COPY(16, 0)
COPY(24, 0)
COPY(32, 0)
COPY(40, 0)
COPY(48, 0)
COPY(56, 0)
END(memcpy)

View File

@@ -0,0 +1,666 @@
/*
* Itanium 2-optimized version of memcpy and copy_user function
*
* Inputs:
* in0: destination address
* in1: source address
* in2: number of bytes to copy
* Output:
* for memcpy: return dest
* for copy_user: return 0 if success,
* or number of byte NOT copied if error occurred.
*
* Copyright (C) 2002 Intel Corp.
* Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
*/
#include <asm/asmmacro.h>
#include <asm/page.h>
#define EK(y...) EX(y)
/* McKinley specific optimization */
#define retval r8
#define saved_pfs r31
#define saved_lc r10
#define saved_pr r11
#define saved_in0 r14
#define saved_in1 r15
#define saved_in2 r16
#define src0 r2
#define src1 r3
#define dst0 r17
#define dst1 r18
#define cnt r9
/* r19-r30 are temp for each code section */
#define PREFETCH_DIST 8
#define src_pre_mem r19
#define dst_pre_mem r20
#define src_pre_l2 r21
#define dst_pre_l2 r22
#define t1 r23
#define t2 r24
#define t3 r25
#define t4 r26
#define t5 t1 // alias!
#define t6 t2 // alias!
#define t7 t3 // alias!
#define n8 r27
#define t9 t5 // alias!
#define t10 t4 // alias!
#define t11 t7 // alias!
#define t12 t6 // alias!
#define t14 t10 // alias!
#define t13 r28
#define t15 r29
#define tmp r30
/* defines for long_copy block */
#define A 0
#define B (PREFETCH_DIST)
#define C (B + PREFETCH_DIST)
#define D (C + 1)
#define N (D + 1)
#define Nrot ((N + 7) & ~7)
/* alias */
#define in0 r32
#define in1 r33
#define in2 r34
GLOBAL_ENTRY(memcpy)
and r28=0x7,in0
and r29=0x7,in1
mov f6=f0
mov retval=in0
br.cond.sptk .common_code
;;
END(memcpy)
GLOBAL_ENTRY(__copy_user)
.prologue
// check dest alignment
and r28=0x7,in0
and r29=0x7,in1
mov f6=f1
mov saved_in0=in0 // save dest pointer
mov saved_in1=in1 // save src pointer
mov retval=r0 // initialize return value
;;
.common_code:
cmp.gt p15,p0=8,in2 // check for small size
cmp.ne p13,p0=0,r28 // check dest alignment
cmp.ne p14,p0=0,r29 // check src alignment
add src0=0,in1
sub r30=8,r28 // for .align_dest
mov saved_in2=in2 // save len
;;
add dst0=0,in0
add dst1=1,in0 // dest odd index
cmp.le p6,p0 = 1,r30 // for .align_dest
(p15) br.cond.dpnt .memcpy_short
(p13) br.cond.dpnt .align_dest
(p14) br.cond.dpnt .unaligned_src
;;
// both dest and src are aligned on 8-byte boundary
.aligned_src:
.save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
.save pr, saved_pr
mov saved_pr=pr
shr.u cnt=in2,7 // this much cache line
;;
cmp.lt p6,p0=2*PREFETCH_DIST,cnt
cmp.lt p7,p8=1,cnt
.save ar.lc, saved_lc
mov saved_lc=ar.lc
.body
add cnt=-1,cnt
add src_pre_mem=0,in1 // prefetch src pointer
add dst_pre_mem=0,in0 // prefetch dest pointer
;;
(p7) mov ar.lc=cnt // prefetch count
(p8) mov ar.lc=r0
(p6) br.cond.dpnt .long_copy
;;
.prefetch:
lfetch.fault [src_pre_mem], 128
lfetch.fault.excl [dst_pre_mem], 128
br.cloop.dptk.few .prefetch
;;
.medium_copy:
and tmp=31,in2 // copy length after iteration
shr.u r29=in2,5 // number of 32-byte iteration
add dst1=8,dst0 // 2nd dest pointer
;;
add cnt=-1,r29 // ctop iteration adjustment
cmp.eq p10,p0=r29,r0 // do we really need to loop?
add src1=8,src0 // 2nd src pointer
cmp.le p6,p0=8,tmp
;;
cmp.le p7,p0=16,tmp
mov ar.lc=cnt // loop setup
cmp.eq p16,p17 = r0,r0
mov ar.ec=2
(p10) br.dpnt.few .aligned_src_tail
;;
TEXT_ALIGN(32)
1:
EX(.ex_handler, (p16) ld8 r34=[src0],16)
EK(.ex_handler, (p16) ld8 r38=[src1],16)
EX(.ex_handler, (p17) st8 [dst0]=r33,16)
EK(.ex_handler, (p17) st8 [dst1]=r37,16)
;;
EX(.ex_handler, (p16) ld8 r32=[src0],16)
EK(.ex_handler, (p16) ld8 r36=[src1],16)
EX(.ex_handler, (p16) st8 [dst0]=r34,16)
EK(.ex_handler, (p16) st8 [dst1]=r38,16)
br.ctop.dptk.few 1b
;;
.aligned_src_tail:
EX(.ex_handler, (p6) ld8 t1=[src0])
mov ar.lc=saved_lc
mov ar.pfs=saved_pfs
EX(.ex_hndlr_s, (p7) ld8 t2=[src1],8)
cmp.le p8,p0=24,tmp
and r21=-8,tmp
;;
EX(.ex_hndlr_s, (p8) ld8 t3=[src1])
EX(.ex_handler, (p6) st8 [dst0]=t1) // store byte 1
and in2=7,tmp // remaining length
EX(.ex_hndlr_d, (p7) st8 [dst1]=t2,8) // store byte 2
add src0=src0,r21 // setting up src pointer
add dst0=dst0,r21 // setting up dest pointer
;;
EX(.ex_handler, (p8) st8 [dst1]=t3) // store byte 3
mov pr=saved_pr,-1
br.dptk.many .memcpy_short
;;
/* code taken from copy_page_mck */
.long_copy:
.rotr v[2*PREFETCH_DIST]
.rotp p[N]
mov src_pre_mem = src0
mov pr.rot = 0x10000
mov ar.ec = 1 // special unrolled loop
mov dst_pre_mem = dst0
add src_pre_l2 = 8*8, src0
add dst_pre_l2 = 8*8, dst0
;;
add src0 = 8, src_pre_mem // first t1 src
mov ar.lc = 2*PREFETCH_DIST - 1
shr.u cnt=in2,7 // number of lines
add src1 = 3*8, src_pre_mem // first t3 src
add dst0 = 8, dst_pre_mem // first t1 dst
add dst1 = 3*8, dst_pre_mem // first t3 dst
;;
and tmp=127,in2 // remaining bytes after this block
add cnt = -(2*PREFETCH_DIST) - 1, cnt
// same as .line_copy loop, but with all predicated-off instructions removed:
.prefetch_loop:
EX(.ex_hndlr_lcpy_1, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0
EK(.ex_hndlr_lcpy_1, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2
br.ctop.sptk .prefetch_loop
;;
cmp.eq p16, p0 = r0, r0 // reset p16 to 1
mov ar.lc = cnt
mov ar.ec = N // # of stages in pipeline
;;
.line_copy:
EX(.ex_handler, (p[D]) ld8 t2 = [src0], 3*8) // M0
EK(.ex_handler, (p[D]) ld8 t4 = [src1], 3*8) // M1
EX(.ex_handler_lcpy, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 prefetch dst from memory
EK(.ex_handler_lcpy, (p[D]) st8 [dst_pre_l2] = n8, 128) // M3 prefetch dst from L2
;;
EX(.ex_handler_lcpy, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 prefetch src from memory
EK(.ex_handler_lcpy, (p[C]) ld8 n8 = [src_pre_l2], 128) // M1 prefetch src from L2
EX(.ex_handler, (p[D]) st8 [dst0] = t1, 8) // M2
EK(.ex_handler, (p[D]) st8 [dst1] = t3, 8) // M3
;;
EX(.ex_handler, (p[D]) ld8 t5 = [src0], 8)
EK(.ex_handler, (p[D]) ld8 t7 = [src1], 3*8)
EX(.ex_handler, (p[D]) st8 [dst0] = t2, 3*8)
EK(.ex_handler, (p[D]) st8 [dst1] = t4, 3*8)
;;
EX(.ex_handler, (p[D]) ld8 t6 = [src0], 3*8)
EK(.ex_handler, (p[D]) ld8 t10 = [src1], 8)
EX(.ex_handler, (p[D]) st8 [dst0] = t5, 8)
EK(.ex_handler, (p[D]) st8 [dst1] = t7, 3*8)
;;
EX(.ex_handler, (p[D]) ld8 t9 = [src0], 3*8)
EK(.ex_handler, (p[D]) ld8 t11 = [src1], 3*8)
EX(.ex_handler, (p[D]) st8 [dst0] = t6, 3*8)
EK(.ex_handler, (p[D]) st8 [dst1] = t10, 8)
;;
EX(.ex_handler, (p[D]) ld8 t12 = [src0], 8)
EK(.ex_handler, (p[D]) ld8 t14 = [src1], 8)
EX(.ex_handler, (p[D]) st8 [dst0] = t9, 3*8)
EK(.ex_handler, (p[D]) st8 [dst1] = t11, 3*8)
;;
EX(.ex_handler, (p[D]) ld8 t13 = [src0], 4*8)
EK(.ex_handler, (p[D]) ld8 t15 = [src1], 4*8)
EX(.ex_handler, (p[D]) st8 [dst0] = t12, 8)
EK(.ex_handler, (p[D]) st8 [dst1] = t14, 8)
;;
EX(.ex_handler, (p[C]) ld8 t1 = [src0], 8)
EK(.ex_handler, (p[C]) ld8 t3 = [src1], 8)
EX(.ex_handler, (p[D]) st8 [dst0] = t13, 4*8)
EK(.ex_handler, (p[D]) st8 [dst1] = t15, 4*8)
br.ctop.sptk .line_copy
;;
add dst0=-8,dst0
add src0=-8,src0
mov in2=tmp
.restore sp
br.sptk.many .medium_copy
;;
#define BLOCK_SIZE 128*32
#define blocksize r23
#define curlen r24
// dest is on 8-byte boundary, src is not. We need to do
// ld8-ld8, shrp, then st8. Max 8 byte copy per cycle.
.unaligned_src:
.prologue
.save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,3,5,0,8
.save ar.lc, saved_lc
mov saved_lc=ar.lc
.save pr, saved_pr
mov saved_pr=pr
.body
.4k_block:
mov saved_in0=dst0 // need to save all input arguments
mov saved_in2=in2
mov blocksize=BLOCK_SIZE
;;
cmp.lt p6,p7=blocksize,in2
mov saved_in1=src0
;;
(p6) mov in2=blocksize
;;
shr.u r21=in2,7 // this much cache line
shr.u r22=in2,4 // number of 16-byte iteration
and curlen=15,in2 // copy length after iteration
and r30=7,src0 // source alignment
;;
cmp.lt p7,p8=1,r21
add cnt=-1,r21
;;
add src_pre_mem=0,src0 // prefetch src pointer
add dst_pre_mem=0,dst0 // prefetch dest pointer
and src0=-8,src0 // 1st src pointer
(p7) mov ar.lc = cnt
(p8) mov ar.lc = r0
;;
TEXT_ALIGN(32)
1: lfetch.fault [src_pre_mem], 128
lfetch.fault.excl [dst_pre_mem], 128
br.cloop.dptk.few 1b
;;
shladd dst1=r22,3,dst0 // 2nd dest pointer
shladd src1=r22,3,src0 // 2nd src pointer
cmp.eq p8,p9=r22,r0 // do we really need to loop?
cmp.le p6,p7=8,curlen; // have at least 8 byte remaining?
add cnt=-1,r22 // ctop iteration adjustment
;;
EX(.ex_handler, (p9) ld8 r33=[src0],8) // loop primer
EK(.ex_handler, (p9) ld8 r37=[src1],8)
(p8) br.dpnt.few .noloop
;;
// The jump address is calculated based on src alignment. The COPYU
// macro below need to confine its size to power of two, so an entry
// can be caulated using shl instead of an expensive multiply. The
// size is then hard coded by the following #define to match the
// actual size. This make it somewhat tedious when COPYU macro gets
// changed and this need to be adjusted to match.
#define LOOP_SIZE 6
1:
mov r29=ip // jmp_table thread
mov ar.lc=cnt
;;
add r29=.jump_table - 1b - (.jmp1-.jump_table), r29
shl r28=r30, LOOP_SIZE // jmp_table thread
mov ar.ec=2 // loop setup
;;
add r29=r29,r28 // jmp_table thread
cmp.eq p16,p17=r0,r0
;;
mov b6=r29 // jmp_table thread
;;
br.cond.sptk.few b6
// for 8-15 byte case
// We will skip the loop, but need to replicate the side effect
// that the loop produces.
.noloop:
EX(.ex_handler, (p6) ld8 r37=[src1],8)
add src0=8,src0
(p6) shl r25=r30,3
;;
EX(.ex_handler, (p6) ld8 r27=[src1])
(p6) shr.u r28=r37,r25
(p6) sub r26=64,r25
;;
(p6) shl r27=r27,r26
;;
(p6) or r21=r28,r27
.unaligned_src_tail:
/* check if we have more than blocksize to copy, if so go back */
cmp.gt p8,p0=saved_in2,blocksize
;;
(p8) add dst0=saved_in0,blocksize
(p8) add src0=saved_in1,blocksize
(p8) sub in2=saved_in2,blocksize
(p8) br.dpnt .4k_block
;;
/* we have up to 15 byte to copy in the tail.
* part of work is already done in the jump table code
* we are at the following state.
* src side:
*
* xxxxxx xx <----- r21 has xxxxxxxx already
* -------- -------- --------
* 0 8 16
* ^
* |
* src1
*
* dst
* -------- -------- --------
* ^
* |
* dst1
*/
EX(.ex_handler, (p6) st8 [dst1]=r21,8) // more than 8 byte to copy
(p6) add curlen=-8,curlen // update length
mov ar.pfs=saved_pfs
;;
mov ar.lc=saved_lc
mov pr=saved_pr,-1
mov in2=curlen // remaining length
mov dst0=dst1 // dest pointer
add src0=src1,r30 // forward by src alignment
;;
// 7 byte or smaller.
.memcpy_short:
cmp.le p8,p9 = 1,in2
cmp.le p10,p11 = 2,in2
cmp.le p12,p13 = 3,in2
cmp.le p14,p15 = 4,in2
add src1=1,src0 // second src pointer
add dst1=1,dst0 // second dest pointer
;;
EX(.ex_handler_short, (p8) ld1 t1=[src0],2)
EK(.ex_handler_short, (p10) ld1 t2=[src1],2)
(p9) br.ret.dpnt rp // 0 byte copy
;;
EX(.ex_handler_short, (p8) st1 [dst0]=t1,2)
EK(.ex_handler_short, (p10) st1 [dst1]=t2,2)
(p11) br.ret.dpnt rp // 1 byte copy
EX(.ex_handler_short, (p12) ld1 t3=[src0],2)
EK(.ex_handler_short, (p14) ld1 t4=[src1],2)
(p13) br.ret.dpnt rp // 2 byte copy
;;
cmp.le p6,p7 = 5,in2
cmp.le p8,p9 = 6,in2
cmp.le p10,p11 = 7,in2
EX(.ex_handler_short, (p12) st1 [dst0]=t3,2)
EK(.ex_handler_short, (p14) st1 [dst1]=t4,2)
(p15) br.ret.dpnt rp // 3 byte copy
;;
EX(.ex_handler_short, (p6) ld1 t5=[src0],2)
EK(.ex_handler_short, (p8) ld1 t6=[src1],2)
(p7) br.ret.dpnt rp // 4 byte copy
;;
EX(.ex_handler_short, (p6) st1 [dst0]=t5,2)
EK(.ex_handler_short, (p8) st1 [dst1]=t6,2)
(p9) br.ret.dptk rp // 5 byte copy
EX(.ex_handler_short, (p10) ld1 t7=[src0],2)
(p11) br.ret.dptk rp // 6 byte copy
;;
EX(.ex_handler_short, (p10) st1 [dst0]=t7,2)
br.ret.dptk rp // done all cases
/* Align dest to nearest 8-byte boundary. We know we have at
* least 7 bytes to copy, enough to crawl to 8-byte boundary.
* Actual number of byte to crawl depend on the dest alignment.
* 7 byte or less is taken care at .memcpy_short
* src0 - source even index
* src1 - source odd index
* dst0 - dest even index
* dst1 - dest odd index
* r30 - distance to 8-byte boundary
*/
.align_dest:
add src1=1,in1 // source odd index
cmp.le p7,p0 = 2,r30 // for .align_dest
cmp.le p8,p0 = 3,r30 // for .align_dest
EX(.ex_handler_short, (p6) ld1 t1=[src0],2)
cmp.le p9,p0 = 4,r30 // for .align_dest
cmp.le p10,p0 = 5,r30
;;
EX(.ex_handler_short, (p7) ld1 t2=[src1],2)
EK(.ex_handler_short, (p8) ld1 t3=[src0],2)
cmp.le p11,p0 = 6,r30
EX(.ex_handler_short, (p6) st1 [dst0] = t1,2)
cmp.le p12,p0 = 7,r30
;;
EX(.ex_handler_short, (p9) ld1 t4=[src1],2)
EK(.ex_handler_short, (p10) ld1 t5=[src0],2)
EX(.ex_handler_short, (p7) st1 [dst1] = t2,2)
EK(.ex_handler_short, (p8) st1 [dst0] = t3,2)
;;
EX(.ex_handler_short, (p11) ld1 t6=[src1],2)
EK(.ex_handler_short, (p12) ld1 t7=[src0],2)
cmp.eq p6,p7=r28,r29
EX(.ex_handler_short, (p9) st1 [dst1] = t4,2)
EK(.ex_handler_short, (p10) st1 [dst0] = t5,2)
sub in2=in2,r30
;;
EX(.ex_handler_short, (p11) st1 [dst1] = t6,2)
EK(.ex_handler_short, (p12) st1 [dst0] = t7)
add dst0=in0,r30 // setup arguments
add src0=in1,r30
(p6) br.cond.dptk .aligned_src
(p7) br.cond.dpnt .unaligned_src
;;
/* main loop body in jump table format */
#define COPYU(shift) \
1: \
EX(.ex_handler, (p16) ld8 r32=[src0],8); /* 1 */ \
EK(.ex_handler, (p16) ld8 r36=[src1],8); \
(p17) shrp r35=r33,r34,shift;; /* 1 */ \
EX(.ex_handler, (p6) ld8 r22=[src1]); /* common, prime for tail section */ \
nop.m 0; \
(p16) shrp r38=r36,r37,shift; \
EX(.ex_handler, (p17) st8 [dst0]=r35,8); /* 1 */ \
EK(.ex_handler, (p17) st8 [dst1]=r39,8); \
br.ctop.dptk.few 1b;; \
(p7) add src1=-8,src1; /* back out for <8 byte case */ \
shrp r21=r22,r38,shift; /* speculative work */ \
br.sptk.few .unaligned_src_tail /* branch out of jump table */ \
;;
TEXT_ALIGN(32)
.jump_table:
COPYU(8) // unaligned cases
.jmp1:
COPYU(16)
COPYU(24)
COPYU(32)
COPYU(40)
COPYU(48)
COPYU(56)
#undef A
#undef B
#undef C
#undef D
/*
* Due to lack of local tag support in gcc 2.x assembler, it is not clear which
* instruction failed in the bundle. The exception algorithm is that we
* first figure out the faulting address, then detect if there is any
* progress made on the copy, if so, redo the copy from last known copied
* location up to the faulting address (exclusive). In the copy_from_user
* case, remaining byte in kernel buffer will be zeroed.
*
* Take copy_from_user as an example, in the code there are multiple loads
* in a bundle and those multiple loads could span over two pages, the
* faulting address is calculated as page_round_down(max(src0, src1)).
* This is based on knowledge that if we can access one byte in a page, we
* can access any byte in that page.
*
* predicate used in the exception handler:
* p6-p7: direction
* p10-p11: src faulting addr calculation
* p12-p13: dst faulting addr calculation
*/
#define A r19
#define B r20
#define C r21
#define D r22
#define F r28
#define memset_arg0 r32
#define memset_arg2 r33
#define saved_retval loc0
#define saved_rtlink loc1
#define saved_pfs_stack loc2
.ex_hndlr_s:
add src0=8,src0
br.sptk .ex_handler
;;
.ex_hndlr_d:
add dst0=8,dst0
br.sptk .ex_handler
;;
.ex_hndlr_lcpy_1:
mov src1=src_pre_mem
mov dst1=dst_pre_mem
cmp.gtu p10,p11=src_pre_mem,saved_in1
cmp.gtu p12,p13=dst_pre_mem,saved_in0
;;
(p10) add src0=8,saved_in1
(p11) mov src0=saved_in1
(p12) add dst0=8,saved_in0
(p13) mov dst0=saved_in0
br.sptk .ex_handler
.ex_handler_lcpy:
// in line_copy block, the preload addresses should always ahead
// of the other two src/dst pointers. Furthermore, src1/dst1 should
// always ahead of src0/dst0.
mov src1=src_pre_mem
mov dst1=dst_pre_mem
.ex_handler:
mov pr=saved_pr,-1 // first restore pr, lc, and pfs
mov ar.lc=saved_lc
mov ar.pfs=saved_pfs
;;
.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
cmp.ltu p6,p7=saved_in0, saved_in1 // get the copy direction
cmp.ltu p10,p11=src0,src1
cmp.ltu p12,p13=dst0,dst1
fcmp.eq p8,p0=f6,f0 // is it memcpy?
mov tmp = dst0
;;
(p11) mov src1 = src0 // pick the larger of the two
(p13) mov dst0 = dst1 // make dst0 the smaller one
(p13) mov dst1 = tmp // and dst1 the larger one
;;
(p6) dep F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
(p7) dep F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
;;
(p6) cmp.le p14,p0=dst0,saved_in0 // no progress has been made on store
(p7) cmp.le p14,p0=src0,saved_in1 // no progress has been made on load
mov retval=saved_in2
(p8) ld1 tmp=[src1] // force an oops for memcpy call
(p8) st1 [dst1]=r0 // force an oops for memcpy call
(p14) br.ret.sptk.many rp
/*
* The remaining byte to copy is calculated as:
*
* A = (faulting_addr - orig_src) -> len to faulting ld address
* or
* (faulting_addr - orig_dst) -> len to faulting st address
* B = (cur_dst - orig_dst) -> len copied so far
* C = A - B -> len need to be copied
* D = orig_len - A -> len need to be zeroed
*/
(p6) sub A = F, saved_in0
(p7) sub A = F, saved_in1
clrrrb
;;
alloc saved_pfs_stack=ar.pfs,3,3,3,0
cmp.lt p8,p0=A,r0
sub B = dst0, saved_in0 // how many byte copied so far
;;
(p8) mov A = 0; // A shouldn't be negative, cap it
;;
sub C = A, B
sub D = saved_in2, A
;;
cmp.gt p8,p0=C,r0 // more than 1 byte?
add memset_arg0=saved_in0, A
(p6) mov memset_arg2=0 // copy_to_user should not call memset
(p7) mov memset_arg2=D // copy_from_user need to have kbuf zeroed
mov r8=0
mov saved_retval = D
mov saved_rtlink = b0
add out0=saved_in0, B
add out1=saved_in1, B
mov out2=C
(p8) br.call.sptk.few b0=__copy_user // recursive call
;;
add saved_retval=saved_retval,r8 // above might return non-zero value
cmp.gt p8,p0=memset_arg2,r0 // more than 1 byte?
mov out0=memset_arg0 // *s
mov out1=r0 // c
mov out2=memset_arg2 // n
(p8) br.call.sptk.few b0=memset
;;
mov retval=saved_retval
mov ar.pfs=saved_pfs_stack
mov b0=saved_rtlink
br.ret.sptk.many rp
/* end of McKinley specific optimization */
END(__copy_user)

View File

@@ -0,0 +1,362 @@
/* Optimized version of the standard memset() function.
Copyright (c) 2002 Hewlett-Packard Co/CERN
Sverre Jarp <Sverre.Jarp@cern.ch>
Return: dest
Inputs:
in0: dest
in1: value
in2: count
The algorithm is fairly straightforward: set byte by byte until we
we get to a 16B-aligned address, then loop on 128 B chunks using an
early store as prefetching, then loop on 32B chucks, then clear remaining
words, finally clear remaining bytes.
Since a stf.spill f0 can store 16B in one go, we use this instruction
to get peak speed when value = 0. */
#include <asm/asmmacro.h>
#undef ret
#define dest in0
#define value in1
#define cnt in2
#define tmp r31
#define save_lc r30
#define ptr0 r29
#define ptr1 r28
#define ptr2 r27
#define ptr3 r26
#define ptr9 r24
#define loopcnt r23
#define linecnt r22
#define bytecnt r21
#define fvalue f6
// This routine uses only scratch predicate registers (p6 - p15)
#define p_scr p6 // default register for same-cycle branches
#define p_nz p7
#define p_zr p8
#define p_unalgn p9
#define p_y p11
#define p_n p12
#define p_yy p13
#define p_nn p14
#define MIN1 15
#define MIN1P1HALF 8
#define LINE_SIZE 128
#define LSIZE_SH 7 // shift amount
#define PREF_AHEAD 8
GLOBAL_ENTRY(memset)
{ .mmi
.prologue
alloc tmp = ar.pfs, 3, 0, 0, 0
lfetch.nt1 [dest] //
.save ar.lc, save_lc
mov.i save_lc = ar.lc
.body
} { .mmi
mov ret0 = dest // return value
cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero
cmp.eq p_scr, p0 = cnt, r0
;; }
{ .mmi
and ptr2 = -(MIN1+1), dest // aligned address
and tmp = MIN1, dest // prepare to check for correct alignment
tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U)
} { .mib
mov ptr1 = dest
mux1 value = value, @brcst // create 8 identical bytes in word
(p_scr) br.ret.dpnt.many rp // return immediately if count = 0
;; }
{ .mib
cmp.ne p_unalgn, p0 = tmp, r0 //
} { .mib
sub bytecnt = (MIN1+1), tmp // NB: # of bytes to move is 1 higher than loopcnt
cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task?
(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
;; }
{ .mmi
(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment
(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment
(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ?
;; }
{ .mib
(p_y) add cnt = -8, cnt //
(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ?
} { .mib
(p_y) st8 [ptr2] = value,-4 //
(p_n) add ptr2 = 4, ptr2 //
;; }
{ .mib
(p_yy) add cnt = -4, cnt //
(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ?
} { .mib
(p_yy) st4 [ptr2] = value,-2 //
(p_nn) add ptr2 = 2, ptr2 //
;; }
{ .mmi
mov tmp = LINE_SIZE+1 // for compare
(p_y) add cnt = -2, cnt //
(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ?
} { .mmi
setf.sig fvalue=value // transfer value to FLP side
(p_y) st2 [ptr2] = value,-1 //
(p_n) add ptr2 = 1, ptr2 //
;; }
{ .mmi
(p_yy) st1 [ptr2] = value //
cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task?
} { .mbb
(p_yy) add cnt = -1, cnt //
(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few
;; }
{ .mib
nop.m 0
shr.u linecnt = cnt, LSIZE_SH
(p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill
;; }
TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later
{ .mmi
and tmp = -(LINE_SIZE), cnt // compute end of range
mov ptr9 = ptr1 // used for prefetching
and cnt = (LINE_SIZE-1), cnt // remainder
} { .mmi
mov loopcnt = PREF_AHEAD-1 // default prefetch loop
cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
;; }
{ .mmi
(p_scr) add loopcnt = -1, linecnt //
add ptr2 = 8, ptr1 // start of stores (beyond prefetch stores)
add ptr1 = tmp, ptr1 // first address beyond total range
;; }
{ .mmi
add tmp = -1, linecnt // next loop count
mov.i ar.lc = loopcnt //
;; }
.pref_l1a:
{ .mib
stf8 [ptr9] = fvalue, 128 // Do stores one cache line apart
nop.i 0
br.cloop.dptk.few .pref_l1a
;; }
{ .mmi
add ptr0 = 16, ptr2 // Two stores in parallel
mov.i ar.lc = tmp //
;; }
.l1ax:
{ .mmi
stf8 [ptr2] = fvalue, 8
stf8 [ptr0] = fvalue, 8
;; }
{ .mmi
stf8 [ptr2] = fvalue, 24
stf8 [ptr0] = fvalue, 24
;; }
{ .mmi
stf8 [ptr2] = fvalue, 8
stf8 [ptr0] = fvalue, 8
;; }
{ .mmi
stf8 [ptr2] = fvalue, 24
stf8 [ptr0] = fvalue, 24
;; }
{ .mmi
stf8 [ptr2] = fvalue, 8
stf8 [ptr0] = fvalue, 8
;; }
{ .mmi
stf8 [ptr2] = fvalue, 24
stf8 [ptr0] = fvalue, 24
;; }
{ .mmi
stf8 [ptr2] = fvalue, 8
stf8 [ptr0] = fvalue, 32
cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
;; }
{ .mmb
stf8 [ptr2] = fvalue, 24
(p_scr) stf8 [ptr9] = fvalue, 128
br.cloop.dptk.few .l1ax
;; }
{ .mbb
cmp.le p_scr, p0 = 8, cnt // just a few bytes left ?
(p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2
br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3
;; }
TEXT_ALIGN(32)
.l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later
{ .mmi
and tmp = -(LINE_SIZE), cnt // compute end of range
mov ptr9 = ptr1 // used for prefetching
and cnt = (LINE_SIZE-1), cnt // remainder
} { .mmi
mov loopcnt = PREF_AHEAD-1 // default prefetch loop
cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
;; }
{ .mmi
(p_scr) add loopcnt = -1, linecnt
add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
add ptr1 = tmp, ptr1 // first address beyond total range
;; }
{ .mmi
add tmp = -1, linecnt // next loop count
mov.i ar.lc = loopcnt
;; }
.pref_l1b:
{ .mib
stf.spill [ptr9] = f0, 128 // Do stores one cache line apart
nop.i 0
br.cloop.dptk.few .pref_l1b
;; }
{ .mmi
add ptr0 = 16, ptr2 // Two stores in parallel
mov.i ar.lc = tmp
;; }
.l1bx:
{ .mmi
stf.spill [ptr2] = f0, 32
stf.spill [ptr0] = f0, 32
;; }
{ .mmi
stf.spill [ptr2] = f0, 32
stf.spill [ptr0] = f0, 32
;; }
{ .mmi
stf.spill [ptr2] = f0, 32
stf.spill [ptr0] = f0, 64
cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
;; }
{ .mmb
stf.spill [ptr2] = f0, 32
(p_scr) stf.spill [ptr9] = f0, 128
br.cloop.dptk.few .l1bx
;; }
{ .mib
cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
(p_scr) br.cond.dpnt.many .move_bytes_from_alignment //
;; }
.fraction_of_line:
{ .mib
add ptr2 = 16, ptr1
shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32
;; }
{ .mib
cmp.eq p_scr, p0 = loopcnt, r0
add loopcnt = -1, loopcnt
(p_scr) br.cond.dpnt.many .store_words
;; }
{ .mib
and cnt = 0x1f, cnt // compute the remaining cnt
mov.i ar.lc = loopcnt
;; }
TEXT_ALIGN(32)
.l2: // ------------------------------------ // L2A: store 32B in 2 cycles
{ .mmb
stf8 [ptr1] = fvalue, 8
stf8 [ptr2] = fvalue, 8
;; } { .mmb
stf8 [ptr1] = fvalue, 24
stf8 [ptr2] = fvalue, 24
br.cloop.dptk.many .l2
;; }
.store_words:
{ .mib
cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch
;; }
{ .mmi
stf8 [ptr1] = fvalue, 8 // store
cmp.le p_y, p_n = 16, cnt
add cnt = -8, cnt // subtract
;; }
{ .mmi
(p_y) stf8 [ptr1] = fvalue, 8 // store
(p_y) cmp.le.unc p_yy, p_nn = 16, cnt
(p_y) add cnt = -8, cnt // subtract
;; }
{ .mmi // store
(p_yy) stf8 [ptr1] = fvalue, 8
(p_yy) add cnt = -8, cnt // subtract
;; }
.move_bytes_from_alignment:
{ .mib
cmp.eq p_scr, p0 = cnt, r0
tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ?
(p_scr) br.cond.dpnt.few .restore_and_exit
;; }
{ .mib
(p_y) st4 [ptr1] = value,4
tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ?
;; }
{ .mib
(p_yy) st2 [ptr1] = value,2
tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ?
;; }
{ .mib
(p_y) st1 [ptr1] = value
;; }
.restore_and_exit:
{ .mib
nop.m 0
mov.i ar.lc = save_lc
br.ret.sptk.many rp
;; }
.move_bytes_unaligned:
{ .mmi
.pred.rel "mutex",p_y, p_n
.pred.rel "mutex",p_yy, p_nn
(p_n) cmp.le p_yy, p_nn = 4, cnt
(p_y) cmp.le p_yy, p_nn = 5, cnt
(p_n) add ptr2 = 2, ptr1
} { .mmi
(p_y) add ptr2 = 3, ptr1
(p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left]
(p_y) add cnt = -1, cnt
;; }
{ .mmi
(p_yy) cmp.le.unc p_y, p0 = 8, cnt
add ptr3 = ptr1, cnt // prepare last store
mov.i ar.lc = save_lc
} { .mmi
(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [11, 10 (o less) left]
(p_yy) add cnt = -4, cnt
;; }
{ .mmi
(p_y) cmp.le.unc p_yy, p0 = 8, cnt
add ptr3 = -1, ptr3 // last store
tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ?
} { .mmi
(p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
(p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [7, 6 (or less) left]
(p_y) add cnt = -4, cnt
;; }
{ .mmi
(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [3, 2 (or less) left]
tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ?
} { .mmi
(p_yy) add cnt = -4, cnt
;; }
{ .mmb
(p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes
(p_y) st1 [ptr3] = value // fill last byte (using ptr3)
br.ret.sptk.many rp
}
END(memset)

View File

@@ -0,0 +1,192 @@
/*
*
* Optimized version of the standard strlen() function
*
*
* Inputs:
* in0 address of string
*
* Outputs:
* ret0 the number of characters in the string (0 if empty string)
* does not count the \0
*
* Copyright (C) 1999, 2001 Hewlett-Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
*
* 09/24/99 S.Eranian add speculation recovery code
*/
#include <asm/asmmacro.h>
//
//
// This is an enhanced version of the basic strlen. it includes a combination
// of compute zero index (czx), parallel comparisons, speculative loads and
// loop unroll using rotating registers.
//
// General Ideas about the algorithm:
// The goal is to look at the string in chunks of 8 bytes.
// so we need to do a few extra checks at the beginning because the
// string may not be 8-byte aligned. In this case we load the 8byte
// quantity which includes the start of the string and mask the unused
// bytes with 0xff to avoid confusing czx.
// We use speculative loads and software pipelining to hide memory
// latency and do read ahead safely. This way we defer any exception.
//
// Because we don't want the kernel to be relying on particular
// settings of the DCR register, we provide recovery code in case
// speculation fails. The recovery code is going to "redo" the work using
// only normal loads. If we still get a fault then we generate a
// kernel panic. Otherwise we return the strlen as usual.
//
// The fact that speculation may fail can be caused, for instance, by
// the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
// a NaT bit will be set if the translation is not present. The normal
// load, on the other hand, will cause the translation to be inserted
// if the mapping exists.
//
// It should be noted that we execute recovery code only when we need
// to use the data that has been speculatively loaded: we don't execute
// recovery code on pure read ahead data.
//
// Remarks:
// - the cmp r0,r0 is used as a fast way to initialize a predicate
// register to 1. This is required to make sure that we get the parallel
// compare correct.
//
// - we don't use the epilogue counter to exit the loop but we need to set
// it to zero beforehand.
//
// - after the loop we must test for Nat values because neither the
// czx nor cmp instruction raise a NaT consumption fault. We must be
// careful not to look too far for a Nat for which we don't care.
// For instance we don't need to look at a NaT in val2 if the zero byte
// was in val1.
//
// - Clearly performance tuning is required.
//
//
//
#define saved_pfs r11
#define tmp r10
#define base r16
#define orig r17
#define saved_pr r18
#define src r19
#define mask r20
#define val r21
#define val1 r22
#define val2 r23
GLOBAL_ENTRY(strlen)
.prologue
.save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
.rotr v[2], w[2] // declares our 4 aliases
extr.u tmp=in0,0,3 // tmp=least significant 3 bits
mov orig=in0 // keep trackof initial byte address
dep src=0,in0,0,3 // src=8byte-aligned in0 address
.save pr, saved_pr
mov saved_pr=pr // preserve predicates (rotation)
;;
.body
ld8 v[1]=[src],8 // must not speculate: can fail here
shl tmp=tmp,3 // multiply by 8bits/byte
mov mask=-1 // our mask
;;
ld8.s w[1]=[src],8 // speculatively load next
cmp.eq p6,p0=r0,r0 // sets p6 to true for cmp.and
sub tmp=64,tmp // how many bits to shift our mask on the right
;;
shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part
mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs)
;;
add base=-16,src // keep track of aligned base
or v[1]=v[1],mask // now we have a safe initial byte pattern
;;
1:
ld8.s v[0]=[src],8 // speculatively load next
czx1.r val1=v[1] // search 0 byte from right
czx1.r val2=w[1] // search 0 byte from right following 8bytes
;;
ld8.s w[0]=[src],8 // speculatively load next to next
cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
(p6) br.wtop.dptk 1b // loop until p6 == 0
;;
//
// We must return try the recovery code iff
// val1_is_nat || (val1==8 && val2_is_nat)
//
// XXX Fixme
// - there must be a better way of doing the test
//
cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate)
tnat.nz p6,p7=val1 // test NaT on val1
(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT
;;
//
// if we come here p7 is true, i.e., initialized for // cmp
//
cmp.eq.and p7,p0=8,val1// val1==8?
tnat.nz.and p7,p0=val2 // test NaT if val2
(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT
;;
(p8) mov val1=val2 // the other test got us out of the loop
(p8) adds src=-16,src // correct position when 3 ahead
(p9) adds src=-24,src // correct position when 4 ahead
;;
sub ret0=src,orig // distance from base
sub tmp=8,val1 // which byte in word
mov pr=saved_pr,0xffffffffffff0000
;;
sub ret0=ret0,tmp // adjust
mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
br.ret.sptk.many rp // end of normal execution
//
// Outlined recovery code when speculation failed
//
// This time we don't use speculation and rely on the normal exception
// mechanism. that's why the loop is not as good as the previous one
// because read ahead is not possible
//
// IMPORTANT:
// Please note that in the case of strlen() as opposed to strlen_user()
// we don't use the exception mechanism, as this function is not
// supposed to fail. If that happens it means we have a bug and the
// code will cause of kernel fault.
//
// XXX Fixme
// - today we restart from the beginning of the string instead
// of trying to continue where we left off.
//
.recover:
ld8 val=[base],8 // will fail if unrecoverable fault
;;
or val=val,mask // remask first bytes
cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop
;;
//
// ar.ec is still zero here
//
2:
(p6) ld8 val=[base],8 // will fail if unrecoverable fault
;;
czx1.r val1=val // search 0 byte from right
;;
cmp.eq p6,p0=8,val1 // val1==8 ?
(p6) br.wtop.dptk 2b // loop until p6 == 0
;; // (avoid WAW on p63)
sub ret0=base,orig // distance from base
sub tmp=8,val1
mov pr=saved_pr,0xffffffffffff0000
;;
sub ret0=ret0,tmp // length=now - back -1
mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
br.ret.sptk.many rp // end of successful recovery code
END(strlen)

View File

@@ -0,0 +1,198 @@
/*
* Optimized version of the strlen_user() function
*
* Inputs:
* in0 address of buffer
*
* Outputs:
* ret0 0 in case of fault, strlen(buffer)+1 otherwise
*
* Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
* David Mosberger-Tang <davidm@hpl.hp.com>
* Stephane Eranian <eranian@hpl.hp.com>
*
* 01/19/99 S.Eranian heavily enhanced version (see details below)
* 09/24/99 S.Eranian added speculation recovery code
*/
#include <asm/asmmacro.h>
//
// int strlen_user(char *)
// ------------------------
// Returns:
// - length of string + 1
// - 0 in case an exception is raised
//
// This is an enhanced version of the basic strlen_user. it includes a
// combination of compute zero index (czx), parallel comparisons, speculative
// loads and loop unroll using rotating registers.
//
// General Ideas about the algorithm:
// The goal is to look at the string in chunks of 8 bytes.
// so we need to do a few extra checks at the beginning because the
// string may not be 8-byte aligned. In this case we load the 8byte
// quantity which includes the start of the string and mask the unused
// bytes with 0xff to avoid confusing czx.
// We use speculative loads and software pipelining to hide memory
// latency and do read ahead safely. This way we defer any exception.
//
// Because we don't want the kernel to be relying on particular
// settings of the DCR register, we provide recovery code in case
// speculation fails. The recovery code is going to "redo" the work using
// only normal loads. If we still get a fault then we return an
// error (ret0=0). Otherwise we return the strlen+1 as usual.
// The fact that speculation may fail can be caused, for instance, by
// the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
// a NaT bit will be set if the translation is not present. The normal
// load, on the other hand, will cause the translation to be inserted
// if the mapping exists.
//
// It should be noted that we execute recovery code only when we need
// to use the data that has been speculatively loaded: we don't execute
// recovery code on pure read ahead data.
//
// Remarks:
// - the cmp r0,r0 is used as a fast way to initialize a predicate
// register to 1. This is required to make sure that we get the parallel
// compare correct.
//
// - we don't use the epilogue counter to exit the loop but we need to set
// it to zero beforehand.
//
// - after the loop we must test for Nat values because neither the
// czx nor cmp instruction raise a NaT consumption fault. We must be
// careful not to look too far for a Nat for which we don't care.
// For instance we don't need to look at a NaT in val2 if the zero byte
// was in val1.
//
// - Clearly performance tuning is required.
//
#define saved_pfs r11
#define tmp r10
#define base r16
#define orig r17
#define saved_pr r18
#define src r19
#define mask r20
#define val r21
#define val1 r22
#define val2 r23
GLOBAL_ENTRY(__strlen_user)
.prologue
.save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,11,0,0,8
.rotr v[2], w[2] // declares our 4 aliases
extr.u tmp=in0,0,3 // tmp=least significant 3 bits
mov orig=in0 // keep trackof initial byte address
dep src=0,in0,0,3 // src=8byte-aligned in0 address
.save pr, saved_pr
mov saved_pr=pr // preserve predicates (rotation)
;;
.body
ld8.s v[1]=[src],8 // load the initial 8bytes (must speculate)
shl tmp=tmp,3 // multiply by 8bits/byte
mov mask=-1 // our mask
;;
ld8.s w[1]=[src],8 // load next 8 bytes in 2nd pipeline
cmp.eq p6,p0=r0,r0 // sets p6 (required because of // cmp.and)
sub tmp=64,tmp // how many bits to shift our mask on the right
;;
shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part
mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs)
;;
add base=-16,src // keep track of aligned base
chk.s v[1], .recover // if already NaT, then directly skip to recover
or v[1]=v[1],mask // now we have a safe initial byte pattern
;;
1:
ld8.s v[0]=[src],8 // speculatively load next
czx1.r val1=v[1] // search 0 byte from right
czx1.r val2=w[1] // search 0 byte from right following 8bytes
;;
ld8.s w[0]=[src],8 // speculatively load next to next
cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
(p6) br.wtop.dptk.few 1b // loop until p6 == 0
;;
//
// We must return try the recovery code iff
// val1_is_nat || (val1==8 && val2_is_nat)
//
// XXX Fixme
// - there must be a better way of doing the test
//
cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate)
tnat.nz p6,p7=val1 // test NaT on val1
(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT
;;
//
// if we come here p7 is true, i.e., initialized for // cmp
//
cmp.eq.and p7,p0=8,val1// val1==8?
tnat.nz.and p7,p0=val2 // test NaT if val2
(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT
;;
(p8) mov val1=val2 // val2 contains the value
(p8) adds src=-16,src // correct position when 3 ahead
(p9) adds src=-24,src // correct position when 4 ahead
;;
sub ret0=src,orig // distance from origin
sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1
mov pr=saved_pr,0xffffffffffff0000
;;
sub ret0=ret0,tmp // length=now - back -1
mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
br.ret.sptk.many rp // end of normal execution
//
// Outlined recovery code when speculation failed
//
// This time we don't use speculation and rely on the normal exception
// mechanism. that's why the loop is not as good as the previous one
// because read ahead is not possible
//
// XXX Fixme
// - today we restart from the beginning of the string instead
// of trying to continue where we left off.
//
.recover:
EX(.Lexit1, ld8 val=[base],8) // load the initial bytes
;;
or val=val,mask // remask first bytes
cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop
;;
//
// ar.ec is still zero here
//
2:
EX(.Lexit1, (p6) ld8 val=[base],8)
;;
czx1.r val1=val // search 0 byte from right
;;
cmp.eq p6,p0=8,val1 // val1==8 ?
(p6) br.wtop.dptk.few 2b // loop until p6 == 0
;;
sub ret0=base,orig // distance from base
sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1
mov pr=saved_pr,0xffffffffffff0000
;;
sub ret0=ret0,tmp // length=now - back -1
mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
br.ret.sptk.many rp // end of successful recovery code
//
// We failed even on the normal load (called from exception handler)
//
.Lexit1:
mov ret0=0
mov pr=saved_pr,0xffffffffffff0000
mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
br.ret.sptk.many rp
END(__strlen_user)

View File

@@ -0,0 +1,44 @@
/*
* Just like strncpy() except that if a fault occurs during copying,
* -EFAULT is returned.
*
* Inputs:
* in0: address of destination buffer
* in1: address of string to be copied
* in2: length of buffer in bytes
* Outputs:
* r8: -EFAULT in case of fault or number of bytes copied if no fault
*
* Copyright (C) 1998-2001 Hewlett-Packard Co
* Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
*
* 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
* by Andreas Schwab <schwab@suse.de>).
*/
#include <asm/asmmacro.h>
GLOBAL_ENTRY(__strncpy_from_user)
alloc r2=ar.pfs,3,0,0,0
mov r8=0
mov r9=in1
;;
add r10=in1,in2
cmp.eq p6,p0=r0,in2
(p6) br.ret.spnt.many rp
// XXX braindead copy loop---this needs to be optimized
.Loop1:
EX(.Lexit, ld1 r8=[in1],1)
;;
EX(.Lexit, st1 [in0]=r8,1)
cmp.ne p6,p7=r8,r0
;;
(p6) cmp.ne.unc p8,p0=in1,r10
(p8) br.cond.dpnt.few .Loop1
;;
(p6) mov r8=in2 // buffer filled up---return buffer length
(p7) sub r8=in1,r9,1 // return string length (excluding NUL character)
[.Lexit:]
br.ret.sptk.many rp
END(__strncpy_from_user)

View File

@@ -0,0 +1,45 @@
/*
* Returns 0 if exception before NUL or reaching the supplied limit (N),
* a value greater than N if the string is longer than the limit, else
* strlen.
*
* Inputs:
* in0: address of buffer
* in1: string length limit N
* Outputs:
* r8: 0 in case of fault, strlen(buffer)+1 otherwise
*
* Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
*/
#include <asm/asmmacro.h>
GLOBAL_ENTRY(__strnlen_user)
.prologue
alloc r2=ar.pfs,2,0,0,0
.save ar.lc, r16
mov r16=ar.lc // preserve ar.lc
.body
add r3=-1,in1
;;
mov ar.lc=r3
mov r9=0
;;
// XXX braindead strlen loop---this needs to be optimized
.Loop1:
EXCLR(.Lexit, ld1 r8=[in0],1)
add r9=1,r9
;;
cmp.eq p6,p0=r8,r0
(p6) br.cond.dpnt .Lexit
br.cloop.dptk.few .Loop1
add r9=1,in1 // NUL not found---return N+1
;;
.Lexit:
mov r8=r9
mov ar.lc=r16 // restore ar.lc
br.ret.sptk.many rp
END(__strnlen_user)

184
kernel/arch/ia64/lib/xor.S Normal file
View File

@@ -0,0 +1,184 @@
/*
* arch/ia64/lib/xor.S
*
* Optimized RAID-5 checksumming functions for IA-64.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* You should have received a copy of the GNU General Public License
* (for example /usr/src/linux/COPYING); if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <asm/asmmacro.h>
GLOBAL_ENTRY(xor_ia64_2)
.prologue
.fframe 0
.save ar.pfs, r31
alloc r31 = ar.pfs, 3, 0, 13, 16
.save ar.lc, r30
mov r30 = ar.lc
.save pr, r29
mov r29 = pr
;;
.body
mov r8 = in1
mov ar.ec = 6 + 2
shr in0 = in0, 3
;;
adds in0 = -1, in0
mov r16 = in1
mov r17 = in2
;;
mov ar.lc = in0
mov pr.rot = 1 << 16
;;
.rotr s1[6+1], s2[6+1], d[2]
.rotp p[6+2]
0:
(p[0]) ld8.nta s1[0] = [r16], 8
(p[0]) ld8.nta s2[0] = [r17], 8
(p[6]) xor d[0] = s1[6], s2[6]
(p[6+1])st8.nta [r8] = d[1], 8
nop.f 0
br.ctop.dptk.few 0b
;;
mov ar.lc = r30
mov pr = r29, -1
br.ret.sptk.few rp
END(xor_ia64_2)
GLOBAL_ENTRY(xor_ia64_3)
.prologue
.fframe 0
.save ar.pfs, r31
alloc r31 = ar.pfs, 4, 0, 20, 24
.save ar.lc, r30
mov r30 = ar.lc
.save pr, r29
mov r29 = pr
;;
.body
mov r8 = in1
mov ar.ec = 6 + 2
shr in0 = in0, 3
;;
adds in0 = -1, in0
mov r16 = in1
mov r17 = in2
;;
mov r18 = in3
mov ar.lc = in0
mov pr.rot = 1 << 16
;;
.rotr s1[6+1], s2[6+1], s3[6+1], d[2]
.rotp p[6+2]
0:
(p[0]) ld8.nta s1[0] = [r16], 8
(p[0]) ld8.nta s2[0] = [r17], 8
(p[6]) xor d[0] = s1[6], s2[6]
;;
(p[0]) ld8.nta s3[0] = [r18], 8
(p[6+1])st8.nta [r8] = d[1], 8
(p[6]) xor d[0] = d[0], s3[6]
br.ctop.dptk.few 0b
;;
mov ar.lc = r30
mov pr = r29, -1
br.ret.sptk.few rp
END(xor_ia64_3)
GLOBAL_ENTRY(xor_ia64_4)
.prologue
.fframe 0
.save ar.pfs, r31
alloc r31 = ar.pfs, 5, 0, 27, 32
.save ar.lc, r30
mov r30 = ar.lc
.save pr, r29
mov r29 = pr
;;
.body
mov r8 = in1
mov ar.ec = 6 + 2
shr in0 = in0, 3
;;
adds in0 = -1, in0
mov r16 = in1
mov r17 = in2
;;
mov r18 = in3
mov ar.lc = in0
mov pr.rot = 1 << 16
mov r19 = in4
;;
.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
.rotp p[6+2]
0:
(p[0]) ld8.nta s1[0] = [r16], 8
(p[0]) ld8.nta s2[0] = [r17], 8
(p[6]) xor d[0] = s1[6], s2[6]
(p[0]) ld8.nta s3[0] = [r18], 8
(p[0]) ld8.nta s4[0] = [r19], 8
(p[6]) xor r20 = s3[6], s4[6]
;;
(p[6+1])st8.nta [r8] = d[1], 8
(p[6]) xor d[0] = d[0], r20
br.ctop.dptk.few 0b
;;
mov ar.lc = r30
mov pr = r29, -1
br.ret.sptk.few rp
END(xor_ia64_4)
GLOBAL_ENTRY(xor_ia64_5)
.prologue
.fframe 0
.save ar.pfs, r31
alloc r31 = ar.pfs, 6, 0, 34, 40
.save ar.lc, r30
mov r30 = ar.lc
.save pr, r29
mov r29 = pr
;;
.body
mov r8 = in1
mov ar.ec = 6 + 2
shr in0 = in0, 3
;;
adds in0 = -1, in0
mov r16 = in1
mov r17 = in2
;;
mov r18 = in3
mov ar.lc = in0
mov pr.rot = 1 << 16
mov r19 = in4
mov r20 = in5
;;
.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
.rotp p[6+2]
0:
(p[0]) ld8.nta s1[0] = [r16], 8
(p[0]) ld8.nta s2[0] = [r17], 8
(p[6]) xor d[0] = s1[6], s2[6]
(p[0]) ld8.nta s3[0] = [r18], 8
(p[0]) ld8.nta s4[0] = [r19], 8
(p[6]) xor r21 = s3[6], s4[6]
;;
(p[0]) ld8.nta s5[0] = [r20], 8
(p[6+1])st8.nta [r8] = d[1], 8
(p[6]) xor d[0] = d[0], r21
;;
(p[6]) xor d[0] = d[0], s5[6]
nop.f 0
br.ctop.dptk.few 0b
;;
mov ar.lc = r30
mov pr = r29, -1
br.ret.sptk.few rp
END(xor_ia64_5)