add idl4k kernel firmware version 1.13.0.105

This commit is contained in:
Jaroslav Kysela
2015-03-26 17:22:37 +01:00
parent 5194d2792e
commit e9070cdc77
31064 changed files with 12769984 additions and 0 deletions

View File

@@ -0,0 +1,28 @@
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o gup.o setup_nx.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_physaddr.o := $(nostackp)
CFLAGS_setup_nx.o := $(nostackp)
obj-$(CONFIG_SMP) += tlb.o
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck/
obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
obj-$(CONFIG_K8_NUMA) += k8topology_64.o
obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
obj-$(CONFIG_MEMTEST) += memtest.o

View File

@@ -0,0 +1,355 @@
/*
* Debug helper to dump the current kernel pagetables of the system
* so that we can see what the various memory ranges are set to.
*
* (C) Copyright 2008 Intel Corporation
*
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/debugfs.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/seq_file.h>
#include <asm/pgtable.h>
/*
* The dumper groups pagetable entries of the same type into one, and for
* that it needs to keep some state when walking, and flush this state
* when a "break" in the continuity is found.
*/
struct pg_state {
int level;
pgprot_t current_prot;
unsigned long start_address;
unsigned long current_address;
const struct addr_marker *marker;
};
struct addr_marker {
unsigned long start_address;
const char *name;
};
/* Address space markers hints */
static struct addr_marker address_markers[] = {
{ 0, "User Space" },
#ifdef CONFIG_X86_64
{ 0x8000000000000000UL, "Kernel Space" },
{ PAGE_OFFSET, "Low Kernel Mapping" },
{ VMALLOC_START, "vmalloc() Area" },
{ VMEMMAP_START, "Vmemmap" },
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
#else
{ PAGE_OFFSET, "Kernel Mapping" },
{ 0/* VMALLOC_START */, "vmalloc() Area" },
{ 0/*VMALLOC_END*/, "vmalloc() End" },
# ifdef CONFIG_HIGHMEM
{ 0/*PKMAP_BASE*/, "Persisent kmap() Area" },
# endif
{ 0/*FIXADDR_START*/, "Fixmap Area" },
#endif
{ -1, NULL } /* End of list */
};
/* Multipliers for offsets within the PTEs */
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
/*
* Print a readable form of a pgprot_t to the seq_file
*/
static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
{
pgprotval_t pr = pgprot_val(prot);
static const char * const level_name[] =
{ "cr3", "pgd", "pud", "pmd", "pte" };
if (!pgprot_val(prot)) {
/* Not present */
seq_printf(m, " ");
} else {
if (pr & _PAGE_USER)
seq_printf(m, "USR ");
else
seq_printf(m, " ");
if (pr & _PAGE_RW)
seq_printf(m, "RW ");
else
seq_printf(m, "ro ");
if (pr & _PAGE_PWT)
seq_printf(m, "PWT ");
else
seq_printf(m, " ");
if (pr & _PAGE_PCD)
seq_printf(m, "PCD ");
else
seq_printf(m, " ");
/* Bit 9 has a different meaning on level 3 vs 4 */
if (level <= 3) {
if (pr & _PAGE_PSE)
seq_printf(m, "PSE ");
else
seq_printf(m, " ");
} else {
if (pr & _PAGE_PAT)
seq_printf(m, "pat ");
else
seq_printf(m, " ");
}
if (pr & _PAGE_GLOBAL)
seq_printf(m, "GLB ");
else
seq_printf(m, " ");
if (pr & _PAGE_NX)
seq_printf(m, "NX ");
else
seq_printf(m, "x ");
}
seq_printf(m, "%s\n", level_name[level]);
}
/*
* On 64 bits, sign-extend the 48 bit address to 64 bit
*/
static unsigned long normalize_addr(unsigned long u)
{
#ifdef CONFIG_X86_64
return (signed long)(u << 16) >> 16;
#else
return u;
#endif
}
/*
* This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to
* print what we collected so far.
*/
static void note_page(struct seq_file *m, struct pg_state *st,
pgprot_t new_prot, int level)
{
pgprotval_t prot, cur;
static const char units[] = "KMGTPE";
/*
* If we have a "break" in the series, we need to flush the state that
* we have now. "break" is either changing perms, levels or
* address space marker.
*/
prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
if (!st->level) {
/* First entry */
st->current_prot = new_prot;
st->level = level;
st->marker = address_markers;
seq_printf(m, "---[ %s ]---\n", st->marker->name);
} else if (prot != cur || level != st->level ||
st->current_address >= st->marker[1].start_address) {
const char *unit = units;
unsigned long delta;
int width = sizeof(unsigned long) * 2;
/*
* Now print the actual finished series
*/
seq_printf(m, "0x%0*lx-0x%0*lx ",
width, st->start_address,
width, st->current_address);
delta = (st->current_address - st->start_address) >> 10;
while (!(delta & 1023) && unit[1]) {
delta >>= 10;
unit++;
}
seq_printf(m, "%9lu%c ", delta, *unit);
printk_prot(m, st->current_prot, st->level);
/*
* We print markers for special areas of address space,
* such as the start of vmalloc space etc.
* This helps in the interpretation.
*/
if (st->current_address >= st->marker[1].start_address) {
st->marker++;
seq_printf(m, "---[ %s ]---\n", st->marker->name);
}
st->start_address = st->current_address;
st->current_prot = new_prot;
st->level = level;
}
}
static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
unsigned long P)
{
int i;
pte_t *start;
start = (pte_t *) pmd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PTE; i++) {
pgprot_t prot = pte_pgprot(*start);
st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
note_page(m, st, prot, 4);
start++;
}
}
#if PTRS_PER_PMD > 1
static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
unsigned long P)
{
int i;
pmd_t *start;
start = (pmd_t *) pud_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PMD; i++) {
st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
if (!pmd_none(*start)) {
pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
if (pmd_large(*start) || !pmd_present(*start))
note_page(m, st, __pgprot(prot), 3);
else
walk_pte_level(m, st, *start,
P + i * PMD_LEVEL_MULT);
} else
note_page(m, st, __pgprot(0), 3);
start++;
}
}
#else
#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
#define pud_large(a) pmd_large(__pmd(pud_val(a)))
#define pud_none(a) pmd_none(__pmd(pud_val(a)))
#endif
#if PTRS_PER_PUD > 1
static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
unsigned long P)
{
int i;
pud_t *start;
start = (pud_t *) pgd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
if (!pud_none(*start)) {
pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
if (pud_large(*start) || !pud_present(*start))
note_page(m, st, __pgprot(prot), 2);
else
walk_pmd_level(m, st, *start,
P + i * PUD_LEVEL_MULT);
} else
note_page(m, st, __pgprot(0), 2);
start++;
}
}
#else
#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
#define pgd_large(a) pud_large(__pud(pgd_val(a)))
#define pgd_none(a) pud_none(__pud(pgd_val(a)))
#endif
static void walk_pgd_level(struct seq_file *m)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_level4_pgt;
#else
pgd_t *start = swapper_pg_dir;
#endif
int i;
struct pg_state st;
memset(&st, 0, sizeof(st));
for (i = 0; i < PTRS_PER_PGD; i++) {
st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
if (!pgd_none(*start)) {
pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
if (pgd_large(*start) || !pgd_present(*start))
note_page(m, &st, __pgprot(prot), 1);
else
walk_pud_level(m, &st, *start,
i * PGD_LEVEL_MULT);
} else
note_page(m, &st, __pgprot(0), 1);
start++;
}
/* Flush out the last page */
st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
note_page(m, &st, __pgprot(0), 0);
}
static int ptdump_show(struct seq_file *m, void *v)
{
walk_pgd_level(m);
return 0;
}
static int ptdump_open(struct inode *inode, struct file *filp)
{
return single_open(filp, ptdump_show, NULL);
}
static const struct file_operations ptdump_fops = {
.open = ptdump_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int pt_dump_init(void)
{
struct dentry *pe;
#ifdef CONFIG_X86_32
/* Not a compile-time constant on x86-32 */
address_markers[2].start_address = VMALLOC_START;
address_markers[3].start_address = VMALLOC_END;
# ifdef CONFIG_HIGHMEM
address_markers[4].start_address = PKMAP_BASE;
address_markers[5].start_address = FIXADDR_START;
# else
address_markers[4].start_address = FIXADDR_START;
# endif
#endif
pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
&ptdump_fops);
if (!pe)
return -ENOMEM;
return 0;
}
__initcall(pt_dump_init);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");

View File

@@ -0,0 +1,68 @@
#include <linux/module.h>
#include <linux/spinlock.h>
#include <asm/uaccess.h>
int fixup_exception(struct pt_regs *regs)
{
const struct exception_table_entry *fixup;
#ifdef CONFIG_PNPBIOS
if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
extern u32 pnp_bios_is_utter_crap;
pnp_bios_is_utter_crap = 1;
printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
__asm__ volatile(
"movl %0, %%esp\n\t"
"jmp *%1\n\t"
: : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
panic("do_trap: can't hit this");
}
#endif
fixup = search_exception_tables(regs->ip);
if (fixup) {
/* If fixup is less than 16, it means uaccess error */
if (fixup->fixup < 16) {
current_thread_info()->uaccess_err = -EFAULT;
regs->ip += fixup->fixup;
return 1;
}
regs->ip = fixup->fixup;
return 1;
}
return 0;
}
#ifdef CONFIG_X86_64
/*
* Need to defined our own search_extable on X86_64 to work around
* a B stepping K8 bug.
*/
const struct exception_table_entry *
search_extable(const struct exception_table_entry *first,
const struct exception_table_entry *last,
unsigned long value)
{
/* B stepping K8 bug */
if ((value >> 32) == 0)
value |= 0xffffffffUL << 32;
while (first <= last) {
const struct exception_table_entry *mid;
long diff;
mid = (last - first) / 2 + first;
diff = mid->insn - value;
if (diff == 0)
return mid;
else if (diff < 0)
first = mid+1;
else
last = mid-1;
}
return NULL;
}
#endif

1148
kernel/arch/x86/mm/fault.c Normal file

File diff suppressed because it is too large Load Diff

375
kernel/arch/x86/mm/gup.c Normal file
View File

@@ -0,0 +1,375 @@
/*
* Lockless get_user_pages_fast for x86
*
* Copyright (C) 2008 Nick Piggin
* Copyright (C) 2008 Novell Inc.
*/
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/highmem.h>
#include <asm/pgtable.h>
static inline pte_t gup_get_pte(pte_t *ptep)
{
#ifndef CONFIG_X86_PAE
return ACCESS_ONCE(*ptep);
#else
/*
* With get_user_pages_fast, we walk down the pagetables without taking
* any locks. For this we would like to load the pointers atoimcally,
* but that is not possible (without expensive cmpxchg8b) on PAE. What
* we do have is the guarantee that a pte will only either go from not
* present to present, or present to not present or both -- it will not
* switch to a completely different present page without a TLB flush in
* between; something that we are blocking by holding interrupts off.
*
* Setting ptes from not present to present goes:
* ptep->pte_high = h;
* smp_wmb();
* ptep->pte_low = l;
*
* And present to not present goes:
* ptep->pte_low = 0;
* smp_wmb();
* ptep->pte_high = 0;
*
* We must ensure here that the load of pte_low sees l iff pte_high
* sees h. We load pte_high *after* loading pte_low, which ensures we
* don't see an older value of pte_high. *Then* we recheck pte_low,
* which ensures that we haven't picked up a changed pte high. We might
* have got rubbish values from pte_low and pte_high, but we are
* guaranteed that pte_low will not have the present bit set *unless*
* it is 'l'. And get_user_pages_fast only operates on present ptes, so
* we're safe.
*
* gup_get_pte should not be used or copied outside gup.c without being
* very careful -- it does not atomically load the pte or anything that
* is likely to be useful for you.
*/
pte_t pte;
retry:
pte.pte_low = ptep->pte_low;
smp_rmb();
pte.pte_high = ptep->pte_high;
smp_rmb();
if (unlikely(pte.pte_low != ptep->pte_low))
goto retry;
return pte;
#endif
}
/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
* register pressure.
*/
static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask;
pte_t *ptep;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
ptep = pte_offset_map(&pmd, addr);
do {
pte_t pte = gup_get_pte(ptep);
struct page *page;
if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
pte_unmap(ptep);
return 0;
}
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
get_page(page);
pages[*nr] = page;
(*nr)++;
} while (ptep++, addr += PAGE_SIZE, addr != end);
pte_unmap(ptep - 1);
return 1;
}
static inline void get_head_page_multiple(struct page *page, int nr)
{
VM_BUG_ON(page != compound_head(page));
VM_BUG_ON(page_count(page) == 0);
atomic_add(nr, &page->_count);
}
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask;
pte_t pte = *(pte_t *)&pmd;
struct page *head, *page;
int refs;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
if ((pte_flags(pte) & mask) != mask)
return 0;
/* hugepages are never "special" */
VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
refs = 0;
head = pte_page(pte);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
get_head_page_multiple(head, refs);
return 1;
}
static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;
pmdp = pmd_offset(&pud, addr);
do {
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
if (pmd_none(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
return 0;
} else {
if (!gup_pte_range(pmd, addr, next, write, pages, nr))
return 0;
}
} while (pmdp++, addr = next, addr != end);
return 1;
}
static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask;
pte_t pte = *(pte_t *)&pud;
struct page *head, *page;
int refs;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
if ((pte_flags(pte) & mask) != mask)
return 0;
/* hugepages are never "special" */
VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
refs = 0;
head = pte_page(pte);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
get_head_page_multiple(head, refs);
return 1;
}
static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
pudp = pud_offset(&pgd, addr);
do {
pud_t pud = *pudp;
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
if (unlikely(pud_large(pud))) {
if (!gup_huge_pud(pud, addr, next, write, pages, nr))
return 0;
} else {
if (!gup_pmd_range(pud, addr, next, write, pages, nr))
return 0;
}
} while (pudp++, addr = next, addr != end);
return 1;
}
/*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP.
*/
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
unsigned long next;
unsigned long flags;
pgd_t *pgdp;
int nr = 0;
start &= PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
(void __user *)start, len)))
return 0;
/*
* XXX: batch / limit 'nr', to avoid large irq off latency
* needs some instrumenting to determine the common sizes used by
* important workloads (eg. DB2), and whether limiting the batch size
* will decrease performance.
*
* It seems like we're in the clear for the moment. Direct-IO is
* the main guy that batches up lots of get_user_pages, and even
* they are limited to 64-at-a-time which is not so many.
*/
/*
* This doesn't prevent pagetable teardown, but does prevent
* the pagetables and pages from being freed on x86.
*
* So long as we atomically load page table pointers versus teardown
* (which we do on x86, with the above PAE exception), we can follow the
* address down to the the page and take a ref on it.
*/
local_irq_save(flags);
pgdp = pgd_offset(mm, addr);
do {
pgd_t pgd = *pgdp;
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
break;
if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
break;
} while (pgdp++, addr = next, addr != end);
local_irq_restore(flags);
return nr;
}
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @write: whether pages will be written to
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
* Attempt to pin user pages in memory without taking mm->mmap_sem.
* If not successful, it will fall back to taking the lock and
* calling get_user_pages().
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
unsigned long next;
pgd_t *pgdp;
int nr = 0;
start &= PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
if (end < start)
goto slow_irqon;
#ifdef CONFIG_X86_64
if (end >> __VIRTUAL_MASK_SHIFT)
goto slow_irqon;
#endif
/*
* XXX: batch / limit 'nr', to avoid large irq off latency
* needs some instrumenting to determine the common sizes used by
* important workloads (eg. DB2), and whether limiting the batch size
* will decrease performance.
*
* It seems like we're in the clear for the moment. Direct-IO is
* the main guy that batches up lots of get_user_pages, and even
* they are limited to 64-at-a-time which is not so many.
*/
/*
* This doesn't prevent pagetable teardown, but does prevent
* the pagetables and pages from being freed on x86.
*
* So long as we atomically load page table pointers versus teardown
* (which we do on x86, with the above PAE exception), we can follow the
* address down to the the page and take a ref on it.
*/
local_irq_disable();
pgdp = pgd_offset(mm, addr);
do {
pgd_t pgd = *pgdp;
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
return nr;
{
int ret;
slow:
local_irq_enable();
slow_irqon:
/* Try to get the remaining pages with get_user_pages */
start += nr << PAGE_SHIFT;
pages += nr;
down_read(&mm->mmap_sem);
ret = get_user_pages(current, mm, start,
(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
up_read(&mm->mmap_sem);
/* Have to be a bit careful with return values */
if (nr > 0) {
if (ret < 0)
ret = nr;
else
ret += nr;
}
return ret;
}
}

View File

@@ -0,0 +1,131 @@
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/swap.h> /* for totalram_pages */
void *kmap(struct page *page)
{
might_sleep();
if (!PageHighMem(page))
return page_address(page);
return kmap_high(page);
}
void kunmap(struct page *page)
{
if (in_interrupt())
BUG();
if (!PageHighMem(page))
return;
kunmap_high(page);
}
/*
* kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
* no global lock is needed and because the kmap code must perform a global TLB
* invalidation when the kmap pool wraps.
*
* However when holding an atomic kmap it is not legal to sleep, so atomic
* kmaps are appropriate for short, tight code paths only.
*/
void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
{
enum fixed_addresses idx;
unsigned long vaddr;
/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
pagefault_disable();
if (!PageHighMem(page))
return page_address(page);
debug_kmap_atomic(type);
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte-idx)));
set_pte(kmap_pte-idx, mk_pte(page, prot));
return (void *)vaddr;
}
void *kmap_atomic(struct page *page, enum km_type type)
{
return kmap_atomic_prot(page, type, kmap_prot);
}
void kunmap_atomic(void *kvaddr, enum km_type type)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
/*
* Force other mappings to Oops if they'll try to access this pte
* without first remap it. Keeping stale mappings around is a bad idea
* also, in case the page changes cacheability attributes or becomes
* a protected page in a hypervisor.
*/
if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
kpte_clear_flush(kmap_pte-idx, vaddr);
else {
#ifdef CONFIG_DEBUG_HIGHMEM
BUG_ON(vaddr < PAGE_OFFSET);
BUG_ON(vaddr >= (unsigned long)high_memory);
#endif
}
pagefault_enable();
}
/*
* This is the same as kmap_atomic() but can map memory that doesn't
* have a struct page associated with it.
*/
void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
{
return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
}
EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
struct page *kmap_atomic_to_page(void *ptr)
{
unsigned long idx, vaddr = (unsigned long)ptr;
pte_t *pte;
if (vaddr < FIXADDR_START)
return virt_to_page(ptr);
idx = virt_to_fix(vaddr);
pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
return pte_page(*pte);
}
EXPORT_SYMBOL(kmap);
EXPORT_SYMBOL(kunmap);
EXPORT_SYMBOL(kmap_atomic);
EXPORT_SYMBOL(kunmap_atomic);
EXPORT_SYMBOL(kmap_atomic_prot);
EXPORT_SYMBOL(kmap_atomic_to_page);
void __init set_highmem_pages_init(void)
{
struct zone *zone;
int nid;
for_each_zone(zone) {
unsigned long zone_start_pfn, zone_end_pfn;
if (!is_highmem(zone))
continue;
zone_start_pfn = zone->zone_start_pfn;
zone_end_pfn = zone_start_pfn + zone->spanned_pages;
nid = zone_to_nid(zone);
printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
zone->name, nid, zone_start_pfn, zone_end_pfn);
add_highpages_with_active_regions(nid, zone_start_pfn,
zone_end_pfn);
}
totalram_pages += totalhigh_pages;
}

View File

@@ -0,0 +1,448 @@
/*
* IA-32 Huge TLB Page Support for Kernel.
*
* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
*/
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/sysctl.h>
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
static unsigned long page_table_shareable(struct vm_area_struct *svma,
struct vm_area_struct *vma,
unsigned long addr, pgoff_t idx)
{
unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
svma->vm_start;
unsigned long sbase = saddr & PUD_MASK;
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
/*
* match the virtual addresses, permission and the alignment of the
* page table page.
*/
if (pmd_index(addr) != pmd_index(saddr) ||
vm_flags != svm_flags ||
sbase < svma->vm_start || svma->vm_end < s_end)
return 0;
return saddr;
}
static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
{
unsigned long base = addr & PUD_MASK;
unsigned long end = base + PUD_SIZE;
/*
* check on proper vm_flags and page table alignment
*/
if (vma->vm_flags & VM_MAYSHARE &&
vma->vm_start <= base && end <= vma->vm_end)
return 1;
return 0;
}
/*
* search for a shareable pmd page for hugetlb.
*/
static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
struct vm_area_struct *vma = find_vma(mm, addr);
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
struct prio_tree_iter iter;
struct vm_area_struct *svma;
unsigned long saddr;
pte_t *spte = NULL;
if (!vma_shareable(vma, addr))
return;
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
saddr = page_table_shareable(svma, vma, addr, idx);
if (saddr) {
spte = huge_pte_offset(svma->vm_mm, saddr);
if (spte) {
get_page(virt_to_page(spte));
break;
}
}
}
if (!spte)
goto out;
spin_lock(&mm->page_table_lock);
if (pud_none(*pud))
pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
else
put_page(virt_to_page(spte));
spin_unlock(&mm->page_table_lock);
out:
spin_unlock(&mapping->i_mmap_lock);
}
/*
* unmap huge page backed by shared pte.
*
* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
* called with vma->vm_mm->page_table_lock held.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
*/
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
pgd_t *pgd = pgd_offset(mm, *addr);
pud_t *pud = pud_offset(pgd, *addr);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
pud_clear(pud);
put_page(virt_to_page(ptep));
*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
return 1;
}
pte_t *huge_pte_alloc(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
pud_t *pud;
pte_t *pte = NULL;
pgd = pgd_offset(mm, addr);
pud = pud_alloc(mm, pgd, addr);
if (pud) {
if (sz == PUD_SIZE) {
pte = (pte_t *)pud;
} else {
BUG_ON(sz != PMD_SIZE);
if (pud_none(*pud))
huge_pmd_share(mm, addr, pud);
pte = (pte_t *) pmd_alloc(mm, pud, addr);
}
}
BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
return pte;
}
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd = NULL;
pgd = pgd_offset(mm, addr);
if (pgd_present(*pgd)) {
pud = pud_offset(pgd, addr);
if (pud_present(*pud)) {
if (pud_large(*pud))
return (pte_t *)pud;
pmd = pmd_offset(pud, addr);
}
}
return (pte_t *) pmd;
}
#if 0 /* This is just for testing */
struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
unsigned long start = address;
int length = 1;
int nr;
struct page *page;
struct vm_area_struct *vma;
vma = find_vma(mm, addr);
if (!vma || !is_vm_hugetlb_page(vma))
return ERR_PTR(-EINVAL);
pte = huge_pte_offset(mm, address);
/* hugetlb should be locked, and hence, prefaulted */
WARN_ON(!pte || pte_none(*pte));
page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
WARN_ON(!PageHead(page));
return page;
}
int pmd_huge(pmd_t pmd)
{
return 0;
}
int pud_huge(pud_t pud)
{
return 0;
}
struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int write)
{
return NULL;
}
#else
struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
return ERR_PTR(-EINVAL);
}
int pmd_huge(pmd_t pmd)
{
return !!(pmd_val(pmd) & _PAGE_PSE);
}
int pud_huge(pud_t pud)
{
return !!(pud_val(pud) & _PAGE_PSE);
}
struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int write)
{
struct page *page;
page = pte_page(*(pte_t *)pmd);
if (page)
page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
return page;
}
struct page *
follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int write)
{
struct page *page;
page = pte_page(*(pte_t *)pud);
if (page)
page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
return page;
}
#endif
/* x86_64 also uses this file */
#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long start_addr;
if (len > mm->cached_hole_size) {
start_addr = mm->free_area_cache;
} else {
start_addr = TASK_UNMAPPED_BASE;
mm->cached_hole_size = 0;
}
full_search:
addr = ALIGN(start_addr, huge_page_size(h));
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr) {
/*
* Start a new search - just in case we missed
* some holes.
*/
if (start_addr != TASK_UNMAPPED_BASE) {
start_addr = TASK_UNMAPPED_BASE;
mm->cached_hole_size = 0;
goto full_search;
}
return -ENOMEM;
}
if (!vma || addr + len <= vma->vm_start) {
mm->free_area_cache = addr + len;
return addr;
}
if (addr + mm->cached_hole_size < vma->vm_start)
mm->cached_hole_size = vma->vm_start - addr;
addr = ALIGN(vma->vm_end, huge_page_size(h));
}
}
static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
unsigned long addr0, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev_vma;
unsigned long base = mm->mmap_base, addr = addr0;
unsigned long largest_hole = mm->cached_hole_size;
int first_time = 1;
/* don't allow allocations above current base */
if (mm->free_area_cache > base)
mm->free_area_cache = base;
if (len <= largest_hole) {
largest_hole = 0;
mm->free_area_cache = base;
}
try_again:
/* make sure it can fit in the remaining address space */
if (mm->free_area_cache < len)
goto fail;
/* either no address requested or cant fit in requested address hole */
addr = (mm->free_area_cache - len) & huge_page_mask(h);
do {
/*
* Lookup failure means no vma is above this address,
* i.e. return with success:
*/
if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
return addr;
/*
* new region fits between prev_vma->vm_end and
* vma->vm_start, use it:
*/
if (addr + len <= vma->vm_start &&
(!prev_vma || (addr >= prev_vma->vm_end))) {
/* remember the address as a hint for next time */
mm->cached_hole_size = largest_hole;
return (mm->free_area_cache = addr);
} else {
/* pull free_area_cache down to the first hole */
if (mm->free_area_cache == vma->vm_end) {
mm->free_area_cache = vma->vm_start;
mm->cached_hole_size = largest_hole;
}
}
/* remember the largest hole we saw so far */
if (addr + largest_hole < vma->vm_start)
largest_hole = vma->vm_start - addr;
/* try just below the current vma->vm_start */
addr = (vma->vm_start - len) & huge_page_mask(h);
} while (len <= vma->vm_start);
fail:
/*
* if hint left us with no space for the requested
* mapping then try again:
*/
if (first_time) {
mm->free_area_cache = base;
largest_hole = 0;
first_time = 0;
goto try_again;
}
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
len, pgoff, flags);
/*
* Restore the topdown base:
*/
mm->free_area_cache = base;
mm->cached_hole_size = ~0UL;
return addr;
}
unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
if (len & ~huge_page_mask(h))
return -EINVAL;
if (len > TASK_SIZE)
return -ENOMEM;
if (flags & MAP_FIXED) {
if (prepare_hugepage_range(file, addr, len))
return -EINVAL;
return addr;
}
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
if (mm->get_unmapped_area == arch_get_unmapped_area)
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
pgoff, flags);
else
return hugetlb_get_unmapped_area_topdown(file, addr, len,
pgoff, flags);
}
#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
#ifdef CONFIG_X86_64
static __init int setup_hugepagesz(char *opt)
{
unsigned long ps = memparse(opt, &opt);
if (ps == PMD_SIZE) {
hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
} else if (ps == PUD_SIZE && cpu_has_gbpages) {
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
} else {
printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
ps >> 20);
return 0;
}
return 1;
}
__setup("hugepagesz=", setup_hugepagesz);
#endif

392
kernel/arch/x86/mm/init.c Normal file
View File

@@ -0,0 +1,392 @@
#include <linux/initrd.h>
#include <linux/ioport.h>
#include <linux/swap.h>
#include <asm/cacheflush.h>
#include <asm/e820.h>
#include <asm/init.h>
#include <asm/page.h>
#include <asm/page_types.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <asm/system.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/proto.h>
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
unsigned long __initdata e820_table_start;
unsigned long __meminitdata e820_table_end;
unsigned long __meminitdata e820_table_top;
int after_bootmem;
int direct_gbpages
#ifdef CONFIG_DIRECT_GBPAGES
= 1
#endif
;
static void __init find_early_table_space(unsigned long end, int use_pse,
int use_gbpages)
{
unsigned long puds, pmds, ptes, tables, start;
puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
if (use_gbpages) {
unsigned long extra;
extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
} else
pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
if (use_pse) {
unsigned long extra;
extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
#ifdef CONFIG_X86_32
extra += PMD_SIZE;
#endif
ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
} else
ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
#ifdef CONFIG_X86_32
/* for fixmap */
tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
#endif
/*
* RED-PEN putting page tables only on node 0 could
* cause a hotspot and fill up ZONE_DMA. The page tables
* need roughly 0.5KB per GB.
*/
#ifdef CONFIG_X86_32
start = 0x7000;
#else
start = 0x8000;
#endif
e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
tables, PAGE_SIZE);
if (e820_table_start == -1UL)
panic("Cannot find space for the kernel page tables");
e820_table_start >>= PAGE_SHIFT;
e820_table_end = e820_table_start;
e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
}
struct map_range {
unsigned long start;
unsigned long end;
unsigned page_size_mask;
};
#ifdef CONFIG_X86_32
#define NR_RANGE_MR 3
#else /* CONFIG_X86_64 */
#define NR_RANGE_MR 5
#endif
static int __meminit save_mr(struct map_range *mr, int nr_range,
unsigned long start_pfn, unsigned long end_pfn,
unsigned long page_size_mask)
{
if (start_pfn < end_pfn) {
if (nr_range >= NR_RANGE_MR)
panic("run out of range for init_memory_mapping\n");
mr[nr_range].start = start_pfn<<PAGE_SHIFT;
mr[nr_range].end = end_pfn<<PAGE_SHIFT;
mr[nr_range].page_size_mask = page_size_mask;
nr_range++;
}
return nr_range;
}
/*
* Setup the direct mapping of the physical memory at PAGE_OFFSET.
* This runs before bootmem is initialized and gets pages directly from
* the physical memory. To access them they are temporarily mapped.
*/
unsigned long __init_refok init_memory_mapping(unsigned long start,
unsigned long end)
{
unsigned long page_size_mask = 0;
unsigned long start_pfn, end_pfn;
unsigned long ret = 0;
unsigned long pos;
struct map_range mr[NR_RANGE_MR];
int nr_range, i;
int use_pse, use_gbpages;
printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
/*
* For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
use_pse = use_gbpages = 0;
#else
use_pse = cpu_has_pse;
use_gbpages = direct_gbpages;
#endif
set_nx();
if (nx_enabled)
printk(KERN_INFO "NX (Execute Disable) protection: active\n");
/* Enable PSE if available */
if (cpu_has_pse)
set_in_cr4(X86_CR4_PSE);
/* Enable PGE if available */
if (cpu_has_pge) {
set_in_cr4(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
if (use_gbpages)
page_size_mask |= 1 << PG_LEVEL_1G;
if (use_pse)
page_size_mask |= 1 << PG_LEVEL_2M;
memset(mr, 0, sizeof(mr));
nr_range = 0;
/* head if not big page alignment ? */
start_pfn = start >> PAGE_SHIFT;
pos = start_pfn << PAGE_SHIFT;
#ifdef CONFIG_X86_32
/*
* Don't use a large page for the first 2/4MB of memory
* because there are often fixed size MTRRs in there
* and overlapping MTRRs into large pages can cause
* slowdowns.
*/
if (pos == 0)
end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
else
end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
<< (PMD_SHIFT - PAGE_SHIFT);
#else /* CONFIG_X86_64 */
end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
<< (PMD_SHIFT - PAGE_SHIFT);
#endif
if (end_pfn > (end >> PAGE_SHIFT))
end_pfn = end >> PAGE_SHIFT;
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
pos = end_pfn << PAGE_SHIFT;
}
/* big page (2M) range */
start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
<< (PMD_SHIFT - PAGE_SHIFT);
#ifdef CONFIG_X86_32
end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
#else /* CONFIG_X86_64 */
end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
<< (PUD_SHIFT - PAGE_SHIFT);
if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
#endif
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
pos = end_pfn << PAGE_SHIFT;
}
#ifdef CONFIG_X86_64
/* big page (1G) range */
start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
<< (PUD_SHIFT - PAGE_SHIFT);
end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask &
((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
pos = end_pfn << PAGE_SHIFT;
}
/* tail is not big page (1G) alignment */
start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
<< (PMD_SHIFT - PAGE_SHIFT);
end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
pos = end_pfn << PAGE_SHIFT;
}
#endif
/* tail is not big page (2M) alignment */
start_pfn = pos>>PAGE_SHIFT;
end_pfn = end>>PAGE_SHIFT;
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
/* try to merge same page size and continuous */
for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
unsigned long old_start;
if (mr[i].end != mr[i+1].start ||
mr[i].page_size_mask != mr[i+1].page_size_mask)
continue;
/* move it */
old_start = mr[i].start;
memmove(&mr[i], &mr[i+1],
(nr_range - 1 - i) * sizeof(struct map_range));
mr[i--].start = old_start;
nr_range--;
}
for (i = 0; i < nr_range; i++)
printk(KERN_DEBUG " %010lx - %010lx page %s\n",
mr[i].start, mr[i].end,
(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
(mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
/*
* Find space for the kernel direct mapping tables.
*
* Later we should allocate these tables in the local node of the
* memory mapped. Unfortunately this is done currently before the
* nodes are discovered.
*/
if (!after_bootmem)
find_early_table_space(end, use_pse, use_gbpages);
#ifdef CONFIG_X86_32
for (i = 0; i < nr_range; i++)
kernel_physical_mapping_init(mr[i].start, mr[i].end,
mr[i].page_size_mask);
ret = end;
#else /* CONFIG_X86_64 */
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
mr[i].page_size_mask);
#endif
#ifdef CONFIG_X86_32
early_ioremap_page_table_range_init();
load_cr3(swapper_pg_dir);
#endif
#ifdef CONFIG_X86_64
if (!after_bootmem && !start) {
pud_t *pud;
pmd_t *pmd;
mmu_cr4_features = read_cr4();
/*
* _brk_end cannot change anymore, but it and _end may be
* located on different 2M pages. cleanup_highmap(), however,
* can only consider _end when it runs, so destroy any
* mappings beyond _brk_end here.
*/
pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
pmd = pmd_offset(pud, _brk_end - 1);
while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
pmd_clear(pmd);
}
#endif
__flush_tlb_all();
if (!after_bootmem && e820_table_end > e820_table_start)
reserve_early(e820_table_start << PAGE_SHIFT,
e820_table_end << PAGE_SHIFT, "PGTABLE");
if (!after_bootmem)
early_memtest(start, end);
return ret >> PAGE_SHIFT;
}
/*
* devmem_is_allowed() checks to see if /dev/mem access to a certain address
* is valid. The argument is a physical page number.
*
*
* On x86, access has to be given to the first megabyte of ram because that area
* contains bios code and data regions used by X and dosemu and similar apps.
* Access has to be given to non-kernel-ram areas as well, these contain the PCI
* mmio resources as well as potential bios/acpi data regions.
*/
int devmem_is_allowed(unsigned long pagenr)
{
if (pagenr <= 256)
return 1;
if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
return 0;
if (!page_is_ram(pagenr))
return 1;
return 0;
}
void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
unsigned long addr = begin;
if (addr >= end)
return;
/*
* If debugging page accesses then do not free this memory but
* mark them not present - any buggy init-section access will
* create a kernel page fault:
*/
#ifdef CONFIG_DEBUG_PAGEALLOC
printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
begin, PAGE_ALIGN(end));
set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
/*
* We just marked the kernel text read only above, now that
* we are going to free part of that, we need to make that
* writeable first.
*/
set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
for (; addr < end; addr += PAGE_SIZE) {
ClearPageReserved(virt_to_page(addr));
init_page_count(virt_to_page(addr));
memset((void *)(addr & ~(PAGE_SIZE-1)),
POISON_FREE_INITMEM, PAGE_SIZE);
free_page(addr);
totalram_pages++;
}
#endif
}
void free_initmem(void)
{
free_init_pages("unused kernel memory",
(unsigned long)(&__init_begin),
(unsigned long)(&__init_end));
}
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
free_init_pages("initrd memory", start, end);
}
#endif

1071
kernel/arch/x86/mm/init_32.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,994 @@
/*
* linux/arch/x86_64/mm/init.c
*
* Copyright (C) 1995 Linus Torvalds
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
* Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
*/
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/initrd.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
#include <linux/pci.h>
#include <linux/pfn.h>
#include <linux/poison.h>
#include <linux/dma-mapping.h>
#include <linux/module.h>
#include <linux/memory_hotplug.h>
#include <linux/nmi.h>
#include <asm/processor.h>
#include <asm/bios_ebda.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
#include <asm/sections.h>
#include <asm/kdebug.h>
#include <asm/numa.h>
#include <asm/cacheflush.h>
#include <asm/init.h>
#include <linux/bootmem.h>
static unsigned long dma_reserve __initdata;
static int __init parse_direct_gbpages_off(char *arg)
{
direct_gbpages = 0;
return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);
static int __init parse_direct_gbpages_on(char *arg)
{
direct_gbpages = 1;
return 0;
}
early_param("gbpages", parse_direct_gbpages_on);
/*
* NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
* physical space so we can cache the place of the first one and move
* around without checking the pgd every time.
*/
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
int force_personality32;
/*
* noexec32=on|off
* Control non executable heap for 32bit processes.
* To control the stack too use noexec=off
*
* on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
* off PROT_READ implies PROT_EXEC
*/
static int __init nonx32_setup(char *str)
{
if (!strcmp(str, "on"))
force_personality32 &= ~READ_IMPLIES_EXEC;
else if (!strcmp(str, "off"))
force_personality32 |= READ_IMPLIES_EXEC;
return 1;
}
__setup("noexec32=", nonx32_setup);
/*
* NOTE: This function is marked __ref because it calls __init function
* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
*/
static __ref void *spp_getpage(void)
{
void *ptr;
if (after_bootmem)
ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
else
ptr = alloc_bootmem_pages(PAGE_SIZE);
if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
panic("set_pte_phys: cannot allocate page data %s\n",
after_bootmem ? "after bootmem" : "");
}
pr_debug("spp_getpage %p\n", ptr);
return ptr;
}
static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
{
if (pgd_none(*pgd)) {
pud_t *pud = (pud_t *)spp_getpage();
pgd_populate(&init_mm, pgd, pud);
if (pud != pud_offset(pgd, 0))
printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
pud, pud_offset(pgd, 0));
}
return pud_offset(pgd, vaddr);
}
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
{
if (pud_none(*pud)) {
pmd_t *pmd = (pmd_t *) spp_getpage();
pud_populate(&init_mm, pud, pmd);
if (pmd != pmd_offset(pud, 0))
printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
pmd, pmd_offset(pud, 0));
}
return pmd_offset(pud, vaddr);
}
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
{
if (pmd_none(*pmd)) {
pte_t *pte = (pte_t *) spp_getpage();
pmd_populate_kernel(&init_mm, pmd, pte);
if (pte != pte_offset_kernel(pmd, 0))
printk(KERN_ERR "PAGETABLE BUG #02!\n");
}
return pte_offset_kernel(pmd, vaddr);
}
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pud = pud_page + pud_index(vaddr);
pmd = fill_pmd(pud, vaddr);
pte = fill_pte(pmd, vaddr);
set_pte(pte, new_pte);
/*
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one(vaddr);
}
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
pud_t *pud_page;
pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
pgd = pgd_offset_k(vaddr);
if (pgd_none(*pgd)) {
printk(KERN_ERR
"PGD FIXMAP MISSING, it should be setup in head.S!\n");
return;
}
pud_page = (pud_t*)pgd_page_vaddr(*pgd);
set_pte_vaddr_pud(pud_page, vaddr, pteval);
}
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
{
pgd_t *pgd;
pud_t *pud;
pgd = pgd_offset_k(vaddr);
pud = fill_pud(pgd, vaddr);
return fill_pmd(pud, vaddr);
}
pte_t * __init populate_extra_pte(unsigned long vaddr)
{
pmd_t *pmd;
pmd = populate_extra_pmd(vaddr);
return fill_pte(pmd, vaddr);
}
/*
* Create large page table mappings for a range of physical addresses.
*/
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
pgprot_t prot)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
pgd = pgd_offset_k((unsigned long)__va(phys));
if (pgd_none(*pgd)) {
pud = (pud_t *) spp_getpage();
set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
_PAGE_USER));
}
pud = pud_offset(pgd, (unsigned long)__va(phys));
if (pud_none(*pud)) {
pmd = (pmd_t *) spp_getpage();
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
_PAGE_USER));
}
pmd = pmd_offset(pud, phys);
BUG_ON(!pmd_none(*pmd));
set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
}
}
void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}
void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}
/*
* The head.S code sets up the kernel high mapping:
*
* from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
*
* phys_addr holds the negative offset to the kernel, which is added
* to the compile time generated pmds. This results in invalid pmds up
* to the point where we hit the physaddr 0 mapping.
*
* We limit the mappings to the region from _text to _end. _end is
* rounded up to the 2MB boundary. This catches the invalid pmds as
* well, as they are located before _text:
*/
void __init cleanup_highmap(void)
{
unsigned long vaddr = __START_KERNEL_map;
unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
pmd_t *pmd = level2_kernel_pgt;
pmd_t *last_pmd = pmd + PTRS_PER_PMD;
for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
if (pmd_none(*pmd))
continue;
if (vaddr < (unsigned long) _text || vaddr > end)
set_pmd(pmd, __pmd(0));
}
}
static __ref void *alloc_low_page(unsigned long *phys)
{
unsigned long pfn = e820_table_end++;
void *adr;
if (after_bootmem) {
adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
*phys = __pa(adr);
return adr;
}
if (pfn >= e820_table_top)
panic("alloc_low_page: ran out of memory");
adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
memset(adr, 0, PAGE_SIZE);
*phys = pfn * PAGE_SIZE;
return adr;
}
static __ref void unmap_low_page(void *adr)
{
if (after_bootmem)
return;
early_iounmap(adr, PAGE_SIZE);
}
static unsigned long __meminit
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
pgprot_t prot)
{
unsigned pages = 0;
unsigned long last_map_addr = end;
int i;
pte_t *pte = pte_page + pte_index(addr);
for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
if (addr >= end) {
if (!after_bootmem) {
for(; i < PTRS_PER_PTE; i++, pte++)
set_pte(pte, __pte(0));
}
break;
}
/*
* We will re-use the existing mapping.
* Xen for example has some special requirements, like mapping
* pagetable pages as RO. So assume someone who pre-setup
* these mappings are more intelligent.
*/
if (pte_val(*pte)) {
pages++;
continue;
}
if (0)
printk(" pte=%p addr=%lx pte=%016lx\n",
pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
pages++;
set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
}
update_page_count(PG_LEVEL_4K, pages);
return last_map_addr;
}
static unsigned long __meminit
phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
pgprot_t prot)
{
pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
return phys_pte_init(pte, address, end, prot);
}
static unsigned long __meminit
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
unsigned long page_size_mask, pgprot_t prot)
{
unsigned long pages = 0;
unsigned long last_map_addr = end;
int i = pmd_index(address);
for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
unsigned long pte_phys;
pmd_t *pmd = pmd_page + pmd_index(address);
pte_t *pte;
pgprot_t new_prot = prot;
if (address >= end) {
if (!after_bootmem) {
for (; i < PTRS_PER_PMD; i++, pmd++)
set_pmd(pmd, __pmd(0));
}
break;
}
if (pmd_val(*pmd)) {
if (!pmd_large(*pmd)) {
spin_lock(&init_mm.page_table_lock);
last_map_addr = phys_pte_update(pmd, address,
end, prot);
spin_unlock(&init_mm.page_table_lock);
continue;
}
/*
* If we are ok with PG_LEVEL_2M mapping, then we will
* use the existing mapping,
*
* Otherwise, we will split the large page mapping but
* use the same existing protection bits except for
* large page, so that we don't violate Intel's TLB
* Application note (317080) which says, while changing
* the page sizes, new and old translations should
* not differ with respect to page frame and
* attributes.
*/
if (page_size_mask & (1 << PG_LEVEL_2M)) {
pages++;
continue;
}
new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
}
if (page_size_mask & (1<<PG_LEVEL_2M)) {
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pmd,
pfn_pte(address >> PAGE_SHIFT,
__pgprot(pgprot_val(prot) | _PAGE_PSE)));
spin_unlock(&init_mm.page_table_lock);
last_map_addr = (address & PMD_MASK) + PMD_SIZE;
continue;
}
pte = alloc_low_page(&pte_phys);
last_map_addr = phys_pte_init(pte, address, end, new_prot);
unmap_low_page(pte);
spin_lock(&init_mm.page_table_lock);
pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
spin_unlock(&init_mm.page_table_lock);
}
update_page_count(PG_LEVEL_2M, pages);
return last_map_addr;
}
static unsigned long __meminit
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
unsigned long page_size_mask, pgprot_t prot)
{
pmd_t *pmd = pmd_offset(pud, 0);
unsigned long last_map_addr;
last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
__flush_tlb_all();
return last_map_addr;
}
static unsigned long __meminit
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
unsigned long page_size_mask)
{
unsigned long pages = 0;
unsigned long last_map_addr = end;
int i = pud_index(addr);
for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
unsigned long pmd_phys;
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
pgprot_t prot = PAGE_KERNEL;
if (addr >= end)
break;
if (!after_bootmem &&
!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
set_pud(pud, __pud(0));
continue;
}
if (pud_val(*pud)) {
if (!pud_large(*pud)) {
last_map_addr = phys_pmd_update(pud, addr, end,
page_size_mask, prot);
continue;
}
/*
* If we are ok with PG_LEVEL_1G mapping, then we will
* use the existing mapping.
*
* Otherwise, we will split the gbpage mapping but use
* the same existing protection bits except for large
* page, so that we don't violate Intel's TLB
* Application note (317080) which says, while changing
* the page sizes, new and old translations should
* not differ with respect to page frame and
* attributes.
*/
if (page_size_mask & (1 << PG_LEVEL_1G)) {
pages++;
continue;
}
prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
}
if (page_size_mask & (1<<PG_LEVEL_1G)) {
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pud,
pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
spin_unlock(&init_mm.page_table_lock);
last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
continue;
}
pmd = alloc_low_page(&pmd_phys);
last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
prot);
unmap_low_page(pmd);
spin_lock(&init_mm.page_table_lock);
pud_populate(&init_mm, pud, __va(pmd_phys));
spin_unlock(&init_mm.page_table_lock);
}
__flush_tlb_all();
update_page_count(PG_LEVEL_1G, pages);
return last_map_addr;
}
static unsigned long __meminit
phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
unsigned long page_size_mask)
{
pud_t *pud;
pud = (pud_t *)pgd_page_vaddr(*pgd);
return phys_pud_init(pud, addr, end, page_size_mask);
}
unsigned long __meminit
kernel_physical_mapping_init(unsigned long start,
unsigned long end,
unsigned long page_size_mask)
{
unsigned long next, last_map_addr = end;
start = (unsigned long)__va(start);
end = (unsigned long)__va(end);
for (; start < end; start = next) {
pgd_t *pgd = pgd_offset_k(start);
unsigned long pud_phys;
pud_t *pud;
next = (start + PGDIR_SIZE) & PGDIR_MASK;
if (next > end)
next = end;
if (pgd_val(*pgd)) {
last_map_addr = phys_pud_update(pgd, __pa(start),
__pa(end), page_size_mask);
continue;
}
pud = alloc_low_page(&pud_phys);
last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
page_size_mask);
unmap_low_page(pud);
spin_lock(&init_mm.page_table_lock);
pgd_populate(&init_mm, pgd, __va(pud_phys));
spin_unlock(&init_mm.page_table_lock);
}
__flush_tlb_all();
return last_map_addr;
}
#ifndef CONFIG_NUMA
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long bootmap_size, bootmap;
bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
PAGE_SIZE);
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n", bootmap_size);
/* don't touch min_low_pfn */
bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
0, end_pfn);
e820_register_active_regions(0, start_pfn, end_pfn);
free_bootmem_with_active_regions(0, end_pfn);
early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}
#endif
void __init paging_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
max_zone_pfns[ZONE_NORMAL] = max_pfn;
sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
/*
* clear the default setting with node 0
* note: don't use nodes_clear here, that is really clearing when
* numa support is not compiled in, and later node_set_state
* will not set it back.
*/
node_clear_state(0, N_NORMAL_MEMORY);
free_area_init_nodes(max_zone_pfns);
}
/*
* Memory hotplug specific functions
*/
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
* updating.
*/
static void update_end_of_memory_vars(u64 start, u64 size)
{
unsigned long end_pfn = PFN_UP(start + size);
if (end_pfn > max_pfn) {
max_pfn = end_pfn;
max_low_pfn = end_pfn;
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
}
}
/*
* Memory is added always to NORMAL zone. This means you will never get
* additional DMA/DMA32 memory.
*/
int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(nid);
struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
last_mapped_pfn = init_memory_mapping(start, start + size);
if (last_mapped_pfn > max_pfn_mapped)
max_pfn_mapped = last_mapped_pfn;
ret = __add_pages(nid, zone, start_pfn, nr_pages);
WARN_ON_ONCE(ret);
/* update max_pfn, max_low_pfn and high_memory */
update_end_of_memory_vars(start, size);
return ret;
}
EXPORT_SYMBOL_GPL(arch_add_memory);
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
int memory_add_physaddr_to_nid(u64 start)
{
return 0;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
static struct kcore_list kcore_vsyscall;
void __init mem_init(void)
{
long codesize, reservedpages, datasize, initsize;
unsigned long absent_pages;
pci_iommu_alloc();
/* clear_bss() already clear the empty_zero_page */
reservedpages = 0;
/* this will put all low memory onto the freelists */
#ifdef CONFIG_NUMA
totalram_pages = numa_free_all_bootmem();
#else
totalram_pages = free_all_bootmem();
#endif
absent_pages = absent_pages_in_range(0, max_pfn);
reservedpages = max_pfn - totalram_pages - absent_pages;
after_bootmem = 1;
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
/* Register memory areas for /proc/kcore */
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
"%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
nr_free_pages() << (PAGE_SHIFT-10),
max_pfn << (PAGE_SHIFT-10),
codesize >> 10,
absent_pages << (PAGE_SHIFT-10),
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
initsize >> 10);
}
#ifdef CONFIG_DEBUG_RODATA
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
static int kernel_set_to_readonly;
void set_kernel_text_rw(void)
{
unsigned long start = PFN_ALIGN(_stext);
unsigned long end = PFN_ALIGN(__start_rodata);
if (!kernel_set_to_readonly)
return;
pr_debug("Set kernel text: %lx - %lx for read write\n",
start, end);
set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}
void set_kernel_text_ro(void)
{
unsigned long start = PFN_ALIGN(_stext);
unsigned long end = PFN_ALIGN(__start_rodata);
if (!kernel_set_to_readonly)
return;
pr_debug("Set kernel text: %lx - %lx for read only\n",
start, end);
set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
unsigned long rodata_start =
((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
set_memory_ro(start, (end - start) >> PAGE_SHIFT);
kernel_set_to_readonly = 1;
/*
* The rodata section (but not the kernel text!) should also be
* not-executable.
*/
set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
rodata_test();
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
set_memory_rw(start, (end-start) >> PAGE_SHIFT);
printk(KERN_INFO "Testing CPA: again\n");
set_memory_ro(start, (end-start) >> PAGE_SHIFT);
#endif
}
#endif
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
int flags)
{
#ifdef CONFIG_NUMA
int nid, next_nid;
int ret;
#endif
unsigned long pfn = phys >> PAGE_SHIFT;
if (pfn >= max_pfn) {
/*
* This can happen with kdump kernels when accessing
* firmware tables:
*/
if (pfn < max_pfn_mapped)
return -EFAULT;
printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
phys, len);
return -EFAULT;
}
/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
nid = phys_to_nid(phys);
next_nid = phys_to_nid(phys + len - 1);
if (nid == next_nid)
ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
else
ret = reserve_bootmem(phys, len, flags);
if (ret != 0)
return ret;
#else
reserve_bootmem(phys, len, flags);
#endif
if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
dma_reserve += len / PAGE_SIZE;
set_dma_reserve(dma_reserve);
}
return 0;
}
int kern_addr_valid(unsigned long addr)
{
unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (above != 0 && above != -1UL)
return 0;
pgd = pgd_offset_k(addr);
if (pgd_none(*pgd))
return 0;
pud = pud_offset(pgd, addr);
if (pud_none(*pud))
return 0;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
return 0;
if (pmd_large(*pmd))
return pfn_valid(pmd_pfn(*pmd));
pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte))
return 0;
return pfn_valid(pte_pfn(*pte));
}
/*
* A pseudo VMA to allow ptrace access for the vsyscall page. This only
* covers the 64bit vsyscall page now. 32bit has a real VMA now and does
* not need special handling anymore:
*/
static struct vm_area_struct gate_vma = {
.vm_start = VSYSCALL_START,
.vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
.vm_page_prot = PAGE_READONLY_EXEC,
.vm_flags = VM_READ | VM_EXEC
};
struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
if (test_tsk_thread_flag(tsk, TIF_IA32))
return NULL;
#endif
return &gate_vma;
}
int in_gate_area(struct task_struct *task, unsigned long addr)
{
struct vm_area_struct *vma = get_gate_vma(task);
if (!vma)
return 0;
return (addr >= vma->vm_start) && (addr < vma->vm_end);
}
/*
* Use this when you have no reliable task/vma, typically from interrupt
* context. It is less reliable than using the task's vma and may give
* false positives:
*/
int in_gate_area_no_task(unsigned long addr)
{
return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
}
const char *arch_vma_name(struct vm_area_struct *vma)
{
if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
return "[vdso]";
if (vma == &gate_vma)
return "[vsyscall]";
return NULL;
}
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
* Initialise the sparsemem vmemmap using huge-pages at the PMD level.
*/
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
{
unsigned long addr = (unsigned long)start_page;
unsigned long end = (unsigned long)(start_page + size);
unsigned long next;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
for (; addr < end; addr = next) {
void *p = NULL;
pgd = vmemmap_pgd_populate(addr, node);
if (!pgd)
return -ENOMEM;
pud = vmemmap_pud_populate(pgd, addr, node);
if (!pud)
return -ENOMEM;
if (!cpu_has_pse) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
pmd = vmemmap_pmd_populate(pud, addr, node);
if (!pmd)
return -ENOMEM;
p = vmemmap_pte_populate(pmd, addr, node);
if (!p)
return -ENOMEM;
addr_end = addr + PAGE_SIZE;
p_end = p + PAGE_SIZE;
} else {
next = pmd_addr_end(addr, end);
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
pte_t entry;
p = vmemmap_alloc_block(PMD_SIZE, node);
if (!p)
return -ENOMEM;
entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
PAGE_KERNEL_LARGE);
set_pmd(pmd, __pmd(pte_val(entry)));
/* check to see if we have contiguous blocks */
if (p_end != p || node_start != node) {
if (p_start)
printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
addr_start, addr_end-1, p_start, p_end-1, node_start);
addr_start = addr;
node_start = node;
p_start = p;
}
addr_end = addr + PMD_SIZE;
p_end = p + PMD_SIZE;
} else
vmemmap_verify((pte_t *)pmd, node, addr, next);
}
}
return 0;
}
void __meminit vmemmap_populate_print_last(void)
{
if (p_start) {
printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
addr_start, addr_end-1, p_start, p_end-1, node_start);
p_start = NULL;
p_end = NULL;
node_start = 0;
}
}
#endif

View File

@@ -0,0 +1,110 @@
/*
* Copyright © 2008 Ingo Molnar
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#include <asm/iomap.h>
#include <asm/pat.h>
#include <linux/module.h>
#include <linux/highmem.h>
static int is_io_mapping_possible(resource_size_t base, unsigned long size)
{
#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
/* There is no way to map greater than 1 << 32 address without PAE */
if (base + size > 0x100000000ULL)
return 0;
#endif
return 1;
}
int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
{
unsigned long flag = _PAGE_CACHE_WC;
int ret;
if (!is_io_mapping_possible(base, size))
return -EINVAL;
ret = io_reserve_memtype(base, base + size, &flag);
if (ret)
return ret;
*prot = __pgprot(__PAGE_KERNEL | flag);
return 0;
}
EXPORT_SYMBOL_GPL(iomap_create_wc);
void
iomap_free(resource_size_t base, unsigned long size)
{
io_free_memtype(base, base + size);
}
EXPORT_SYMBOL_GPL(iomap_free);
void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
{
enum fixed_addresses idx;
unsigned long vaddr;
pagefault_disable();
debug_kmap_atomic(type);
idx = type + KM_TYPE_NR * smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
arch_flush_lazy_mmu_mode();
return (void *)vaddr;
}
/*
* Map 'pfn' using fixed map 'type' and protections 'prot'
*/
void *
iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
{
/*
* For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
* PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
* MTRR is UC or WC. UC_MINUS gets the real intention, of the
* user, which is "WC if the MTRR is WC, UC if you can't do that."
*/
if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
prot = PAGE_KERNEL_UC_MINUS;
return kmap_atomic_prot_pfn(pfn, type, prot);
}
EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
void
iounmap_atomic(void *kvaddr, enum km_type type)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
/*
* Force other mappings to Oops if they'll try to access this pte
* without first remap it. Keeping stale mappings around is a bad idea
* also, in case the page changes cacheability attributes or becomes
* a protected page in a hypervisor.
*/
if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
kpte_clear_flush(kmap_pte-idx, vaddr);
pagefault_enable();
}
EXPORT_SYMBOL_GPL(iounmap_atomic);

View File

@@ -0,0 +1,668 @@
/*
* Re-map IO memory to kernel address space so that we can access it.
* This is needed for high PCI addresses that aren't mapped in the
* 640k-1MB IO memory area on PC's
*
* (C) Copyright 1995 1996 Linus Torvalds
*/
#include <linux/bootmem.h>
#include <linux/init.h>
#include <linux/io.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mmiotrace.h>
#include <asm/cacheflush.h>
#include <asm/e820.h>
#include <asm/fixmap.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
#include <asm/pat.h>
#include "physaddr.h"
int page_is_ram(unsigned long pagenr)
{
resource_size_t addr, end;
int i;
/*
* A special case is the first 4Kb of memory;
* This is a BIOS owned area, not kernel ram, but generally
* not listed as such in the E820 table.
*/
if (pagenr == 0)
return 0;
/*
* Second special case: Some BIOSen report the PC BIOS
* area (640->1Mb) as ram even though it is not.
*/
if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
pagenr < (BIOS_END >> PAGE_SHIFT))
return 0;
for (i = 0; i < e820.nr_map; i++) {
/*
* Not usable memory:
*/
if (e820.map[i].type != E820_RAM)
continue;
addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
if ((pagenr >= addr) && (pagenr < end))
return 1;
}
return 0;
}
/*
* Fix up the linear direct mapping of the kernel to avoid cache attribute
* conflicts.
*/
int ioremap_change_attr(unsigned long vaddr, unsigned long size,
unsigned long prot_val)
{
unsigned long nrpages = size >> PAGE_SHIFT;
int err;
switch (prot_val) {
case _PAGE_CACHE_UC:
default:
err = _set_memory_uc(vaddr, nrpages);
break;
case _PAGE_CACHE_WC:
err = _set_memory_wc(vaddr, nrpages);
break;
case _PAGE_CACHE_WB:
err = _set_memory_wb(vaddr, nrpages);
break;
}
return err;
}
/*
* Remap an arbitrary physical address space into the kernel virtual
* address space. Needed when the kernel wants to access high addresses
* directly.
*
* NOTE! We need to allow non-page-aligned mappings too: we will obviously
* have to convert them into an offset in a page-aligned mapping, but the
* caller shouldn't need to know that small detail.
*/
static void __iomem *__ioremap_caller(resource_size_t phys_addr,
unsigned long size, unsigned long prot_val, void *caller)
{
unsigned long pfn, offset, vaddr;
resource_size_t last_addr;
const resource_size_t unaligned_phys_addr = phys_addr;
const unsigned long unaligned_size = size;
struct vm_struct *area;
unsigned long new_prot_val;
pgprot_t prot;
int retval;
void __iomem *ret_addr;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
return NULL;
if (!phys_addr_valid(phys_addr)) {
printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
(unsigned long long)phys_addr);
WARN_ON_ONCE(1);
return NULL;
}
/*
* Don't remap the low PCI/ISA area, it's always mapped..
*/
if (is_ISA_range(phys_addr, last_addr))
return (__force void __iomem *)phys_to_virt(phys_addr);
/*
* Check if the request spans more than any BAR in the iomem resource
* tree.
*/
WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
for (pfn = phys_addr >> PAGE_SHIFT;
(pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK);
pfn++) {
int is_ram = page_is_ram(pfn);
if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
return NULL;
WARN_ON_ONCE(is_ram);
}
/*
* Mappings have to be page-aligned
*/
offset = phys_addr & ~PAGE_MASK;
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr+1) - phys_addr;
retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
prot_val, &new_prot_val);
if (retval) {
printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
return NULL;
}
if (prot_val != new_prot_val) {
if (!is_new_memtype_allowed(phys_addr, size,
prot_val, new_prot_val)) {
printk(KERN_ERR
"ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
(unsigned long long)phys_addr,
(unsigned long long)(phys_addr + size),
prot_val, new_prot_val);
goto err_free_memtype;
}
prot_val = new_prot_val;
}
switch (prot_val) {
case _PAGE_CACHE_UC:
default:
prot = PAGE_KERNEL_IO_NOCACHE;
break;
case _PAGE_CACHE_UC_MINUS:
prot = PAGE_KERNEL_IO_UC_MINUS;
break;
case _PAGE_CACHE_WC:
prot = PAGE_KERNEL_IO_WC;
break;
case _PAGE_CACHE_WB:
prot = PAGE_KERNEL_IO;
break;
}
/*
* Ok, go for it..
*/
area = get_vm_area_caller(size, VM_IOREMAP, caller);
if (!area)
goto err_free_memtype;
area->phys_addr = phys_addr;
vaddr = (unsigned long) area->addr;
if (kernel_map_sync_memtype(phys_addr, size, prot_val))
goto err_free_area;
if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
goto err_free_area;
ret_addr = (void __iomem *) (vaddr + offset);
mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
return ret_addr;
err_free_area:
free_vm_area(area);
err_free_memtype:
free_memtype(phys_addr, phys_addr + size);
return NULL;
}
/**
* ioremap_nocache - map bus memory into CPU space
* @offset: bus address of the memory
* @size: size of the resource to map
*
* ioremap_nocache performs a platform specific sequence of operations to
* make bus memory CPU accessible via the readb/readw/readl/writeb/
* writew/writel functions and the other mmio helpers. The returned
* address is not guaranteed to be usable directly as a virtual
* address.
*
* This version of ioremap ensures that the memory is marked uncachable
* on the CPU as well as honouring existing caching rules from things like
* the PCI bus. Note that there are other caches and buffers on many
* busses. In particular driver authors should read up on PCI writes
*
* It's useful if some control registers are in such an area and
* write combining or read caching is not desirable:
*
* Must be freed with iounmap.
*/
void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
{
/*
* Ideally, this should be:
* pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
*
* Till we fix all X drivers to use ioremap_wc(), we will use
* UC MINUS.
*/
unsigned long val = _PAGE_CACHE_UC_MINUS;
return __ioremap_caller(phys_addr, size, val,
__builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_nocache);
/**
* ioremap_wc - map memory into CPU space write combined
* @offset: bus address of the memory
* @size: size of the resource to map
*
* This version of ioremap ensures that the memory is marked write combining.
* Write combining allows faster writes to some hardware devices.
*
* Must be freed with iounmap.
*/
void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{
if (pat_enabled)
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
__builtin_return_address(0));
else
return ioremap_nocache(phys_addr, size);
}
EXPORT_SYMBOL(ioremap_wc);
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
__builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_cache);
static void __iomem *ioremap_default(resource_size_t phys_addr,
unsigned long size)
{
unsigned long flags;
void __iomem *ret;
int err;
/*
* - WB for WB-able memory and no other conflicting mappings
* - UC_MINUS for non-WB-able memory with no other conflicting mappings
* - Inherit from confliting mappings otherwise
*/
err = reserve_memtype(phys_addr, phys_addr + size,
_PAGE_CACHE_WB, &flags);
if (err < 0)
return NULL;
ret = __ioremap_caller(phys_addr, size, flags,
__builtin_return_address(0));
free_memtype(phys_addr, phys_addr + size);
return ret;
}
void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
unsigned long prot_val)
{
return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
__builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_prot);
/**
* iounmap - Free a IO remapping
* @addr: virtual address from ioremap_*
*
* Caller must ensure there is only one unmapping for the same pointer.
*/
void iounmap(volatile void __iomem *addr)
{
struct vm_struct *p, *o;
if ((void __force *)addr <= high_memory)
return;
/*
* __ioremap special-cases the PCI/ISA range by not instantiating a
* vm_area and by simply returning an address into the kernel mapping
* of ISA space. So handle that here.
*/
if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
(void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
return;
addr = (volatile void __iomem *)
(PAGE_MASK & (unsigned long __force)addr);
mmiotrace_iounmap(addr);
/* Use the vm area unlocked, assuming the caller
ensures there isn't another iounmap for the same address
in parallel. Reuse of the virtual address is prevented by
leaving it in the global lists until we're done with it.
cpa takes care of the direct mappings. */
read_lock(&vmlist_lock);
for (p = vmlist; p; p = p->next) {
if (p->addr == (void __force *)addr)
break;
}
read_unlock(&vmlist_lock);
if (!p) {
printk(KERN_ERR "iounmap: bad address %p\n", addr);
dump_stack();
return;
}
free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
/* Finally remove it */
o = remove_vm_area((void __force *)addr);
BUG_ON(p != o || o == NULL);
kfree(p);
}
EXPORT_SYMBOL(iounmap);
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
*/
void *xlate_dev_mem_ptr(unsigned long phys)
{
void *addr;
unsigned long start = phys & PAGE_MASK;
/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
if (page_is_ram(start >> PAGE_SHIFT))
return __va(phys);
addr = (void __force *)ioremap_default(start, PAGE_SIZE);
if (addr)
addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
return addr;
}
void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
{
if (page_is_ram(phys >> PAGE_SHIFT))
return;
iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
return;
}
static int __initdata early_ioremap_debug;
static int __init early_ioremap_debug_setup(char *str)
{
early_ioremap_debug = 1;
return 0;
}
early_param("early_ioremap_debug", early_ioremap_debug_setup);
static __initdata int after_paging_init;
static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{
/* Don't assume we're using swapper_pg_dir at this point */
pgd_t *base = __va(read_cr3());
pgd_t *pgd = &base[pgd_index(addr)];
pud_t *pud = pud_offset(pgd, addr);
pmd_t *pmd = pmd_offset(pud, addr);
return pmd;
}
static inline pte_t * __init early_ioremap_pte(unsigned long addr)
{
return &bm_pte[pte_index(addr)];
}
static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
void __init early_ioremap_init(void)
{
pmd_t *pmd;
int i;
if (early_ioremap_debug)
printk(KERN_INFO "early_ioremap_init()\n");
for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
memset(bm_pte, 0, sizeof(bm_pte));
pmd_populate_kernel(&init_mm, pmd, bm_pte);
/*
* The boot-ioremap range spans multiple pmds, for which
* we are not prepared:
*/
if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
WARN_ON(1);
printk(KERN_WARNING "pmd %p != %p\n",
pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
fix_to_virt(FIX_BTMAP_BEGIN));
printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
fix_to_virt(FIX_BTMAP_END));
printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
FIX_BTMAP_BEGIN);
}
}
void __init early_ioremap_reset(void)
{
after_paging_init = 1;
}
static void __init __early_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags)
{
unsigned long addr = __fix_to_virt(idx);
pte_t *pte;
if (idx >= __end_of_fixed_addresses) {
BUG();
return;
}
pte = early_ioremap_pte(addr);
if (pgprot_val(flags))
set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
else
pte_clear(&init_mm, addr, pte);
__flush_tlb_one(addr);
}
static inline void __init early_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t prot)
{
if (after_paging_init)
__set_fixmap(idx, phys, prot);
else
__early_set_fixmap(idx, phys, prot);
}
static inline void __init early_clear_fixmap(enum fixed_addresses idx)
{
if (after_paging_init)
clear_fixmap(idx);
else
__early_set_fixmap(idx, 0, __pgprot(0));
}
static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
static int __init check_early_ioremap_leak(void)
{
int count = 0;
int i;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
if (prev_map[i])
count++;
if (!count)
return 0;
WARN(1, KERN_WARNING
"Debug warning: early ioremap leak of %d areas detected.\n",
count);
printk(KERN_WARNING
"please boot with early_ioremap_debug and report the dmesg.\n");
return 1;
}
late_initcall(check_early_ioremap_leak);
static void __init __iomem *
__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
{
unsigned long offset;
resource_size_t last_addr;
unsigned int nrpages;
enum fixed_addresses idx0, idx;
int i, slot;
WARN_ON(system_state != SYSTEM_BOOTING);
slot = -1;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
if (!prev_map[i]) {
slot = i;
break;
}
}
if (slot < 0) {
printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n",
(u64)phys_addr, size);
WARN_ON(1);
return NULL;
}
if (early_ioremap_debug) {
printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ",
(u64)phys_addr, size, slot);
dump_stack();
}
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr) {
WARN_ON(1);
return NULL;
}
prev_size[slot] = size;
/*
* Mappings have to be page-aligned
*/
offset = phys_addr & ~PAGE_MASK;
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr + 1) - phys_addr;
/*
* Mappings have to fit in the FIX_BTMAP area.
*/
nrpages = size >> PAGE_SHIFT;
if (nrpages > NR_FIX_BTMAPS) {
WARN_ON(1);
return NULL;
}
/*
* Ok, go for it..
*/
idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
idx = idx0;
while (nrpages > 0) {
early_set_fixmap(idx, phys_addr, prot);
phys_addr += PAGE_SIZE;
--idx;
--nrpages;
}
if (early_ioremap_debug)
printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
return prev_map[slot];
}
/* Remap an IO device */
void __init __iomem *
early_ioremap(resource_size_t phys_addr, unsigned long size)
{
return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
}
/* Remap memory */
void __init __iomem *
early_memremap(resource_size_t phys_addr, unsigned long size)
{
return __early_ioremap(phys_addr, size, PAGE_KERNEL);
}
void __init early_iounmap(void __iomem *addr, unsigned long size)
{
unsigned long virt_addr;
unsigned long offset;
unsigned int nrpages;
enum fixed_addresses idx;
int i, slot;
slot = -1;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
if (prev_map[i] == addr) {
slot = i;
break;
}
}
if (slot < 0) {
printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
addr, size);
WARN_ON(1);
return;
}
if (prev_size[slot] != size) {
printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
addr, size, slot, prev_size[slot]);
WARN_ON(1);
return;
}
if (early_ioremap_debug) {
printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
size, slot);
dump_stack();
}
virt_addr = (unsigned long)addr;
if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
WARN_ON(1);
return;
}
offset = virt_addr & ~PAGE_MASK;
nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
while (nrpages > 0) {
early_clear_fixmap(idx);
--idx;
--nrpages;
}
prev_map[slot] = NULL;
}

View File

@@ -0,0 +1,222 @@
/*
* AMD K8 NUMA support.
* Discover the memory map and associated nodes.
*
* This version reads it directly from the K8 northbridge.
*
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/nodemask.h>
#include <asm/io.h>
#include <linux/pci_ids.h>
#include <linux/acpi.h>
#include <asm/types.h>
#include <asm/mmzone.h>
#include <asm/proto.h>
#include <asm/e820.h>
#include <asm/pci-direct.h>
#include <asm/numa.h>
#include <asm/mpspec.h>
#include <asm/apic.h>
#include <asm/k8.h>
static __init int find_northbridge(void)
{
int num;
for (num = 0; num < 32; num++) {
u32 header;
header = read_pci_config(0, num, 0, 0x00);
if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
continue;
header = read_pci_config(0, num, 1, 0x00);
if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
continue;
return num;
}
return -1;
}
static __init void early_get_boot_cpu_id(void)
{
/*
* need to get boot_cpu_id so can use that to create apicid_to_node
* in k8_scan_nodes()
*/
/*
* Find possible boot-time SMP configuration:
*/
#ifdef CONFIG_X86_MPPARSE
early_find_smp_config();
#endif
#ifdef CONFIG_ACPI
/*
* Read APIC information from ACPI tables.
*/
early_acpi_boot_init();
#endif
#ifdef CONFIG_X86_MPPARSE
/*
* get boot-time SMP configuration:
*/
if (smp_found_config)
early_get_smp_config();
#endif
early_init_lapic_mapping();
}
int __init k8_scan_nodes(unsigned long start, unsigned long end)
{
unsigned numnodes, cores, bits, apicid_base;
unsigned long prevbase;
struct bootnode nodes[8];
int i, j, nb, found = 0;
u32 nodeid, reg;
if (!early_pci_allowed())
return -1;
nb = find_northbridge();
if (nb < 0)
return nb;
printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
reg = read_pci_config(0, nb, 0, 0x60);
numnodes = ((reg >> 4) & 0xF) + 1;
if (numnodes <= 1)
return -1;
printk(KERN_INFO "Number of nodes %d\n", numnodes);
memset(&nodes, 0, sizeof(nodes));
prevbase = 0;
for (i = 0; i < 8; i++) {
unsigned long base, limit;
base = read_pci_config(0, nb, 1, 0x40 + i*8);
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
nodeid = limit & 7;
if ((base & 3) == 0) {
if (i < numnodes)
printk("Skipping disabled node %d\n", i);
continue;
}
if (nodeid >= numnodes) {
printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
base, limit);
continue;
}
if (!limit) {
printk(KERN_INFO "Skipping node entry %d (base %lx)\n",
i, base);
continue;
}
if ((base >> 8) & 3 || (limit >> 8) & 3) {
printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
nodeid, (base>>8)&3, (limit>>8) & 3);
return -1;
}
if (node_isset(nodeid, node_possible_map)) {
printk(KERN_INFO "Node %d already present. Skipping\n",
nodeid);
continue;
}
limit >>= 16;
limit <<= 24;
limit |= (1<<24)-1;
limit++;
if (limit > max_pfn << PAGE_SHIFT)
limit = max_pfn << PAGE_SHIFT;
if (limit <= base)
continue;
base >>= 16;
base <<= 24;
if (base < start)
base = start;
if (limit > end)
limit = end;
if (limit == base) {
printk(KERN_ERR "Empty node %d\n", nodeid);
continue;
}
if (limit < base) {
printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
nodeid, base, limit);
continue;
}
/* Could sort here, but pun for now. Should not happen anyroads. */
if (prevbase > base) {
printk(KERN_ERR "Node map not sorted %lx,%lx\n",
prevbase, base);
return -1;
}
printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
nodeid, base, limit);
found++;
nodes[nodeid].start = base;
nodes[nodeid].end = limit;
prevbase = base;
node_set(nodeid, node_possible_map);
}
if (!found)
return -1;
memnode_shift = compute_hash_shift(nodes, 8, NULL);
if (memnode_shift < 0) {
printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
return -1;
}
printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
/* use the coreid bits from early_identify_cpu */
bits = boot_cpu_data.x86_coreid_bits;
cores = (1<<bits);
apicid_base = 0;
/* need to get boot_cpu_id early for system with apicid lifting */
early_get_boot_cpu_id();
if (boot_cpu_physical_apicid > 0) {
printk(KERN_INFO "BSP APIC ID: %02x\n",
boot_cpu_physical_apicid);
apicid_base = boot_cpu_physical_apicid;
}
for (i = 0; i < 8; i++) {
if (nodes[i].start == nodes[i].end)
continue;
e820_register_active_regions(i,
nodes[i].start >> PAGE_SHIFT,
nodes[i].end >> PAGE_SHIFT);
for (j = apicid_base; j < cores + apicid_base; j++)
apicid_to_node[(i << bits) + j] = i;
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
numa_init_array();
return 0;
}

View File

@@ -0,0 +1 @@
obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o

View File

@@ -0,0 +1,228 @@
#include <linux/interrupt.h>
#include <linux/kdebug.h>
#include <linux/kmemcheck.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
#include "error.h"
#include "shadow.h"
enum kmemcheck_error_type {
KMEMCHECK_ERROR_INVALID_ACCESS,
KMEMCHECK_ERROR_BUG,
};
#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
struct kmemcheck_error {
enum kmemcheck_error_type type;
union {
/* KMEMCHECK_ERROR_INVALID_ACCESS */
struct {
/* Kind of access that caused the error */
enum kmemcheck_shadow state;
/* Address and size of the erroneous read */
unsigned long address;
unsigned int size;
};
};
struct pt_regs regs;
struct stack_trace trace;
unsigned long trace_entries[32];
/* We compress it to a char. */
unsigned char shadow_copy[SHADOW_COPY_SIZE];
unsigned char memory_copy[SHADOW_COPY_SIZE];
};
/*
* Create a ring queue of errors to output. We can't call printk() directly
* from the kmemcheck traps, since this may call the console drivers and
* result in a recursive fault.
*/
static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
static unsigned int error_count;
static unsigned int error_rd;
static unsigned int error_wr;
static unsigned int error_missed_count;
static struct kmemcheck_error *error_next_wr(void)
{
struct kmemcheck_error *e;
if (error_count == ARRAY_SIZE(error_fifo)) {
++error_missed_count;
return NULL;
}
e = &error_fifo[error_wr];
if (++error_wr == ARRAY_SIZE(error_fifo))
error_wr = 0;
++error_count;
return e;
}
static struct kmemcheck_error *error_next_rd(void)
{
struct kmemcheck_error *e;
if (error_count == 0)
return NULL;
e = &error_fifo[error_rd];
if (++error_rd == ARRAY_SIZE(error_fifo))
error_rd = 0;
--error_count;
return e;
}
void kmemcheck_error_recall(void)
{
static const char *desc[] = {
[KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
[KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
[KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
[KMEMCHECK_SHADOW_FREED] = "freed",
};
static const char short_desc[] = {
[KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
[KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
[KMEMCHECK_SHADOW_INITIALIZED] = 'i',
[KMEMCHECK_SHADOW_FREED] = 'f',
};
struct kmemcheck_error *e;
unsigned int i;
e = error_next_rd();
if (!e)
return;
switch (e->type) {
case KMEMCHECK_ERROR_INVALID_ACCESS:
printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read "
"from %s memory (%p)\n",
8 * e->size, e->state < ARRAY_SIZE(desc) ?
desc[e->state] : "(invalid shadow state)",
(void *) e->address);
printk(KERN_INFO);
for (i = 0; i < SHADOW_COPY_SIZE; ++i)
printk("%02x", e->memory_copy[i]);
printk("\n");
printk(KERN_INFO);
for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
printk(" %c", short_desc[e->shadow_copy[i]]);
else
printk(" ?");
}
printk("\n");
printk(KERN_INFO "%*c\n", 2 + 2
* (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
break;
case KMEMCHECK_ERROR_BUG:
printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
break;
}
__show_regs(&e->regs, 1);
print_stack_trace(&e->trace, 0);
}
static void do_wakeup(unsigned long data)
{
while (error_count > 0)
kmemcheck_error_recall();
if (error_missed_count > 0) {
printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
"the queue was too small\n", error_missed_count);
error_missed_count = 0;
}
}
static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
/*
* Save the context of an error report.
*/
void kmemcheck_error_save(enum kmemcheck_shadow state,
unsigned long address, unsigned int size, struct pt_regs *regs)
{
static unsigned long prev_ip;
struct kmemcheck_error *e;
void *shadow_copy;
void *memory_copy;
/* Don't report several adjacent errors from the same EIP. */
if (regs->ip == prev_ip)
return;
prev_ip = regs->ip;
e = error_next_wr();
if (!e)
return;
e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
e->state = state;
e->address = address;
e->size = size;
/* Save regs */
memcpy(&e->regs, regs, sizeof(*regs));
/* Save stack trace */
e->trace.nr_entries = 0;
e->trace.entries = e->trace_entries;
e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
e->trace.skip = 0;
save_stack_trace_bp(&e->trace, regs->bp);
/* Round address down to nearest 16 bytes */
shadow_copy = kmemcheck_shadow_lookup(address
& ~(SHADOW_COPY_SIZE - 1));
BUG_ON(!shadow_copy);
memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
kmemcheck_show_addr(address);
memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
kmemcheck_hide_addr(address);
tasklet_hi_schedule_first(&kmemcheck_tasklet);
}
/*
* Save the context of a kmemcheck bug.
*/
void kmemcheck_error_save_bug(struct pt_regs *regs)
{
struct kmemcheck_error *e;
e = error_next_wr();
if (!e)
return;
e->type = KMEMCHECK_ERROR_BUG;
memcpy(&e->regs, regs, sizeof(*regs));
e->trace.nr_entries = 0;
e->trace.entries = e->trace_entries;
e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
e->trace.skip = 1;
save_stack_trace(&e->trace);
tasklet_hi_schedule_first(&kmemcheck_tasklet);
}

View File

@@ -0,0 +1,15 @@
#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
#define ARCH__X86__MM__KMEMCHECK__ERROR_H
#include <linux/ptrace.h>
#include "shadow.h"
void kmemcheck_error_save(enum kmemcheck_shadow state,
unsigned long address, unsigned int size, struct pt_regs *regs);
void kmemcheck_error_save_bug(struct pt_regs *regs);
void kmemcheck_error_recall(void);
#endif

View File

@@ -0,0 +1,651 @@
/**
* kmemcheck - a heavyweight memory checker for the linux kernel
* Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
* (With a lot of help from Ingo Molnar and Pekka Enberg.)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License (version 2) as
* published by the Free Software Foundation.
*/
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
#include <linux/kernel.h>
#include <linux/kmemcheck.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/page-flags.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/string.h>
#include <linux/types.h>
#include <asm/cacheflush.h>
#include <asm/kmemcheck.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include "error.h"
#include "opcode.h"
#include "pte.h"
#include "selftest.h"
#include "shadow.h"
#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
# define KMEMCHECK_ENABLED 0
#endif
#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
# define KMEMCHECK_ENABLED 1
#endif
#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
# define KMEMCHECK_ENABLED 2
#endif
int kmemcheck_enabled = KMEMCHECK_ENABLED;
int __init kmemcheck_init(void)
{
#ifdef CONFIG_SMP
/*
* Limit SMP to use a single CPU. We rely on the fact that this code
* runs before SMP is set up.
*/
if (setup_max_cpus > 1) {
printk(KERN_INFO
"kmemcheck: Limiting number of CPUs to 1.\n");
setup_max_cpus = 1;
}
#endif
if (!kmemcheck_selftest()) {
printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
kmemcheck_enabled = 0;
return -EINVAL;
}
printk(KERN_INFO "kmemcheck: Initialized\n");
return 0;
}
early_initcall(kmemcheck_init);
/*
* We need to parse the kmemcheck= option before any memory is allocated.
*/
static int __init param_kmemcheck(char *str)
{
if (!str)
return -EINVAL;
sscanf(str, "%d", &kmemcheck_enabled);
return 0;
}
early_param("kmemcheck", param_kmemcheck);
int kmemcheck_show_addr(unsigned long address)
{
pte_t *pte;
pte = kmemcheck_pte_lookup(address);
if (!pte)
return 0;
set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
__flush_tlb_one(address);
return 1;
}
int kmemcheck_hide_addr(unsigned long address)
{
pte_t *pte;
pte = kmemcheck_pte_lookup(address);
if (!pte)
return 0;
set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
__flush_tlb_one(address);
return 1;
}
struct kmemcheck_context {
bool busy;
int balance;
/*
* There can be at most two memory operands to an instruction, but
* each address can cross a page boundary -- so we may need up to
* four addresses that must be hidden/revealed for each fault.
*/
unsigned long addr[4];
unsigned long n_addrs;
unsigned long flags;
/* Data size of the instruction that caused a fault. */
unsigned int size;
};
static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
bool kmemcheck_active(struct pt_regs *regs)
{
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
return data->balance > 0;
}
/* Save an address that needs to be shown/hidden */
static void kmemcheck_save_addr(unsigned long addr)
{
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
data->addr[data->n_addrs++] = addr;
}
static unsigned int kmemcheck_show_all(void)
{
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
unsigned int i;
unsigned int n;
n = 0;
for (i = 0; i < data->n_addrs; ++i)
n += kmemcheck_show_addr(data->addr[i]);
return n;
}
static unsigned int kmemcheck_hide_all(void)
{
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
unsigned int i;
unsigned int n;
n = 0;
for (i = 0; i < data->n_addrs; ++i)
n += kmemcheck_hide_addr(data->addr[i]);
return n;
}
/*
* Called from the #PF handler.
*/
void kmemcheck_show(struct pt_regs *regs)
{
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
BUG_ON(!irqs_disabled());
if (unlikely(data->balance != 0)) {
kmemcheck_show_all();
kmemcheck_error_save_bug(regs);
data->balance = 0;
return;
}
/*
* None of the addresses actually belonged to kmemcheck. Note that
* this is not an error.
*/
if (kmemcheck_show_all() == 0)
return;
++data->balance;
/*
* The IF needs to be cleared as well, so that the faulting
* instruction can run "uninterrupted". Otherwise, we might take
* an interrupt and start executing that before we've had a chance
* to hide the page again.
*
* NOTE: In the rare case of multiple faults, we must not override
* the original flags:
*/
if (!(regs->flags & X86_EFLAGS_TF))
data->flags = regs->flags;
regs->flags |= X86_EFLAGS_TF;
regs->flags &= ~X86_EFLAGS_IF;
}
/*
* Called from the #DB handler.
*/
void kmemcheck_hide(struct pt_regs *regs)
{
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
int n;
BUG_ON(!irqs_disabled());
if (unlikely(data->balance != 1)) {
kmemcheck_show_all();
kmemcheck_error_save_bug(regs);
data->n_addrs = 0;
data->balance = 0;
if (!(data->flags & X86_EFLAGS_TF))
regs->flags &= ~X86_EFLAGS_TF;
if (data->flags & X86_EFLAGS_IF)
regs->flags |= X86_EFLAGS_IF;
return;
}
if (kmemcheck_enabled)
n = kmemcheck_hide_all();
else
n = kmemcheck_show_all();
if (n == 0)
return;
--data->balance;
data->n_addrs = 0;
if (!(data->flags & X86_EFLAGS_TF))
regs->flags &= ~X86_EFLAGS_TF;
if (data->flags & X86_EFLAGS_IF)
regs->flags |= X86_EFLAGS_IF;
}
void kmemcheck_show_pages(struct page *p, unsigned int n)
{
unsigned int i;
for (i = 0; i < n; ++i) {
unsigned long address;
pte_t *pte;
unsigned int level;
address = (unsigned long) page_address(&p[i]);
pte = lookup_address(address, &level);
BUG_ON(!pte);
BUG_ON(level != PG_LEVEL_4K);
set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
__flush_tlb_one(address);
}
}
bool kmemcheck_page_is_tracked(struct page *p)
{
/* This will also check the "hidden" flag of the PTE. */
return kmemcheck_pte_lookup((unsigned long) page_address(p));
}
void kmemcheck_hide_pages(struct page *p, unsigned int n)
{
unsigned int i;
for (i = 0; i < n; ++i) {
unsigned long address;
pte_t *pte;
unsigned int level;
address = (unsigned long) page_address(&p[i]);
pte = lookup_address(address, &level);
BUG_ON(!pte);
BUG_ON(level != PG_LEVEL_4K);
set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
__flush_tlb_one(address);
}
}
/* Access may NOT cross page boundary */
static void kmemcheck_read_strict(struct pt_regs *regs,
unsigned long addr, unsigned int size)
{
void *shadow;
enum kmemcheck_shadow status;
shadow = kmemcheck_shadow_lookup(addr);
if (!shadow)
return;
kmemcheck_save_addr(addr);
status = kmemcheck_shadow_test(shadow, size);
if (status == KMEMCHECK_SHADOW_INITIALIZED)
return;
if (kmemcheck_enabled)
kmemcheck_error_save(status, addr, size, regs);
if (kmemcheck_enabled == 2)
kmemcheck_enabled = 0;
/* Don't warn about it again. */
kmemcheck_shadow_set(shadow, size);
}
bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
{
enum kmemcheck_shadow status;
void *shadow;
shadow = kmemcheck_shadow_lookup(addr);
if (!shadow)
return true;
status = kmemcheck_shadow_test(shadow, size);
return status == KMEMCHECK_SHADOW_INITIALIZED;
}
/* Access may cross page boundary */
static void kmemcheck_read(struct pt_regs *regs,
unsigned long addr, unsigned int size)
{
unsigned long page = addr & PAGE_MASK;
unsigned long next_addr = addr + size - 1;
unsigned long next_page = next_addr & PAGE_MASK;
if (likely(page == next_page)) {
kmemcheck_read_strict(regs, addr, size);
return;
}
/*
* What we do is basically to split the access across the
* two pages and handle each part separately. Yes, this means
* that we may now see reads that are 3 + 5 bytes, for
* example (and if both are uninitialized, there will be two
* reports), but it makes the code a lot simpler.
*/
kmemcheck_read_strict(regs, addr, next_page - addr);
kmemcheck_read_strict(regs, next_page, next_addr - next_page);
}
static void kmemcheck_write_strict(struct pt_regs *regs,
unsigned long addr, unsigned int size)
{
void *shadow;
shadow = kmemcheck_shadow_lookup(addr);
if (!shadow)
return;
kmemcheck_save_addr(addr);
kmemcheck_shadow_set(shadow, size);
}
static void kmemcheck_write(struct pt_regs *regs,
unsigned long addr, unsigned int size)
{
unsigned long page = addr & PAGE_MASK;
unsigned long next_addr = addr + size - 1;
unsigned long next_page = next_addr & PAGE_MASK;
if (likely(page == next_page)) {
kmemcheck_write_strict(regs, addr, size);
return;
}
/* See comment in kmemcheck_read(). */
kmemcheck_write_strict(regs, addr, next_page - addr);
kmemcheck_write_strict(regs, next_page, next_addr - next_page);
}
/*
* Copying is hard. We have two addresses, each of which may be split across
* a page (and each page will have different shadow addresses).
*/
static void kmemcheck_copy(struct pt_regs *regs,
unsigned long src_addr, unsigned long dst_addr, unsigned int size)
{
uint8_t shadow[8];
enum kmemcheck_shadow status;
unsigned long page;
unsigned long next_addr;
unsigned long next_page;
uint8_t *x;
unsigned int i;
unsigned int n;
BUG_ON(size > sizeof(shadow));
page = src_addr & PAGE_MASK;
next_addr = src_addr + size - 1;
next_page = next_addr & PAGE_MASK;
if (likely(page == next_page)) {
/* Same page */
x = kmemcheck_shadow_lookup(src_addr);
if (x) {
kmemcheck_save_addr(src_addr);
for (i = 0; i < size; ++i)
shadow[i] = x[i];
} else {
for (i = 0; i < size; ++i)
shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
}
} else {
n = next_page - src_addr;
BUG_ON(n > sizeof(shadow));
/* First page */
x = kmemcheck_shadow_lookup(src_addr);
if (x) {
kmemcheck_save_addr(src_addr);
for (i = 0; i < n; ++i)
shadow[i] = x[i];
} else {
/* Not tracked */
for (i = 0; i < n; ++i)
shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
}
/* Second page */
x = kmemcheck_shadow_lookup(next_page);
if (x) {
kmemcheck_save_addr(next_page);
for (i = n; i < size; ++i)
shadow[i] = x[i - n];
} else {
/* Not tracked */
for (i = n; i < size; ++i)
shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
}
}
page = dst_addr & PAGE_MASK;
next_addr = dst_addr + size - 1;
next_page = next_addr & PAGE_MASK;
if (likely(page == next_page)) {
/* Same page */
x = kmemcheck_shadow_lookup(dst_addr);
if (x) {
kmemcheck_save_addr(dst_addr);
for (i = 0; i < size; ++i) {
x[i] = shadow[i];
shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
}
}
} else {
n = next_page - dst_addr;
BUG_ON(n > sizeof(shadow));
/* First page */
x = kmemcheck_shadow_lookup(dst_addr);
if (x) {
kmemcheck_save_addr(dst_addr);
for (i = 0; i < n; ++i) {
x[i] = shadow[i];
shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
}
}
/* Second page */
x = kmemcheck_shadow_lookup(next_page);
if (x) {
kmemcheck_save_addr(next_page);
for (i = n; i < size; ++i) {
x[i - n] = shadow[i];
shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
}
}
}
status = kmemcheck_shadow_test(shadow, size);
if (status == KMEMCHECK_SHADOW_INITIALIZED)
return;
if (kmemcheck_enabled)
kmemcheck_error_save(status, src_addr, size, regs);
if (kmemcheck_enabled == 2)
kmemcheck_enabled = 0;
}
enum kmemcheck_method {
KMEMCHECK_READ,
KMEMCHECK_WRITE,
};
static void kmemcheck_access(struct pt_regs *regs,
unsigned long fallback_address, enum kmemcheck_method fallback_method)
{
const uint8_t *insn;
const uint8_t *insn_primary;
unsigned int size;
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
/* Recursive fault -- ouch. */
if (data->busy) {
kmemcheck_show_addr(fallback_address);
kmemcheck_error_save_bug(regs);
return;
}
data->busy = true;
insn = (const uint8_t *) regs->ip;
insn_primary = kmemcheck_opcode_get_primary(insn);
kmemcheck_opcode_decode(insn, &size);
switch (insn_primary[0]) {
#ifdef CONFIG_KMEMCHECK_BITOPS_OK
/* AND, OR, XOR */
/*
* Unfortunately, these instructions have to be excluded from
* our regular checking since they access only some (and not
* all) bits. This clears out "bogus" bitfield-access warnings.
*/
case 0x80:
case 0x81:
case 0x82:
case 0x83:
switch ((insn_primary[1] >> 3) & 7) {
/* OR */
case 1:
/* AND */
case 4:
/* XOR */
case 6:
kmemcheck_write(regs, fallback_address, size);
goto out;
/* ADD */
case 0:
/* ADC */
case 2:
/* SBB */
case 3:
/* SUB */
case 5:
/* CMP */
case 7:
break;
}
break;
#endif
/* MOVS, MOVSB, MOVSW, MOVSD */
case 0xa4:
case 0xa5:
/*
* These instructions are special because they take two
* addresses, but we only get one page fault.
*/
kmemcheck_copy(regs, regs->si, regs->di, size);
goto out;
/* CMPS, CMPSB, CMPSW, CMPSD */
case 0xa6:
case 0xa7:
kmemcheck_read(regs, regs->si, size);
kmemcheck_read(regs, regs->di, size);
goto out;
}
/*
* If the opcode isn't special in any way, we use the data from the
* page fault handler to determine the address and type of memory
* access.
*/
switch (fallback_method) {
case KMEMCHECK_READ:
kmemcheck_read(regs, fallback_address, size);
goto out;
case KMEMCHECK_WRITE:
kmemcheck_write(regs, fallback_address, size);
goto out;
}
out:
data->busy = false;
}
bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code)
{
pte_t *pte;
/*
* XXX: Is it safe to assume that memory accesses from virtual 86
* mode or non-kernel code segments will _never_ access kernel
* memory (e.g. tracked pages)? For now, we need this to avoid
* invoking kmemcheck for PnP BIOS calls.
*/
if (regs->flags & X86_VM_MASK)
return false;
if (regs->cs != __KERNEL_CS)
return false;
pte = kmemcheck_pte_lookup(address);
if (!pte)
return false;
if (error_code & 2)
kmemcheck_access(regs, address, KMEMCHECK_WRITE);
else
kmemcheck_access(regs, address, KMEMCHECK_READ);
kmemcheck_show(regs);
return true;
}
bool kmemcheck_trap(struct pt_regs *regs)
{
if (!kmemcheck_active(regs))
return false;
/* We're done. */
kmemcheck_hide(regs);
return true;
}

View File

@@ -0,0 +1,106 @@
#include <linux/types.h>
#include "opcode.h"
static bool opcode_is_prefix(uint8_t b)
{
return
/* Group 1 */
b == 0xf0 || b == 0xf2 || b == 0xf3
/* Group 2 */
|| b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
|| b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
/* Group 3 */
|| b == 0x66
/* Group 4 */
|| b == 0x67;
}
#ifdef CONFIG_X86_64
static bool opcode_is_rex_prefix(uint8_t b)
{
return (b & 0xf0) == 0x40;
}
#else
static bool opcode_is_rex_prefix(uint8_t b)
{
return false;
}
#endif
#define REX_W (1 << 3)
/*
* This is a VERY crude opcode decoder. We only need to find the size of the
* load/store that caused our #PF and this should work for all the opcodes
* that we care about. Moreover, the ones who invented this instruction set
* should be shot.
*/
void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
{
/* Default operand size */
int operand_size_override = 4;
/* prefixes */
for (; opcode_is_prefix(*op); ++op) {
if (*op == 0x66)
operand_size_override = 2;
}
/* REX prefix */
if (opcode_is_rex_prefix(*op)) {
uint8_t rex = *op;
++op;
if (rex & REX_W) {
switch (*op) {
case 0x63:
*size = 4;
return;
case 0x0f:
++op;
switch (*op) {
case 0xb6:
case 0xbe:
*size = 1;
return;
case 0xb7:
case 0xbf:
*size = 2;
return;
}
break;
}
*size = 8;
return;
}
}
/* escape opcode */
if (*op == 0x0f) {
++op;
/*
* This is move with zero-extend and sign-extend, respectively;
* we don't have to think about 0xb6/0xbe, because this is
* already handled in the conditional below.
*/
if (*op == 0xb7 || *op == 0xbf)
operand_size_override = 2;
}
*size = (*op & 1) ? operand_size_override : 1;
}
const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
{
/* skip prefixes */
while (opcode_is_prefix(*op))
++op;
if (opcode_is_rex_prefix(*op))
++op;
return op;
}

View File

@@ -0,0 +1,9 @@
#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
#include <linux/types.h>
void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
#endif

View File

@@ -0,0 +1,22 @@
#include <linux/mm.h>
#include <asm/pgtable.h>
#include "pte.h"
pte_t *kmemcheck_pte_lookup(unsigned long address)
{
pte_t *pte;
unsigned int level;
pte = lookup_address(address, &level);
if (!pte)
return NULL;
if (level != PG_LEVEL_4K)
return NULL;
if (!pte_hidden(*pte))
return NULL;
return pte;
}

View File

@@ -0,0 +1,10 @@
#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
#define ARCH__X86__MM__KMEMCHECK__PTE_H
#include <linux/mm.h>
#include <asm/pgtable.h>
pte_t *kmemcheck_pte_lookup(unsigned long address);
#endif

View File

@@ -0,0 +1,69 @@
#include <linux/kernel.h>
#include "opcode.h"
#include "selftest.h"
struct selftest_opcode {
unsigned int expected_size;
const uint8_t *insn;
const char *desc;
};
static const struct selftest_opcode selftest_opcodes[] = {
/* REP MOVS */
{1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
{4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
/* MOVZX / MOVZXD */
{1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
{1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
/* MOVSX / MOVSXD */
{1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
{1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
#ifdef CONFIG_X86_64
/* MOVZX / MOVZXD */
{1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
{2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
/* MOVSX / MOVSXD */
{1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
{2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
{4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
#endif
};
static bool selftest_opcode_one(const struct selftest_opcode *op)
{
unsigned size;
kmemcheck_opcode_decode(op->insn, &size);
if (size == op->expected_size)
return true;
printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
op->desc, op->expected_size, size);
return false;
}
static bool selftest_opcodes_all(void)
{
bool pass = true;
unsigned int i;
for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
return pass;
}
bool kmemcheck_selftest(void)
{
bool pass = true;
pass = pass && selftest_opcodes_all();
return pass;
}

View File

@@ -0,0 +1,6 @@
#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
bool kmemcheck_selftest(void);
#endif

View File

@@ -0,0 +1,161 @@
#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include "pte.h"
#include "shadow.h"
/*
* Return the shadow address for the given address. Returns NULL if the
* address is not tracked.
*
* We need to be extremely careful not to follow any invalid pointers,
* because this function can be called for *any* possible address.
*/
void *kmemcheck_shadow_lookup(unsigned long address)
{
pte_t *pte;
struct page *page;
if (!virt_addr_valid(address))
return NULL;
pte = kmemcheck_pte_lookup(address);
if (!pte)
return NULL;
page = virt_to_page(address);
if (!page->shadow)
return NULL;
return page->shadow + (address & (PAGE_SIZE - 1));
}
static void mark_shadow(void *address, unsigned int n,
enum kmemcheck_shadow status)
{
unsigned long addr = (unsigned long) address;
unsigned long last_addr = addr + n - 1;
unsigned long page = addr & PAGE_MASK;
unsigned long last_page = last_addr & PAGE_MASK;
unsigned int first_n;
void *shadow;
/* If the memory range crosses a page boundary, stop there. */
if (page == last_page)
first_n = n;
else
first_n = page + PAGE_SIZE - addr;
shadow = kmemcheck_shadow_lookup(addr);
if (shadow)
memset(shadow, status, first_n);
addr += first_n;
n -= first_n;
/* Do full-page memset()s. */
while (n >= PAGE_SIZE) {
shadow = kmemcheck_shadow_lookup(addr);
if (shadow)
memset(shadow, status, PAGE_SIZE);
addr += PAGE_SIZE;
n -= PAGE_SIZE;
}
/* Do the remaining page, if any. */
if (n > 0) {
shadow = kmemcheck_shadow_lookup(addr);
if (shadow)
memset(shadow, status, n);
}
}
void kmemcheck_mark_unallocated(void *address, unsigned int n)
{
mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
}
void kmemcheck_mark_uninitialized(void *address, unsigned int n)
{
mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
}
/*
* Fill the shadow memory of the given address such that the memory at that
* address is marked as being initialized.
*/
void kmemcheck_mark_initialized(void *address, unsigned int n)
{
mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
}
EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
void kmemcheck_mark_freed(void *address, unsigned int n)
{
mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
}
void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
{
unsigned int i;
for (i = 0; i < n; ++i)
kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
}
void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
{
unsigned int i;
for (i = 0; i < n; ++i)
kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
}
void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
{
unsigned int i;
for (i = 0; i < n; ++i)
kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
}
enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
{
uint8_t *x;
unsigned int i;
x = shadow;
#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
/*
* Make sure _some_ bytes are initialized. Gcc frequently generates
* code to access neighboring bytes.
*/
for (i = 0; i < size; ++i) {
if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
return x[i];
}
#else
/* All bytes must be initialized. */
for (i = 0; i < size; ++i) {
if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
return x[i];
}
#endif
return x[0];
}
void kmemcheck_shadow_set(void *shadow, unsigned int size)
{
uint8_t *x;
unsigned int i;
x = shadow;
for (i = 0; i < size; ++i)
x[i] = KMEMCHECK_SHADOW_INITIALIZED;
}

View File

@@ -0,0 +1,16 @@
#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
enum kmemcheck_shadow {
KMEMCHECK_SHADOW_UNALLOCATED,
KMEMCHECK_SHADOW_UNINITIALIZED,
KMEMCHECK_SHADOW_INITIALIZED,
KMEMCHECK_SHADOW_FREED,
};
void *kmemcheck_shadow_lookup(unsigned long address);
enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
void kmemcheck_shadow_set(void *shadow, unsigned int size);
#endif

572
kernel/arch/x86/mm/kmmio.c Normal file
View File

@@ -0,0 +1,572 @@
/* Support for MMIO probes.
* Benfit many code from kprobes
* (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
* 2007 Alexander Eichner
* 2008 Pekka Paalanen <pq@iki.fi>
*/
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/spinlock.h>
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/uaccess.h>
#include <linux/ptrace.h>
#include <linux/preempt.h>
#include <linux/percpu.h>
#include <linux/kdebug.h>
#include <linux/mutex.h>
#include <linux/io.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <linux/errno.h>
#include <asm/debugreg.h>
#include <linux/mmiotrace.h>
#define KMMIO_PAGE_HASH_BITS 4
#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
struct kmmio_fault_page {
struct list_head list;
struct kmmio_fault_page *release_next;
unsigned long page; /* location of the fault page */
pteval_t old_presence; /* page presence prior to arming */
bool armed;
/*
* Number of times this page has been registered as a part
* of a probe. If zero, page is disarmed and this may be freed.
* Used only by writers (RCU) and post_kmmio_handler().
* Protected by kmmio_lock, when linked into kmmio_page_table.
*/
int count;
};
struct kmmio_delayed_release {
struct rcu_head rcu;
struct kmmio_fault_page *release_list;
};
struct kmmio_context {
struct kmmio_fault_page *fpage;
struct kmmio_probe *probe;
unsigned long saved_flags;
unsigned long addr;
int active;
};
static DEFINE_SPINLOCK(kmmio_lock);
/* Protected by kmmio_lock */
unsigned int kmmio_count;
/* Read-protected by RCU, write-protected by kmmio_lock. */
static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
static LIST_HEAD(kmmio_probes);
static struct list_head *kmmio_page_list(unsigned long page)
{
return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
}
/* Accessed per-cpu */
static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
/*
* this is basically a dynamic stabbing problem:
* Could use the existing prio tree code or
* Possible better implementations:
* The Interval Skip List: A Data Structure for Finding All Intervals That
* Overlap a Point (might be simple)
* Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
*/
/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
{
struct kmmio_probe *p;
list_for_each_entry_rcu(p, &kmmio_probes, list) {
if (addr >= p->addr && addr < (p->addr + p->len))
return p;
}
return NULL;
}
/* You must be holding RCU read lock. */
static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
{
struct list_head *head;
struct kmmio_fault_page *f;
page &= PAGE_MASK;
head = kmmio_page_list(page);
list_for_each_entry_rcu(f, head, list) {
if (f->page == page)
return f;
}
return NULL;
}
static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
{
pmdval_t v = pmd_val(*pmd);
if (clear) {
*old = v & _PAGE_PRESENT;
v &= ~_PAGE_PRESENT;
} else /* presume this has been called with clear==true previously */
v |= *old;
set_pmd(pmd, __pmd(v));
}
static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
{
pteval_t v = pte_val(*pte);
if (clear) {
*old = v & _PAGE_PRESENT;
v &= ~_PAGE_PRESENT;
} else /* presume this has been called with clear==true previously */
v |= *old;
set_pte_atomic(pte, __pte(v));
}
static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
{
unsigned int level;
pte_t *pte = lookup_address(f->page, &level);
if (!pte) {
pr_err("kmmio: no pte for page 0x%08lx\n", f->page);
return -1;
}
switch (level) {
case PG_LEVEL_2M:
clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
break;
case PG_LEVEL_4K:
clear_pte_presence(pte, clear, &f->old_presence);
break;
default:
pr_err("kmmio: unexpected page level 0x%x.\n", level);
return -1;
}
__flush_tlb_one(f->page);
return 0;
}
/*
* Mark the given page as not present. Access to it will trigger a fault.
*
* Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
* protection is ignored here. RCU read lock is assumed held, so the struct
* will not disappear unexpectedly. Furthermore, the caller must guarantee,
* that double arming the same virtual address (page) cannot occur.
*
* Double disarming on the other hand is allowed, and may occur when a fault
* and mmiotrace shutdown happen simultaneously.
*/
static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
{
int ret;
WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
if (f->armed) {
pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
f->page, f->count, !!f->old_presence);
}
ret = clear_page_presence(f, true);
WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
f->armed = true;
return ret;
}
/** Restore the given page to saved presence state. */
static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
{
int ret = clear_page_presence(f, false);
WARN_ONCE(ret < 0,
KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
f->armed = false;
}
/*
* This is being called from do_page_fault().
*
* We may be in an interrupt or a critical section. Also prefecthing may
* trigger a page fault. We may be in the middle of process switch.
* We cannot take any locks, because we could be executing especially
* within a kmmio critical section.
*
* Local interrupts are disabled, so preemption cannot happen.
* Do not enable interrupts, do not sleep, and watch out for other CPUs.
*/
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate
* and they remain disabled thorough out this function.
*/
int kmmio_handler(struct pt_regs *regs, unsigned long addr)
{
struct kmmio_context *ctx;
struct kmmio_fault_page *faultpage;
int ret = 0; /* default to fault not handled */
/*
* Preemption is now disabled to prevent process switch during
* single stepping. We can only handle one active kmmio trace
* per cpu, so ensure that we finish it before something else
* gets to run. We also hold the RCU read lock over single
* stepping to avoid looking up the probe and kmmio_fault_page
* again.
*/
preempt_disable();
rcu_read_lock();
faultpage = get_kmmio_fault_page(addr);
if (!faultpage) {
/*
* Either this page fault is not caused by kmmio, or
* another CPU just pulled the kmmio probe from under
* our feet. The latter case should not be possible.
*/
goto no_kmmio;
}
ctx = &get_cpu_var(kmmio_ctx);
if (ctx->active) {
if (addr == ctx->addr) {
/*
* A second fault on the same page means some other
* condition needs handling by do_page_fault(), the
* page really not being present is the most common.
*/
pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n",
addr, smp_processor_id());
if (!faultpage->old_presence)
pr_info("kmmio: unexpected secondary hit for "
"address 0x%08lx on CPU %d.\n", addr,
smp_processor_id());
} else {
/*
* Prevent overwriting already in-flight context.
* This should not happen, let's hope disarming at
* least prevents a panic.
*/
pr_emerg("kmmio: recursive probe hit on CPU %d, "
"for address 0x%08lx. Ignoring.\n",
smp_processor_id(), addr);
pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
ctx->addr);
disarm_kmmio_fault_page(faultpage);
}
goto no_kmmio_ctx;
}
ctx->active++;
ctx->fpage = faultpage;
ctx->probe = get_kmmio_probe(addr);
ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
ctx->addr = addr;
if (ctx->probe && ctx->probe->pre_handler)
ctx->probe->pre_handler(ctx->probe, regs, addr);
/*
* Enable single-stepping and disable interrupts for the faulting
* context. Local interrupts must not get enabled during stepping.
*/
regs->flags |= X86_EFLAGS_TF;
regs->flags &= ~X86_EFLAGS_IF;
/* Now we set present bit in PTE and single step. */
disarm_kmmio_fault_page(ctx->fpage);
/*
* If another cpu accesses the same page while we are stepping,
* the access will not be caught. It will simply succeed and the
* only downside is we lose the event. If this becomes a problem,
* the user should drop to single cpu before tracing.
*/
put_cpu_var(kmmio_ctx);
return 1; /* fault handled */
no_kmmio_ctx:
put_cpu_var(kmmio_ctx);
no_kmmio:
rcu_read_unlock();
preempt_enable_no_resched();
return ret;
}
/*
* Interrupts are disabled on entry as trap1 is an interrupt gate
* and they remain disabled thorough out this function.
* This must always get called as the pair to kmmio_handler().
*/
static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
{
int ret = 0;
struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
if (!ctx->active) {
/*
* debug traps without an active context are due to either
* something external causing them (f.e. using a debugger while
* mmio tracing enabled), or erroneous behaviour
*/
pr_warning("kmmio: unexpected debug trap on CPU %d.\n",
smp_processor_id());
goto out;
}
if (ctx->probe && ctx->probe->post_handler)
ctx->probe->post_handler(ctx->probe, condition, regs);
/* Prevent racing against release_kmmio_fault_page(). */
spin_lock(&kmmio_lock);
if (ctx->fpage->count)
arm_kmmio_fault_page(ctx->fpage);
spin_unlock(&kmmio_lock);
regs->flags &= ~X86_EFLAGS_TF;
regs->flags |= ctx->saved_flags;
/* These were acquired in kmmio_handler(). */
ctx->active--;
BUG_ON(ctx->active);
rcu_read_unlock();
preempt_enable_no_resched();
/*
* if somebody else is singlestepping across a probe point, flags
* will have TF set, in which case, continue the remaining processing
* of do_debug, as if this is not a probe hit.
*/
if (!(regs->flags & X86_EFLAGS_TF))
ret = 1;
out:
put_cpu_var(kmmio_ctx);
return ret;
}
/* You must be holding kmmio_lock. */
static int add_kmmio_fault_page(unsigned long page)
{
struct kmmio_fault_page *f;
page &= PAGE_MASK;
f = get_kmmio_fault_page(page);
if (f) {
if (!f->count)
arm_kmmio_fault_page(f);
f->count++;
return 0;
}
f = kzalloc(sizeof(*f), GFP_ATOMIC);
if (!f)
return -1;
f->count = 1;
f->page = page;
if (arm_kmmio_fault_page(f)) {
kfree(f);
return -1;
}
list_add_rcu(&f->list, kmmio_page_list(f->page));
return 0;
}
/* You must be holding kmmio_lock. */
static void release_kmmio_fault_page(unsigned long page,
struct kmmio_fault_page **release_list)
{
struct kmmio_fault_page *f;
page &= PAGE_MASK;
f = get_kmmio_fault_page(page);
if (!f)
return;
f->count--;
BUG_ON(f->count < 0);
if (!f->count) {
disarm_kmmio_fault_page(f);
f->release_next = *release_list;
*release_list = f;
}
}
/*
* With page-unaligned ioremaps, one or two armed pages may contain
* addresses from outside the intended mapping. Events for these addresses
* are currently silently dropped. The events may result only from programming
* mistakes by accessing addresses before the beginning or past the end of a
* mapping.
*/
int register_kmmio_probe(struct kmmio_probe *p)
{
unsigned long flags;
int ret = 0;
unsigned long size = 0;
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
spin_lock_irqsave(&kmmio_lock, flags);
if (get_kmmio_probe(p->addr)) {
ret = -EEXIST;
goto out;
}
kmmio_count++;
list_add_rcu(&p->list, &kmmio_probes);
while (size < size_lim) {
if (add_kmmio_fault_page(p->addr + size))
pr_err("kmmio: Unable to set page fault.\n");
size += PAGE_SIZE;
}
out:
spin_unlock_irqrestore(&kmmio_lock, flags);
/*
* XXX: What should I do here?
* Here was a call to global_flush_tlb(), but it does not exist
* anymore. It seems it's not needed after all.
*/
return ret;
}
EXPORT_SYMBOL(register_kmmio_probe);
static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
{
struct kmmio_delayed_release *dr = container_of(
head,
struct kmmio_delayed_release,
rcu);
struct kmmio_fault_page *f = dr->release_list;
while (f) {
struct kmmio_fault_page *next = f->release_next;
BUG_ON(f->count);
kfree(f);
f = next;
}
kfree(dr);
}
static void remove_kmmio_fault_pages(struct rcu_head *head)
{
struct kmmio_delayed_release *dr =
container_of(head, struct kmmio_delayed_release, rcu);
struct kmmio_fault_page *f = dr->release_list;
struct kmmio_fault_page **prevp = &dr->release_list;
unsigned long flags;
spin_lock_irqsave(&kmmio_lock, flags);
while (f) {
if (!f->count) {
list_del_rcu(&f->list);
prevp = &f->release_next;
} else {
*prevp = f->release_next;
}
f = f->release_next;
}
spin_unlock_irqrestore(&kmmio_lock, flags);
/* This is the real RCU destroy call. */
call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
}
/*
* Remove a kmmio probe. You have to synchronize_rcu() before you can be
* sure that the callbacks will not be called anymore. Only after that
* you may actually release your struct kmmio_probe.
*
* Unregistering a kmmio fault page has three steps:
* 1. release_kmmio_fault_page()
* Disarm the page, wait a grace period to let all faults finish.
* 2. remove_kmmio_fault_pages()
* Remove the pages from kmmio_page_table.
* 3. rcu_free_kmmio_fault_pages()
* Actally free the kmmio_fault_page structs as with RCU.
*/
void unregister_kmmio_probe(struct kmmio_probe *p)
{
unsigned long flags;
unsigned long size = 0;
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
struct kmmio_fault_page *release_list = NULL;
struct kmmio_delayed_release *drelease;
spin_lock_irqsave(&kmmio_lock, flags);
while (size < size_lim) {
release_kmmio_fault_page(p->addr + size, &release_list);
size += PAGE_SIZE;
}
list_del_rcu(&p->list);
kmmio_count--;
spin_unlock_irqrestore(&kmmio_lock, flags);
drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
if (!drelease) {
pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
return;
}
drelease->release_list = release_list;
/*
* This is not really RCU here. We have just disarmed a set of
* pages so that they cannot trigger page faults anymore. However,
* we cannot remove the pages from kmmio_page_table,
* because a probe hit might be in flight on another CPU. The
* pages are collected into a list, and they will be removed from
* kmmio_page_table when it is certain that no probe hit related to
* these pages can be in flight. RCU grace period sounds like a
* good choice.
*
* If we removed the pages too early, kmmio page fault handler might
* not find the respective kmmio_fault_page and determine it's not
* a kmmio fault, when it actually is. This would lead to madness.
*/
call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
}
EXPORT_SYMBOL(unregister_kmmio_probe);
static int
kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
{
struct die_args *arg = args;
if (val == DIE_DEBUG && (arg->err & DR_STEP))
if (post_kmmio_handler(arg->err, arg->regs) == 1)
return NOTIFY_STOP;
return NOTIFY_DONE;
}
static struct notifier_block nb_die = {
.notifier_call = kmmio_die_notifier
};
int kmmio_init(void)
{
int i;
for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
INIT_LIST_HEAD(&kmmio_page_table[i]);
return register_die_notifier(&nb_die);
}
void kmmio_cleanup(void)
{
int i;
unregister_die_notifier(&nb_die);
for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
WARN_ONCE(!list_empty(&kmmio_page_table[i]),
KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
}
}

View File

@@ -0,0 +1,130 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/pfn.h>
#include <asm/e820.h>
static u64 patterns[] __initdata = {
0,
0xffffffffffffffffULL,
0x5555555555555555ULL,
0xaaaaaaaaaaaaaaaaULL,
0x1111111111111111ULL,
0x2222222222222222ULL,
0x4444444444444444ULL,
0x8888888888888888ULL,
0x3333333333333333ULL,
0x6666666666666666ULL,
0x9999999999999999ULL,
0xccccccccccccccccULL,
0x7777777777777777ULL,
0xbbbbbbbbbbbbbbbbULL,
0xddddddddddddddddULL,
0xeeeeeeeeeeeeeeeeULL,
0x7a6c7258554e494cULL, /* yeah ;-) */
};
static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
{
printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
(unsigned long long) pattern,
(unsigned long long) start_bad,
(unsigned long long) end_bad);
reserve_early(start_bad, end_bad, "BAD RAM");
}
static void __init memtest(u64 pattern, u64 start_phys, u64 size)
{
u64 *p, *start, *end;
u64 start_bad, last_bad;
u64 start_phys_aligned;
const size_t incr = sizeof(pattern);
start_phys_aligned = ALIGN(start_phys, incr);
start = __va(start_phys_aligned);
end = start + (size - (start_phys_aligned - start_phys)) / incr;
start_bad = 0;
last_bad = 0;
for (p = start; p < end; p++)
*p = pattern;
for (p = start; p < end; p++, start_phys_aligned += incr) {
if (*p == pattern)
continue;
if (start_phys_aligned == last_bad + incr) {
last_bad += incr;
continue;
}
if (start_bad)
reserve_bad_mem(pattern, start_bad, last_bad + incr);
start_bad = last_bad = start_phys_aligned;
}
if (start_bad)
reserve_bad_mem(pattern, start_bad, last_bad + incr);
}
static void __init do_one_pass(u64 pattern, u64 start, u64 end)
{
u64 size = 0;
while (start < end) {
start = find_e820_area_size(start, &size, 1);
/* done ? */
if (start >= end)
break;
if (start + size > end)
size = end - start;
printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
(unsigned long long) start,
(unsigned long long) start + size,
(unsigned long long) cpu_to_be64(pattern));
memtest(pattern, start, size);
start += size;
}
}
/* default is disabled */
static int memtest_pattern __initdata;
static int __init parse_memtest(char *arg)
{
if (arg)
memtest_pattern = simple_strtoul(arg, NULL, 0);
else
memtest_pattern = ARRAY_SIZE(patterns);
return 0;
}
early_param("memtest", parse_memtest);
void __init early_memtest(unsigned long start, unsigned long end)
{
unsigned int i;
unsigned int idx = 0;
if (!memtest_pattern)
return;
printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
for (i = 0; i < memtest_pattern; i++) {
idx = i % ARRAY_SIZE(patterns);
do_one_pass(patterns[idx], start, end);
}
if (idx > 0) {
printk(KERN_INFO "early_memtest: wipe out "
"test pattern from memory\n");
/* additional test with pattern 0 will do this */
do_one_pass(0, start, end);
}
}

136
kernel/arch/x86/mm/mmap.c Normal file
View File

@@ -0,0 +1,136 @@
/*
* Flexible mmap layout support
*
* Based on code by Ingo Molnar and Andi Kleen, copyrighted
* as follows:
*
* Copyright 2003-2009 Red Hat Inc.
* All Rights Reserved.
* Copyright 2005 Andi Kleen, SUSE Labs.
* Copyright 2007 Jiri Kosina, SUSE Labs.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/personality.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/limits.h>
#include <linux/sched.h>
#include <asm/elf.h>
static unsigned int stack_maxrandom_size(void)
{
unsigned int max = 0;
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT;
}
return max;
}
/*
* Top of mmap area (just below the process stack).
*
* Leave an at least ~128 MB hole with possible stack randomization.
*/
#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
#define MAX_GAP (TASK_SIZE/6*5)
/*
* True on X86_32 or when emulating IA32 on X86_64
*/
static int mmap_is_ia32(void)
{
#ifdef CONFIG_X86_32
return 1;
#endif
#ifdef CONFIG_IA32_EMULATION
if (test_thread_flag(TIF_IA32))
return 1;
#endif
return 0;
}
static int mmap_is_legacy(void)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
}
static unsigned long mmap_rnd(void)
{
unsigned long rnd = 0;
/*
* 8 bits of randomness in 32bit mmaps, 20 address space bits
* 28 bits of randomness in 64bit mmaps, 40 address space bits
*/
if (current->flags & PF_RANDOMIZE) {
if (mmap_is_ia32())
rnd = (long)get_random_int() % (1<<8);
else
rnd = (long)(get_random_int() % (1<<28));
}
return rnd << PAGE_SHIFT;
}
static unsigned long mmap_base(void)
{
unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
if (gap < MIN_GAP)
gap = MIN_GAP;
else if (gap > MAX_GAP)
gap = MAX_GAP;
return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
}
/*
* Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
* does, but not when emulating X86_32
*/
static unsigned long mmap_legacy_base(void)
{
if (mmap_is_ia32())
return TASK_UNMAPPED_BASE;
else
return TASK_UNMAPPED_BASE + mmap_rnd();
}
/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
if (mmap_is_legacy()) {
mm->mmap_base = mmap_legacy_base();
mm->get_unmapped_area = arch_get_unmapped_area;
mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
mm->unmap_area = arch_unmap_area_topdown;
}
}

View File

@@ -0,0 +1,481 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2005
* Jeff Muizelaar, 2006, 2007
* Pekka Paalanen, 2008 <pq@iki.fi>
*
* Derived from the read-mod example from relay-examples by Tom Zanussi.
*/
#define DEBUG 1
#include <linux/module.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/version.h>
#include <linux/kallsyms.h>
#include <asm/pgtable.h>
#include <linux/mmiotrace.h>
#include <asm/e820.h> /* for ISA_START_ADDRESS */
#include <asm/atomic.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include "pf_in.h"
#define NAME "mmiotrace: "
struct trap_reason {
unsigned long addr;
unsigned long ip;
enum reason_type type;
int active_traces;
};
struct remap_trace {
struct list_head list;
struct kmmio_probe probe;
resource_size_t phys;
unsigned long id;
};
/* Accessed per-cpu. */
static DEFINE_PER_CPU(struct trap_reason, pf_reason);
static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
static DEFINE_MUTEX(mmiotrace_mutex);
static DEFINE_SPINLOCK(trace_lock);
static atomic_t mmiotrace_enabled;
static LIST_HEAD(trace_list); /* struct remap_trace */
/*
* Locking in this file:
* - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
* - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
* and trace_lock.
* - Routines depending on is_enabled() must take trace_lock.
* - trace_list users must hold trace_lock.
* - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
* - pre/post callbacks assume the effect of is_enabled() being true.
*/
/* module parameters */
static unsigned long filter_offset;
static int nommiotrace;
static int trace_pc;
module_param(filter_offset, ulong, 0);
module_param(nommiotrace, bool, 0);
module_param(trace_pc, bool, 0);
MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
static bool is_enabled(void)
{
return atomic_read(&mmiotrace_enabled);
}
static void print_pte(unsigned long address)
{
unsigned int level;
pte_t *pte = lookup_address(address, &level);
if (!pte) {
pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
__func__, address);
return;
}
if (level == PG_LEVEL_2M) {
pr_emerg(NAME "4MB pages are not currently supported: "
"0x%08lx\n", address);
BUG();
}
pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address,
(unsigned long long)pte_val(*pte),
(unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
}
/*
* For some reason the pre/post pairs have been called in an
* unmatched order. Report and die.
*/
static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
{
const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
"last fault for address: 0x%08lx\n",
addr, my_reason->addr);
print_pte(addr);
print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
#ifdef __i386__
pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
regs->si, regs->di, regs->bp, regs->sp);
#else
pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
regs->ax, regs->cx, regs->dx);
pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
regs->si, regs->di, regs->bp, regs->sp);
#endif
put_cpu_var(pf_reason);
BUG();
}
static void pre(struct kmmio_probe *p, struct pt_regs *regs,
unsigned long addr)
{
struct trap_reason *my_reason = &get_cpu_var(pf_reason);
struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
const unsigned long instptr = instruction_pointer(regs);
const enum reason_type type = get_ins_type(instptr);
struct remap_trace *trace = p->private;
/* it doesn't make sense to have more than one active trace per cpu */
if (my_reason->active_traces)
die_kmmio_nesting_error(regs, addr);
else
my_reason->active_traces++;
my_reason->type = type;
my_reason->addr = addr;
my_reason->ip = instptr;
my_trace->phys = addr - trace->probe.addr + trace->phys;
my_trace->map_id = trace->id;
/*
* Only record the program counter when requested.
* It may taint clean-room reverse engineering.
*/
if (trace_pc)
my_trace->pc = instptr;
else
my_trace->pc = 0;
/*
* XXX: the timestamp recorded will be *after* the tracing has been
* done, not at the time we hit the instruction. SMP implications
* on event ordering?
*/
switch (type) {
case REG_READ:
my_trace->opcode = MMIO_READ;
my_trace->width = get_ins_mem_width(instptr);
break;
case REG_WRITE:
my_trace->opcode = MMIO_WRITE;
my_trace->width = get_ins_mem_width(instptr);
my_trace->value = get_ins_reg_val(instptr, regs);
break;
case IMM_WRITE:
my_trace->opcode = MMIO_WRITE;
my_trace->width = get_ins_mem_width(instptr);
my_trace->value = get_ins_imm_val(instptr);
break;
default:
{
unsigned char *ip = (unsigned char *)instptr;
my_trace->opcode = MMIO_UNKNOWN_OP;
my_trace->width = 0;
my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
*(ip + 2);
}
}
put_cpu_var(cpu_trace);
put_cpu_var(pf_reason);
}
static void post(struct kmmio_probe *p, unsigned long condition,
struct pt_regs *regs)
{
struct trap_reason *my_reason = &get_cpu_var(pf_reason);
struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
/* this should always return the active_trace count to 0 */
my_reason->active_traces--;
if (my_reason->active_traces) {
pr_emerg(NAME "unexpected post handler");
BUG();
}
switch (my_reason->type) {
case REG_READ:
my_trace->value = get_ins_reg_val(my_reason->ip, regs);
break;
default:
break;
}
mmio_trace_rw(my_trace);
put_cpu_var(cpu_trace);
put_cpu_var(pf_reason);
}
static void ioremap_trace_core(resource_size_t offset, unsigned long size,
void __iomem *addr)
{
static atomic_t next_id;
struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
/* These are page-unaligned. */
struct mmiotrace_map map = {
.phys = offset,
.virt = (unsigned long)addr,
.len = size,
.opcode = MMIO_PROBE
};
if (!trace) {
pr_err(NAME "kmalloc failed in ioremap\n");
return;
}
*trace = (struct remap_trace) {
.probe = {
.addr = (unsigned long)addr,
.len = size,
.pre_handler = pre,
.post_handler = post,
.private = trace
},
.phys = offset,
.id = atomic_inc_return(&next_id)
};
map.map_id = trace->id;
spin_lock_irq(&trace_lock);
if (!is_enabled()) {
kfree(trace);
goto not_enabled;
}
mmio_trace_mapping(&map);
list_add_tail(&trace->list, &trace_list);
if (!nommiotrace)
register_kmmio_probe(&trace->probe);
not_enabled:
spin_unlock_irq(&trace_lock);
}
void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
void __iomem *addr)
{
if (!is_enabled()) /* recheck and proper locking in *_core() */
return;
pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n",
(unsigned long long)offset, size, addr);
if ((filter_offset) && (offset != filter_offset))
return;
ioremap_trace_core(offset, size, addr);
}
static void iounmap_trace_core(volatile void __iomem *addr)
{
struct mmiotrace_map map = {
.phys = 0,
.virt = (unsigned long)addr,
.len = 0,
.opcode = MMIO_UNPROBE
};
struct remap_trace *trace;
struct remap_trace *tmp;
struct remap_trace *found_trace = NULL;
pr_debug(NAME "Unmapping %p.\n", addr);
spin_lock_irq(&trace_lock);
if (!is_enabled())
goto not_enabled;
list_for_each_entry_safe(trace, tmp, &trace_list, list) {
if ((unsigned long)addr == trace->probe.addr) {
if (!nommiotrace)
unregister_kmmio_probe(&trace->probe);
list_del(&trace->list);
found_trace = trace;
break;
}
}
map.map_id = (found_trace) ? found_trace->id : -1;
mmio_trace_mapping(&map);
not_enabled:
spin_unlock_irq(&trace_lock);
if (found_trace) {
synchronize_rcu(); /* unregister_kmmio_probe() requirement */
kfree(found_trace);
}
}
void mmiotrace_iounmap(volatile void __iomem *addr)
{
might_sleep();
if (is_enabled()) /* recheck and proper locking in *_core() */
iounmap_trace_core(addr);
}
int mmiotrace_printk(const char *fmt, ...)
{
int ret = 0;
va_list args;
unsigned long flags;
va_start(args, fmt);
spin_lock_irqsave(&trace_lock, flags);
if (is_enabled())
ret = mmio_trace_printk(fmt, args);
spin_unlock_irqrestore(&trace_lock, flags);
va_end(args);
return ret;
}
EXPORT_SYMBOL(mmiotrace_printk);
static void clear_trace_list(void)
{
struct remap_trace *trace;
struct remap_trace *tmp;
/*
* No locking required, because the caller ensures we are in a
* critical section via mutex, and is_enabled() is false,
* i.e. nothing can traverse or modify this list.
* Caller also ensures is_enabled() cannot change.
*/
list_for_each_entry(trace, &trace_list, list) {
pr_notice(NAME "purging non-iounmapped "
"trace @0x%08lx, size 0x%lx.\n",
trace->probe.addr, trace->probe.len);
if (!nommiotrace)
unregister_kmmio_probe(&trace->probe);
}
synchronize_rcu(); /* unregister_kmmio_probe() requirement */
list_for_each_entry_safe(trace, tmp, &trace_list, list) {
list_del(&trace->list);
kfree(trace);
}
}
#ifdef CONFIG_HOTPLUG_CPU
static cpumask_var_t downed_cpus;
static void enter_uniprocessor(void)
{
int cpu;
int err;
if (downed_cpus == NULL &&
!alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
pr_notice(NAME "Failed to allocate mask\n");
goto out;
}
get_online_cpus();
cpumask_copy(downed_cpus, cpu_online_mask);
cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
if (num_online_cpus() > 1)
pr_notice(NAME "Disabling non-boot CPUs...\n");
put_online_cpus();
for_each_cpu(cpu, downed_cpus) {
err = cpu_down(cpu);
if (!err)
pr_info(NAME "CPU%d is down.\n", cpu);
else
pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
}
out:
if (num_online_cpus() > 1)
pr_warning(NAME "multiple CPUs still online, "
"may miss events.\n");
}
/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
but this whole function is ifdefed CONFIG_HOTPLUG_CPU */
static void __ref leave_uniprocessor(void)
{
int cpu;
int err;
if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
return;
pr_notice(NAME "Re-enabling CPUs...\n");
for_each_cpu(cpu, downed_cpus) {
err = cpu_up(cpu);
if (!err)
pr_info(NAME "enabled CPU%d.\n", cpu);
else
pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err);
}
}
#else /* !CONFIG_HOTPLUG_CPU */
static void enter_uniprocessor(void)
{
if (num_online_cpus() > 1)
pr_warning(NAME "multiple CPUs are online, may miss events. "
"Suggest booting with maxcpus=1 kernel argument.\n");
}
static void leave_uniprocessor(void)
{
}
#endif
void enable_mmiotrace(void)
{
mutex_lock(&mmiotrace_mutex);
if (is_enabled())
goto out;
if (nommiotrace)
pr_info(NAME "MMIO tracing disabled.\n");
kmmio_init();
enter_uniprocessor();
spin_lock_irq(&trace_lock);
atomic_inc(&mmiotrace_enabled);
spin_unlock_irq(&trace_lock);
pr_info(NAME "enabled.\n");
out:
mutex_unlock(&mmiotrace_mutex);
}
void disable_mmiotrace(void)
{
mutex_lock(&mmiotrace_mutex);
if (!is_enabled())
goto out;
spin_lock_irq(&trace_lock);
atomic_dec(&mmiotrace_enabled);
BUG_ON(is_enabled());
spin_unlock_irq(&trace_lock);
clear_trace_list(); /* guarantees: no more kmmio callbacks */
leave_uniprocessor();
kmmio_cleanup();
pr_info(NAME "disabled.\n");
out:
mutex_unlock(&mmiotrace_mutex);
}

67
kernel/arch/x86/mm/numa.c Normal file
View File

@@ -0,0 +1,67 @@
/* Common code for 32 and 64-bit NUMA */
#include <linux/topology.h>
#include <linux/module.h>
#include <linux/bootmem.h>
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
# define DBG(x...) printk(KERN_DEBUG x)
#else
# define DBG(x...)
#endif
/*
* Which logical CPUs are on which nodes
*/
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
EXPORT_SYMBOL(node_to_cpumask_map);
/*
* Allocate node_to_cpumask_map based on number of available nodes
* Requires node_possible_map to be valid.
*
* Note: node_to_cpumask() is not valid until after this is done.
* (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
*/
void __init setup_node_to_cpumask_map(void)
{
unsigned int node, num = 0;
/* setup nr_node_ids if not done yet */
if (nr_node_ids == MAX_NUMNODES) {
for_each_node_mask(node, node_possible_map)
num = node;
nr_node_ids = num + 1;
}
/* allocate the map */
for (node = 0; node < nr_node_ids; node++)
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
/* cpumask_of_node() will now work */
pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
}
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
/*
* Returns a pointer to the bitmask of CPUs on Node 'node'.
*/
const struct cpumask *cpumask_of_node(int node)
{
if (node >= nr_node_ids) {
printk(KERN_WARNING
"cpumask_of_node(%d): node > nr_node_ids(%d)\n",
node, nr_node_ids);
dump_stack();
return cpu_none_mask;
}
if (node_to_cpumask_map[node] == NULL) {
printk(KERN_WARNING
"cpumask_of_node(%d): no node_to_cpumask_map!\n",
node);
dump_stack();
return cpu_online_mask;
}
return node_to_cpumask_map[node];
}
EXPORT_SYMBOL(cpumask_of_node);
#endif

View File

@@ -0,0 +1,454 @@
/*
* Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
* August 2002: added remote node KVA remap - Martin J. Bligh
*
* Copyright (C) 2002, IBM Corp.
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/mm.h>
#include <linux/bootmem.h>
#include <linux/mmzone.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
#include <linux/nodemask.h>
#include <linux/module.h>
#include <linux/kexec.h>
#include <linux/pfn.h>
#include <linux/swap.h>
#include <linux/acpi.h>
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/mmzone.h>
#include <asm/bios_ebda.h>
#include <asm/proto.h>
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
/*
* numa interface - we expect the numa architecture specific code to have
* populated the following initialisation.
*
* 1) node_online_map - the map of all nodes configured (online) in the system
* 2) node_start_pfn - the starting page frame number for a node
* 3) node_end_pfn - the ending page fram number for a node
*/
unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
#ifdef CONFIG_DISCONTIGMEM
/*
* 4) physnode_map - the mapping between a pfn and owning node
* physnode_map keeps track of the physical memory layout of a generic
* numa node on a 64Mb break (each element of the array will
* represent 64Mb of memory and will be marked by the node id. so,
* if the first gig is on node 0, and the second gig is on node 1
* physnode_map will contain:
*
* physnode_map[0-15] = 0;
* physnode_map[16-31] = 1;
* physnode_map[32- ] = -1;
*/
s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
EXPORT_SYMBOL(physnode_map);
void memory_present(int nid, unsigned long start, unsigned long end)
{
unsigned long pfn;
printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
nid, start, end);
printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
printk(KERN_DEBUG " ");
for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
printk(KERN_CONT "%lx ", pfn);
}
printk(KERN_CONT "\n");
}
unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long nr_pages = end_pfn - start_pfn;
if (!nr_pages)
return 0;
return (nr_pages + 1) * sizeof(struct page);
}
#endif
extern unsigned long find_max_low_pfn(void);
extern unsigned long highend_pfn, highstart_pfn;
#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
unsigned long node_remap_size[MAX_NUMNODES];
static void *node_remap_start_vaddr[MAX_NUMNODES];
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
static unsigned long kva_start_pfn;
static unsigned long kva_pages;
/*
* FLAT - support for basic PC memory model with discontig enabled, essentially
* a single node with all available processors in it with a flat
* memory map.
*/
int __init get_memcfg_numa_flat(void)
{
printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
node_start_pfn[0] = 0;
node_end_pfn[0] = max_pfn;
e820_register_active_regions(0, 0, max_pfn);
memory_present(0, 0, max_pfn);
node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
/* Indicate there is one node available. */
nodes_clear(node_online_map);
node_set_online(0);
return 1;
}
/*
* Find the highest page frame number we have available for the node
*/
static void __init propagate_e820_map_node(int nid)
{
if (node_end_pfn[nid] > max_pfn)
node_end_pfn[nid] = max_pfn;
/*
* if a user has given mem=XXXX, then we need to make sure
* that the node _starts_ before that, too, not just ends
*/
if (node_start_pfn[nid] > max_pfn)
node_start_pfn[nid] = max_pfn;
BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
}
/*
* Allocate memory for the pg_data_t for this node via a crude pre-bootmem
* method. For node zero take this from the bottom of memory, for
* subsequent nodes place them at node_remap_start_vaddr which contains
* node local data in physically node local memory. See setup_memory()
* for details.
*/
static void __init allocate_pgdat(int nid)
{
char buf[16];
if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
else {
unsigned long pgdat_phys;
pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
max_pfn_mapped<<PAGE_SHIFT,
sizeof(pg_data_t),
PAGE_SIZE);
NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
memset(buf, 0, sizeof(buf));
sprintf(buf, "NODE_DATA %d", nid);
reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
}
printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
nid, (unsigned long)NODE_DATA(nid));
}
/*
* In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
* virtual address space (KVA) is reserved and portions of nodes are mapped
* using it. This is to allow node-local memory to be allocated for
* structures that would normally require ZONE_NORMAL. The memory is
* allocated with alloc_remap() and callers should be prepared to allocate
* from the bootmem allocator instead.
*/
static unsigned long node_remap_start_pfn[MAX_NUMNODES];
static void *node_remap_end_vaddr[MAX_NUMNODES];
static void *node_remap_alloc_vaddr[MAX_NUMNODES];
static unsigned long node_remap_offset[MAX_NUMNODES];
void *alloc_remap(int nid, unsigned long size)
{
void *allocation = node_remap_alloc_vaddr[nid];
size = ALIGN(size, L1_CACHE_BYTES);
if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
return NULL;
node_remap_alloc_vaddr[nid] += size;
memset(allocation, 0, size);
return allocation;
}
static void __init remap_numa_kva(void)
{
void *vaddr;
unsigned long pfn;
int node;
for_each_online_node(node) {
printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
(unsigned long)vaddr,
node_remap_start_pfn[node] + pfn);
set_pmd_pfn((ulong) vaddr,
node_remap_start_pfn[node] + pfn,
PAGE_KERNEL_LARGE);
}
}
}
#ifdef CONFIG_HIBERNATION
/**
* resume_map_numa_kva - add KVA mapping to the temporary page tables created
* during resume from hibernation
* @pgd_base - temporary resume page directory
*/
void resume_map_numa_kva(pgd_t *pgd_base)
{
int node;
for_each_online_node(node) {
unsigned long start_va, start_pfn, size, pfn;
start_va = (unsigned long)node_remap_start_vaddr[node];
start_pfn = node_remap_start_pfn[node];
size = node_remap_size[node];
printk(KERN_DEBUG "%s: node %d\n", __func__, node);
for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) {
unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
pgd_t *pgd = pgd_base + pgd_index(vaddr);
pud_t *pud = pud_offset(pgd, vaddr);
pmd_t *pmd = pmd_offset(pud, vaddr);
set_pmd(pmd, pfn_pmd(start_pfn + pfn,
PAGE_KERNEL_LARGE_EXEC));
printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
__func__, vaddr, start_pfn + pfn);
}
}
}
#endif
static __init unsigned long calculate_numa_remap_pages(void)
{
int nid;
unsigned long size, reserve_pages = 0;
for_each_online_node(nid) {
u64 node_kva_target;
u64 node_kva_final;
/*
* The acpi/srat node info can show hot-add memroy zones
* where memory could be added but not currently present.
*/
printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
nid, node_start_pfn[nid], node_end_pfn[nid]);
if (node_start_pfn[nid] > max_pfn)
continue;
if (!node_end_pfn[nid])
continue;
if (node_end_pfn[nid] > max_pfn)
node_end_pfn[nid] = max_pfn;
/* ensure the remap includes space for the pgdat. */
size = node_remap_size[nid] + sizeof(pg_data_t);
/* convert size to large (pmd size) pages, rounding up */
size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
/* now the roundup is correct, convert to PAGE_SIZE pages */
size = size * PTRS_PER_PTE;
node_kva_target = round_down(node_end_pfn[nid] - size,
PTRS_PER_PTE);
node_kva_target <<= PAGE_SHIFT;
do {
node_kva_final = find_e820_area(node_kva_target,
((u64)node_end_pfn[nid])<<PAGE_SHIFT,
((u64)size)<<PAGE_SHIFT,
LARGE_PAGE_BYTES);
node_kva_target -= LARGE_PAGE_BYTES;
} while (node_kva_final == -1ULL &&
(node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
if (node_kva_final == -1ULL)
panic("Can not get kva ram\n");
node_remap_size[nid] = size;
node_remap_offset[nid] = reserve_pages;
reserve_pages += size;
printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
" node %d at %llx\n",
size, nid, node_kva_final>>PAGE_SHIFT);
/*
* prevent kva address below max_low_pfn want it on system
* with less memory later.
* layout will be: KVA address , KVA RAM
*
* we are supposed to only record the one less then max_low_pfn
* but we could have some hole in high memory, and it will only
* check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
* to use it as free.
* So reserve_early here, hope we don't run out of that array
*/
reserve_early(node_kva_final,
node_kva_final+(((u64)size)<<PAGE_SHIFT),
"KVA RAM");
node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
remove_active_range(nid, node_remap_start_pfn[nid],
node_remap_start_pfn[nid] + size);
}
printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
reserve_pages);
return reserve_pages;
}
static void init_remap_allocator(int nid)
{
node_remap_start_vaddr[nid] = pfn_to_kaddr(
kva_start_pfn + node_remap_offset[nid]);
node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
(node_remap_size[nid] * PAGE_SIZE);
node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
ALIGN(sizeof(pg_data_t), PAGE_SIZE);
printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
(ulong) node_remap_start_vaddr[nid],
(ulong) node_remap_end_vaddr[nid]);
}
void __init initmem_init(unsigned long start_pfn,
unsigned long end_pfn)
{
int nid;
long kva_target_pfn;
/*
* When mapping a NUMA machine we allocate the node_mem_map arrays
* from node local memory. They are then mapped directly into KVA
* between zone normal and vmalloc space. Calculate the size of
* this space and use it to adjust the boundary between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
get_memcfg_numa();
kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
do {
kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
max_low_pfn<<PAGE_SHIFT,
kva_pages<<PAGE_SHIFT,
PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
kva_target_pfn -= PTRS_PER_PTE;
} while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
if (kva_start_pfn == -1UL)
panic("Can not get kva space\n");
printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
kva_start_pfn, max_low_pfn);
printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
/* avoid clash with initrd */
reserve_early(kva_start_pfn<<PAGE_SHIFT,
(kva_start_pfn + kva_pages)<<PAGE_SHIFT,
"KVA PG");
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
highstart_pfn = max_low_pfn;
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
max_low_pfn, highstart_pfn);
printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
(ulong) pfn_to_kaddr(max_low_pfn));
for_each_online_node(nid) {
init_remap_allocator(nid);
allocate_pgdat(nid);
}
remap_numa_kva();
printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
(ulong) pfn_to_kaddr(highstart_pfn));
for_each_online_node(nid)
propagate_e820_map_node(nid);
for_each_online_node(nid) {
memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
}
setup_bootmem_allocator();
}
#ifdef CONFIG_MEMORY_HOTPLUG
static int paddr_to_nid(u64 addr)
{
int nid;
unsigned long pfn = PFN_DOWN(addr);
for_each_node(nid)
if (node_start_pfn[nid] <= pfn &&
pfn < node_end_pfn[nid])
return nid;
return -1;
}
/*
* This function is used to ask node id BEFORE memmap and mem_section's
* initialization (pfn_to_nid() can't be used yet).
* If _PXM is not defined on ACPI's DSDT, node id must be found by this.
*/
int memory_add_physaddr_to_nid(u64 addr)
{
int nid = paddr_to_nid(addr);
return (nid >= 0) ? nid : 0;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif

View File

@@ -0,0 +1,754 @@
/*
* Generic VM initialization for x86-64 NUMA setups.
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/mmzone.h>
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/nodemask.h>
#include <linux/sched.h>
#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/numa.h>
#include <asm/acpi.h>
#include <asm/k8.h>
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
struct memnode memnode;
s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
int numa_off __initdata;
static unsigned long __initdata nodemap_addr;
static unsigned long __initdata nodemap_size;
DEFINE_PER_CPU(int, node_number) = 0;
EXPORT_PER_CPU_SYMBOL(node_number);
/*
* Map cpu index to node index
*/
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
/*
* Given a shift value, try to populate memnodemap[]
* Returns :
* 1 if OK
* 0 if memnodmap[] too small (of shift too small)
* -1 if node overlap or lost ram (shift too big)
*/
static int __init populate_memnodemap(const struct bootnode *nodes,
int numnodes, int shift, int *nodeids)
{
unsigned long addr, end;
int i, res = -1;
memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
for (i = 0; i < numnodes; i++) {
addr = nodes[i].start;
end = nodes[i].end;
if (addr >= end)
continue;
if ((end >> shift) >= memnodemapsize)
return 0;
do {
if (memnodemap[addr >> shift] != NUMA_NO_NODE)
return -1;
if (!nodeids)
memnodemap[addr >> shift] = i;
else
memnodemap[addr >> shift] = nodeids[i];
addr += (1UL << shift);
} while (addr < end);
res = 1;
}
return res;
}
static int __init allocate_cachealigned_memnodemap(void)
{
unsigned long addr;
memnodemap = memnode.embedded_map;
if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
return 0;
addr = 0x8000;
nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
nodemap_size, L1_CACHE_BYTES);
if (nodemap_addr == -1UL) {
printk(KERN_ERR
"NUMA: Unable to allocate Memory to Node hash map\n");
nodemap_addr = nodemap_size = 0;
return -1;
}
memnodemap = phys_to_virt(nodemap_addr);
reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
nodemap_addr, nodemap_addr + nodemap_size);
return 0;
}
/*
* The LSB of all start and end addresses in the node map is the value of the
* maximum possible shift.
*/
static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
int numnodes)
{
int i, nodes_used = 0;
unsigned long start, end;
unsigned long bitfield = 0, memtop = 0;
for (i = 0; i < numnodes; i++) {
start = nodes[i].start;
end = nodes[i].end;
if (start >= end)
continue;
bitfield |= start;
nodes_used++;
if (end > memtop)
memtop = end;
}
if (nodes_used <= 1)
i = 63;
else
i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
memnodemapsize = (memtop >> i)+1;
return i;
}
int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
int *nodeids)
{
int shift;
shift = extract_lsb_from_nodes(nodes, numnodes);
if (allocate_cachealigned_memnodemap())
return -1;
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
shift);
if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
printk(KERN_INFO "Your memory is not aligned you need to "
"rebuild your kernel with a bigger NODEMAPSIZE "
"shift=%d\n", shift);
return -1;
}
return shift;
}
int __meminit __early_pfn_to_nid(unsigned long pfn)
{
return phys_to_nid(pfn << PAGE_SHIFT);
}
static void * __init early_node_mem(int nodeid, unsigned long start,
unsigned long end, unsigned long size,
unsigned long align)
{
unsigned long mem = find_e820_area(start, end, size, align);
void *ptr;
if (mem != -1L)
return __va(mem);
ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
if (ptr == NULL) {
printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
size, nodeid);
return NULL;
}
return ptr;
}
/* Initialize bootmem allocator for a node */
void __init
setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
{
unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
unsigned long bootmap_start, nodedata_phys;
void *bootmap;
int nid;
if (!end)
return;
/*
* Don't confuse VM with a node that doesn't have the
* minimum amount of memory:
*/
if (end && (end - start) < NODE_MIN_SIZE)
return;
start = roundup(start, ZONE_ALIGN);
printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
start, end);
start_pfn = start >> PAGE_SHIFT;
last_pfn = end >> PAGE_SHIFT;
node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
SMP_CACHE_BYTES);
if (node_data[nodeid] == NULL)
return;
nodedata_phys = __pa(node_data[nodeid]);
printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
nodedata_phys + pgdat_size - 1);
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
/*
* Find a place for the bootmem map
* nodedata_phys could be on other nodes by alloc_bootmem,
* so need to sure bootmap_start not to be small, otherwise
* early_node_mem will get that with find_e820_area instead
* of alloc_bootmem, that could clash with reserved range
*/
bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
nid = phys_to_nid(nodedata_phys);
if (nid == nodeid)
bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
else
bootmap_start = roundup(start, PAGE_SIZE);
/*
* SMP_CACHE_BYTES could be enough, but init_bootmem_node like
* to use that to align to PAGE_SIZE
*/
bootmap = early_node_mem(nodeid, bootmap_start, end,
bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
if (bootmap == NULL) {
if (nodedata_phys < start || nodedata_phys >= end)
free_bootmem(nodedata_phys, pgdat_size);
node_data[nodeid] = NULL;
return;
}
bootmap_start = __pa(bootmap);
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
bootmap_start >> PAGE_SHIFT,
start_pfn, last_pfn);
printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
bootmap_start, bootmap_start + bootmap_size - 1,
bootmap_pages);
free_bootmem_with_active_regions(nodeid, end);
/*
* convert early reserve to bootmem reserve earlier
* otherwise early_node_mem could use early reserved mem
* on previous node
*/
early_res_to_bootmem(start, end);
/*
* in some case early_node_mem could use alloc_bootmem
* to get range on other node, don't reserve that again
*/
if (nid != nodeid)
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
else
reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
pgdat_size, BOOTMEM_DEFAULT);
nid = phys_to_nid(bootmap_start);
if (nid != nodeid)
printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
else
reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
node_set_online(nodeid);
}
/*
* There are unfortunately some poorly designed mainboards around that
* only connect memory to a single CPU. This breaks the 1:1 cpu->node
* mapping. To avoid this fill in the mapping for all possible CPUs,
* as the number of CPUs is not known yet. We round robin the existing
* nodes.
*/
void __init numa_init_array(void)
{
int rr, i;
rr = first_node(node_online_map);
for (i = 0; i < nr_cpu_ids; i++) {
if (early_cpu_to_node(i) != NUMA_NO_NODE)
continue;
numa_set_node(i, rr);
rr = next_node(rr, node_online_map);
if (rr == MAX_NUMNODES)
rr = first_node(node_online_map);
}
}
#ifdef CONFIG_NUMA_EMU
/* Numa emulation */
static char *cmdline __initdata;
/*
* Setups up nid to range from addr to addr + size. If the end
* boundary is greater than max_addr, then max_addr is used instead.
* The return value is 0 if there is additional memory left for
* allocation past addr and -1 otherwise. addr is adjusted to be at
* the end of the node.
*/
static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
u64 size, u64 max_addr)
{
int ret = 0;
nodes[nid].start = *addr;
*addr += size;
if (*addr >= max_addr) {
*addr = max_addr;
ret = -1;
}
nodes[nid].end = *addr;
node_set(nid, node_possible_map);
printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
nodes[nid].start, nodes[nid].end,
(nodes[nid].end - nodes[nid].start) >> 20);
return ret;
}
/*
* Splits num_nodes nodes up equally starting at node_start. The return value
* is the number of nodes split up and addr is adjusted to be at the end of the
* last node allocated.
*/
static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
u64 max_addr, int node_start,
int num_nodes)
{
unsigned int big;
u64 size;
int i;
if (num_nodes <= 0)
return -1;
if (num_nodes > MAX_NUMNODES)
num_nodes = MAX_NUMNODES;
size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
num_nodes;
/*
* Calculate the number of big nodes that can be allocated as a result
* of consolidating the leftovers.
*/
big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
FAKE_NODE_MIN_SIZE;
/* Round down to nearest FAKE_NODE_MIN_SIZE. */
size &= FAKE_NODE_MIN_HASH_MASK;
if (!size) {
printk(KERN_ERR "Not enough memory for each node. "
"NUMA emulation disabled.\n");
return -1;
}
for (i = node_start; i < num_nodes + node_start; i++) {
u64 end = *addr + size;
if (i < big)
end += FAKE_NODE_MIN_SIZE;
/*
* The final node can have the remaining system RAM. Other
* nodes receive roughly the same amount of available pages.
*/
if (i == num_nodes + node_start - 1)
end = max_addr;
else
while (end - *addr - e820_hole_size(*addr, end) <
size) {
end += FAKE_NODE_MIN_SIZE;
if (end > max_addr) {
end = max_addr;
break;
}
}
if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
break;
}
return i - node_start + 1;
}
/*
* Splits the remaining system RAM into chunks of size. The remaining memory is
* always assigned to a final node and can be asymmetric. Returns the number of
* nodes split.
*/
static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
u64 max_addr, int node_start, u64 size)
{
int i = node_start;
size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
while (!setup_node_range(i++, nodes, addr, size, max_addr))
;
return i - node_start;
}
/*
* Sets up the system RAM area from start_pfn to last_pfn according to the
* numa=fake command-line option.
*/
static struct bootnode nodes[MAX_NUMNODES] __initdata;
static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
{
u64 size, addr = start_pfn << PAGE_SHIFT;
u64 max_addr = last_pfn << PAGE_SHIFT;
int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
memset(&nodes, 0, sizeof(nodes));
/*
* If the numa=fake command-line is just a single number N, split the
* system RAM into N fake nodes.
*/
if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
long n = simple_strtol(cmdline, NULL, 0);
num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
if (num_nodes < 0)
return num_nodes;
goto out;
}
/* Parse the command line. */
for (coeff_flag = 0; ; cmdline++) {
if (*cmdline && isdigit(*cmdline)) {
num = num * 10 + *cmdline - '0';
continue;
}
if (*cmdline == '*') {
if (num > 0)
coeff = num;
coeff_flag = 1;
}
if (!*cmdline || *cmdline == ',') {
if (!coeff_flag)
coeff = 1;
/*
* Round down to the nearest FAKE_NODE_MIN_SIZE.
* Command-line coefficients are in megabytes.
*/
size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
if (size)
for (i = 0; i < coeff; i++, num_nodes++)
if (setup_node_range(num_nodes, nodes,
&addr, size, max_addr) < 0)
goto done;
if (!*cmdline)
break;
coeff_flag = 0;
coeff = -1;
}
num = 0;
}
done:
if (!num_nodes)
return -1;
/* Fill remainder of system RAM, if appropriate. */
if (addr < max_addr) {
if (coeff_flag && coeff < 0) {
/* Split remaining nodes into num-sized chunks */
num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
num_nodes, num);
goto out;
}
switch (*(cmdline - 1)) {
case '*':
/* Split remaining nodes into coeff chunks */
if (coeff <= 0)
break;
num_nodes += split_nodes_equally(nodes, &addr, max_addr,
num_nodes, coeff);
break;
case ',':
/* Do not allocate remaining system RAM */
break;
default:
/* Give one final node */
setup_node_range(num_nodes, nodes, &addr,
max_addr - addr, max_addr);
num_nodes++;
}
}
out:
memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
if (memnode_shift < 0) {
memnode_shift = 0;
printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
"disabled.\n");
return -1;
}
/*
* We need to vacate all active ranges that may have been registered by
* SRAT and set acpi_numa to -1 so that srat_disabled() always returns
* true. NUMA emulation has succeeded so we will not scan ACPI nodes.
*/
remove_all_active_ranges();
#ifdef CONFIG_ACPI_NUMA
acpi_numa = -1;
#endif
for_each_node_mask(i, node_possible_map) {
e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
nodes[i].end >> PAGE_SHIFT);
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
acpi_fake_nodes(nodes, num_nodes);
numa_init_array();
return 0;
}
#endif /* CONFIG_NUMA_EMU */
void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
{
int i;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#ifdef CONFIG_NUMA_EMU
if (cmdline && !numa_emulation(start_pfn, last_pfn))
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#endif
#ifdef CONFIG_ACPI_NUMA
if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
last_pfn << PAGE_SHIFT))
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#endif
#ifdef CONFIG_K8_NUMA
if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
last_pfn<<PAGE_SHIFT))
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#endif
printk(KERN_INFO "%s\n",
numa_off ? "NUMA turned off" : "No NUMA configuration found");
printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
start_pfn << PAGE_SHIFT,
last_pfn << PAGE_SHIFT);
/* setup dummy node covering all memory */
memnode_shift = 63;
memnodemap = memnode.embedded_map;
memnodemap[0] = 0;
node_set_online(0);
node_set(0, node_possible_map);
for (i = 0; i < nr_cpu_ids; i++)
numa_set_node(i, 0);
e820_register_active_regions(0, start_pfn, last_pfn);
setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
}
unsigned long __init numa_free_all_bootmem(void)
{
unsigned long pages = 0;
int i;
for_each_online_node(i)
pages += free_all_bootmem_node(NODE_DATA(i));
return pages;
}
static __init int numa_setup(char *opt)
{
if (!opt)
return -EINVAL;
if (!strncmp(opt, "off", 3))
numa_off = 1;
#ifdef CONFIG_NUMA_EMU
if (!strncmp(opt, "fake=", 5))
cmdline = opt + 5;
#endif
#ifdef CONFIG_ACPI_NUMA
if (!strncmp(opt, "noacpi", 6))
acpi_numa = -1;
#endif
return 0;
}
early_param("numa", numa_setup);
#ifdef CONFIG_NUMA
/*
* Setup early cpu_to_node.
*
* Populate cpu_to_node[] only if x86_cpu_to_apicid[],
* and apicid_to_node[] tables have valid entries for a CPU.
* This means we skip cpu_to_node[] initialisation for NUMA
* emulation and faking node case (when running a kernel compiled
* for NUMA on a non NUMA box), which is OK as cpu_to_node[]
* is already initialized in a round robin manner at numa_init_array,
* prior to this call, and this initialization is good enough
* for the fake NUMA cases.
*
* Called before the per_cpu areas are setup.
*/
void __init init_cpu_to_node(void)
{
int cpu;
u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
BUG_ON(cpu_to_apicid == NULL);
for_each_possible_cpu(cpu) {
int node;
u16 apicid = cpu_to_apicid[cpu];
if (apicid == BAD_APICID)
continue;
node = apicid_to_node[apicid];
if (node == NUMA_NO_NODE)
continue;
if (!node_online(node))
continue;
numa_set_node(cpu, node);
}
}
#endif
void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
/* early setting, no percpu area yet */
if (cpu_to_node_map) {
cpu_to_node_map[cpu] = node;
return;
}
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
dump_stack();
return;
}
#endif
per_cpu(x86_cpu_to_node_map, cpu) = node;
if (node != NUMA_NO_NODE)
per_cpu(node_number, cpu) = node;
}
void __cpuinit numa_clear_node(int cpu)
{
numa_set_node(cpu, NUMA_NO_NODE);
}
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
void __cpuinit numa_add_cpu(int cpu)
{
cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
void __cpuinit numa_remove_cpu(int cpu)
{
cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
#else /* CONFIG_DEBUG_PER_CPU_MAPS */
/*
* --------- debug versions of the numa functions ---------
*/
static void __cpuinit numa_set_cpumask(int cpu, int enable)
{
int node = early_cpu_to_node(cpu);
struct cpumask *mask;
char buf[64];
mask = node_to_cpumask_map[node];
if (mask == NULL) {
printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
dump_stack();
return;
}
if (enable)
cpumask_set_cpu(cpu, mask);
else
cpumask_clear_cpu(cpu, mask);
cpulist_scnprintf(buf, sizeof(buf), mask);
printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
}
void __cpuinit numa_add_cpu(int cpu)
{
numa_set_cpumask(cpu, 1);
}
void __cpuinit numa_remove_cpu(int cpu)
{
numa_set_cpumask(cpu, 0);
}
int cpu_to_node(int cpu)
{
if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
printk(KERN_WARNING
"cpu_to_node(%d): usage too early!\n", cpu);
dump_stack();
return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
}
return per_cpu(x86_cpu_to_node_map, cpu);
}
EXPORT_SYMBOL(cpu_to_node);
/*
* Same function as cpu_to_node() but used if called before the
* per_cpu areas are setup.
*/
int early_cpu_to_node(int cpu)
{
if (early_per_cpu_ptr(x86_cpu_to_node_map))
return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
if (!cpu_possible(cpu)) {
printk(KERN_WARNING
"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
dump_stack();
return NUMA_NO_NODE;
}
return per_cpu(x86_cpu_to_node_map, cpu);
}
/*
* --------- end of debug versions of the numa functions ---------
*/
#endif /* CONFIG_DEBUG_PER_CPU_MAPS */

View File

@@ -0,0 +1,262 @@
/*
* self test for change_page_attr.
*
* Clears the a test pte bit on random pages in the direct mapping,
* then reverts and compares page tables forwards and afterwards.
*/
#include <linux/bootmem.h>
#include <linux/kthread.h>
#include <linux/random.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <asm/cacheflush.h>
#include <asm/pgtable.h>
#include <asm/kdebug.h>
/*
* Only print the results of the first pass:
*/
static __read_mostly int print = 1;
enum {
NTEST = 400,
#ifdef CONFIG_X86_64
LPS = (1 << PMD_SHIFT),
#elif defined(CONFIG_X86_PAE)
LPS = (1 << PMD_SHIFT),
#else
LPS = (1 << 22),
#endif
GPS = (1<<30)
};
#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
static int pte_testbit(pte_t pte)
{
return pte_flags(pte) & _PAGE_UNUSED1;
}
struct split_state {
long lpg, gpg, spg, exec;
long min_exec, max_exec;
};
static int print_split(struct split_state *s)
{
long i, expected, missed = 0;
int err = 0;
s->lpg = s->gpg = s->spg = s->exec = 0;
s->min_exec = ~0UL;
s->max_exec = 0;
for (i = 0; i < max_pfn_mapped; ) {
unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT);
unsigned int level;
pte_t *pte;
pte = lookup_address(addr, &level);
if (!pte) {
missed++;
i++;
continue;
}
if (level == PG_LEVEL_1G && sizeof(long) == 8) {
s->gpg++;
i += GPS/PAGE_SIZE;
} else if (level == PG_LEVEL_2M) {
if (!(pte_val(*pte) & _PAGE_PSE)) {
printk(KERN_ERR
"%lx level %d but not PSE %Lx\n",
addr, level, (u64)pte_val(*pte));
err = 1;
}
s->lpg++;
i += LPS/PAGE_SIZE;
} else {
s->spg++;
i++;
}
if (!(pte_val(*pte) & _PAGE_NX)) {
s->exec++;
if (addr < s->min_exec)
s->min_exec = addr;
if (addr > s->max_exec)
s->max_exec = addr;
}
}
if (print) {
printk(KERN_INFO
" 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
s->spg, s->lpg, s->gpg, s->exec,
s->min_exec != ~0UL ? s->min_exec : 0,
s->max_exec, missed);
}
expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed;
if (expected != i) {
printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n",
max_pfn_mapped, expected);
return 1;
}
return err;
}
static unsigned long addr[NTEST];
static unsigned int len[NTEST];
/* Change the global bit on random pages in the direct mapping */
static int pageattr_test(void)
{
struct split_state sa, sb, sc;
unsigned long *bm;
pte_t *pte, pte0;
int failed = 0;
unsigned int level;
int i, k;
int err;
unsigned long test_addr;
if (print)
printk(KERN_INFO "CPA self-test:\n");
bm = vmalloc((max_pfn_mapped + 7) / 8);
if (!bm) {
printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
return -ENOMEM;
}
memset(bm, 0, (max_pfn_mapped + 7) / 8);
failed += print_split(&sa);
srandom32(100);
for (i = 0; i < NTEST; i++) {
unsigned long pfn = random32() % max_pfn_mapped;
addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
len[i] = random32() % 100;
len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
if (len[i] == 0)
len[i] = 1;
pte = NULL;
pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */
for (k = 0; k < len[i]; k++) {
pte = lookup_address(addr[i] + k*PAGE_SIZE, &level);
if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 ||
!(pte_val(*pte) & _PAGE_PRESENT)) {
addr[i] = 0;
break;
}
if (k == 0) {
pte0 = *pte;
} else {
if (pgprot_val(pte_pgprot(*pte)) !=
pgprot_val(pte_pgprot(pte0))) {
len[i] = k;
break;
}
}
if (test_bit(pfn + k, bm)) {
len[i] = k;
break;
}
__set_bit(pfn + k, bm);
}
if (!addr[i] || !pte || !k) {
addr[i] = 0;
continue;
}
test_addr = addr[i];
err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
if (err < 0) {
printk(KERN_ERR "CPA %d failed %d\n", i, err);
failed++;
}
pte = lookup_address(addr[i], &level);
if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
pte ? (u64)pte_val(*pte) : 0ULL);
failed++;
}
if (level != PG_LEVEL_4K) {
printk(KERN_ERR "CPA %lx: unexpected level %d\n",
addr[i], level);
failed++;
}
}
vfree(bm);
failed += print_split(&sb);
for (i = 0; i < NTEST; i++) {
if (!addr[i])
continue;
pte = lookup_address(addr[i], &level);
if (!pte) {
printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]);
failed++;
continue;
}
test_addr = addr[i];
err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
if (err < 0) {
printk(KERN_ERR "CPA reverting failed: %d\n", err);
failed++;
}
pte = lookup_address(addr[i], &level);
if (!pte || pte_testbit(*pte)) {
printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
failed++;
}
}
failed += print_split(&sc);
if (failed) {
WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
return -EINVAL;
} else {
if (print)
printk(KERN_INFO "ok.\n");
}
return 0;
}
static int do_pageattr_test(void *__unused)
{
while (!kthread_should_stop()) {
schedule_timeout_interruptible(HZ*30);
if (pageattr_test() < 0)
break;
if (print)
print--;
}
return 0;
}
static int start_pageattr_test(void)
{
struct task_struct *p;
p = kthread_create(do_pageattr_test, NULL, "pageattr-test");
if (!IS_ERR(p))
wake_up_process(p);
else
WARN_ON(1);
return 0;
}
module_init(start_pageattr_test);

File diff suppressed because it is too large Load Diff

1028
kernel/arch/x86/mm/pat.c Normal file

File diff suppressed because it is too large Load Diff

536
kernel/arch/x86/mm/pf_in.c Normal file
View File

@@ -0,0 +1,536 @@
/*
* Fault Injection Test harness (FI)
* Copyright (C) Intel Crop.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
* USA.
*
*/
/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
* Copyright by Intel Crop., 2002
* Louis Zhuang (louis.zhuang@intel.com)
*
* Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
*/
#include <linux/module.h>
#include <linux/ptrace.h> /* struct pt_regs */
#include "pf_in.h"
#ifdef __i386__
/* IA32 Manual 3, 2-1 */
static unsigned char prefix_codes[] = {
0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
0x65, 0x2E, 0x3E, 0x66, 0x67
};
/* IA32 Manual 3, 3-432*/
static unsigned int reg_rop[] = {
0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
};
static unsigned int reg_wop[] = { 0x88, 0x89 };
static unsigned int imm_wop[] = { 0xC6, 0xC7 };
/* IA32 Manual 3, 3-432*/
static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
static unsigned int rw32[] = {
0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
};
static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
static unsigned int mw16[] = { 0xB70F, 0xBF0F };
static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
static unsigned int mw64[] = {};
#else /* not __i386__ */
static unsigned char prefix_codes[] = {
0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
0xF0, 0xF3, 0xF2,
/* REX Prefixes */
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
};
/* AMD64 Manual 3, Appendix A*/
static unsigned int reg_rop[] = {
0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
};
static unsigned int reg_wop[] = { 0x88, 0x89 };
static unsigned int imm_wop[] = { 0xC6, 0xC7 };
static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
static unsigned int rw32[] = {
0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
};
/* 8 bit only */
static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
/* 16 bit only */
static unsigned int mw16[] = { 0xB70F, 0xBF0F };
/* 16 or 32 bit */
static unsigned int mw32[] = { 0xC7 };
/* 16, 32 or 64 bit */
static unsigned int mw64[] = { 0x89, 0x8B };
#endif /* not __i386__ */
struct prefix_bits {
unsigned shorted:1;
unsigned enlarged:1;
unsigned rexr:1;
unsigned rex:1;
};
static int skip_prefix(unsigned char *addr, struct prefix_bits *prf)
{
int i;
unsigned char *p = addr;
prf->shorted = 0;
prf->enlarged = 0;
prf->rexr = 0;
prf->rex = 0;
restart:
for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
if (*p == prefix_codes[i]) {
if (*p == 0x66)
prf->shorted = 1;
#ifdef __amd64__
if ((*p & 0xf8) == 0x48)
prf->enlarged = 1;
if ((*p & 0xf4) == 0x44)
prf->rexr = 1;
if ((*p & 0xf0) == 0x40)
prf->rex = 1;
#endif
p++;
goto restart;
}
}
return (p - addr);
}
static int get_opcode(unsigned char *addr, unsigned int *opcode)
{
int len;
if (*addr == 0x0F) {
/* 0x0F is extension instruction */
*opcode = *(unsigned short *)addr;
len = 2;
} else {
*opcode = *addr;
len = 1;
}
return len;
}
#define CHECK_OP_TYPE(opcode, array, type) \
for (i = 0; i < ARRAY_SIZE(array); i++) { \
if (array[i] == opcode) { \
rv = type; \
goto exit; \
} \
}
enum reason_type get_ins_type(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char *p;
struct prefix_bits prf;
int i;
enum reason_type rv = OTHERS;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
exit:
return rv;
}
#undef CHECK_OP_TYPE
static unsigned int get_ins_reg_width(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char *p;
struct prefix_bits prf;
int i;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(rw8); i++)
if (rw8[i] == opcode)
return 1;
for (i = 0; i < ARRAY_SIZE(rw32); i++)
if (rw32[i] == opcode)
return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
return 0;
}
unsigned int get_ins_mem_width(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char *p;
struct prefix_bits prf;
int i;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(mw8); i++)
if (mw8[i] == opcode)
return 1;
for (i = 0; i < ARRAY_SIZE(mw16); i++)
if (mw16[i] == opcode)
return 2;
for (i = 0; i < ARRAY_SIZE(mw32); i++)
if (mw32[i] == opcode)
return prf.shorted ? 2 : 4;
for (i = 0; i < ARRAY_SIZE(mw64); i++)
if (mw64[i] == opcode)
return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
return 0;
}
/*
* Define register ident in mod/rm byte.
* Note: these are NOT the same as in ptrace-abi.h.
*/
enum {
arg_AL = 0,
arg_CL = 1,
arg_DL = 2,
arg_BL = 3,
arg_AH = 4,
arg_CH = 5,
arg_DH = 6,
arg_BH = 7,
arg_AX = 0,
arg_CX = 1,
arg_DX = 2,
arg_BX = 3,
arg_SP = 4,
arg_BP = 5,
arg_SI = 6,
arg_DI = 7,
#ifdef __amd64__
arg_R8 = 8,
arg_R9 = 9,
arg_R10 = 10,
arg_R11 = 11,
arg_R12 = 12,
arg_R13 = 13,
arg_R14 = 14,
arg_R15 = 15
#endif
};
static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs)
{
unsigned char *rv = NULL;
switch (no) {
case arg_AL:
rv = (unsigned char *)&regs->ax;
break;
case arg_BL:
rv = (unsigned char *)&regs->bx;
break;
case arg_CL:
rv = (unsigned char *)&regs->cx;
break;
case arg_DL:
rv = (unsigned char *)&regs->dx;
break;
#ifdef __amd64__
case arg_R8:
rv = (unsigned char *)&regs->r8;
break;
case arg_R9:
rv = (unsigned char *)&regs->r9;
break;
case arg_R10:
rv = (unsigned char *)&regs->r10;
break;
case arg_R11:
rv = (unsigned char *)&regs->r11;
break;
case arg_R12:
rv = (unsigned char *)&regs->r12;
break;
case arg_R13:
rv = (unsigned char *)&regs->r13;
break;
case arg_R14:
rv = (unsigned char *)&regs->r14;
break;
case arg_R15:
rv = (unsigned char *)&regs->r15;
break;
#endif
default:
break;
}
if (rv)
return rv;
if (rex) {
/*
* If REX prefix exists, access low bytes of SI etc.
* instead of AH etc.
*/
switch (no) {
case arg_SI:
rv = (unsigned char *)&regs->si;
break;
case arg_DI:
rv = (unsigned char *)&regs->di;
break;
case arg_BP:
rv = (unsigned char *)&regs->bp;
break;
case arg_SP:
rv = (unsigned char *)&regs->sp;
break;
default:
break;
}
} else {
switch (no) {
case arg_AH:
rv = 1 + (unsigned char *)&regs->ax;
break;
case arg_BH:
rv = 1 + (unsigned char *)&regs->bx;
break;
case arg_CH:
rv = 1 + (unsigned char *)&regs->cx;
break;
case arg_DH:
rv = 1 + (unsigned char *)&regs->dx;
break;
default:
break;
}
}
if (!rv)
printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
return rv;
}
static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
{
unsigned long *rv = NULL;
switch (no) {
case arg_AX:
rv = &regs->ax;
break;
case arg_BX:
rv = &regs->bx;
break;
case arg_CX:
rv = &regs->cx;
break;
case arg_DX:
rv = &regs->dx;
break;
case arg_SP:
rv = &regs->sp;
break;
case arg_BP:
rv = &regs->bp;
break;
case arg_SI:
rv = &regs->si;
break;
case arg_DI:
rv = &regs->di;
break;
#ifdef __amd64__
case arg_R8:
rv = &regs->r8;
break;
case arg_R9:
rv = &regs->r9;
break;
case arg_R10:
rv = &regs->r10;
break;
case arg_R11:
rv = &regs->r11;
break;
case arg_R12:
rv = &regs->r12;
break;
case arg_R13:
rv = &regs->r13;
break;
case arg_R14:
rv = &regs->r14;
break;
case arg_R15:
rv = &regs->r15;
break;
#endif
default:
printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
}
return rv;
}
unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
{
unsigned int opcode;
unsigned char mod_rm;
int reg;
unsigned char *p;
struct prefix_bits prf;
int i;
unsigned long rv;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
if (reg_rop[i] == opcode) {
rv = REG_READ;
goto do_work;
}
for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
if (reg_wop[i] == opcode) {
rv = REG_WRITE;
goto do_work;
}
printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
"0x%02x\n", opcode);
goto err;
do_work:
mod_rm = *p;
reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
switch (get_ins_reg_width(ins_addr)) {
case 1:
return *get_reg_w8(reg, prf.rex, regs);
case 2:
return *(unsigned short *)get_reg_w32(reg, regs);
case 4:
return *(unsigned int *)get_reg_w32(reg, regs);
#ifdef __amd64__
case 8:
return *(unsigned long *)get_reg_w32(reg, regs);
#endif
default:
printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
}
err:
return 0;
}
unsigned long get_ins_imm_val(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char mod_rm;
unsigned char mod;
unsigned char *p;
struct prefix_bits prf;
int i;
unsigned long rv;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
if (imm_wop[i] == opcode) {
rv = IMM_WRITE;
goto do_work;
}
printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
"0x%02x\n", opcode);
goto err;
do_work:
mod_rm = *p;
mod = mod_rm >> 6;
p++;
switch (mod) {
case 0:
/* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */
/* AMD64: XXX Check for address size prefix? */
if ((mod_rm & 0x7) == 0x5)
p += 4;
break;
case 1:
p += 1;
break;
case 2:
p += 4;
break;
case 3:
default:
printk(KERN_ERR "mmiotrace: not a memory access instruction "
"at 0x%lx, rm_mod=0x%02x\n",
ins_addr, mod_rm);
}
switch (get_ins_reg_width(ins_addr)) {
case 1:
return *(unsigned char *)p;
case 2:
return *(unsigned short *)p;
case 4:
return *(unsigned int *)p;
#ifdef __amd64__
case 8:
return *(unsigned long *)p;
#endif
default:
printk(KERN_ERR "mmiotrace: Error: width.\n");
}
err:
return 0;
}

View File

@@ -0,0 +1,39 @@
/*
* Fault Injection Test harness (FI)
* Copyright (C) Intel Crop.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
* USA.
*
*/
#ifndef __PF_H_
#define __PF_H_
enum reason_type {
NOT_ME, /* page fault is not in regions */
NOTHING, /* access others point in regions */
REG_READ, /* read from addr to reg */
REG_WRITE, /* write from reg to addr */
IMM_WRITE, /* write from imm to addr */
OTHERS /* Other instructions can not intercept */
};
enum reason_type get_ins_type(unsigned long ins_addr);
unsigned int get_ins_mem_width(unsigned long ins_addr);
unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
unsigned long get_ins_imm_val(unsigned long ins_addr);
#endif /* __PF_H_ */

View File

@@ -0,0 +1,373 @@
#include <linux/mm.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
#else
#define PGALLOC_USER_GFP 0
#endif
gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
return (pte_t *)__get_free_page(PGALLOC_GFP);
}
pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
struct page *pte;
pte = alloc_pages(__userpte_alloc_gfp, 0);
if (pte)
pgtable_page_ctor(pte);
return pte;
}
static int __init setup_userpte(char *arg)
{
if (!arg)
return -EINVAL;
/*
* "userpte=nohigh" disables allocation of user pagetables in
* high memory.
*/
if (strcmp(arg, "nohigh") == 0)
__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
else
return -EINVAL;
return 0;
}
early_param("userpte", setup_userpte);
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
paravirt_release_pte(page_to_pfn(pte));
tlb_remove_page(tlb, pte);
}
#if PAGETABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pmd));
}
#if PAGETABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
}
#endif /* PAGETABLE_LEVELS > 3 */
#endif /* PAGETABLE_LEVELS > 2 */
static inline void pgd_list_add(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
list_add(&page->lru, &pgd_list);
}
static inline void pgd_list_del(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
list_del(&page->lru);
}
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
static void pgd_ctor(pgd_t *pgd)
{
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
if (PAGETABLE_LEVELS == 2 ||
(PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
PAGETABLE_LEVELS == 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
__pa(swapper_pg_dir) >> PAGE_SHIFT,
KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
}
/* list required to sync kernel mapping updates */
if (!SHARED_KERNEL_PMD)
pgd_list_add(pgd);
}
static void pgd_dtor(pgd_t *pgd)
{
unsigned long flags; /* can be called from interrupt context */
if (SHARED_KERNEL_PMD)
return;
spin_lock_irqsave(&pgd_lock, flags);
pgd_list_del(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
}
/*
* List of all pgd's needed for non-PAE so it can invalidate entries
* in both cached and uncached pgd's; not needed for PAE since the
* kernel pmd is shared. If PAE were not to share the pmd a similar
* tactic would be needed. This is essentially codepath-based locking
* against pageattr.c; it is the unique case in which a valid change
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
* vmalloc faults work because attached pagetables are never freed.
* -- wli
*/
#ifdef CONFIG_X86_PAE
/*
* In PAE mode, we need to do a cr3 reload (=tlb flush) when
* updating the top-level pagetable entries to guarantee the
* processor notices the update. Since this is expensive, and
* all 4 top-level entries are used almost immediately in a
* new process's life, we just pre-populate them here.
*
* Also, if we're in a paravirt environment where the kernel pmd is
* not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
* and initialize the kernel pmds here.
*/
#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
/* Note: almost everything apart from _PAGE_PRESENT is
reserved at the pmd (PDPT) level. */
set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
/*
* According to Intel App note "TLBs, Paging-Structure Caches,
* and Their Invalidation", April 2007, document 317080-001,
* section 8.1: in PAE mode we explicitly have to flush the
* TLB via cr3 if the top-level pgd is changed...
*/
flush_tlb_mm(mm);
}
#else /* !CONFIG_X86_PAE */
/* No need to prepopulate any pagetable entries in non-PAE modes. */
#define PREALLOCATED_PMDS 0
#endif /* CONFIG_X86_PAE */
static void free_pmds(pmd_t *pmds[])
{
int i;
for(i = 0; i < PREALLOCATED_PMDS; i++)
if (pmds[i])
free_page((unsigned long)pmds[i]);
}
static int preallocate_pmds(pmd_t *pmds[])
{
int i;
bool failed = false;
for(i = 0; i < PREALLOCATED_PMDS; i++) {
pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
if (pmd == NULL)
failed = true;
pmds[i] = pmd;
}
if (failed) {
free_pmds(pmds);
return -ENOMEM;
}
return 0;
}
/*
* Mop up any pmd pages which may still be attached to the pgd.
* Normally they will be freed by munmap/exit_mmap, but any pmd we
* preallocate which never got a corresponding vma will need to be
* freed manually.
*/
static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
{
int i;
for(i = 0; i < PREALLOCATED_PMDS; i++) {
pgd_t pgd = pgdp[i];
if (pgd_val(pgd) != 0) {
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
pgdp[i] = native_make_pgd(0);
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
}
}
}
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
pud_t *pud;
unsigned long addr;
int i;
if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
return;
pud = pud_offset(pgd, 0);
for (addr = i = 0; i < PREALLOCATED_PMDS;
i++, pud++, addr += PUD_SIZE) {
pmd_t *pmd = pmds[i];
if (i >= KERNEL_PGD_BOUNDARY)
memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
sizeof(pmd_t) * PTRS_PER_PMD);
pud_populate(mm, pud, pmd);
}
}
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
pmd_t *pmds[PREALLOCATED_PMDS];
unsigned long flags;
pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
if (pgd == NULL)
goto out;
mm->pgd = pgd;
if (preallocate_pmds(pmds) != 0)
goto out_free_pgd;
if (paravirt_pgd_alloc(mm) != 0)
goto out_free_pmds;
/*
* Make sure that pre-populating the pmds is atomic with
* respect to anything walking the pgd_list, so that they
* never see a partially populated pgd.
*/
spin_lock_irqsave(&pgd_lock, flags);
pgd_ctor(pgd);
pgd_prepopulate_pmd(mm, pgd, pmds);
spin_unlock_irqrestore(&pgd_lock, flags);
return pgd;
out_free_pmds:
free_pmds(pmds);
out_free_pgd:
free_page((unsigned long)pgd);
out:
return NULL;
}
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
pgd_mop_up_pmds(mm, pgd);
pgd_dtor(pgd);
paravirt_pgd_free(mm, pgd);
free_page((unsigned long)pgd);
}
int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
{
int changed = !pte_same(*ptep, entry);
if (changed && dirty) {
*ptep = entry;
pte_update_defer(vma->vm_mm, address, ptep);
flush_tlb_page(vma, address);
}
return changed;
}
int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
int ret = 0;
if (pte_young(*ptep))
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *) &ptep->pte);
if (ret)
pte_update(vma->vm_mm, addr, ptep);
return ret;
}
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
int young;
young = ptep_test_and_clear_young(vma, address, ptep);
if (young)
flush_tlb_page(vma, address);
return young;
}
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
*
* Can be used to relocate the fixmap area and poke a hole in the top
* of kernel address space to make room for a hypervisor.
*/
void __init reserve_top_address(unsigned long reserve)
{
#ifdef CONFIG_X86_32
BUG_ON(fixmaps_set > 0);
printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
(int)-reserve);
__FIXADDR_TOP = -reserve - PAGE_SIZE;
#endif
}
int fixmaps_set;
void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
{
unsigned long address = __fix_to_virt(idx);
if (idx >= __end_of_fixed_addresses) {
BUG();
return;
}
set_pte_vaddr(address, pte);
fixmaps_set++;
}
void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
pgprot_t flags)
{
__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}

View File

@@ -0,0 +1,134 @@
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/nmi.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/quicklist.h>
#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
/*
* Associate a virtual page frame with a given physical page frame
* and protection flags for that frame.
*/
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd = swapper_pg_dir + pgd_index(vaddr);
if (pgd_none(*pgd)) {
BUG();
return;
}
pud = pud_offset(pgd, vaddr);
if (pud_none(*pud)) {
BUG();
return;
}
pmd = pmd_offset(pud, vaddr);
if (pmd_none(*pmd)) {
BUG();
return;
}
pte = pte_offset_kernel(pmd, vaddr);
if (pte_val(pteval))
set_pte_at(&init_mm, vaddr, pte, pteval);
else
pte_clear(&init_mm, vaddr, pte);
/*
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one(vaddr);
}
/*
* Associate a large virtual page frame with a given physical page frame
* and protection flags for that frame. pfn is for the base of the page,
* vaddr is what the page gets mapped to - both must be properly aligned.
* The pmd must already be instantiated. Assumes PAE mode.
*/
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
return; /* BUG(); */
}
if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
return; /* BUG(); */
}
pgd = swapper_pg_dir + pgd_index(vaddr);
if (pgd_none(*pgd)) {
printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
return; /* BUG(); */
}
pud = pud_offset(pgd, vaddr);
pmd = pmd_offset(pud, vaddr);
set_pmd(pmd, pfn_pmd(pfn, flags));
/*
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one(vaddr);
}
unsigned long __FIXADDR_TOP = 0xfffff000;
EXPORT_SYMBOL(__FIXADDR_TOP);
/*
* vmalloc=size forces the vmalloc area to be exactly 'size'
* bytes. This can be used to increase (or decrease) the
* vmalloc area - the default is 128m.
*/
static int __init parse_vmalloc(char *arg)
{
if (!arg)
return -EINVAL;
/* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
__VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
return 0;
}
early_param("vmalloc", parse_vmalloc);
/*
* reservetop=size reserves a hole at the top of the kernel address space which
* a hypervisor can load into later. Needed for dynamically loaded hypervisors,
* so relocating the fixmap can be done before paging initialization.
*/
static int __init parse_reservetop(char *arg)
{
unsigned long address;
if (!arg)
return -EINVAL;
address = memparse(arg, &arg);
reserve_top_address(address);
return 0;
}
early_param("reservetop", parse_reservetop);

View File

@@ -0,0 +1,70 @@
#include <linux/mmdebug.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <asm/page.h>
#include "physaddr.h"
#ifdef CONFIG_X86_64
unsigned long __phys_addr(unsigned long x)
{
if (x >= __START_KERNEL_map) {
x -= __START_KERNEL_map;
VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
x += phys_base;
} else {
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
x -= PAGE_OFFSET;
VIRTUAL_BUG_ON(!phys_addr_valid(x));
}
return x;
}
EXPORT_SYMBOL(__phys_addr);
bool __virt_addr_valid(unsigned long x)
{
if (x >= __START_KERNEL_map) {
x -= __START_KERNEL_map;
if (x >= KERNEL_IMAGE_SIZE)
return false;
x += phys_base;
} else {
if (x < PAGE_OFFSET)
return false;
x -= PAGE_OFFSET;
if (!phys_addr_valid(x))
return false;
}
return pfn_valid(x >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);
#else
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
/* VMALLOC_* aren't constants */
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
return x - PAGE_OFFSET;
}
EXPORT_SYMBOL(__phys_addr);
#endif
bool __virt_addr_valid(unsigned long x)
{
if (x < PAGE_OFFSET)
return false;
if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
return false;
if (x >= FIXADDR_START)
return false;
return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);
#endif /* CONFIG_X86_64 */

View File

@@ -0,0 +1,10 @@
#include <asm/processor.h>
static inline int phys_addr_valid(resource_size_t addr)
{
#ifdef CONFIG_PHYS_ADDR_T_64BIT
return !(addr >> boot_cpu_data.x86_phys_bits);
#else
return 1;
#endif
}

View File

@@ -0,0 +1,69 @@
#include <linux/spinlock.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <asm/pgtable.h>
int nx_enabled;
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
static int disable_nx __cpuinitdata;
/*
* noexec = on|off
*
* Control non-executable mappings for processes.
*
* on Enable
* off Disable
*/
static int __init noexec_setup(char *str)
{
if (!str)
return -EINVAL;
if (!strncmp(str, "on", 2)) {
__supported_pte_mask |= _PAGE_NX;
disable_nx = 0;
} else if (!strncmp(str, "off", 3)) {
disable_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
}
return 0;
}
early_param("noexec", noexec_setup);
#endif
#ifdef CONFIG_X86_PAE
void __init set_nx(void)
{
unsigned int v[4], l, h;
if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
if ((v[3] & (1 << 20)) && !disable_nx) {
rdmsr(MSR_EFER, l, h);
l |= EFER_NX;
wrmsr(MSR_EFER, l, h);
nx_enabled = 1;
__supported_pte_mask |= _PAGE_NX;
}
}
}
#else
void set_nx(void)
{
}
#endif
#ifdef CONFIG_X86_64
void __cpuinit check_efer(void)
{
unsigned long efer;
rdmsrl(MSR_EFER, efer);
if (!(efer & EFER_NX) || disable_nx)
__supported_pte_mask &= ~_PAGE_NX;
}
#endif

View File

@@ -0,0 +1,283 @@
/*
* Some of the code in this file has been gleaned from the 64 bit
* discontigmem support code base.
*
* Copyright (C) 2002, IBM Corp.
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Send feedback to Pat Gaughen <gone@us.ibm.com>
*/
#include <linux/mm.h>
#include <linux/bootmem.h>
#include <linux/mmzone.h>
#include <linux/acpi.h>
#include <linux/nodemask.h>
#include <asm/srat.h>
#include <asm/topology.h>
#include <asm/smp.h>
#include <asm/e820.h>
/*
* proximity macros and definitions
*/
#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
/* bitmap length; _PXM is at most 255 */
#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
#define MAX_CHUNKS_PER_NODE 3
#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
struct node_memory_chunk_s {
unsigned long start_pfn;
unsigned long end_pfn;
u8 pxm; // proximity domain of node
u8 nid; // which cnode contains this chunk?
u8 bank; // which mem bank on this node
};
static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
static int __initdata num_memory_chunks; /* total number of memory chunks */
static u8 __initdata apicid_to_pxm[MAX_APICID];
int numa_off __initdata;
int acpi_numa __initdata;
static __init void bad_srat(void)
{
printk(KERN_ERR "SRAT: SRAT not used.\n");
acpi_numa = -1;
num_memory_chunks = 0;
}
static __init inline int srat_disabled(void)
{
return numa_off || acpi_numa < 0;
}
/* Identify CPU proximity domains */
void __init
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
{
if (srat_disabled())
return;
if (cpu_affinity->header.length !=
sizeof(struct acpi_srat_cpu_affinity)) {
bad_srat();
return;
}
if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
return; /* empty entry */
/* mark this node as "seen" in node bitmap */
BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
}
/*
* Identify memory proximity domains and hot-remove capabilities.
* Fill node memory chunk list structure.
*/
void __init
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
{
unsigned long long paddr, size;
unsigned long start_pfn, end_pfn;
u8 pxm;
struct node_memory_chunk_s *p, *q, *pend;
if (srat_disabled())
return;
if (memory_affinity->header.length !=
sizeof(struct acpi_srat_mem_affinity)) {
bad_srat();
return;
}
if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
return; /* empty entry */
pxm = memory_affinity->proximity_domain & 0xff;
/* mark this node as "seen" in node bitmap */
BMAP_SET(pxm_bitmap, pxm);
/* calculate info for memory chunk structure */
paddr = memory_affinity->base_address;
size = memory_affinity->length;
start_pfn = paddr >> PAGE_SHIFT;
end_pfn = (paddr + size) >> PAGE_SHIFT;
if (num_memory_chunks >= MAXCHUNKS) {
printk(KERN_WARNING "Too many mem chunks in SRAT."
" Ignoring %lld MBytes at %llx\n",
size/(1024*1024), paddr);
return;
}
/* Insertion sort based on base address */
pend = &node_memory_chunk[num_memory_chunks];
for (p = &node_memory_chunk[0]; p < pend; p++) {
if (start_pfn < p->start_pfn)
break;
}
if (p < pend) {
for (q = pend; q >= p; q--)
*(q + 1) = *q;
}
p->start_pfn = start_pfn;
p->end_pfn = end_pfn;
p->pxm = pxm;
num_memory_chunks++;
printk(KERN_DEBUG "Memory range %08lx to %08lx"
" in proximity domain %02x %s\n",
start_pfn, end_pfn,
pxm,
((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
"enabled and removable" : "enabled" ) );
}
/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
}
void acpi_numa_arch_fixup(void)
{
}
/*
* The SRAT table always lists ascending addresses, so can always
* assume that the first "start" address that you see is the real
* start of the node, and that the current "end" address is after
* the previous one.
*/
static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
{
/*
* Only add present memory as told by the e820.
* There is no guarantee from the SRAT that the memory it
* enumerates is present at boot time because it represents
* *possible* memory hotplug areas the same as normal RAM.
*/
if (memory_chunk->start_pfn >= max_pfn) {
printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
memory_chunk->start_pfn, memory_chunk->end_pfn);
return -1;
}
if (memory_chunk->nid != nid)
return -1;
if (!node_has_online_mem(nid))
node_start_pfn[nid] = memory_chunk->start_pfn;
if (node_start_pfn[nid] > memory_chunk->start_pfn)
node_start_pfn[nid] = memory_chunk->start_pfn;
if (node_end_pfn[nid] < memory_chunk->end_pfn)
node_end_pfn[nid] = memory_chunk->end_pfn;
return 0;
}
int __init get_memcfg_from_srat(void)
{
int i, j, nid;
if (srat_disabled())
goto out_fail;
if (num_memory_chunks == 0) {
printk(KERN_DEBUG
"could not find any ACPI SRAT memory areas.\n");
goto out_fail;
}
/* Calculate total number of nodes in system from PXM bitmap and create
* a set of sequential node IDs starting at zero. (ACPI doesn't seem
* to specify the range of _PXM values.)
*/
/*
* MCD - we no longer HAVE to number nodes sequentially. PXM domain
* numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
* 32, so we will continue numbering them in this manner until MAX_NUMNODES
* approaches MAX_PXM_DOMAINS for i386.
*/
nodes_clear(node_online_map);
for (i = 0; i < MAX_PXM_DOMAINS; i++) {
if (BMAP_TEST(pxm_bitmap, i)) {
int nid = acpi_map_pxm_to_node(i);
node_set_online(nid);
}
}
BUG_ON(num_online_nodes() == 0);
/* set cnode id in memory chunk structure */
for (i = 0; i < num_memory_chunks; i++)
node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
printk(KERN_DEBUG "pxm bitmap: ");
for (i = 0; i < sizeof(pxm_bitmap); i++) {
printk(KERN_CONT "%02x ", pxm_bitmap[i]);
}
printk(KERN_CONT "\n");
printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
num_online_nodes());
printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
num_memory_chunks);
for (i = 0; i < MAX_APICID; i++)
apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
for (j = 0; j < num_memory_chunks; j++){
struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
printk(KERN_DEBUG
"chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
if (node_read_chunk(chunk->nid, chunk))
continue;
e820_register_active_regions(chunk->nid, chunk->start_pfn,
min(chunk->end_pfn, max_pfn));
}
for_each_online_node(nid) {
unsigned long start = node_start_pfn[nid];
unsigned long end = min(node_end_pfn[nid], max_pfn);
memory_present(nid, start, end);
node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
}
return 1;
out_fail:
printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
" table\n");
return 0;
}

View File

@@ -0,0 +1,498 @@
/*
* ACPI 3.0 based NUMA setup
* Copyright 2004 Andi Kleen, SuSE Labs.
*
* Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
*
* Called from acpi_numa_init while reading the SRAT and SLIT tables.
* Assumes all memory regions belonging to a single proximity domain
* are in one chunk. Holes between them will be included in the node.
*/
#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/mmzone.h>
#include <linux/bitmap.h>
#include <linux/module.h>
#include <linux/topology.h>
#include <linux/bootmem.h>
#include <linux/mm.h>
#include <asm/proto.h>
#include <asm/numa.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
int acpi_numa __initdata;
static struct acpi_table_slit *acpi_slit;
static nodemask_t nodes_parsed __initdata;
static nodemask_t cpu_nodes_parsed __initdata;
static struct bootnode nodes[MAX_NUMNODES] __initdata;
static struct bootnode nodes_add[MAX_NUMNODES];
static int num_node_memblks __initdata;
static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
static __init int setup_node(int pxm)
{
return acpi_map_pxm_to_node(pxm);
}
static __init int conflicting_memblks(unsigned long start, unsigned long end)
{
int i;
for (i = 0; i < num_node_memblks; i++) {
struct bootnode *nd = &node_memblk_range[i];
if (nd->start == nd->end)
continue;
if (nd->end > start && nd->start < end)
return memblk_nodeid[i];
if (nd->end == end && nd->start == start)
return memblk_nodeid[i];
}
return -1;
}
static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
struct bootnode *nd = &nodes[i];
if (nd->start < start) {
nd->start = start;
if (nd->end < nd->start)
nd->start = nd->end;
}
if (nd->end > end) {
nd->end = end;
if (nd->start > nd->end)
nd->start = nd->end;
}
}
static __init void bad_srat(void)
{
int i;
printk(KERN_ERR "SRAT: SRAT not used.\n");
acpi_numa = -1;
for (i = 0; i < MAX_LOCAL_APIC; i++)
apicid_to_node[i] = NUMA_NO_NODE;
for (i = 0; i < MAX_NUMNODES; i++) {
nodes[i].start = nodes[i].end = 0;
nodes_add[i].start = nodes_add[i].end = 0;
}
remove_all_active_ranges();
}
static __init inline int srat_disabled(void)
{
return numa_off || acpi_numa < 0;
}
/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
unsigned length;
unsigned long phys;
length = slit->header.length;
phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
PAGE_SIZE);
if (phys == -1L)
panic(" Can not save slit!\n");
acpi_slit = __va(phys);
memcpy(acpi_slit, slit, length);
reserve_early(phys, phys + length, "ACPI SLIT");
}
/* Callback for Proximity Domain -> x2APIC mapping */
void __init
acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
{
int pxm, node;
int apic_id;
if (srat_disabled())
return;
if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
bad_srat();
return;
}
if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
return;
pxm = pa->proximity_domain;
node = setup_node(pxm);
if (node < 0) {
printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
bad_srat();
return;
}
apic_id = pa->apic_id;
apicid_to_node[apic_id] = node;
node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
pxm, apic_id, node);
}
/* Callback for Proximity Domain -> LAPIC mapping */
void __init
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
{
int pxm, node;
int apic_id;
if (srat_disabled())
return;
if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
bad_srat();
return;
}
if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
return;
pxm = pa->proximity_domain_lo;
node = setup_node(pxm);
if (node < 0) {
printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
bad_srat();
return;
}
if (get_uv_system_type() >= UV_X2APIC)
apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
else
apic_id = pa->apic_id;
apicid_to_node[apic_id] = node;
node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
pxm, apic_id, node);
}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
/*
* Update nodes_add[]
* This code supports one contiguous hot add area per node
*/
static void __init
update_nodes_add(int node, unsigned long start, unsigned long end)
{
unsigned long s_pfn = start >> PAGE_SHIFT;
unsigned long e_pfn = end >> PAGE_SHIFT;
int changed = 0;
struct bootnode *nd = &nodes_add[node];
/* I had some trouble with strange memory hotadd regions breaking
the boot. Be very strict here and reject anything unexpected.
If you want working memory hotadd write correct SRATs.
The node size check is a basic sanity check to guard against
mistakes */
if ((signed long)(end - start) < NODE_MIN_SIZE) {
printk(KERN_ERR "SRAT: Hotplug area too small\n");
return;
}
/* This check might be a bit too strict, but I'm keeping it for now. */
if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
printk(KERN_ERR
"SRAT: Hotplug area %lu -> %lu has existing memory\n",
s_pfn, e_pfn);
return;
}
/* Looks good */
if (nd->start == nd->end) {
nd->start = start;
nd->end = end;
changed = 1;
} else {
if (nd->start == end) {
nd->start = start;
changed = 1;
}
if (nd->end == start) {
nd->end = end;
changed = 1;
}
if (!changed)
printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
}
if (changed) {
node_set(node, cpu_nodes_parsed);
printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
nd->start, nd->end);
}
}
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
{
struct bootnode *nd, oldnode;
unsigned long start, end;
int node, pxm;
int i;
if (srat_disabled())
return;
if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
bad_srat();
return;
}
if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
return;
if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
return;
start = ma->base_address;
end = start + ma->length;
pxm = ma->proximity_domain;
node = setup_node(pxm);
if (node < 0) {
printk(KERN_ERR "SRAT: Too many proximity domains.\n");
bad_srat();
return;
}
i = conflicting_memblks(start, end);
if (i == node) {
printk(KERN_WARNING
"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
pxm, start, end, nodes[i].start, nodes[i].end);
} else if (i >= 0) {
printk(KERN_ERR
"SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
pxm, start, end, node_to_pxm(i),
nodes[i].start, nodes[i].end);
bad_srat();
return;
}
nd = &nodes[node];
oldnode = *nd;
if (!node_test_and_set(node, nodes_parsed)) {
nd->start = start;
nd->end = end;
} else {
if (start < nd->start)
nd->start = start;
if (nd->end < end)
nd->end = end;
}
printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
start, end);
e820_register_active_regions(node, start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
update_nodes_add(node, start, end);
/* restore nodes[node] */
*nd = oldnode;
if ((nd->start | nd->end) == 0)
node_clear(node, nodes_parsed);
}
node_memblk_range[num_node_memblks].start = start;
node_memblk_range[num_node_memblks].end = end;
memblk_nodeid[num_node_memblks] = node;
num_node_memblks++;
}
/* Sanity check to catch more bad SRATs (they are amazingly common).
Make sure the PXMs cover all memory. */
static int __init nodes_cover_memory(const struct bootnode *nodes)
{
int i;
unsigned long pxmram, e820ram;
pxmram = 0;
for_each_node_mask(i, nodes_parsed) {
unsigned long s = nodes[i].start >> PAGE_SHIFT;
unsigned long e = nodes[i].end >> PAGE_SHIFT;
pxmram += e - s;
pxmram -= absent_pages_in_range(s, e);
if ((long)pxmram < 0)
pxmram = 0;
}
e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
printk(KERN_ERR
"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
(pxmram << PAGE_SHIFT) >> 20,
(e820ram << PAGE_SHIFT) >> 20);
return 0;
}
return 1;
}
void __init acpi_numa_arch_fixup(void) {}
/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
int i;
if (acpi_numa <= 0)
return -1;
/* First clean up the node list */
for (i = 0; i < MAX_NUMNODES; i++)
cutoff_node(i, start, end);
if (!nodes_cover_memory(nodes)) {
bad_srat();
return -1;
}
memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
memblk_nodeid);
if (memnode_shift < 0) {
printk(KERN_ERR
"SRAT: No NUMA node hash function found. Contact maintainer\n");
bad_srat();
return -1;
}
/* Account for nodes with cpus and no memory */
nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
/* Finally register nodes */
for_each_node_mask(i, node_possible_map)
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
/* Try again in case setup_node_bootmem missed one due
to missing bootmem */
for_each_node_mask(i, node_possible_map)
if (!node_online(i))
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
for (i = 0; i < nr_cpu_ids; i++) {
int node = early_cpu_to_node(i);
if (node == NUMA_NO_NODE)
continue;
if (!node_online(node))
numa_clear_node(i);
}
numa_init_array();
return 0;
}
#ifdef CONFIG_NUMA_EMU
static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
[0 ... MAX_NUMNODES-1] = PXM_INVAL
};
static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
static int __init find_node_by_addr(unsigned long addr)
{
int ret = NUMA_NO_NODE;
int i;
for_each_node_mask(i, nodes_parsed) {
/*
* Find the real node that this emulated node appears on. For
* the sake of simplicity, we only use a real node's starting
* address to determine which emulated node it appears on.
*/
if (addr >= nodes[i].start && addr < nodes[i].end) {
ret = i;
break;
}
}
return ret;
}
/*
* In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
* mappings that respect the real ACPI topology but reflect our emulated
* environment. For each emulated node, we find which real node it appears on
* and create PXM to NID mappings for those fake nodes which mirror that
* locality. SLIT will now represent the correct distances between emulated
* nodes as a result of the real topology.
*/
void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
{
int i, j;
printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
"topology.\n");
for (i = 0; i < num_nodes; i++) {
int nid, pxm;
nid = find_node_by_addr(fake_nodes[i].start);
if (nid == NUMA_NO_NODE)
continue;
pxm = node_to_pxm(nid);
if (pxm == PXM_INVAL)
continue;
fake_node_to_pxm_map[i] = pxm;
/*
* For each apicid_to_node mapping that exists for this real
* node, it must now point to the fake node ID.
*/
for (j = 0; j < MAX_LOCAL_APIC; j++)
if (apicid_to_node[j] == nid)
fake_apicid_to_node[j] = i;
}
for (i = 0; i < num_nodes; i++)
__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
nodes_clear(nodes_parsed);
for (i = 0; i < num_nodes; i++)
if (fake_nodes[i].start != fake_nodes[i].end)
node_set(i, nodes_parsed);
WARN_ON(!nodes_cover_memory(fake_nodes));
}
static int null_slit_node_compare(int a, int b)
{
return node_to_pxm(a) == node_to_pxm(b);
}
#else
static int null_slit_node_compare(int a, int b)
{
return a == b;
}
#endif /* CONFIG_NUMA_EMU */
int __node_distance(int a, int b)
{
int index;
if (!acpi_slit)
return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
REMOTE_DISTANCE;
index = acpi_slit->locality_count * node_to_pxm(a);
return acpi_slit->entry[index + node_to_pxm(b)];
}
EXPORT_SYMBOL(__node_distance);
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
int memory_add_physaddr_to_nid(u64 start)
{
int i, ret = 0;
for_each_node(i)
if (nodes_add[i].start <= start && nodes_add[i].end > start)
ret = i;
return ret;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif

View File

@@ -0,0 +1,119 @@
/*
* Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
*/
#include <linux/module.h>
#include <linux/io.h>
#include <linux/mmiotrace.h>
#define MODULE_NAME "testmmiotrace"
static unsigned long mmio_address;
module_param(mmio_address, ulong, 0);
MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
"(or 8 MB if read_far is non-zero).");
static unsigned long read_far = 0x400100;
module_param(read_far, ulong, 0);
MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB "
"(default: 0x400100).");
static unsigned v16(unsigned i)
{
return i * 12 + 7;
}
static unsigned v32(unsigned i)
{
return i * 212371 + 13;
}
static void do_write_test(void __iomem *p)
{
unsigned int i;
pr_info(MODULE_NAME ": write test.\n");
mmiotrace_printk("Write test.\n");
for (i = 0; i < 256; i++)
iowrite8(i, p + i);
for (i = 1024; i < (5 * 1024); i += 2)
iowrite16(v16(i), p + i);
for (i = (5 * 1024); i < (16 * 1024); i += 4)
iowrite32(v32(i), p + i);
}
static void do_read_test(void __iomem *p)
{
unsigned int i;
unsigned errs[3] = { 0 };
pr_info(MODULE_NAME ": read test.\n");
mmiotrace_printk("Read test.\n");
for (i = 0; i < 256; i++)
if (ioread8(p + i) != i)
++errs[0];
for (i = 1024; i < (5 * 1024); i += 2)
if (ioread16(p + i) != v16(i))
++errs[1];
for (i = (5 * 1024); i < (16 * 1024); i += 4)
if (ioread32(p + i) != v32(i))
++errs[2];
mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n",
errs[0], errs[1], errs[2]);
}
static void do_read_far_test(void __iomem *p)
{
pr_info(MODULE_NAME ": read far test.\n");
mmiotrace_printk("Read far test.\n");
ioread32(p + read_far);
}
static void do_test(unsigned long size)
{
void __iomem *p = ioremap_nocache(mmio_address, size);
if (!p) {
pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
return;
}
mmiotrace_printk("ioremap returned %p.\n", p);
do_write_test(p);
do_read_test(p);
if (read_far && read_far < size - 4)
do_read_far_test(p);
iounmap(p);
}
static int __init init(void)
{
unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
if (mmio_address == 0) {
pr_err(MODULE_NAME ": you have to use the module argument "
"mmio_address.\n");
pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
" YOU REALLY KNOW WHAT YOU ARE DOING!\n");
return -ENXIO;
}
pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI "
"address space, and writing 16 kB of rubbish in there.\n",
size >> 10, mmio_address);
do_test(size);
pr_info(MODULE_NAME ": All done.\n");
return 0;
}
static void __exit cleanup(void)
{
pr_debug(MODULE_NAME ": unloaded.\n");
}
module_init(init);
module_exit(cleanup);
MODULE_LICENSE("GPL");

290
kernel/arch/x86/mm/tlb.c Normal file
View File

@@ -0,0 +1,290 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
= { &init_mm, 0, };
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
*
* These mean you can really definitely utterly forget about
* writing to user space from interrupts. (Its not allowed anyway).
*
* Optimizations Manfred Spraul <manfred@colorfullife.com>
*
* More scalable flush, from Andi Kleen
*
* To avoid global state use 8 different call vectors.
* Each CPU uses a specific vector to trigger flushes on other
* CPUs. Depending on the received vector the target CPUs look into
* the right array slot for the flush data.
*
* With more than 8 CPUs they are hashed to the 8 available
* vectors. The limited global vector space forces us to this right now.
* In future when interrupts are split into per CPU domains this could be
* fixed, at the cost of triggering multiple IPIs in some cases.
*/
union smp_flush_state {
struct {
struct mm_struct *flush_mm;
unsigned long flush_va;
spinlock_t tlbstate_lock;
DECLARE_BITMAP(flush_cpumask, NR_CPUS);
};
char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
} ____cacheline_internodealigned_in_smp;
/* State is put into the per CPU data section, but padded
to a full cache line because other CPUs can access it and we don't
want false sharing in the per cpu data segment. */
static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
/*
* We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask.
*/
void leave_mm(int cpu)
{
if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
cpumask_clear_cpu(cpu,
mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
load_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
/*
*
* The flush IPI assumes that a thread switch happens in this order:
* [cpu0: the cpu that switches]
* 1) switch_mm() either 1a) or 1b)
* 1a) thread switch to a different mm
* 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
* Stop ipi delivery for the old mm. This is not synchronized with
* the other cpus, but smp_invalidate_interrupt ignore flush ipis
* for the wrong mm, and in the worst case we perform a superfluous
* tlb flush.
* 1a2) set cpu mmu_state to TLBSTATE_OK
* Now the smp_invalidate_interrupt won't call leave_mm if cpu0
* was in lazy tlb mode.
* 1a3) update cpu active_mm
* Now cpu0 accepts tlb flushes for the new mm.
* 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
* Now the other cpus will send tlb flush ipis.
* 1a4) change cr3.
* 1b) thread switch without mm change
* cpu active_mm is correct, cpu0 already handles
* flush ipis.
* 1b1) set cpu mmu_state to TLBSTATE_OK
* 1b2) test_and_set the cpu bit in cpu_vm_mask.
* Atomically set the bit [other cpus will start sending flush ipis],
* and test the bit.
* 1b3) if the bit was 0: leave_mm was called, flush the tlb.
* 2) switch %%esp, ie current
*
* The interrupt must handle 2 special cases:
* - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
* - the cpu performs speculative tlb reads, i.e. even if the cpu only
* runs in kernel space, the cpu could load tlb entries for user space
* pages.
*
* The good news is that cpu mmu_state is local to each cpu, no
* write/read ordering problems.
*/
/*
* TLB flush IPI:
*
* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
* 2) Leave the mm if we are in the lazy tlb mode.
*
* Interrupts are disabled.
*/
/*
* FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
* but still used for documentation purpose but the usage is slightly
* inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
* entry calls in with the first parameter in %eax. Maybe define
* intrlinkage?
*/
#ifdef CONFIG_X86_64
asmlinkage
#endif
void smp_invalidate_interrupt(struct pt_regs *regs)
{
unsigned int cpu;
unsigned int sender;
union smp_flush_state *f;
cpu = smp_processor_id();
/*
* orig_rax contains the negated interrupt vector.
* Use that to determine where the sender put the data.
*/
sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
f = &flush_state[sender];
if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
goto out;
/*
* This was a BUG() but until someone can quote me the
* line from the intel manual that guarantees an IPI to
* multiple CPUs is retried _only_ on the erroring CPUs
* its staying as a return
*
* BUG();
*/
if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
__flush_tlb_one(f->flush_va);
} else
leave_mm(cpu);
}
out:
ack_APIC_irq();
smp_mb__before_clear_bit();
cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
smp_mb__after_clear_bit();
inc_irq_stat(irq_tlb_count);
}
static void flush_tlb_others_ipi(const struct cpumask *cpumask,
struct mm_struct *mm, unsigned long va)
{
unsigned int sender;
union smp_flush_state *f;
/* Caller has disabled preemption */
sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
f = &flush_state[sender];
/*
* Could avoid this lock when
* num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
* probably not worth checking this for a cache-hot lock.
*/
spin_lock(&f->tlbstate_lock);
f->flush_mm = mm;
f->flush_va = va;
if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
/*
* We have to send the IPI only to
* CPUs affected.
*/
apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
INVALIDATE_TLB_VECTOR_START + sender);
while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
cpu_relax();
}
f->flush_mm = NULL;
f->flush_va = 0;
spin_unlock(&f->tlbstate_lock);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm, unsigned long va)
{
if (is_uv_system()) {
unsigned int cpu;
cpu = get_cpu();
cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
if (cpumask)
flush_tlb_others_ipi(cpumask, mm, va);
put_cpu();
return;
}
flush_tlb_others_ipi(cpumask, mm, va);
}
static int __cpuinit init_smp_flush(void)
{
int i;
for (i = 0; i < ARRAY_SIZE(flush_state); i++)
spin_lock_init(&flush_state[i].tlbstate_lock);
return 0;
}
core_initcall(init_smp_flush);
void flush_tlb_current_task(void)
{
struct mm_struct *mm = current->mm;
preempt_disable();
local_flush_tlb();
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
preempt_enable();
}
void flush_tlb_mm(struct mm_struct *mm)
{
preempt_disable();
if (current->active_mm == mm) {
if (current->mm)
local_flush_tlb();
else
leave_mm(smp_processor_id());
}
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
preempt_enable();
}
void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
{
struct mm_struct *mm = vma->vm_mm;
preempt_disable();
if (current->active_mm == mm) {
if (current->mm)
__flush_tlb_one(va);
else
leave_mm(smp_processor_id());
}
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, va);
preempt_enable();
}
static void do_flush_tlb_all(void *info)
{
unsigned long cpu = smp_processor_id();
__flush_tlb_all();
if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(cpu);
}
void flush_tlb_all(void)
{
on_each_cpu(do_flush_tlb_all, NULL, 1);
}