add idl4k kernel firmware version 1.13.0.105

This commit is contained in:
Jaroslav Kysela
2015-03-26 17:22:37 +01:00
parent 5194d2792e
commit e9070cdc77
31064 changed files with 12769984 additions and 0 deletions

View File

@@ -0,0 +1,38 @@
#
# This Kconfig describes xen options
#
config XEN
bool "Xen guest support"
select PARAVIRT
select PARAVIRT_CLOCK
depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
depends on X86_CMPXCHG && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
Xen hypervisor.
config XEN_MAX_DOMAIN_MEMORY
int "Maximum allowed size of a domain in gigabytes"
default 8 if X86_32
default 32 if X86_64
depends on XEN
help
The pseudo-physical to machine address array is sized
according to the maximum possible memory size of a Xen
domain. This array uses 1 page per gigabyte, so there's no
need to be too stingy here.
config XEN_SAVE_RESTORE
bool
depends on XEN && PM
default y
config XEN_DEBUG_FS
bool "Enable Xen debug and tuning parameters in debugfs"
depends on XEN && DEBUG_FS
default n
help
Enable statistics output and various tuning options in debugfs.
Enabling this option may incur a significant performance overhead.

View File

@@ -0,0 +1,20 @@
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_spinlock.o = -pg
CFLAGS_REMOVE_time.o = -pg
CFLAGS_REMOVE_irq.o = -pg
endif
# Make sure early boot has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_enlighten.o := $(nostackp)
CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o

View File

@@ -0,0 +1,123 @@
#include <linux/init.h>
#include <linux/debugfs.h>
#include <linux/module.h>
#include "debugfs.h"
static struct dentry *d_xen_debug;
struct dentry * __init xen_init_debugfs(void)
{
if (!d_xen_debug) {
d_xen_debug = debugfs_create_dir("xen", NULL);
if (!d_xen_debug)
pr_warning("Could not create 'xen' debugfs directory\n");
}
return d_xen_debug;
}
struct array_data
{
void *array;
unsigned elements;
};
static int u32_array_open(struct inode *inode, struct file *file)
{
file->private_data = NULL;
return nonseekable_open(inode, file);
}
static size_t format_array(char *buf, size_t bufsize, const char *fmt,
u32 *array, unsigned array_size)
{
size_t ret = 0;
unsigned i;
for(i = 0; i < array_size; i++) {
size_t len;
len = snprintf(buf, bufsize, fmt, array[i]);
len++; /* ' ' or '\n' */
ret += len;
if (buf) {
buf += len;
bufsize -= len;
buf[-1] = (i == array_size-1) ? '\n' : ' ';
}
}
ret++; /* \0 */
if (buf)
*buf = '\0';
return ret;
}
static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
{
size_t len = format_array(NULL, 0, fmt, array, array_size);
char *ret;
ret = kmalloc(len, GFP_KERNEL);
if (ret == NULL)
return NULL;
format_array(ret, len, fmt, array, array_size);
return ret;
}
static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
loff_t *ppos)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct array_data *data = inode->i_private;
size_t size;
if (*ppos == 0) {
if (file->private_data) {
kfree(file->private_data);
file->private_data = NULL;
}
file->private_data = format_array_alloc("%u", data->array, data->elements);
}
size = 0;
if (file->private_data)
size = strlen(file->private_data);
return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
}
static int xen_array_release(struct inode *inode, struct file *file)
{
kfree(file->private_data);
return 0;
}
static const struct file_operations u32_array_fops = {
.owner = THIS_MODULE,
.open = u32_array_open,
.release= xen_array_release,
.read = u32_array_read,
};
struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
struct dentry *parent,
u32 *array, unsigned elements)
{
struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
if (data == NULL)
return NULL;
data->array = array;
data->elements = elements;
return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
}

View File

@@ -0,0 +1,10 @@
#ifndef _XEN_DEBUGFS_H
#define _XEN_DEBUGFS_H
struct dentry * __init xen_init_debugfs(void);
struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
struct dentry *parent,
u32 *array, unsigned elements);
#endif /* _XEN_DEBUGFS_H */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,91 @@
/******************************************************************************
* grant_table.c
* x86 specific part
*
* Granting foreign access to our memory reservation.
*
* Copyright (c) 2005-2006, Christopher Clark
* Copyright (c) 2004-2005, K A Fraser
* Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
* VA Linux Systems Japan. Split out x86 specific part.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <xen/interface/xen.h>
#include <xen/page.h>
#include <xen/grant_table.h>
#include <asm/pgtable.h>
static int map_pte_fn(pte_t *pte, struct page *pmd_page,
unsigned long addr, void *data)
{
unsigned long **frames = (unsigned long **)data;
set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
(*frames)++;
return 0;
}
static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
unsigned long addr, void *data)
{
set_pte_at(&init_mm, addr, pte, __pte(0));
return 0;
}
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
struct grant_entry **__shared)
{
int rc;
struct grant_entry *shared = *__shared;
if (shared == NULL) {
struct vm_struct *area =
xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes);
BUG_ON(area == NULL);
shared = area->addr;
*__shared = shared;
}
rc = apply_to_page_range(&init_mm, (unsigned long)shared,
PAGE_SIZE * nr_gframes,
map_pte_fn, &frames);
return rc;
}
void arch_gnttab_unmap_shared(struct grant_entry *shared,
unsigned long nr_gframes)
{
apply_to_page_range(&init_mm, (unsigned long)shared,
PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
}

133
kernel/arch/x86/xen/irq.c Normal file
View File

@@ -0,0 +1,133 @@
#include <linux/hardirq.h>
#include <asm/x86_init.h>
#include <xen/interface/xen.h>
#include <xen/interface/sched.h>
#include <xen/interface/vcpu.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include "xen-ops.h"
/*
* Force a proper event-channel callback from Xen after clearing the
* callback mask. We do this in a very simple manner, by making a call
* down into Xen. The pending flag will be checked by Xen on return.
*/
void xen_force_evtchn_callback(void)
{
(void)HYPERVISOR_xen_version(0, NULL);
}
static unsigned long xen_save_fl(void)
{
struct vcpu_info *vcpu;
unsigned long flags;
vcpu = percpu_read(xen_vcpu);
/* flag has opposite sense of mask */
flags = !vcpu->evtchn_upcall_mask;
/* convert to IF type flag
-0 -> 0x00000000
-1 -> 0xffffffff
*/
return (-flags) & X86_EFLAGS_IF;
}
PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
static void xen_restore_fl(unsigned long flags)
{
struct vcpu_info *vcpu;
/* convert from IF type flag */
flags = !(flags & X86_EFLAGS_IF);
/* There's a one instruction preempt window here. We need to
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
vcpu = percpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = flags;
preempt_enable_no_resched();
/* Doesn't matter if we get preempted here, because any
pending event will get dealt with anyway. */
if (flags == 0) {
preempt_check_resched();
barrier(); /* unmask then check (avoid races) */
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
}
}
PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
static void xen_irq_disable(void)
{
/* There's a one instruction preempt window here. We need to
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
preempt_enable_no_resched();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
static void xen_irq_enable(void)
{
struct vcpu_info *vcpu;
/* We don't need to worry about being preempted here, since
either a) interrupts are disabled, so no preemption, or b)
the caller is confused and is trying to re-enable interrupts
on an indeterminate processor. */
vcpu = percpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = 0;
/* Doesn't matter if we get preempted here, because any
pending event will get dealt with anyway. */
barrier(); /* unmask then check (avoid races) */
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
static void xen_safe_halt(void)
{
/* Blocking includes an implicit local_irq_enable(). */
if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
BUG();
}
static void xen_halt(void)
{
if (irqs_disabled())
HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
else
xen_safe_halt();
}
static const struct pv_irq_ops xen_irq_ops __initdata = {
.save_fl = PV_CALLEE_SAVE(xen_save_fl),
.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
.irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
.safe_halt = xen_safe_halt,
.halt = xen_halt,
#ifdef CONFIG_X86_64
.adjust_exception_frame = xen_adjust_exception_frame,
#endif
};
void __init xen_init_irq_ops()
{
pv_irq_ops = xen_irq_ops;
x86_init.irqs.intr_init = xen_init_IRQ;
}

2027
kernel/arch/x86/xen/mmu.c Normal file

File diff suppressed because it is too large Load Diff

63
kernel/arch/x86/xen/mmu.h Normal file
View File

@@ -0,0 +1,63 @@
#ifndef _XEN_MMU_H
#include <linux/linkage.h>
#include <asm/page.h>
enum pt_level {
PT_PGD,
PT_PUD,
PT_PMD,
PT_PTE
};
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
void xen_exit_mmap(struct mm_struct *mm);
pteval_t xen_pte_val(pte_t);
pmdval_t xen_pmd_val(pmd_t);
pgdval_t xen_pgd_val(pgd_t);
pte_t xen_make_pte(pteval_t);
pmd_t xen_make_pmd(pmdval_t);
pgd_t xen_make_pgd(pgdval_t);
void xen_set_pte(pte_t *ptep, pte_t pteval);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
#ifdef CONFIG_X86_PAE
void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_pmd_clear(pmd_t *pmdp);
#endif /* CONFIG_X86_PAE */
void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
void xen_set_pud(pud_t *ptr, pud_t val);
void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
void xen_set_pud_hyper(pud_t *ptr, pud_t val);
#if PAGETABLE_LEVELS == 4
pudval_t xen_pud_val(pud_t pud);
pud_t xen_make_pud(pudval_t pudval);
void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
#endif
pgd_t *xen_get_user_pgd(pgd_t *pgd);
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
unsigned long xen_read_cr2_direct(void);
extern void xen_init_mmu_ops(void);
#endif /* _XEN_MMU_H */

View File

@@ -0,0 +1,283 @@
/*
* Xen hypercall batching.
*
* Xen allows multiple hypercalls to be issued at once, using the
* multicall interface. This allows the cost of trapping into the
* hypervisor to be amortized over several calls.
*
* This file implements a simple interface for multicalls. There's a
* per-cpu buffer of outstanding multicalls. When you want to queue a
* multicall for issuing, you can allocate a multicall slot for the
* call and its arguments, along with storage for space which is
* pointed to by the arguments (for passing pointers to structures,
* etc). When the multicall is actually issued, all the space for the
* commands and allocated memory is freed for reuse.
*
* Multicalls are flushed whenever any of the buffers get full, or
* when explicitly requested. There's no way to get per-multicall
* return results back. It will BUG if any of the multicalls fail.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/debugfs.h>
#include <asm/xen/hypercall.h>
#include "multicalls.h"
#include "debugfs.h"
#define MC_BATCH 32
#define MC_DEBUG 1
#define MC_ARGS (MC_BATCH * 16)
struct mc_buffer {
struct multicall_entry entries[MC_BATCH];
#if MC_DEBUG
struct multicall_entry debug[MC_BATCH];
void *caller[MC_BATCH];
#endif
unsigned char args[MC_ARGS];
struct callback {
void (*fn)(void *);
void *data;
} callbacks[MC_BATCH];
unsigned mcidx, argidx, cbidx;
};
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
/* flush reasons 0- slots, 1- args, 2- callbacks */
enum flush_reasons
{
FL_SLOTS,
FL_ARGS,
FL_CALLBACKS,
FL_N_REASONS
};
#ifdef CONFIG_XEN_DEBUG_FS
#define NHYPERCALLS 40 /* not really */
static struct {
unsigned histo[MC_BATCH+1];
unsigned issued;
unsigned arg_total;
unsigned hypercalls;
unsigned histo_hypercalls[NHYPERCALLS];
unsigned flush[FL_N_REASONS];
} mc_stats;
static u8 zero_stats;
static inline void check_zero(void)
{
if (unlikely(zero_stats)) {
memset(&mc_stats, 0, sizeof(mc_stats));
zero_stats = 0;
}
}
static void mc_add_stats(const struct mc_buffer *mc)
{
int i;
check_zero();
mc_stats.issued++;
mc_stats.hypercalls += mc->mcidx;
mc_stats.arg_total += mc->argidx;
mc_stats.histo[mc->mcidx]++;
for(i = 0; i < mc->mcidx; i++) {
unsigned op = mc->entries[i].op;
if (op < NHYPERCALLS)
mc_stats.histo_hypercalls[op]++;
}
}
static void mc_stats_flush(enum flush_reasons idx)
{
check_zero();
mc_stats.flush[idx]++;
}
#else /* !CONFIG_XEN_DEBUG_FS */
static inline void mc_add_stats(const struct mc_buffer *mc)
{
}
static inline void mc_stats_flush(enum flush_reasons idx)
{
}
#endif /* CONFIG_XEN_DEBUG_FS */
void xen_mc_flush(void)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
int ret = 0;
unsigned long flags;
int i;
BUG_ON(preemptible());
/* Disable interrupts in case someone comes in and queues
something in the middle */
local_irq_save(flags);
mc_add_stats(b);
if (b->mcidx) {
#if MC_DEBUG
memcpy(b->debug, b->entries,
b->mcidx * sizeof(struct multicall_entry));
#endif
if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
BUG();
for (i = 0; i < b->mcidx; i++)
if (b->entries[i].result < 0)
ret++;
#if MC_DEBUG
if (ret) {
printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
ret, smp_processor_id());
dump_stack();
for (i = 0; i < b->mcidx; i++) {
printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
i+1, b->mcidx,
b->debug[i].op,
b->debug[i].args[0],
b->entries[i].result,
b->caller[i]);
}
}
#endif
b->mcidx = 0;
b->argidx = 0;
} else
BUG_ON(b->argidx != 0);
for (i = 0; i < b->cbidx; i++) {
struct callback *cb = &b->callbacks[i];
(*cb->fn)(cb->data);
}
b->cbidx = 0;
local_irq_restore(flags);
WARN_ON(ret);
}
struct multicall_space __xen_mc_entry(size_t args)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct multicall_space ret;
unsigned argidx = roundup(b->argidx, sizeof(u64));
BUG_ON(preemptible());
BUG_ON(b->argidx >= MC_ARGS);
if (b->mcidx == MC_BATCH ||
(argidx + args) >= MC_ARGS) {
mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
xen_mc_flush();
argidx = roundup(b->argidx, sizeof(u64));
}
ret.mc = &b->entries[b->mcidx];
#ifdef MC_DEBUG
b->caller[b->mcidx] = __builtin_return_address(0);
#endif
b->mcidx++;
ret.args = &b->args[argidx];
b->argidx = argidx + args;
BUG_ON(b->argidx >= MC_ARGS);
return ret;
}
struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct multicall_space ret = { NULL, NULL };
BUG_ON(preemptible());
BUG_ON(b->argidx >= MC_ARGS);
if (b->mcidx == 0)
return ret;
if (b->entries[b->mcidx - 1].op != op)
return ret;
if ((b->argidx + size) >= MC_ARGS)
return ret;
ret.mc = &b->entries[b->mcidx - 1];
ret.args = &b->args[b->argidx];
b->argidx += size;
BUG_ON(b->argidx >= MC_ARGS);
return ret;
}
void xen_mc_callback(void (*fn)(void *), void *data)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct callback *cb;
if (b->cbidx == MC_BATCH) {
mc_stats_flush(FL_CALLBACKS);
xen_mc_flush();
}
cb = &b->callbacks[b->cbidx++];
cb->fn = fn;
cb->data = data;
}
#ifdef CONFIG_XEN_DEBUG_FS
static struct dentry *d_mc_debug;
static int __init xen_mc_debugfs(void)
{
struct dentry *d_xen = xen_init_debugfs();
if (d_xen == NULL)
return -ENOMEM;
d_mc_debug = debugfs_create_dir("multicalls", d_xen);
debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
mc_stats.histo, MC_BATCH);
xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
mc_stats.histo_hypercalls, NHYPERCALLS);
xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
mc_stats.flush, FL_N_REASONS);
return 0;
}
fs_initcall(xen_mc_debugfs);
#endif /* CONFIG_XEN_DEBUG_FS */

View File

@@ -0,0 +1,62 @@
#ifndef _XEN_MULTICALLS_H
#define _XEN_MULTICALLS_H
#include "xen-ops.h"
/* Multicalls */
struct multicall_space
{
struct multicall_entry *mc;
void *args;
};
/* Allocate room for a multicall and its args */
struct multicall_space __xen_mc_entry(size_t args);
DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
/* Call to start a batch of multiple __xen_mc_entry()s. Must be
paired with xen_mc_issue() */
static inline void xen_mc_batch(void)
{
unsigned long flags;
/* need to disable interrupts until this entry is complete */
local_irq_save(flags);
__get_cpu_var(xen_mc_irq_flags) = flags;
}
static inline struct multicall_space xen_mc_entry(size_t args)
{
xen_mc_batch();
return __xen_mc_entry(args);
}
/* Flush all pending multicalls */
void xen_mc_flush(void);
/* Issue a multicall if we're not in a lazy mode */
static inline void xen_mc_issue(unsigned mode)
{
if ((paravirt_get_lazy_mode() & mode) == 0)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
local_irq_restore(percpu_read(xen_mc_irq_flags));
}
/* Set up a callback to be called when the current batch is flushed */
void xen_mc_callback(void (*fn)(void *), void *data);
/*
* Try to extend the arguments of the previous multicall command. The
* previous command's op must match. If it does, then it attempts to
* extend the argument space allocated to the multicall entry by
* arg_size bytes.
*
* The returned multicall_space will return with mc pointing to the
* command on success, or NULL on failure, and args pointing to the
* newly allocated space.
*/
struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
#endif /* _XEN_MULTICALLS_H */

194
kernel/arch/x86/xen/setup.c Normal file
View File

@@ -0,0 +1,194 @@
/*
* Machine specific setup for xen
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
#include <asm/elf.h>
#include <asm/vdso.h>
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/acpi.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/page.h>
#include <xen/interface/callback.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
#include "xen-ops.h"
#include "vdso.h"
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
extern void xen_sysenter_target(void);
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
char * __init xen_memory_setup(void)
{
unsigned long max_pfn = xen_start_info->nr_pages;
max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
e820.nr_map = 0;
e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
/*
* Even though this is normal, usable memory under Xen, reserve
* ISA memory anyway because too many things think they can poke
* about in there.
*/
e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
E820_RESERVED);
/*
* Reserve Xen bits:
* - mfn_list
* - xen_start_info
* See comment above "struct start_info" in <xen/interface/xen.h>
*/
reserve_early(__pa(xen_start_info->mfn_list),
__pa(xen_start_info->pt_base),
"XEN START INFO");
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
return "Xen";
}
static void xen_idle(void)
{
local_irq_disable();
if (need_resched())
local_irq_enable();
else {
current_thread_info()->status &= ~TS_POLLING;
smp_mb__after_clear_bit();
safe_halt();
current_thread_info()->status |= TS_POLLING;
}
}
/*
* Set the bit indicating "nosegneg" library variants should be used.
* We only need to bother in pure 32-bit mode; compat 32-bit processes
* can have un-truncated segments, so wrapping around is allowed.
*/
static void __init fiddle_vdso(void)
{
#ifdef CONFIG_X86_32
u32 *mask;
mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
#endif
}
static __cpuinit int register_callback(unsigned type, const void *func)
{
struct callback_register callback = {
.type = type,
.address = XEN_CALLBACK(__KERNEL_CS, func),
.flags = CALLBACKF_mask_events,
};
return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
}
void __cpuinit xen_enable_sysenter(void)
{
int ret;
unsigned sysenter_feature;
#ifdef CONFIG_X86_32
sysenter_feature = X86_FEATURE_SEP;
#else
sysenter_feature = X86_FEATURE_SYSENTER32;
#endif
if (!boot_cpu_has(sysenter_feature))
return;
ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
if(ret != 0)
setup_clear_cpu_cap(sysenter_feature);
}
void __cpuinit xen_enable_syscall(void)
{
#ifdef CONFIG_X86_64
int ret;
ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
if (ret != 0) {
printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
/* Pretty fatal; 64-bit userspace has no other
mechanism for syscalls. */
}
if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
ret = register_callback(CALLBACKTYPE_syscall32,
xen_syscall32_target);
if (ret != 0)
setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
}
#endif /* CONFIG_X86_64 */
}
void __init xen_arch_setup(void)
{
struct physdev_set_iopl set_iopl;
int rc;
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
if (!xen_feature(XENFEAT_auto_translated_physmap))
HYPERVISOR_vm_assist(VMASST_CMD_enable,
VMASST_TYPE_pae_extended_cr3);
if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
BUG();
xen_enable_sysenter();
xen_enable_syscall();
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
if (rc != 0)
printk(KERN_INFO "physdev_op failed %d\n", rc);
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
disable_acpi();
}
#endif
memcpy(boot_command_line, xen_start_info->cmd_line,
MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
pm_idle = xen_idle;
paravirt_disable_iospace();
fiddle_vdso();
}

481
kernel/arch/x86/xen/smp.c Normal file
View File

@@ -0,0 +1,481 @@
/*
* Xen SMP support
*
* This file implements the Xen versions of smp_ops. SMP under Xen is
* very straightforward. Bringing a CPU up is simply a matter of
* loading its initial context and setting it running.
*
* IPIs are handled through the Xen event mechanism.
*
* Because virtual CPUs can be scheduled onto any real CPU, there's no
* useful topology information for the kernel to make use of. As a
* result, all CPUs are treated as if they're single-core and
* single-threaded.
*/
#include <linux/sched.h>
#include <linux/err.h>
#include <linux/smp.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/cpu.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
#include <xen/page.h>
#include <xen/events.h>
#include "xen-ops.h"
#include "mmu.h"
cpumask_var_t xen_cpu_initialized_map;
static DEFINE_PER_CPU(int, resched_irq);
static DEFINE_PER_CPU(int, callfunc_irq);
static DEFINE_PER_CPU(int, callfuncsingle_irq);
static DEFINE_PER_CPU(int, debug_irq) = -1;
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
/*
* Reschedule call back. Nothing to do,
* all the work is done automatically when
* we return from the interrupt.
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
inc_irq_stat(irq_resched_count);
return IRQ_HANDLED;
}
static __cpuinit void cpu_bringup(void)
{
int cpu = smp_processor_id();
cpu_init();
touch_softlockup_watchdog();
preempt_disable();
xen_enable_sysenter();
xen_enable_syscall();
cpu = smp_processor_id();
smp_store_cpu_info(cpu);
cpu_data(cpu).x86_max_cores = 1;
set_cpu_sibling_map(cpu);
xen_setup_cpu_clockevents();
cpu_set(cpu, cpu_online_map);
percpu_write(cpu_state, CPU_ONLINE);
wmb();
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
wmb(); /* make sure everything is out */
}
static __cpuinit void cpu_bringup_and_idle(void)
{
cpu_bringup();
cpu_idle();
}
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
const char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
xen_reschedule_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
resched_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(resched_irq, cpu) = rc;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
xen_call_function_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(callfunc_irq, cpu) = rc;
debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING,
debug_name, NULL);
if (rc < 0)
goto fail;
per_cpu(debug_irq, cpu) = rc;
callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
cpu,
xen_call_function_single_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(callfuncsingle_irq, cpu) = rc;
return 0;
fail:
if (per_cpu(resched_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
if (per_cpu(callfunc_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
if (per_cpu(debug_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
if (per_cpu(callfuncsingle_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
return rc;
}
static void __init xen_fill_possible_map(void)
{
int i, rc;
for (i = 0; i < nr_cpu_ids; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0) {
num_processors++;
set_cpu_possible(i, true);
}
}
}
static void __init xen_smp_prepare_boot_cpu(void)
{
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
/* We've switched to the "real" per-cpu gdt, so make sure the
old memory can be recycled */
make_lowmem_page_readwrite(xen_initial_gdt);
xen_setup_vcpu_info_placement();
}
static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
xen_init_lock_cpu(0);
smp_store_cpu_info(0);
cpu_data(0).x86_max_cores = 1;
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
BUG();
if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
panic("could not allocate xen_cpu_initialized_map\n");
cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
/* Restrict the possible_map according to max_cpus. */
while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
continue;
set_cpu_possible(cpu, false);
}
for_each_possible_cpu (cpu) {
struct task_struct *idle;
if (cpu == 0)
continue;
idle = fork_idle(cpu);
if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu);
set_cpu_present(cpu, true);
}
}
static __cpuinit int
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
struct desc_struct *gdt;
unsigned long gdt_mfn;
if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
return 0;
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
if (ctxt == NULL)
return -ENOMEM;
gdt = get_cpu_gdt_table(cpu);
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
ctxt->user_regs.ss = __KERNEL_DS;
#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
#else
ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
xen_copy_trap_info(ctxt->trap_ctxt);
ctxt->ldt_ents = 0;
BUG_ON((unsigned long)gdt & ~PAGE_MASK);
gdt_mfn = arbitrary_virt_to_mfn(gdt);
make_lowmem_page_readonly(gdt);
make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
ctxt->gdt_frames[0] = gdt_mfn;
ctxt->gdt_ents = GDT_ENTRIES;
ctxt->user_regs.cs = __KERNEL_CS;
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
ctxt->kernel_ss = __KERNEL_DS;
ctxt->kernel_sp = idle->thread.sp0;
#ifdef CONFIG_X86_32
ctxt->event_callback_cs = __KERNEL_CS;
ctxt->failsafe_callback_cs = __KERNEL_CS;
#endif
ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
BUG();
kfree(ctxt);
return 0;
}
static int __cpuinit xen_cpu_up(unsigned int cpu)
{
struct task_struct *idle = idle_task(cpu);
int rc;
per_cpu(current_task, cpu) = idle;
#ifdef CONFIG_X86_32
irq_ctx_init(cpu);
#else
clear_tsk_thread_flag(idle, TIF_FORK);
per_cpu(kernel_stack, cpu) =
(unsigned long)task_stack_page(idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_init_lock_cpu(cpu);
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
rc = cpu_initialize_context(cpu, idle);
if (rc)
return rc;
if (num_online_cpus() == 1)
alternatives_smp_switch(1);
rc = xen_smp_intr_init(cpu);
if (rc)
return rc;
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
barrier();
}
return 0;
}
static void xen_smp_cpus_done(unsigned int max_cpus)
{
}
#ifdef CONFIG_HOTPLUG_CPU
static int xen_cpu_disable(void)
{
unsigned int cpu = smp_processor_id();
if (cpu == 0)
return -EBUSY;
cpu_disable_common();
load_cr3(swapper_pg_dir);
return 0;
}
static void xen_cpu_die(unsigned int cpu)
{
while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
current->state = TASK_UNINTERRUPTIBLE;
schedule_timeout(HZ/10);
}
unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu);
if (num_online_cpus() == 1)
alternatives_smp_switch(0);
}
static void __cpuinit xen_play_dead(void) /* used only with CPU_HOTPLUG */
{
play_dead_common();
HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
cpu_bringup();
}
#else /* !CONFIG_HOTPLUG_CPU */
static int xen_cpu_disable(void)
{
return -ENOSYS;
}
static void xen_cpu_die(unsigned int cpu)
{
BUG();
}
static void xen_play_dead(void)
{
BUG();
}
#endif
static void stop_self(void *v)
{
int cpu = smp_processor_id();
/* make sure we're not pinning something down */
load_cr3(swapper_pg_dir);
/* should set up a minimal gdt */
HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
BUG();
}
static void xen_stop_other_cpus(int wait)
{
smp_call_function(stop_self, NULL, wait);
}
static void xen_smp_send_reschedule(int cpu)
{
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
}
static void xen_send_IPI_mask(const struct cpumask *mask,
enum ipi_vector vector)
{
unsigned cpu;
for_each_cpu_and(cpu, mask, cpu_online_mask)
xen_send_IPI_one(cpu, vector);
}
static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
{
int cpu;
xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
/* Make sure other vcpus get a chance to run if they need to. */
for_each_cpu(cpu, mask) {
if (xen_vcpu_stolen(cpu)) {
HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
break;
}
}
}
static void xen_smp_send_call_function_single_ipi(int cpu)
{
xen_send_IPI_mask(cpumask_of(cpu),
XEN_CALL_FUNCTION_SINGLE_VECTOR);
}
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_interrupt();
inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;
}
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_single_interrupt();
inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;
}
static const struct smp_ops xen_smp_ops __initdata = {
.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_smp_prepare_cpus,
.smp_cpus_done = xen_smp_cpus_done,
.cpu_up = xen_cpu_up,
.cpu_die = xen_cpu_die,
.cpu_disable = xen_cpu_disable,
.play_dead = xen_play_dead,
.stop_other_cpus = xen_stop_other_cpus,
.smp_send_reschedule = xen_smp_send_reschedule,
.send_call_func_ipi = xen_smp_send_call_function_ipi,
.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
};
void __init xen_smp_init(void)
{
smp_ops = xen_smp_ops;
xen_fill_possible_map();
xen_init_spinlocks();
}

View File

@@ -0,0 +1,436 @@
/*
* Split spinlock implementation out into its own file, so it can be
* compiled in a FTRACE-compatible way.
*/
#include <linux/kernel_stat.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
#include <linux/log2.h>
#include <asm/paravirt.h>
#include <xen/interface/xen.h>
#include <xen/events.h>
#include "xen-ops.h"
#include "debugfs.h"
#ifdef CONFIG_XEN_DEBUG_FS
static struct xen_spinlock_stats
{
u64 taken;
u32 taken_slow;
u32 taken_slow_nested;
u32 taken_slow_pickup;
u32 taken_slow_spurious;
u32 taken_slow_irqenable;
u64 released;
u32 released_slow;
u32 released_slow_kicked;
#define HISTO_BUCKETS 30
u32 histo_spin_total[HISTO_BUCKETS+1];
u32 histo_spin_spinning[HISTO_BUCKETS+1];
u32 histo_spin_blocked[HISTO_BUCKETS+1];
u64 time_total;
u64 time_spinning;
u64 time_blocked;
} spinlock_stats;
static u8 zero_stats;
static unsigned lock_timeout = 1 << 10;
#define TIMEOUT lock_timeout
static inline void check_zero(void)
{
if (unlikely(zero_stats)) {
memset(&spinlock_stats, 0, sizeof(spinlock_stats));
zero_stats = 0;
}
}
#define ADD_STATS(elem, val) \
do { check_zero(); spinlock_stats.elem += (val); } while(0)
static inline u64 spin_time_start(void)
{
return xen_clocksource_read();
}
static void __spin_time_accum(u64 delta, u32 *array)
{
unsigned index = ilog2(delta);
check_zero();
if (index < HISTO_BUCKETS)
array[index]++;
else
array[HISTO_BUCKETS]++;
}
static inline void spin_time_accum_spinning(u64 start)
{
u32 delta = xen_clocksource_read() - start;
__spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
spinlock_stats.time_spinning += delta;
}
static inline void spin_time_accum_total(u64 start)
{
u32 delta = xen_clocksource_read() - start;
__spin_time_accum(delta, spinlock_stats.histo_spin_total);
spinlock_stats.time_total += delta;
}
static inline void spin_time_accum_blocked(u64 start)
{
u32 delta = xen_clocksource_read() - start;
__spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
spinlock_stats.time_blocked += delta;
}
#else /* !CONFIG_XEN_DEBUG_FS */
#define TIMEOUT (1 << 10)
#define ADD_STATS(elem, val) do { (void)(val); } while(0)
static inline u64 spin_time_start(void)
{
return 0;
}
static inline void spin_time_accum_total(u64 start)
{
}
static inline void spin_time_accum_spinning(u64 start)
{
}
static inline void spin_time_accum_blocked(u64 start)
{
}
#endif /* CONFIG_XEN_DEBUG_FS */
struct xen_spinlock {
unsigned char lock; /* 0 -> free; 1 -> locked */
unsigned short spinners; /* count of waiting cpus */
};
static int xen_spin_is_locked(struct raw_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
return xl->lock != 0;
}
static int xen_spin_is_contended(struct raw_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
/* Not strictly true; this is only the count of contended
lock-takers entering the slow path. */
return xl->spinners != 0;
}
static int xen_spin_trylock(struct raw_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
u8 old = 1;
asm("xchgb %b0,%1"
: "+q" (old), "+m" (xl->lock) : : "memory");
return old == 0;
}
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
/*
* Mark a cpu as interested in a lock. Returns the CPU's previous
* lock of interest, in case we got preempted by an interrupt.
*/
static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
{
struct xen_spinlock *prev;
prev = __get_cpu_var(lock_spinners);
__get_cpu_var(lock_spinners) = xl;
wmb(); /* set lock of interest before count */
asm(LOCK_PREFIX " incw %0"
: "+m" (xl->spinners) : : "memory");
return prev;
}
/*
* Mark a cpu as no longer interested in a lock. Restores previous
* lock of interest (NULL for none).
*/
static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
{
asm(LOCK_PREFIX " decw %0"
: "+m" (xl->spinners) : : "memory");
wmb(); /* decrement count before restoring lock */
__get_cpu_var(lock_spinners) = prev;
}
static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
struct xen_spinlock *prev;
int irq = __get_cpu_var(lock_kicker_irq);
int ret;
u64 start;
/* If kicker interrupts not initialized yet, just spin */
if (irq == -1)
return 0;
start = spin_time_start();
/* announce we're spinning */
prev = spinning_lock(xl);
ADD_STATS(taken_slow, 1);
ADD_STATS(taken_slow_nested, prev != NULL);
do {
unsigned long flags;
/* clear pending */
xen_clear_irq_pending(irq);
/* check again make sure it didn't become free while
we weren't looking */
ret = xen_spin_trylock(lock);
if (ret) {
ADD_STATS(taken_slow_pickup, 1);
/*
* If we interrupted another spinlock while it
* was blocking, make sure it doesn't block
* without rechecking the lock.
*/
if (prev != NULL)
xen_set_irq_pending(irq);
goto out;
}
flags = __raw_local_save_flags();
if (irq_enable) {
ADD_STATS(taken_slow_irqenable, 1);
raw_local_irq_enable();
}
/*
* Block until irq becomes pending. If we're
* interrupted at this point (after the trylock but
* before entering the block), then the nested lock
* handler guarantees that the irq will be left
* pending if there's any chance the lock became free;
* xen_poll_irq() returns immediately if the irq is
* pending.
*/
xen_poll_irq(irq);
raw_local_irq_restore(flags);
ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
out:
unspinning_lock(xl, prev);
spin_time_accum_blocked(start);
return ret;
}
static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
unsigned timeout;
u8 oldval;
u64 start_spin;
ADD_STATS(taken, 1);
start_spin = spin_time_start();
do {
u64 start_spin_fast = spin_time_start();
timeout = TIMEOUT;
asm("1: xchgb %1,%0\n"
" testb %1,%1\n"
" jz 3f\n"
"2: rep;nop\n"
" cmpb $0,%0\n"
" je 1b\n"
" dec %2\n"
" jnz 2b\n"
"3:\n"
: "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
: "1" (1)
: "memory");
spin_time_accum_spinning(start_spin_fast);
} while (unlikely(oldval != 0 &&
(TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
spin_time_accum_total(start_spin);
}
static void xen_spin_lock(struct raw_spinlock *lock)
{
__xen_spin_lock(lock, false);
}
static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
{
__xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
}
static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
{
int cpu;
ADD_STATS(released_slow, 1);
for_each_online_cpu(cpu) {
/* XXX should mix up next cpu selection */
if (per_cpu(lock_spinners, cpu) == xl) {
ADD_STATS(released_slow_kicked, 1);
xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
break;
}
}
}
static void xen_spin_unlock(struct raw_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
ADD_STATS(released, 1);
smp_wmb(); /* make sure no writes get moved after unlock */
xl->lock = 0; /* release lock */
/*
* Make sure unlock happens before checking for waiting
* spinners. We need a strong barrier to enforce the
* write-read ordering to different memory locations, as the
* CPU makes no implied guarantees about their ordering.
*/
mb();
if (unlikely(xl->spinners))
xen_spin_unlock_slow(xl);
}
static irqreturn_t dummy_handler(int irq, void *dev_id)
{
BUG();
return IRQ_HANDLED;
}
void __cpuinit xen_init_lock_cpu(int cpu)
{
int irq;
const char *name;
name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
cpu,
dummy_handler,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
name,
NULL);
if (irq >= 0) {
disable_irq(irq); /* make sure it's never delivered */
per_cpu(lock_kicker_irq, cpu) = irq;
}
printk("cpu %d spinlock event irq %d\n", cpu, irq);
}
void xen_uninit_lock_cpu(int cpu)
{
unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
}
void __init xen_init_spinlocks(void)
{
pv_lock_ops.spin_is_locked = xen_spin_is_locked;
pv_lock_ops.spin_is_contended = xen_spin_is_contended;
pv_lock_ops.spin_lock = xen_spin_lock;
pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
pv_lock_ops.spin_trylock = xen_spin_trylock;
pv_lock_ops.spin_unlock = xen_spin_unlock;
}
#ifdef CONFIG_XEN_DEBUG_FS
static struct dentry *d_spin_debug;
static int __init xen_spinlock_debugfs(void)
{
struct dentry *d_xen = xen_init_debugfs();
if (d_xen == NULL)
return -ENOMEM;
d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
debugfs_create_u32("taken_slow", 0444, d_spin_debug,
&spinlock_stats.taken_slow);
debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
&spinlock_stats.taken_slow_nested);
debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
&spinlock_stats.taken_slow_pickup);
debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
&spinlock_stats.taken_slow_spurious);
debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
&spinlock_stats.taken_slow_irqenable);
debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
debugfs_create_u32("released_slow", 0444, d_spin_debug,
&spinlock_stats.released_slow);
debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
&spinlock_stats.released_slow_kicked);
debugfs_create_u64("time_spinning", 0444, d_spin_debug,
&spinlock_stats.time_spinning);
debugfs_create_u64("time_blocked", 0444, d_spin_debug,
&spinlock_stats.time_blocked);
debugfs_create_u64("time_total", 0444, d_spin_debug,
&spinlock_stats.time_total);
xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
return 0;
}
fs_initcall(xen_spinlock_debugfs);
#endif /* CONFIG_XEN_DEBUG_FS */

View File

@@ -0,0 +1,65 @@
#include <linux/types.h>
#include <linux/clockchips.h>
#include <xen/interface/xen.h>
#include <xen/grant_table.h>
#include <xen/events.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
#include <asm/fixmap.h>
#include "xen-ops.h"
#include "mmu.h"
void xen_pre_suspend(void)
{
xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
mfn_to_pfn(xen_start_info->console.domU.mfn);
BUG_ON(!irqs_disabled());
HYPERVISOR_shared_info = &xen_dummy_shared_info;
if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
__pte_ma(0), 0))
BUG();
}
void xen_post_suspend(int suspend_cancelled)
{
xen_build_mfn_list_list();
xen_setup_shared_info();
if (suspend_cancelled) {
xen_start_info->store_mfn =
pfn_to_mfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
pfn_to_mfn(xen_start_info->console.domU.mfn);
} else {
#ifdef CONFIG_SMP
BUG_ON(xen_cpu_initialized_map == NULL);
cpumask_copy(xen_cpu_initialized_map, cpu_online_mask);
#endif
xen_vcpu_restore();
}
}
static void xen_vcpu_notify_restore(void *data)
{
unsigned long reason = (unsigned long)data;
/* Boot processor notified via generic timekeeping_resume() */
if ( smp_processor_id() == 0)
return;
clockevents_notify(reason, NULL);
}
void xen_arch_resume(void)
{
on_each_cpu(xen_vcpu_notify_restore,
(void *)CLOCK_EVT_NOTIFY_RESUME, 1);
}

461
kernel/arch/x86/xen/time.c Normal file
View File

@@ -0,0 +1,461 @@
/*
* Xen time implementation.
*
* This is implemented in terms of a clocksource driver which uses
* the hypervisor clock as a nanosecond timebase, and a clockevent
* driver which uses the hypervisor's timer mechanism.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/kernel.h>
#include <linux/interrupt.h>
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/kernel_stat.h>
#include <linux/math64.h>
#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include "xen-ops.h"
#define XEN_SHIFT 22
/* Xen may fire a timer up to this many ns early */
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
/* snapshots of runstate info */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
/* unused ns of stolen and blocked time */
static DEFINE_PER_CPU(u64, residual_stolen);
static DEFINE_PER_CPU(u64, residual_blocked);
/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
{
u64 ret;
if (BITS_PER_LONG < 64) {
u32 *p32 = (u32 *)p;
u32 h, l;
/*
* Read high then low, and then make sure high is
* still the same; this will only loop if low wraps
* and carries into high.
* XXX some clean way to make this endian-proof?
*/
do {
h = p32[1];
barrier();
l = p32[0];
barrier();
} while (p32[1] != h);
ret = (((u64)h) << 32) | l;
} else
ret = *p;
return ret;
}
/*
* Runstate accounting
*/
static void get_runstate_snapshot(struct vcpu_runstate_info *res)
{
u64 state_time;
struct vcpu_runstate_info *state;
BUG_ON(preemptible());
state = &__get_cpu_var(runstate);
/*
* The runstate info is always updated by the hypervisor on
* the current CPU, so there's no need to use anything
* stronger than a compiler barrier when fetching it.
*/
do {
state_time = get64(&state->state_entry_time);
barrier();
*res = *state;
barrier();
} while (get64(&state->state_entry_time) != state_time);
}
/* return true when a vcpu could run but has no real cpu to run on */
bool xen_vcpu_stolen(int vcpu)
{
return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
}
void xen_setup_runstate_info(int cpu)
{
struct vcpu_register_runstate_memory_area area;
area.addr.v = &per_cpu(runstate, cpu);
if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
cpu, &area))
BUG();
}
static void do_stolen_accounting(void)
{
struct vcpu_runstate_info state;
struct vcpu_runstate_info *snap;
s64 blocked, runnable, offline, stolen;
cputime_t ticks;
get_runstate_snapshot(&state);
WARN_ON(state.state != RUNSTATE_running);
snap = &__get_cpu_var(runstate_snapshot);
/* work out how much time the VCPU has not been runn*ing* */
blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
*snap = state;
/* Add the appropriate number of ticks of stolen time,
including any left-overs from last time. */
stolen = runnable + offline + __get_cpu_var(residual_stolen);
if (stolen < 0)
stolen = 0;
ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
__get_cpu_var(residual_stolen) = stolen;
account_steal_ticks(ticks);
/* Add the appropriate number of ticks of blocked time,
including any left-overs from last time. */
blocked += __get_cpu_var(residual_blocked);
if (blocked < 0)
blocked = 0;
ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
__get_cpu_var(residual_blocked) = blocked;
account_idle_ticks(ticks);
}
/* Get the TSC speed from Xen */
unsigned long xen_tsc_khz(void)
{
struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
return pvclock_tsc_khz(info);
}
cycle_t xen_clocksource_read(void)
{
struct pvclock_vcpu_time_info *src;
cycle_t ret;
src = &get_cpu_var(xen_vcpu)->time;
ret = pvclock_clocksource_read(src);
put_cpu_var(xen_vcpu);
return ret;
}
static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
{
return xen_clocksource_read();
}
static void xen_read_wallclock(struct timespec *ts)
{
struct shared_info *s = HYPERVISOR_shared_info;
struct pvclock_wall_clock *wall_clock = &(s->wc);
struct pvclock_vcpu_time_info *vcpu_time;
vcpu_time = &get_cpu_var(xen_vcpu)->time;
pvclock_read_wallclock(wall_clock, vcpu_time, ts);
put_cpu_var(xen_vcpu);
}
unsigned long xen_get_wallclock(void)
{
struct timespec ts;
xen_read_wallclock(&ts);
return ts.tv_sec;
}
int xen_set_wallclock(unsigned long now)
{
/* do nothing for domU */
return -1;
}
static struct clocksource xen_clocksource __read_mostly = {
.name = "xen",
.rating = 400,
.read = xen_clocksource_get_cycles,
.mask = ~0,
.mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
.shift = XEN_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
/*
Xen clockevent implementation
Xen has two clockevent implementations:
The old timer_op one works with all released versions of Xen prior
to version 3.0.4. This version of the hypervisor provides a
single-shot timer with nanosecond resolution. However, sharing the
same event channel is a 100Hz tick which is delivered while the
vcpu is running. We don't care about or use this tick, but it will
cause the core time code to think the timer fired too soon, and
will end up resetting it each time. It could be filtered, but
doing so has complications when the ktime clocksource is not yet
the xen clocksource (ie, at boot time).
The new vcpu_op-based timer interface allows the tick timer period
to be changed or turned off. The tick timer is not useful as a
periodic timer because events are only delivered to running vcpus.
The one-shot timer can report when a timeout is in the past, so
set_next_event is capable of returning -ETIME when appropriate.
This interface is used when available.
*/
/*
Get a hypervisor absolute time. In theory we could maintain an
offset between the kernel's time and the hypervisor's time, and
apply that to a kernel's absolute timeout. Unfortunately the
hypervisor and kernel times can drift even if the kernel is using
the Xen clocksource, because ntp can warp the kernel's clocksource.
*/
static s64 get_abs_timeout(unsigned long delta)
{
return xen_clocksource_read() + delta;
}
static void xen_timerop_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
/* unsupported */
WARN_ON(1);
break;
case CLOCK_EVT_MODE_ONESHOT:
case CLOCK_EVT_MODE_RESUME:
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
HYPERVISOR_set_timer_op(0); /* cancel timeout */
break;
}
}
static int xen_timerop_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
BUG();
/* We may have missed the deadline, but there's no real way of
knowing for sure. If the event was in the past, then we'll
get an immediate interrupt. */
return 0;
}
static const struct clock_event_device xen_timerop_clockevent = {
.name = "xen",
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
.mult = 1,
.shift = 0,
.rating = 500,
.set_mode = xen_timerop_set_mode,
.set_next_event = xen_timerop_set_next_event,
};
static void xen_vcpuop_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
int cpu = smp_processor_id();
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
WARN_ON(1); /* unsupported */
break;
case CLOCK_EVT_MODE_ONESHOT:
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
BUG();
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
BUG();
break;
case CLOCK_EVT_MODE_RESUME:
break;
}
}
static int xen_vcpuop_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
int cpu = smp_processor_id();
struct vcpu_set_singleshot_timer single;
int ret;
WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
single.timeout_abs_ns = get_abs_timeout(delta);
single.flags = VCPU_SSHOTTMR_future;
ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
BUG_ON(ret != 0 && ret != -ETIME);
return ret;
}
static const struct clock_event_device xen_vcpuop_clockevent = {
.name = "xen",
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
.mult = 1,
.shift = 0,
.rating = 500,
.set_mode = xen_vcpuop_set_mode,
.set_next_event = xen_vcpuop_set_next_event,
};
static const struct clock_event_device *xen_clockevent =
&xen_timerop_clockevent;
static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
irqreturn_t ret;
ret = IRQ_NONE;
if (evt->event_handler) {
evt->event_handler(evt);
ret = IRQ_HANDLED;
}
do_stolen_accounting();
return ret;
}
void xen_setup_timer(int cpu)
{
const char *name;
struct clock_event_device *evt;
int irq;
printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
name = kasprintf(GFP_KERNEL, "timer%d", cpu);
if (!name)
name = "<timer kasprintf failed>";
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER,
name, NULL);
evt = &per_cpu(xen_clock_events, cpu);
memcpy(evt, xen_clockevent, sizeof(*evt));
evt->cpumask = cpumask_of(cpu);
evt->irq = irq;
}
void xen_teardown_timer(int cpu)
{
struct clock_event_device *evt;
BUG_ON(cpu == 0);
evt = &per_cpu(xen_clock_events, cpu);
unbind_from_irqhandler(evt->irq, NULL);
}
void xen_setup_cpu_clockevents(void)
{
BUG_ON(preemptible());
clockevents_register_device(&__get_cpu_var(xen_clock_events));
}
void xen_timer_resume(void)
{
int cpu;
pvclock_resume();
if (xen_clockevent != &xen_vcpuop_clockevent)
return;
for_each_online_cpu(cpu) {
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
BUG();
}
}
__init void xen_time_init(void)
{
int cpu = smp_processor_id();
clocksource_register(&xen_clocksource);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
/* Successfully turned off 100Hz tick, so we have the
vcpuop-based timer interface */
printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
xen_clockevent = &xen_vcpuop_clockevent;
}
/* Set initial system time with full resolution */
xen_read_wallclock(&xtime);
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
setup_force_cpu_cap(X86_FEATURE_TSC);
xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
}

View File

@@ -0,0 +1,4 @@
/* Bit used for the pseudo-hwcap for non-negative segments. We use
bit 1 to avoid bugs in some versions of glibc when bit 0 is
used; the choice is otherwise arbitrary. */
#define VDSO_NOTE_NONEGSEG_BIT 1

View File

@@ -0,0 +1,142 @@
/*
* Asm versions of Xen pv-ops, suitable for either direct use or
* inlining. The inline versions are the same as the direct-use
* versions, with the pre- and post-amble chopped off.
*
* This code is encoded for size rather than absolute efficiency, with
* a view to being able to inline as much as possible.
*
* We only bother with direct forms (ie, vcpu in percpu data) of the
* operations here; the indirect forms are better handled in C, since
* they're generally too large to inline anyway.
*/
#include <asm/asm-offsets.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include "xen-asm.h"
/*
* Enable events. This clears the event mask and tests the pending
* event status with one and operation. If there are pending events,
* then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
/* Unmask events */
movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
/*
* Preempt here doesn't matter because that will deal with any
* pending interrupts. The pending check may end up being run
* on the wrong CPU, but that doesn't hurt.
*/
/* Test for pending */
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jz 1f
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)
ret
ENDPROC(xen_irq_enable_direct)
RELOC(xen_irq_enable_direct, 2b+1)
/*
* Disabling events is simply a matter of making the event mask
* non-zero.
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
ENDPATCH(xen_irq_disable_direct)
ret
ENDPROC(xen_irq_disable_direct)
RELOC(xen_irq_disable_direct, 0)
/*
* (xen_)save_fl is used to get the current interrupt enable status.
* Callers expect the status to be in X86_EFLAGS_IF, and other bits
* may be set in the return value. We take advantage of this by
* making sure that X86_EFLAGS_IF has the right value (and other bits
* in that byte are 0), but other bits in the return value are
* undefined. We need to toggle the state of the bit, because Xen and
* x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
setz %ah
addb %ah, %ah
ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
RELOC(xen_save_fl_direct, 0)
/*
* In principle the caller should be passing us a value return from
* xen_save_fl_direct, but for robustness sake we test only the
* X86_EFLAGS_IF flag rather than the whole byte. After setting the
* interrupt mask state, it checks for unmasked pending events and
* enters the hypervisor to get them delivered if so.
*/
ENTRY(xen_restore_fl_direct)
#ifdef CONFIG_X86_64
testw $X86_EFLAGS_IF, %di
#else
testb $X86_EFLAGS_IF>>8, %ah
#endif
setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
/*
* Preempt here doesn't matter because that will deal with any
* pending interrupts. The pending check may end up being run
* on the wrong CPU, but that doesn't hurt.
*/
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jz 1f
2: call check_events
1:
ENDPATCH(xen_restore_fl_direct)
ret
ENDPROC(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
/*
* Force an event check by making a hypercall, but preserve regs
* before making the call.
*/
check_events:
#ifdef CONFIG_X86_32
push %eax
push %ecx
push %edx
call xen_force_evtchn_callback
pop %edx
pop %ecx
pop %eax
#else
push %rax
push %rcx
push %rdx
push %rsi
push %rdi
push %r8
push %r9
push %r10
push %r11
call xen_force_evtchn_callback
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi
pop %rsi
pop %rdx
pop %rcx
pop %rax
#endif
ret

View File

@@ -0,0 +1,12 @@
#ifndef _XEN_XEN_ASM_H
#define _XEN_XEN_ASM_H
#include <linux/linkage.h>
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
#define ENDPATCH(x) .globl x##_end; x##_end=.
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
#define XEN_EFLAGS_NMI 0x80000000
#endif

View File

@@ -0,0 +1,228 @@
/*
* Asm versions of Xen pv-ops, suitable for either direct use or
* inlining. The inline versions are the same as the direct-use
* versions, with the pre- and post-amble chopped off.
*
* This code is encoded for size rather than absolute efficiency, with
* a view to being able to inline as much as possible.
*
* We only bother with direct forms (ie, vcpu in pda) of the
* operations here; the indirect forms are better handled in C, since
* they're generally too large to inline anyway.
*/
#include <asm/thread_info.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
#include "xen-asm.h"
/*
* Force an event check by making a hypercall, but preserve regs
* before making the call.
*/
check_events:
push %eax
push %ecx
push %edx
call xen_force_evtchn_callback
pop %edx
pop %ecx
pop %eax
ret
/*
* We can't use sysexit directly, because we're not running in ring0.
* But we can easily fake it up using iret. Assuming xen_sysexit is
* jumped to with a standard stack frame, we can just strip it back to
* a standard iret frame and use iret.
*/
ENTRY(xen_sysexit)
movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
lea PT_EIP(%esp), %esp
jmp xen_iret
ENDPROC(xen_sysexit)
/*
* This is run where a normal iret would be run, with the same stack setup:
* 8: eflags
* 4: cs
* esp-> 0: eip
*
* This attempts to make sure that any pending events are dealt with
* on return to usermode, but there is a small window in which an
* event can happen just before entering usermode. If the nested
* interrupt ends up setting one of the TIF_WORK_MASK pending work
* flags, they will not be tested again before returning to
* usermode. This means that a process can end up with pending work,
* which will be unprocessed until the process enters and leaves the
* kernel again, which could be an unbounded amount of time. This
* means that a pending signal or reschedule event could be
* indefinitely delayed.
*
* The fix is to notice a nested interrupt in the critical window, and
* if one occurs, then fold the nested interrupt into the current
* interrupt stack frame, and re-process it iteratively rather than
* recursively. This means that it will exit via the normal path, and
* all pending work will be dealt with appropriately.
*
* Because the nested interrupt handler needs to deal with the current
* stack state in whatever form its in, we keep things simple by only
* using a single register which is pushed/popped on the stack.
*/
ENTRY(xen_iret)
/* test eflags for special cases */
testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
jnz hyper_iret
push %eax
ESP_OFFSET=4 # bytes pushed onto stack
/*
* Store vcpu_info pointer for easy access. Do it this way to
* avoid having to reload %fs
*/
#ifdef CONFIG_SMP
GET_THREAD_INFO(%eax)
movl TI_cpu(%eax), %eax
movl __per_cpu_offset(,%eax,4), %eax
mov per_cpu__xen_vcpu(%eax), %eax
#else
movl per_cpu__xen_vcpu, %eax
#endif
/* check IF state we're restoring */
testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
/*
* Maybe enable events. Once this happens we could get a
* recursive event, so the critical region starts immediately
* afterwards. However, if that happens we don't end up
* resuming the code, so we don't have to be worried about
* being preempted to another CPU.
*/
setz XEN_vcpu_info_mask(%eax)
xen_iret_start_crit:
/* check for unmasked and pending */
cmpw $0x0001, XEN_vcpu_info_pending(%eax)
/*
* If there's something pending, mask events again so we can
* jump back into xen_hypervisor_callback
*/
sete XEN_vcpu_info_mask(%eax)
popl %eax
/*
* From this point on the registers are restored and the stack
* updated, so we don't need to worry about it if we're
* preempted
*/
iret_restore_end:
/*
* Jump to hypervisor_callback after fixing up the stack.
* Events are masked, so jumping out of the critical region is
* OK.
*/
je xen_hypervisor_callback
1: iret
xen_iret_end_crit:
.section __ex_table, "a"
.align 4
.long 1b, iret_exc
.previous
hyper_iret:
/* put this out of line since its very rarely used */
jmp hypercall_page + __HYPERVISOR_iret * 32
.globl xen_iret_start_crit, xen_iret_end_crit
/*
* This is called by xen_hypervisor_callback in entry.S when it sees
* that the EIP at the time of interrupt was between
* xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
* %eax so we can do a more refined determination of what to do.
*
* The stack format at this point is:
* ----------------
* ss : (ss/esp may be present if we came from usermode)
* esp :
* eflags } outer exception info
* cs }
* eip }
* ---------------- <- edi (copy dest)
* eax : outer eax if it hasn't been restored
* ----------------
* eflags } nested exception info
* cs } (no ss/esp because we're nested
* eip } from the same ring)
* orig_eax }<- esi (copy src)
* - - - - - - - -
* fs }
* es }
* ds } SAVE_ALL state
* eax }
* : :
* ebx }<- esp
* ----------------
*
* In order to deliver the nested exception properly, we need to shift
* everything from the return addr up to the error code so it sits
* just under the outer exception info. This means that when we
* handle the exception, we do it in the context of the outer
* exception rather than starting a new one.
*
* The only caveat is that if the outer eax hasn't been restored yet
* (ie, it's still on stack), we need to insert its value into the
* SAVE_ALL state before going on, since it's usermode state which we
* eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
/*
* Paranoia: Make sure we're really coming from kernel space.
* One could imagine a case where userspace jumps into the
* critical range address, but just before the CPU delivers a
* GP, it decides to deliver an interrupt instead. Unlikely?
* Definitely. Easy to avoid? Yes. The Intel documents
* explicitly say that the reported EIP for a bad jump is the
* jump instruction itself, not the destination, but some
* virtual environments get this wrong.
*/
movl PT_CS(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
cmpl $USER_RPL, %ecx
je 2f
lea PT_ORIG_EAX(%esp), %esi
lea PT_EFLAGS(%esp), %edi
/*
* If eip is before iret_restore_end then stack
* hasn't been restored yet.
*/
cmp $iret_restore_end, %eax
jae 1f
movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
movl %eax, PT_EAX(%esp)
lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
/* set up the copy */
1: std
mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
rep movsl
cld
lea 4(%edi), %esp /* point esp to new frame */
2: jmp xen_do_upcall

View File

@@ -0,0 +1,159 @@
/*
* Asm versions of Xen pv-ops, suitable for either direct use or
* inlining. The inline versions are the same as the direct-use
* versions, with the pre- and post-amble chopped off.
*
* This code is encoded for size rather than absolute efficiency, with
* a view to being able to inline as much as possible.
*
* We only bother with direct forms (ie, vcpu in pda) of the
* operations here; the indirect forms are better handled in C, since
* they're generally too large to inline anyway.
*/
#include <asm/errno.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
#include "xen-asm.h"
ENTRY(xen_adjust_exception_frame)
mov 8+0(%rsp), %rcx
mov 8+8(%rsp), %r11
ret $16
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
* Xen64 iret frame:
*
* ss
* rsp
* rflags
* cs
* rip <-- standard iret frame
*
* flags
*
* rcx }
* r11 }<-- pushed by hypercall page
* rsp->rax }
*/
ENTRY(xen_iret)
pushq $0
1: jmp hypercall_iret
ENDPATCH(xen_iret)
RELOC(xen_iret, 1b+1)
/*
* sysexit is not used for 64-bit processes, so it's only ever used to
* return to 32-bit compat userspace.
*/
ENTRY(xen_sysexit)
pushq $__USER32_DS
pushq %rcx
pushq $X86_EFLAGS_IF
pushq $__USER32_CS
pushq %rdx
pushq $0
1: jmp hypercall_iret
ENDPATCH(xen_sysexit)
RELOC(xen_sysexit, 1b+1)
ENTRY(xen_sysret64)
/*
* We're already on the usermode stack at this point, but
* still with the kernel gs, so we can easily switch back
*/
movq %rsp, PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER_DS
pushq PER_CPU_VAR(old_rsp)
pushq %r11
pushq $__USER_CS
pushq %rcx
pushq $VGCF_in_syscall
1: jmp hypercall_iret
ENDPATCH(xen_sysret64)
RELOC(xen_sysret64, 1b+1)
ENTRY(xen_sysret32)
/*
* We're already on the usermode stack at this point, but
* still with the kernel gs, so we can easily switch back
*/
movq %rsp, PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER32_DS
pushq PER_CPU_VAR(old_rsp)
pushq %r11
pushq $__USER32_CS
pushq %rcx
pushq $0
1: jmp hypercall_iret
ENDPATCH(xen_sysret32)
RELOC(xen_sysret32, 1b+1)
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
* - kernel gs
* - kernel rsp
* - an iret-like stack frame on the stack (including rcx and r11):
* ss
* rsp
* rflags
* cs
* rip
* r11
* rsp->rcx
*
* In all the entrypoints, we undo all that to make it look like a
* CPU-generated syscall/sysenter and jump to the normal entrypoint.
*/
.macro undo_xen_syscall
mov 0*8(%rsp), %rcx
mov 1*8(%rsp), %r11
mov 5*8(%rsp), %rsp
.endm
/* Normal 64-bit system call target */
ENTRY(xen_syscall_target)
undo_xen_syscall
jmp system_call_after_swapgs
ENDPROC(xen_syscall_target)
#ifdef CONFIG_IA32_EMULATION
/* 32-bit compat syscall target */
ENTRY(xen_syscall32_target)
undo_xen_syscall
jmp ia32_cstar_target
ENDPROC(xen_syscall32_target)
/* 32-bit compat sysenter target */
ENTRY(xen_sysenter_target)
undo_xen_syscall
jmp ia32_sysenter_target
ENDPROC(xen_sysenter_target)
#else /* !CONFIG_IA32_EMULATION */
ENTRY(xen_syscall32_target)
ENTRY(xen_sysenter_target)
lea 16(%rsp), %rsp /* strip %rcx, %r11 */
mov $-ENOSYS, %rax
pushq $0
jmp hypercall_iret
ENDPROC(xen_syscall32_target)
ENDPROC(xen_sysenter_target)
#endif /* CONFIG_IA32_EMULATION */

View File

@@ -0,0 +1,55 @@
/* Xen-specific pieces of head.S, intended to be included in the right
place in head.S */
#ifdef CONFIG_XEN
#include <linux/elfnote.h>
#include <linux/init.h>
#include <asm/boot.h>
#include <asm/asm.h>
#include <asm/page_types.h>
#include <xen/interface/elfnote.h>
#include <asm/xen/interface.h>
__INIT
ENTRY(startup_xen)
cld
#ifdef CONFIG_X86_32
mov %esi,xen_start_info
mov $init_thread_union+THREAD_SIZE,%esp
#else
mov %rsi,xen_start_info
mov $init_thread_union+THREAD_SIZE,%rsp
#endif
jmp xen_start_kernel
__FINIT
.pushsection .text
.align PAGE_SIZE_asm
ENTRY(hypercall_page)
.skip PAGE_SIZE_asm
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
#ifdef CONFIG_X86_32
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
#else
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
#endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
#endif /*CONFIG_XEN */

View File

@@ -0,0 +1,104 @@
#ifndef XEN_OPS_H
#define XEN_OPS_H
#include <linux/init.h>
#include <linux/clocksource.h>
#include <linux/irqreturn.h>
#include <xen/xen-ops.h>
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
extern void *xen_initial_gdt;
struct trap_info;
void xen_copy_trap_info(struct trap_info *traps);
DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
DECLARE_PER_CPU(unsigned long, xen_cr3);
DECLARE_PER_CPU(unsigned long, xen_current_cr3);
extern struct start_info *xen_start_info;
extern struct shared_info xen_dummy_shared_info;
extern struct shared_info *HYPERVISOR_shared_info;
void xen_setup_mfn_list_list(void);
void xen_setup_shared_info(void);
void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void);
pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_ident_map_ISA(void);
void xen_reserve_top(void);
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
void __init xen_init_IRQ(void);
void xen_enable_sysenter(void);
void xen_enable_syscall(void);
void xen_vcpu_restore(void);
void __init xen_build_dynamic_phys_to_machine(void);
void xen_init_irq_ops(void);
void xen_setup_timer(int cpu);
void xen_setup_runstate_info(int cpu);
void xen_teardown_timer(int cpu);
cycle_t xen_clocksource_read(void);
void xen_setup_cpu_clockevents(void);
unsigned long xen_tsc_khz(void);
void __init xen_time_init(void);
unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
unsigned long long xen_sched_clock(void);
irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
bool xen_vcpu_stolen(int vcpu);
void xen_setup_vcpu_info_placement(void);
#ifdef CONFIG_SMP
void xen_smp_init(void);
extern cpumask_var_t xen_cpu_initialized_map;
#else
static inline void xen_smp_init(void) {}
#endif
#ifdef CONFIG_PARAVIRT_SPINLOCKS
void __init xen_init_spinlocks(void);
__cpuinit void xen_init_lock_cpu(int cpu);
void xen_uninit_lock_cpu(int cpu);
#else
static inline void xen_init_spinlocks(void)
{
}
static inline void xen_init_lock_cpu(int cpu)
{
}
static inline void xen_uninit_lock_cpu(int cpu)
{
}
#endif
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \
ret name(__VA_ARGS__); \
extern char name##_end[]; \
extern char name##_reloc[] \
DECL_ASM(void, xen_irq_enable_direct, void);
DECL_ASM(void, xen_irq_disable_direct, void);
DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
/* These are not functions, and cannot be called normally */
void xen_iret(void);
void xen_sysexit(void);
void xen_sysret32(void);
void xen_sysret64(void);
void xen_adjust_exception_frame(void);
#endif /* XEN_OPS_H */