add idl4k kernel firmware version 1.13.0.105

This commit is contained in:
Jaroslav Kysela
2015-03-26 17:22:37 +01:00
parent 5194d2792e
commit e9070cdc77
31064 changed files with 12769984 additions and 0 deletions

6
kernel/kernel/.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
#
# Generated files
#
config_data.h
config_data.gz
timeconst.h

View File

@@ -0,0 +1,2 @@
config FREEZER
def_bool PM_SLEEP || CGROUP_FREEZER

58
kernel/kernel/Kconfig.hz Normal file
View File

@@ -0,0 +1,58 @@
#
# Timer Interrupt Frequency Configuration
#
choice
prompt "Timer frequency"
default HZ_250
help
Allows the configuration of the timer frequency. It is customary
to have the timer interrupt run at 1000 Hz but 100 Hz may be more
beneficial for servers and NUMA systems that do not need to have
a fast response for user interaction and that may experience bus
contention and cacheline bounces as a result of timer interrupts.
Note that the timer interrupt occurs on each processor in an SMP
environment leading to NR_CPUS * HZ number of timer interrupts
per second.
config HZ_100
bool "100 HZ"
help
100 Hz is a typical choice for servers, SMP and NUMA systems
with lots of processors that may show reduced performance if
too many timer interrupts are occurring.
config HZ_250
bool "250 HZ"
help
250 Hz is a good compromise choice allowing server performance
while also showing good interactive responsiveness even
on SMP and NUMA systems. If you are going to be using NTSC video
or multimedia, selected 300Hz instead.
config HZ_300
bool "300 HZ"
help
300 Hz is a good compromise choice allowing server performance
while also showing good interactive responsiveness even
on SMP and NUMA systems and exactly dividing by both PAL and
NTSC frame rates for video and multimedia work.
config HZ_1000
bool "1000 HZ"
help
1000 Hz is the preferred choice for desktop systems and other
systems requiring fast interactive responses to events.
endchoice
config HZ
int
default 100 if HZ_100
default 250 if HZ_250
default 300 if HZ_300
default 1000 if HZ_1000
config SCHED_HRTICK
def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)

View File

@@ -0,0 +1,54 @@
choice
prompt "Preemption Model"
default PREEMPT_NONE
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
help
This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the
time, but there are no guarantees and occasional longer delays
are possible.
Select this option if you are building a kernel for a server or
scientific/computation system, or if you want to maximize the
raw processing power of the kernel, irrespective of scheduling
latencies.
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
preemption points have been selected to reduce the maximum
latency of rescheduling, providing faster application reactions,
at the cost of slightly lower throughput.
This allows reaction to interactive events by allowing a
low priority process to voluntarily preempt itself even if it
is in kernel mode executing a system call. This allows
applications to run more 'smoothly' even when the system is
under load.
Select this if you are building a kernel for a desktop system.
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
preemptible. This allows reaction to interactive events by
permitting a low priority process to be preempted involuntarily
even if it is in kernel mode executing a system call and would
otherwise not be about to reach a natural preemption point.
This allows applications to run more 'smoothly' even when the
system is under load, at the cost of slightly lower throughput
and a slight runtime overhead to kernel code.
Select this if you are building a kernel for a desktop or
embedded system with latency requirements in the milliseconds
range.
endchoice

129
kernel/kernel/Makefile Normal file
View File

@@ -0,0 +1,129 @@
#
# Makefile for the linux kernel.
#
obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
cpu.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
async.o
obj-y += groups.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
CFLAGS_REMOVE_lockdep.o = -pg
CFLAGS_REMOVE_lockdep_proc.o = -pg
CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
endif
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
obj-$(CONFIG_FUTEX) += futex.o
ifeq ($(CONFIG_COMPAT),y)
obj-$(CONFIG_FUTEX) += futex_compat.o
endif
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_PM) += power/
obj-$(CONFIG_FREEZER) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += rcutree.o
obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_X86_DS) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_SLOW_WORK) += slow-work.o
obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
# me. I suspect most platforms don't need this, but until we know that for sure
# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
# to get a correct value for the wait-channel (WCHAN in ps). --davidm
CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
endif
$(obj)/configs.o: $(obj)/config_data.h
# config_data.h contains the same information as ikconfig.h but gzipped.
# Info from config_data can be extracted from /proc/config*
targets += config_data.gz
$(obj)/config_data.gz: .config FORCE
$(call if_changed,gzip)
quiet_cmd_ikconfiggz = IKCFG $@
cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call if_changed,ikconfiggz)
$(obj)/time.o: $(obj)/timeconst.h
quiet_cmd_timeconst = TIMEC $@
cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@
targets += timeconst.h
$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
$(call if_changed,timeconst)

684
kernel/kernel/acct.c Normal file
View File

@@ -0,0 +1,684 @@
/*
* linux/kernel/acct.c
*
* BSD Process Accounting for Linux
*
* Author: Marco van Wieringen <mvw@planets.elm.net>
*
* Some code based on ideas and code from:
* Thomas K. Dyas <tdyas@eden.rutgers.edu>
*
* This file implements BSD-style process accounting. Whenever any
* process exits, an accounting record of type "struct acct" is
* written to the file specified with the acct() system call. It is
* up to user-level programs to do useful things with the accounting
* log. The kernel just provides the raw accounting information.
*
* (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
*
* Plugged two leaks. 1) It didn't return acct_file into the free_filps if
* the file happened to be read-only. 2) If the accounting was suspended
* due to the lack of space it happily allowed to reopen it and completely
* lost the old acct_file. 3/10/98, Al Viro.
*
* Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
* XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
*
* Fixed a nasty interaction with with sys_umount(). If the accointing
* was suspeneded we failed to stop it on umount(). Messy.
* Another one: remount to readonly didn't stop accounting.
* Question: what should we do if we have CAP_SYS_ADMIN but not
* CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
* unless we are messing with the root. In that case we are getting a
* real mess with do_remount_sb(). 9/11/98, AV.
*
* Fixed a bunch of races (and pair of leaks). Probably not the best way,
* but this one obviously doesn't introduce deadlocks. Later. BTW, found
* one race (and leak) in BSD implementation.
* OK, that's better. ANOTHER race and leak in BSD variant. There always
* is one more bug... 10/11/98, AV.
*
* Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
* ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
* a struct file opened for write. Fixed. 2/6/2000, AV.
*/
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/acct.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/tty.h>
#include <linux/security.h>
#include <linux/vfs.h>
#include <linux/jiffies.h>
#include <linux/times.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <asm/uaccess.h>
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
/*
* These constants control the amount of freespace that suspend and
* resume the process accounting system, and the time delay between
* each check.
* Turned into sysctl-controllable parameters. AV, 12/11/98
*/
int acct_parm[3] = {4, 2, 30};
#define RESUME (acct_parm[0]) /* >foo% free space - resume */
#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
/*
* External references and all of the globals.
*/
static void do_acct_process(struct bsd_acct_struct *acct,
struct pid_namespace *ns, struct file *);
/*
* This structure is used so that all the data protected by lock
* can be placed in the same cache line as the lock. This primes
* the cache line to have the data after getting the lock.
*/
struct bsd_acct_struct {
volatile int active;
volatile int needcheck;
struct file *file;
struct pid_namespace *ns;
struct timer_list timer;
struct list_head list;
};
static DEFINE_SPINLOCK(acct_lock);
static LIST_HEAD(acct_list);
/*
* Called whenever the timer says to check the free space.
*/
static void acct_timeout(unsigned long x)
{
struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
acct->needcheck = 1;
}
/*
* Check the amount of free space and suspend/resume accordingly.
*/
static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
{
struct kstatfs sbuf;
int res;
int act;
sector_t resume;
sector_t suspend;
spin_lock(&acct_lock);
res = acct->active;
if (!file || !acct->needcheck)
goto out;
spin_unlock(&acct_lock);
/* May block */
if (vfs_statfs(file->f_path.dentry, &sbuf))
return res;
suspend = sbuf.f_blocks * SUSPEND;
resume = sbuf.f_blocks * RESUME;
sector_div(suspend, 100);
sector_div(resume, 100);
if (sbuf.f_bavail <= suspend)
act = -1;
else if (sbuf.f_bavail >= resume)
act = 1;
else
act = 0;
/*
* If some joker switched acct->file under us we'ld better be
* silent and _not_ touch anything.
*/
spin_lock(&acct_lock);
if (file != acct->file) {
if (act)
res = act>0;
goto out;
}
if (acct->active) {
if (act < 0) {
acct->active = 0;
printk(KERN_INFO "Process accounting paused\n");
}
} else {
if (act > 0) {
acct->active = 1;
printk(KERN_INFO "Process accounting resumed\n");
}
}
del_timer(&acct->timer);
acct->needcheck = 0;
acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
add_timer(&acct->timer);
res = acct->active;
out:
spin_unlock(&acct_lock);
return res;
}
/*
* Close the old accounting file (if currently open) and then replace
* it with file (if non-NULL).
*
* NOTE: acct_lock MUST be held on entry and exit.
*/
static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
struct pid_namespace *ns)
{
struct file *old_acct = NULL;
struct pid_namespace *old_ns = NULL;
if (acct->file) {
old_acct = acct->file;
old_ns = acct->ns;
del_timer(&acct->timer);
acct->active = 0;
acct->needcheck = 0;
acct->file = NULL;
acct->ns = NULL;
list_del(&acct->list);
}
if (file) {
acct->file = file;
acct->ns = ns;
acct->needcheck = 0;
acct->active = 1;
list_add(&acct->list, &acct_list);
/* It's been deleted if it was used before so this is safe */
setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
add_timer(&acct->timer);
}
if (old_acct) {
mnt_unpin(old_acct->f_path.mnt);
spin_unlock(&acct_lock);
do_acct_process(acct, old_ns, old_acct);
filp_close(old_acct, NULL);
spin_lock(&acct_lock);
}
}
static int acct_on(char *name)
{
struct file *file;
struct vfsmount *mnt;
int error;
struct pid_namespace *ns;
struct bsd_acct_struct *acct = NULL;
/* Difference from BSD - they don't do O_APPEND */
file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
if (IS_ERR(file))
return PTR_ERR(file);
if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
filp_close(file, NULL);
return -EACCES;
}
if (!file->f_op->write) {
filp_close(file, NULL);
return -EIO;
}
ns = task_active_pid_ns(current);
if (ns->bacct == NULL) {
acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
if (acct == NULL) {
filp_close(file, NULL);
return -ENOMEM;
}
}
error = security_acct(file);
if (error) {
kfree(acct);
filp_close(file, NULL);
return error;
}
spin_lock(&acct_lock);
if (ns->bacct == NULL) {
ns->bacct = acct;
acct = NULL;
}
mnt = file->f_path.mnt;
mnt_pin(mnt);
acct_file_reopen(ns->bacct, file, ns);
spin_unlock(&acct_lock);
mntput(mnt); /* it's pinned, now give up active reference */
kfree(acct);
return 0;
}
/**
* sys_acct - enable/disable process accounting
* @name: file name for accounting records or NULL to shutdown accounting
*
* Returns 0 for success or negative errno values for failure.
*
* sys_acct() is the only system call needed to implement process
* accounting. It takes the name of the file where accounting records
* should be written. If the filename is NULL, accounting will be
* shutdown.
*/
SYSCALL_DEFINE1(acct, const char __user *, name)
{
int error;
if (!capable(CAP_SYS_PACCT))
return -EPERM;
if (name) {
char *tmp = getname(name);
if (IS_ERR(tmp))
return (PTR_ERR(tmp));
error = acct_on(tmp);
putname(tmp);
} else {
struct bsd_acct_struct *acct;
acct = task_active_pid_ns(current)->bacct;
if (acct == NULL)
return 0;
error = security_acct(NULL);
if (!error) {
spin_lock(&acct_lock);
acct_file_reopen(acct, NULL, NULL);
spin_unlock(&acct_lock);
}
}
return error;
}
/**
* acct_auto_close - turn off a filesystem's accounting if it is on
* @m: vfsmount being shut down
*
* If the accounting is turned on for a file in the subtree pointed to
* to by m, turn accounting off. Done when m is about to die.
*/
void acct_auto_close_mnt(struct vfsmount *m)
{
struct bsd_acct_struct *acct;
spin_lock(&acct_lock);
restart:
list_for_each_entry(acct, &acct_list, list)
if (acct->file && acct->file->f_path.mnt == m) {
acct_file_reopen(acct, NULL, NULL);
goto restart;
}
spin_unlock(&acct_lock);
}
/**
* acct_auto_close - turn off a filesystem's accounting if it is on
* @sb: super block for the filesystem
*
* If the accounting is turned on for a file in the filesystem pointed
* to by sb, turn accounting off.
*/
void acct_auto_close(struct super_block *sb)
{
struct bsd_acct_struct *acct;
spin_lock(&acct_lock);
restart:
list_for_each_entry(acct, &acct_list, list)
if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
acct_file_reopen(acct, NULL, NULL);
goto restart;
}
spin_unlock(&acct_lock);
}
void acct_exit_ns(struct pid_namespace *ns)
{
struct bsd_acct_struct *acct;
spin_lock(&acct_lock);
acct = ns->bacct;
if (acct != NULL) {
if (acct->file != NULL)
acct_file_reopen(acct, NULL, NULL);
kfree(acct);
}
spin_unlock(&acct_lock);
}
/*
* encode an unsigned long into a comp_t
*
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
* is a 13-bit fraction with a 3-bit (base 8) exponent.
*/
#define MANTSIZE 13 /* 13 bit mantissa. */
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
static comp_t encode_comp_t(unsigned long value)
{
int exp, rnd;
exp = rnd = 0;
while (value > MAXFRACT) {
rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */
value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
exp++;
}
/*
* If we need to round up, do it (and handle overflow correctly).
*/
if (rnd && (++value > MAXFRACT)) {
value >>= EXPSIZE;
exp++;
}
/*
* Clean it up and polish it off.
*/
exp <<= MANTSIZE; /* Shift the exponent into place */
exp += value; /* and add on the mantissa. */
return exp;
}
#if ACCT_VERSION==1 || ACCT_VERSION==2
/*
* encode an u64 into a comp2_t (24 bits)
*
* Format: 5 bit base 2 exponent, 20 bits mantissa.
* The leading bit of the mantissa is not stored, but implied for
* non-zero exponents.
* Largest encodable value is 50 bits.
*/
#define MANTSIZE2 20 /* 20 bit mantissa. */
#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */
static comp2_t encode_comp2_t(u64 value)
{
int exp, rnd;
exp = (value > (MAXFRACT2>>1));
rnd = 0;
while (value > MAXFRACT2) {
rnd = value & 1;
value >>= 1;
exp++;
}
/*
* If we need to round up, do it (and handle overflow correctly).
*/
if (rnd && (++value > MAXFRACT2)) {
value >>= 1;
exp++;
}
if (exp > MAXEXP2) {
/* Overflow. Return largest representable number instead. */
return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
} else {
return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
}
}
#endif
#if ACCT_VERSION==3
/*
* encode an u64 into a 32 bit IEEE float
*/
static u32 encode_float(u64 value)
{
unsigned exp = 190;
unsigned u;
if (value==0) return 0;
while ((s64)value > 0){
value <<= 1;
exp--;
}
u = (u32)(value >> 40) & 0x7fffffu;
return u | (exp << 23);
}
#endif
/*
* Write an accounting entry for an exiting process
*
* The acct_process() call is the workhorse of the process
* accounting system. The struct acct is built here and then written
* into the accounting file. This function should only be called from
* do_exit() or when switching to a different output file.
*/
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
static void do_acct_process(struct bsd_acct_struct *acct,
struct pid_namespace *ns, struct file *file)
{
struct pacct_struct *pacct = &current->signal->pacct;
acct_t ac;
mm_segment_t fs;
unsigned long flim;
u64 elapsed;
u64 run_time;
struct timespec uptime;
struct tty_struct *tty;
const struct cred *orig_cred;
/* Perform file operations on behalf of whoever enabled accounting */
orig_cred = override_creds(file->f_cred);
/*
* First check to see if there is enough free_space to continue
* the process accounting system.
*/
if (!check_free_space(acct, file))
goto out;
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
*/
memset((caddr_t)&ac, 0, sizeof(acct_t));
ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
/* calculate run_time in nsec*/
do_posix_clock_monotonic_gettime(&uptime);
run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
+ current->group_leader->start_time.tv_nsec;
/* convert nsec -> AHZ */
elapsed = nsec_to_AHZ(run_time);
#if ACCT_VERSION==3
ac.ac_etime = encode_float(elapsed);
#else
ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
(unsigned long) elapsed : (unsigned long) -1l);
#endif
#if ACCT_VERSION==1 || ACCT_VERSION==2
{
/* new enlarged etime field */
comp2_t etime = encode_comp2_t(elapsed);
ac.ac_etime_hi = etime >> 16;
ac.ac_etime_lo = (u16) etime;
}
#endif
do_div(elapsed, AHZ);
ac.ac_btime = get_seconds() - elapsed;
/* we really need to bite the bullet and change layout */
ac.ac_uid = orig_cred->uid;
ac.ac_gid = orig_cred->gid;
#if ACCT_VERSION==2
ac.ac_ahz = AHZ;
#endif
#if ACCT_VERSION==1 || ACCT_VERSION==2
/* backward-compatible 16 bit fields */
ac.ac_uid16 = ac.ac_uid;
ac.ac_gid16 = ac.ac_gid;
#endif
#if ACCT_VERSION==3
ac.ac_pid = task_tgid_nr_ns(current, ns);
rcu_read_lock();
ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
rcu_read_unlock();
#endif
spin_lock_irq(&current->sighand->siglock);
tty = current->signal->tty; /* Safe as we hold the siglock */
ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
ac.ac_flag = pacct->ac_flag;
ac.ac_mem = encode_comp_t(pacct->ac_mem);
ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
ac.ac_exitcode = pacct->ac_exitcode;
spin_unlock_irq(&current->sighand->siglock);
ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
ac.ac_swaps = encode_comp_t(0);
/*
* Kernel segment override to datasegment and write it
* to the accounting file.
*/
fs = get_fs();
set_fs(KERNEL_DS);
/*
* Accounting records are not subject to resource limits.
*/
flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
file->f_op->write(file, (char *)&ac,
sizeof(acct_t), &file->f_pos);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
set_fs(fs);
out:
revert_creds(orig_cred);
}
/**
* acct_init_pacct - initialize a new pacct_struct
* @pacct: per-process accounting info struct to initialize
*/
void acct_init_pacct(struct pacct_struct *pacct)
{
memset(pacct, 0, sizeof(struct pacct_struct));
pacct->ac_utime = pacct->ac_stime = cputime_zero;
}
/**
* acct_collect - collect accounting information into pacct_struct
* @exitcode: task exit code
* @group_dead: not 0, if this thread is the last one in the process.
*/
void acct_collect(long exitcode, int group_dead)
{
struct pacct_struct *pacct = &current->signal->pacct;
unsigned long vsize = 0;
if (group_dead && current->mm) {
struct vm_area_struct *vma;
down_read(&current->mm->mmap_sem);
vma = current->mm->mmap;
while (vma) {
vsize += vma->vm_end - vma->vm_start;
vma = vma->vm_next;
}
up_read(&current->mm->mmap_sem);
}
spin_lock_irq(&current->sighand->siglock);
if (group_dead)
pacct->ac_mem = vsize / 1024;
if (thread_group_leader(current)) {
pacct->ac_exitcode = exitcode;
if (current->flags & PF_FORKNOEXEC)
pacct->ac_flag |= AFORK;
}
if (current->flags & PF_SUPERPRIV)
pacct->ac_flag |= ASU;
if (current->flags & PF_DUMPCORE)
pacct->ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
pacct->ac_flag |= AXSIG;
pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
pacct->ac_minflt += current->min_flt;
pacct->ac_majflt += current->maj_flt;
spin_unlock_irq(&current->sighand->siglock);
}
static void acct_process_in_ns(struct pid_namespace *ns)
{
struct file *file = NULL;
struct bsd_acct_struct *acct;
acct = ns->bacct;
/*
* accelerate the common fastpath:
*/
if (!acct || !acct->file)
return;
spin_lock(&acct_lock);
file = acct->file;
if (unlikely(!file)) {
spin_unlock(&acct_lock);
return;
}
get_file(file);
spin_unlock(&acct_lock);
do_acct_process(acct, ns, file);
fput(file);
}
/**
* acct_process - now just a wrapper around acct_process_in_ns,
* which in turn is a wrapper around do_acct_process.
*
* handles process accounting for an exiting task
*/
void acct_process(void)
{
struct pid_namespace *ns;
/*
* This loop is safe lockless, since current is still
* alive and holds its namespace, which in turn holds
* its parent.
*/
for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
acct_process_in_ns(ns);
}

397
kernel/kernel/async.c Normal file
View File

@@ -0,0 +1,397 @@
/*
* async.c: Asynchronous function calls for boot performance
*
* (C) Copyright 2009 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
/*
Goals and Theory of Operation
The primary goal of this feature is to reduce the kernel boot time,
by doing various independent hardware delays and discovery operations
decoupled and not strictly serialized.
More specifically, the asynchronous function call concept allows
certain operations (primarily during system boot) to happen
asynchronously, out of order, while these operations still
have their externally visible parts happen sequentially and in-order.
(not unlike how out-of-order CPUs retire their instructions in order)
Key to the asynchronous function call implementation is the concept of
a "sequence cookie" (which, although it has an abstracted type, can be
thought of as a monotonically incrementing number).
The async core will assign each scheduled event such a sequence cookie and
pass this to the called functions.
The asynchronously called function should before doing a globally visible
operation, such as registering device numbers, call the
async_synchronize_cookie() function and pass in its own cookie. The
async_synchronize_cookie() function will make sure that all asynchronous
operations that were scheduled prior to the operation corresponding with the
cookie have completed.
Subsystem/driver initialization code that scheduled asynchronous probe
functions, but which shares global resources with other drivers/subsystems
that do not use the asynchronous call feature, need to do a full
synchronization with the async_synchronize_full() function, before returning
from their init function. This is to maintain strict ordering between the
asynchronous and synchronous parts of the kernel.
*/
#include <linux/async.h>
#include <linux/bug.h>
#include <linux/module.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/kthread.h>
#include <linux/delay.h>
#include <asm/atomic.h>
static async_cookie_t next_cookie = 1;
#define MAX_THREADS 256
#define MAX_WORK 32768
static LIST_HEAD(async_pending);
static LIST_HEAD(async_running);
static DEFINE_SPINLOCK(async_lock);
static int async_enabled = 0;
struct async_entry {
struct list_head list;
async_cookie_t cookie;
async_func_ptr *func;
void *data;
struct list_head *running;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
static DECLARE_WAIT_QUEUE_HEAD(async_new);
static atomic_t entry_count;
static atomic_t thread_count;
extern int initcall_debug;
/*
* MUST be called with the lock held!
*/
static async_cookie_t __lowest_in_progress(struct list_head *running)
{
struct async_entry *entry;
if (!list_empty(running)) {
entry = list_first_entry(running,
struct async_entry, list);
return entry->cookie;
}
list_for_each_entry(entry, &async_pending, list)
if (entry->running == running)
return entry->cookie;
return next_cookie; /* "infinity" value */
}
static async_cookie_t lowest_in_progress(struct list_head *running)
{
unsigned long flags;
async_cookie_t ret;
spin_lock_irqsave(&async_lock, flags);
ret = __lowest_in_progress(running);
spin_unlock_irqrestore(&async_lock, flags);
return ret;
}
/*
* pick the first pending entry and run it
*/
static void run_one_entry(void)
{
unsigned long flags;
struct async_entry *entry;
ktime_t calltime, delta, rettime;
/* 1) pick one task from the pending queue */
spin_lock_irqsave(&async_lock, flags);
if (list_empty(&async_pending))
goto out;
entry = list_first_entry(&async_pending, struct async_entry, list);
/* 2) move it to the running queue */
list_move_tail(&entry->list, entry->running);
spin_unlock_irqrestore(&async_lock, flags);
/* 3) run it (and print duration)*/
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
}
entry->func(entry->data, entry->cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
printk("initcall %lli_%pF returned 0 after %lld usecs\n",
(long long)entry->cookie,
entry->func,
(long long)ktime_to_ns(delta) >> 10);
}
/* 4) remove it from the running queue */
spin_lock_irqsave(&async_lock, flags);
list_del(&entry->list);
/* 5) free the entry */
kfree(entry);
atomic_dec(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
/* 6) wake up any waiters. */
wake_up(&async_done);
return;
out:
spin_unlock_irqrestore(&async_lock, flags);
}
static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
{
struct async_entry *entry;
unsigned long flags;
async_cookie_t newcookie;
/* allow irq-off callers */
entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
/*
* If we're out of memory or if there's too much work
* pending already, we execute synchronously.
*/
if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
kfree(entry);
spin_lock_irqsave(&async_lock, flags);
newcookie = next_cookie++;
spin_unlock_irqrestore(&async_lock, flags);
/* low on memory.. run synchronously */
ptr(data, newcookie);
return newcookie;
}
entry->func = ptr;
entry->data = data;
entry->running = running;
spin_lock_irqsave(&async_lock, flags);
newcookie = entry->cookie = next_cookie++;
list_add_tail(&entry->list, &async_pending);
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
wake_up(&async_new);
return newcookie;
}
/**
* async_schedule - schedule a function for asynchronous execution
* @ptr: function to execute asynchronously
* @data: data pointer to pass to the function
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
*/
async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
{
return __async_schedule(ptr, data, &async_running);
}
EXPORT_SYMBOL_GPL(async_schedule);
/**
* async_schedule_domain - schedule a function for asynchronous execution within a certain domain
* @ptr: function to execute asynchronously
* @data: data pointer to pass to the function
* @running: running list for the domain
*
* Returns an async_cookie_t that may be used for checkpointing later.
* @running may be used in the async_synchronize_*_domain() functions
* to wait within a certain synchronization domain rather than globally.
* A synchronization domain is specified via the running queue @running to use.
* Note: This function may be called from atomic or non-atomic contexts.
*/
async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
struct list_head *running)
{
return __async_schedule(ptr, data, running);
}
EXPORT_SYMBOL_GPL(async_schedule_domain);
/**
* async_synchronize_full - synchronize all asynchronous function calls
*
* This function waits until all asynchronous function calls have been done.
*/
void async_synchronize_full(void)
{
do {
async_synchronize_cookie(next_cookie);
} while (!list_empty(&async_running) || !list_empty(&async_pending));
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
* @list: running list to synchronize on
*
* This function waits until all asynchronous function calls for the
* synchronization domain specified by the running list @list have been done.
*/
void async_synchronize_full_domain(struct list_head *list)
{
async_synchronize_cookie_domain(next_cookie, list);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
/**
* async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
* @running: running list to synchronize on
*
* This function waits until all asynchronous function calls for the
* synchronization domain specified by the running list @list submitted
* prior to @cookie have been done.
*/
void async_synchronize_cookie_domain(async_cookie_t cookie,
struct list_head *running)
{
ktime_t starttime, delta, endtime;
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
wait_event(async_done, lowest_in_progress(running) >= cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
printk("async_continuing @ %i after %lli usec\n",
task_pid_nr(current),
(long long)ktime_to_ns(delta) >> 10);
}
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
/**
* async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
*
* This function waits until all asynchronous function calls prior to @cookie
* have been done.
*/
void async_synchronize_cookie(async_cookie_t cookie)
{
async_synchronize_cookie_domain(cookie, &async_running);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
static int async_thread(void *unused)
{
DECLARE_WAITQUEUE(wq, current);
add_wait_queue(&async_new, &wq);
while (!kthread_should_stop()) {
int ret = HZ;
set_current_state(TASK_INTERRUPTIBLE);
/*
* check the list head without lock.. false positives
* are dealt with inside run_one_entry() while holding
* the lock.
*/
rmb();
if (!list_empty(&async_pending))
run_one_entry();
else
ret = schedule_timeout(HZ);
if (ret == 0) {
/*
* we timed out, this means we as thread are redundant.
* we sign off and die, but we to avoid any races there
* is a last-straw check to see if work snuck in.
*/
atomic_dec(&thread_count);
wmb(); /* manager must see our departure first */
if (list_empty(&async_pending))
break;
/*
* woops work came in between us timing out and us
* signing off; we need to stay alive and keep working.
*/
atomic_inc(&thread_count);
}
}
remove_wait_queue(&async_new, &wq);
return 0;
}
static int async_manager_thread(void *unused)
{
DECLARE_WAITQUEUE(wq, current);
add_wait_queue(&async_new, &wq);
while (!kthread_should_stop()) {
int tc, ec;
set_current_state(TASK_INTERRUPTIBLE);
tc = atomic_read(&thread_count);
rmb();
ec = atomic_read(&entry_count);
while (tc < ec && tc < MAX_THREADS) {
if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
tc))) {
msleep(100);
continue;
}
atomic_inc(&thread_count);
tc++;
}
schedule();
}
remove_wait_queue(&async_new, &wq);
return 0;
}
static int __init async_init(void)
{
async_enabled =
!IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
WARN_ON(!async_enabled);
return 0;
}
core_initcall(async_init);

1513
kernel/kernel/audit.c Normal file

File diff suppressed because it is too large Load Diff

167
kernel/kernel/audit.h Normal file
View File

@@ -0,0 +1,167 @@
/* audit -- definition of audit_context structure and supporting types
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
#include <linux/audit.h>
#include <linux/skbuff.h>
/* 0 = no checking
1 = put_count checking
2 = verbose put_count checking
*/
#define AUDIT_DEBUG 0
/* At task start time, the audit_state is set in the audit_context using
a per-task filter. At syscall entry, the audit_state is augmented by
the syscall filter. */
enum audit_state {
AUDIT_DISABLED, /* Do not create per-task audit_context.
* No syscall-specific audit records can
* be generated. */
AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
* but don't necessarily fill it in at
* syscall entry time (i.e., filter
* instead). */
AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
* and always fill it in at syscall
* entry time. This makes a full
* syscall record available if some
* other part of the kernel decides it
* should be recorded. */
AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
* always fill it in at syscall entry
* time, and always write out the audit
* record at syscall exit time. */
};
/* Rule lists */
struct audit_watch;
struct audit_tree;
struct audit_chunk;
struct audit_entry {
struct list_head list;
struct rcu_head rcu;
struct audit_krule rule;
};
#ifdef CONFIG_AUDIT
extern int audit_enabled;
extern int audit_ever_enabled;
#endif
extern int audit_pid;
#define AUDIT_INODE_BUCKETS 32
extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
static inline int audit_hash_ino(u32 ino)
{
return (ino & (AUDIT_INODE_BUCKETS-1));
}
extern int audit_match_class(int class, unsigned syscall);
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
extern int audit_compare_dname_path(const char *dname, const char *path,
int *dirlen);
extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
int done, int multi,
void *payload, int size);
extern void audit_send_reply(int pid, int seq, int type,
int done, int multi,
void *payload, int size);
extern void audit_panic(const char *message);
struct audit_netlink_list {
int pid;
struct sk_buff_head q;
};
int audit_send_list(void *);
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
extern void audit_free_rule_rcu(struct rcu_head *);
extern struct list_head audit_filter_list[];
/* audit watch functions */
extern unsigned long audit_watch_inode(struct audit_watch *watch);
extern dev_t audit_watch_dev(struct audit_watch *watch);
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
extern int audit_add_watch(struct audit_krule *krule);
extern void audit_remove_watch(struct audit_watch *watch);
extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
extern void audit_inotify_unregister(struct list_head *in_list);
extern char *audit_watch_path(struct audit_watch *watch);
extern struct list_head *audit_watch_rules(struct audit_watch *watch);
extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
struct audit_watch *watch);
#ifdef CONFIG_AUDIT_TREE
extern struct audit_chunk *audit_tree_lookup(const struct inode *);
extern void audit_put_chunk(struct audit_chunk *);
extern int audit_tree_match(struct audit_chunk *, struct audit_tree *);
extern int audit_make_tree(struct audit_krule *, char *, u32);
extern int audit_add_tree_rule(struct audit_krule *);
extern int audit_remove_tree_rule(struct audit_krule *);
extern void audit_trim_trees(void);
extern int audit_tag_tree(char *old, char *new);
extern const char *audit_tree_path(struct audit_tree *);
extern void audit_put_tree(struct audit_tree *);
extern void audit_kill_trees(struct list_head *);
#else
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
#define audit_make_tree(rule, str, op) -EINVAL
#define audit_trim_trees() (void)0
#define audit_put_tree(tree) (void)0
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
#define audit_kill_trees(list) BUG()
#endif
extern char *audit_unpack_string(void **, size_t *, size_t);
extern pid_t audit_sig_pid;
extern uid_t audit_sig_uid;
extern u32 audit_sig_sid;
#ifdef CONFIG_AUDITSYSCALL
extern int __audit_signal_info(int sig, struct task_struct *t);
static inline int audit_signal_info(int sig, struct task_struct *t)
{
if (unlikely((audit_pid && t->tgid == audit_pid) ||
(audit_signals && !audit_dummy_context())))
return __audit_signal_info(sig, t);
return 0;
}
extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
extern struct list_head *audit_killed_trees(void);
#else
#define audit_signal_info(s,t) AUDIT_DISABLED
#define audit_filter_inodes(t,c) AUDIT_DISABLED
#endif
extern struct mutex audit_cmd_mutex;

966
kernel/kernel/audit_tree.c Normal file
View File

@@ -0,0 +1,966 @@
#include "audit.h"
#include <linux/inotify.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/kthread.h>
struct audit_tree;
struct audit_chunk;
struct audit_tree {
atomic_t count;
int goner;
struct audit_chunk *root;
struct list_head chunks;
struct list_head rules;
struct list_head list;
struct list_head same_root;
struct rcu_head head;
char pathname[];
};
struct audit_chunk {
struct list_head hash;
struct inotify_watch watch;
struct list_head trees; /* with root here */
int dead;
int count;
atomic_long_t refs;
struct rcu_head head;
struct node {
struct list_head list;
struct audit_tree *owner;
unsigned index; /* index; upper bit indicates 'will prune' */
} owners[];
};
static LIST_HEAD(tree_list);
static LIST_HEAD(prune_list);
/*
* One struct chunk is attached to each inode of interest.
* We replace struct chunk on tagging/untagging.
* Rules have pointer to struct audit_tree.
* Rules have struct list_head rlist forming a list of rules over
* the same tree.
* References to struct chunk are collected at audit_inode{,_child}()
* time and used in AUDIT_TREE rule matching.
* These references are dropped at the same time we are calling
* audit_free_names(), etc.
*
* Cyclic lists galore:
* tree.chunks anchors chunk.owners[].list hash_lock
* tree.rules anchors rule.rlist audit_filter_mutex
* chunk.trees anchors tree.same_root hash_lock
* chunk.hash is a hash with middle bits of watch.inode as
* a hash function. RCU, hash_lock
*
* tree is refcounted; one reference for "some rules on rules_list refer to
* it", one for each chunk with pointer to it.
*
* chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
* of watch contributes 1 to .refs).
*
* node.index allows to get from node.list to containing chunk.
* MSB of that sucker is stolen to mark taggings that we might have to
* revert - several operations have very unpleasant cleanup logics and
* that makes a difference. Some.
*/
static struct inotify_handle *rtree_ih;
static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
if (tree) {
atomic_set(&tree->count, 1);
tree->goner = 0;
INIT_LIST_HEAD(&tree->chunks);
INIT_LIST_HEAD(&tree->rules);
INIT_LIST_HEAD(&tree->list);
INIT_LIST_HEAD(&tree->same_root);
tree->root = NULL;
strcpy(tree->pathname, s);
}
return tree;
}
static inline void get_tree(struct audit_tree *tree)
{
atomic_inc(&tree->count);
}
static void __put_tree(struct rcu_head *rcu)
{
struct audit_tree *tree = container_of(rcu, struct audit_tree, head);
kfree(tree);
}
static inline void put_tree(struct audit_tree *tree)
{
if (atomic_dec_and_test(&tree->count))
call_rcu(&tree->head, __put_tree);
}
/* to avoid bringing the entire thing in audit.h */
const char *audit_tree_path(struct audit_tree *tree)
{
return tree->pathname;
}
static struct audit_chunk *alloc_chunk(int count)
{
struct audit_chunk *chunk;
size_t size;
int i;
size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
chunk = kzalloc(size, GFP_KERNEL);
if (!chunk)
return NULL;
INIT_LIST_HEAD(&chunk->hash);
INIT_LIST_HEAD(&chunk->trees);
chunk->count = count;
atomic_long_set(&chunk->refs, 1);
for (i = 0; i < count; i++) {
INIT_LIST_HEAD(&chunk->owners[i].list);
chunk->owners[i].index = i;
}
inotify_init_watch(&chunk->watch);
return chunk;
}
static void free_chunk(struct audit_chunk *chunk)
{
int i;
for (i = 0; i < chunk->count; i++) {
if (chunk->owners[i].owner)
put_tree(chunk->owners[i].owner);
}
kfree(chunk);
}
void audit_put_chunk(struct audit_chunk *chunk)
{
if (atomic_long_dec_and_test(&chunk->refs))
free_chunk(chunk);
}
static void __put_chunk(struct rcu_head *rcu)
{
struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
audit_put_chunk(chunk);
}
enum {HASH_SIZE = 128};
static struct list_head chunk_hash_heads[HASH_SIZE];
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
static inline struct list_head *chunk_hash(const struct inode *inode)
{
unsigned long n = (unsigned long)inode / L1_CACHE_BYTES;
return chunk_hash_heads + n % HASH_SIZE;
}
/* hash_lock is held by caller */
static void insert_hash(struct audit_chunk *chunk)
{
struct list_head *list = chunk_hash(chunk->watch.inode);
list_add_rcu(&chunk->hash, list);
}
/* called under rcu_read_lock */
struct audit_chunk *audit_tree_lookup(const struct inode *inode)
{
struct list_head *list = chunk_hash(inode);
struct audit_chunk *p;
list_for_each_entry_rcu(p, list, hash) {
if (p->watch.inode == inode) {
atomic_long_inc(&p->refs);
return p;
}
}
return NULL;
}
int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
{
int n;
for (n = 0; n < chunk->count; n++)
if (chunk->owners[n].owner == tree)
return 1;
return 0;
}
/* tagging and untagging inodes with trees */
static struct audit_chunk *find_chunk(struct node *p)
{
int index = p->index & ~(1U<<31);
p -= index;
return container_of(p, struct audit_chunk, owners[0]);
}
static void untag_chunk(struct node *p)
{
struct audit_chunk *chunk = find_chunk(p);
struct audit_chunk *new;
struct audit_tree *owner;
int size = chunk->count - 1;
int i, j;
if (!pin_inotify_watch(&chunk->watch)) {
/*
* Filesystem is shutting down; all watches are getting
* evicted, just take it off the node list for this
* tree and let the eviction logics take care of the
* rest.
*/
owner = p->owner;
if (owner->root == chunk) {
list_del_init(&owner->same_root);
owner->root = NULL;
}
list_del_init(&p->list);
p->owner = NULL;
put_tree(owner);
return;
}
spin_unlock(&hash_lock);
/*
* pin_inotify_watch() succeeded, so the watch won't go away
* from under us.
*/
mutex_lock(&chunk->watch.inode->inotify_mutex);
if (chunk->dead) {
mutex_unlock(&chunk->watch.inode->inotify_mutex);
goto out;
}
owner = p->owner;
if (!size) {
chunk->dead = 1;
spin_lock(&hash_lock);
list_del_init(&chunk->trees);
if (owner->root == chunk)
owner->root = NULL;
list_del_init(&p->list);
list_del_rcu(&chunk->hash);
spin_unlock(&hash_lock);
inotify_evict_watch(&chunk->watch);
mutex_unlock(&chunk->watch.inode->inotify_mutex);
put_inotify_watch(&chunk->watch);
goto out;
}
new = alloc_chunk(size);
if (!new)
goto Fallback;
if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) {
free_chunk(new);
goto Fallback;
}
chunk->dead = 1;
spin_lock(&hash_lock);
list_replace_init(&chunk->trees, &new->trees);
if (owner->root == chunk) {
list_del_init(&owner->same_root);
owner->root = NULL;
}
for (i = j = 0; j <= size; i++, j++) {
struct audit_tree *s;
if (&chunk->owners[j] == p) {
list_del_init(&p->list);
i--;
continue;
}
s = chunk->owners[j].owner;
new->owners[i].owner = s;
new->owners[i].index = chunk->owners[j].index - j + i;
if (!s) /* result of earlier fallback */
continue;
get_tree(s);
list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
}
list_replace_rcu(&chunk->hash, &new->hash);
list_for_each_entry(owner, &new->trees, same_root)
owner->root = new;
spin_unlock(&hash_lock);
inotify_evict_watch(&chunk->watch);
mutex_unlock(&chunk->watch.inode->inotify_mutex);
put_inotify_watch(&chunk->watch);
goto out;
Fallback:
// do the best we can
spin_lock(&hash_lock);
if (owner->root == chunk) {
list_del_init(&owner->same_root);
owner->root = NULL;
}
list_del_init(&p->list);
p->owner = NULL;
put_tree(owner);
spin_unlock(&hash_lock);
mutex_unlock(&chunk->watch.inode->inotify_mutex);
out:
unpin_inotify_watch(&chunk->watch);
spin_lock(&hash_lock);
}
static int create_chunk(struct inode *inode, struct audit_tree *tree)
{
struct audit_chunk *chunk = alloc_chunk(1);
if (!chunk)
return -ENOMEM;
if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) {
free_chunk(chunk);
return -ENOSPC;
}
mutex_lock(&inode->inotify_mutex);
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
chunk->dead = 1;
inotify_evict_watch(&chunk->watch);
mutex_unlock(&inode->inotify_mutex);
put_inotify_watch(&chunk->watch);
return 0;
}
chunk->owners[0].index = (1U << 31);
chunk->owners[0].owner = tree;
get_tree(tree);
list_add(&chunk->owners[0].list, &tree->chunks);
if (!tree->root) {
tree->root = chunk;
list_add(&tree->same_root, &chunk->trees);
}
insert_hash(chunk);
spin_unlock(&hash_lock);
mutex_unlock(&inode->inotify_mutex);
return 0;
}
/* the first tagged inode becomes root of tree */
static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
struct inotify_watch *watch;
struct audit_tree *owner;
struct audit_chunk *chunk, *old;
struct node *p;
int n;
if (inotify_find_watch(rtree_ih, inode, &watch) < 0)
return create_chunk(inode, tree);
old = container_of(watch, struct audit_chunk, watch);
/* are we already there? */
spin_lock(&hash_lock);
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
put_inotify_watch(&old->watch);
return 0;
}
}
spin_unlock(&hash_lock);
chunk = alloc_chunk(old->count + 1);
if (!chunk) {
put_inotify_watch(&old->watch);
return -ENOMEM;
}
mutex_lock(&inode->inotify_mutex);
if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
mutex_unlock(&inode->inotify_mutex);
put_inotify_watch(&old->watch);
free_chunk(chunk);
return -ENOSPC;
}
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
chunk->dead = 1;
inotify_evict_watch(&chunk->watch);
mutex_unlock(&inode->inotify_mutex);
put_inotify_watch(&old->watch);
put_inotify_watch(&chunk->watch);
return 0;
}
list_replace_init(&old->trees, &chunk->trees);
for (n = 0, p = chunk->owners; n < old->count; n++, p++) {
struct audit_tree *s = old->owners[n].owner;
p->owner = s;
p->index = old->owners[n].index;
if (!s) /* result of fallback in untag */
continue;
get_tree(s);
list_replace_init(&old->owners[n].list, &p->list);
}
p->index = (chunk->count - 1) | (1U<<31);
p->owner = tree;
get_tree(tree);
list_add(&p->list, &tree->chunks);
list_replace_rcu(&old->hash, &chunk->hash);
list_for_each_entry(owner, &chunk->trees, same_root)
owner->root = chunk;
old->dead = 1;
if (!tree->root) {
tree->root = chunk;
list_add(&tree->same_root, &chunk->trees);
}
spin_unlock(&hash_lock);
inotify_evict_watch(&old->watch);
mutex_unlock(&inode->inotify_mutex);
put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
put_inotify_watch(&old->watch); /* and kill it */
return 0;
}
static void kill_rules(struct audit_tree *tree)
{
struct audit_krule *rule, *next;
struct audit_entry *entry;
struct audit_buffer *ab;
list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
entry = container_of(rule, struct audit_entry, rule);
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, "op=");
audit_log_string(ab, "remove rule");
audit_log_format(ab, " dir=");
audit_log_untrustedstring(ab, rule->tree->pathname);
audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=1", rule->listnr);
audit_log_end(ab);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
call_rcu(&entry->rcu, audit_free_rule_rcu);
}
}
}
/*
* finish killing struct audit_tree
*/
static void prune_one(struct audit_tree *victim)
{
spin_lock(&hash_lock);
while (!list_empty(&victim->chunks)) {
struct node *p;
p = list_entry(victim->chunks.next, struct node, list);
untag_chunk(p);
}
spin_unlock(&hash_lock);
put_tree(victim);
}
/* trim the uncommitted chunks from tree */
static void trim_marked(struct audit_tree *tree)
{
struct list_head *p, *q;
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
return;
}
/* reorder */
for (p = tree->chunks.next; p != &tree->chunks; p = q) {
struct node *node = list_entry(p, struct node, list);
q = p->next;
if (node->index & (1U<<31)) {
list_del_init(p);
list_add(p, &tree->chunks);
}
}
while (!list_empty(&tree->chunks)) {
struct node *node;
node = list_entry(tree->chunks.next, struct node, list);
/* have we run out of marked? */
if (!(node->index & (1U<<31)))
break;
untag_chunk(node);
}
if (!tree->root && !tree->goner) {
tree->goner = 1;
spin_unlock(&hash_lock);
mutex_lock(&audit_filter_mutex);
kill_rules(tree);
list_del_init(&tree->list);
mutex_unlock(&audit_filter_mutex);
prune_one(tree);
} else {
spin_unlock(&hash_lock);
}
}
static void audit_schedule_prune(void);
/* called with audit_filter_mutex */
int audit_remove_tree_rule(struct audit_krule *rule)
{
struct audit_tree *tree;
tree = rule->tree;
if (tree) {
spin_lock(&hash_lock);
list_del_init(&rule->rlist);
if (list_empty(&tree->rules) && !tree->goner) {
tree->root = NULL;
list_del_init(&tree->same_root);
tree->goner = 1;
list_move(&tree->list, &prune_list);
rule->tree = NULL;
spin_unlock(&hash_lock);
audit_schedule_prune();
return 1;
}
rule->tree = NULL;
spin_unlock(&hash_lock);
return 1;
}
return 0;
}
void audit_trim_trees(void)
{
struct list_head cursor;
mutex_lock(&audit_filter_mutex);
list_add(&cursor, &tree_list);
while (cursor.next != &tree_list) {
struct audit_tree *tree;
struct path path;
struct vfsmount *root_mnt;
struct node *node;
struct list_head list;
int err;
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
list_del(&cursor);
list_add(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path);
if (err)
goto skip_it;
root_mnt = collect_mounts(&path);
path_put(&path);
if (!root_mnt)
goto skip_it;
list_add_tail(&list, &root_mnt->mnt_list);
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list) {
struct audit_chunk *chunk = find_chunk(node);
struct inode *inode = chunk->watch.inode;
struct vfsmount *mnt;
node->index |= 1U<<31;
list_for_each_entry(mnt, &list, mnt_list) {
if (mnt->mnt_root->d_inode == inode) {
node->index &= ~(1U<<31);
break;
}
}
}
spin_unlock(&hash_lock);
trim_marked(tree);
put_tree(tree);
list_del_init(&list);
drop_collected_mounts(root_mnt);
skip_it:
mutex_lock(&audit_filter_mutex);
}
list_del(&cursor);
mutex_unlock(&audit_filter_mutex);
}
static int is_under(struct vfsmount *mnt, struct dentry *dentry,
struct path *path)
{
if (mnt != path->mnt) {
for (;;) {
if (mnt->mnt_parent == mnt)
return 0;
if (mnt->mnt_parent == path->mnt)
break;
mnt = mnt->mnt_parent;
}
dentry = mnt->mnt_mountpoint;
}
return is_subdir(dentry, path->dentry);
}
int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
{
if (pathname[0] != '/' ||
rule->listnr != AUDIT_FILTER_EXIT ||
op != Audit_equal ||
rule->inode_f || rule->watch || rule->tree)
return -EINVAL;
rule->tree = alloc_tree(pathname);
if (!rule->tree)
return -ENOMEM;
return 0;
}
void audit_put_tree(struct audit_tree *tree)
{
put_tree(tree);
}
/* called with audit_filter_mutex */
int audit_add_tree_rule(struct audit_krule *rule)
{
struct audit_tree *seed = rule->tree, *tree;
struct path path;
struct vfsmount *mnt, *p;
struct list_head list;
int err;
list_for_each_entry(tree, &tree_list, list) {
if (!strcmp(seed->pathname, tree->pathname)) {
put_tree(seed);
rule->tree = tree;
list_add(&rule->rlist, &tree->rules);
return 0;
}
}
tree = seed;
list_add(&tree->list, &tree_list);
list_add(&rule->rlist, &tree->rules);
/* do not set rule->tree yet */
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path);
if (err)
goto Err;
mnt = collect_mounts(&path);
path_put(&path);
if (!mnt) {
err = -ENOMEM;
goto Err;
}
list_add_tail(&list, &mnt->mnt_list);
get_tree(tree);
list_for_each_entry(p, &list, mnt_list) {
err = tag_chunk(p->mnt_root->d_inode, tree);
if (err)
break;
}
list_del(&list);
drop_collected_mounts(mnt);
if (!err) {
struct node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
spin_unlock(&hash_lock);
} else {
trim_marked(tree);
goto Err;
}
mutex_lock(&audit_filter_mutex);
if (list_empty(&rule->rlist)) {
put_tree(tree);
return -ENOENT;
}
rule->tree = tree;
put_tree(tree);
return 0;
Err:
mutex_lock(&audit_filter_mutex);
list_del_init(&tree->list);
list_del_init(&tree->rules);
put_tree(tree);
return err;
}
int audit_tag_tree(char *old, char *new)
{
struct list_head cursor, barrier;
int failed = 0;
struct path path;
struct vfsmount *tagged;
struct list_head list;
struct vfsmount *mnt;
struct dentry *dentry;
int err;
err = kern_path(new, 0, &path);
if (err)
return err;
tagged = collect_mounts(&path);
path_put(&path);
if (!tagged)
return -ENOMEM;
err = kern_path(old, 0, &path);
if (err) {
drop_collected_mounts(tagged);
return err;
}
mnt = mntget(path.mnt);
dentry = dget(path.dentry);
path_put(&path);
list_add_tail(&list, &tagged->mnt_list);
mutex_lock(&audit_filter_mutex);
list_add(&barrier, &tree_list);
list_add(&cursor, &barrier);
while (cursor.next != &tree_list) {
struct audit_tree *tree;
struct vfsmount *p;
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
list_del(&cursor);
list_add(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path);
if (err) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
continue;
}
spin_lock(&vfsmount_lock);
if (!is_under(mnt, dentry, &path)) {
spin_unlock(&vfsmount_lock);
path_put(&path);
put_tree(tree);
mutex_lock(&audit_filter_mutex);
continue;
}
spin_unlock(&vfsmount_lock);
path_put(&path);
list_for_each_entry(p, &list, mnt_list) {
failed = tag_chunk(p->mnt_root->d_inode, tree);
if (failed)
break;
}
if (failed) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
break;
}
mutex_lock(&audit_filter_mutex);
spin_lock(&hash_lock);
if (!tree->goner) {
list_del(&tree->list);
list_add(&tree->list, &tree_list);
}
spin_unlock(&hash_lock);
put_tree(tree);
}
while (barrier.prev != &tree_list) {
struct audit_tree *tree;
tree = container_of(barrier.prev, struct audit_tree, list);
get_tree(tree);
list_del(&tree->list);
list_add(&tree->list, &barrier);
mutex_unlock(&audit_filter_mutex);
if (!failed) {
struct node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
spin_unlock(&hash_lock);
} else {
trim_marked(tree);
}
put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&barrier);
list_del(&cursor);
list_del(&list);
mutex_unlock(&audit_filter_mutex);
dput(dentry);
mntput(mnt);
drop_collected_mounts(tagged);
return failed;
}
/*
* That gets run when evict_chunk() ends up needing to kill audit_tree.
* Runs from a separate thread.
*/
static int prune_tree_thread(void *unused)
{
mutex_lock(&audit_cmd_mutex);
mutex_lock(&audit_filter_mutex);
while (!list_empty(&prune_list)) {
struct audit_tree *victim;
victim = list_entry(prune_list.next, struct audit_tree, list);
list_del_init(&victim->list);
mutex_unlock(&audit_filter_mutex);
prune_one(victim);
mutex_lock(&audit_filter_mutex);
}
mutex_unlock(&audit_filter_mutex);
mutex_unlock(&audit_cmd_mutex);
return 0;
}
static void audit_schedule_prune(void)
{
kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
}
/*
* ... and that one is done if evict_chunk() decides to delay until the end
* of syscall. Runs synchronously.
*/
void audit_kill_trees(struct list_head *list)
{
mutex_lock(&audit_cmd_mutex);
mutex_lock(&audit_filter_mutex);
while (!list_empty(list)) {
struct audit_tree *victim;
victim = list_entry(list->next, struct audit_tree, list);
kill_rules(victim);
list_del_init(&victim->list);
mutex_unlock(&audit_filter_mutex);
prune_one(victim);
mutex_lock(&audit_filter_mutex);
}
mutex_unlock(&audit_filter_mutex);
mutex_unlock(&audit_cmd_mutex);
}
/*
* Here comes the stuff asynchronous to auditctl operations
*/
/* inode->inotify_mutex is locked */
static void evict_chunk(struct audit_chunk *chunk)
{
struct audit_tree *owner;
struct list_head *postponed = audit_killed_trees();
int need_prune = 0;
int n;
if (chunk->dead)
return;
chunk->dead = 1;
mutex_lock(&audit_filter_mutex);
spin_lock(&hash_lock);
while (!list_empty(&chunk->trees)) {
owner = list_entry(chunk->trees.next,
struct audit_tree, same_root);
owner->goner = 1;
owner->root = NULL;
list_del_init(&owner->same_root);
spin_unlock(&hash_lock);
if (!postponed) {
kill_rules(owner);
list_move(&owner->list, &prune_list);
need_prune = 1;
} else {
list_move(&owner->list, postponed);
}
spin_lock(&hash_lock);
}
list_del_rcu(&chunk->hash);
for (n = 0; n < chunk->count; n++)
list_del_init(&chunk->owners[n].list);
spin_unlock(&hash_lock);
if (need_prune)
audit_schedule_prune();
mutex_unlock(&audit_filter_mutex);
}
static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
u32 cookie, const char *dname, struct inode *inode)
{
struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
if (mask & IN_IGNORED) {
evict_chunk(chunk);
put_inotify_watch(watch);
}
}
static void destroy_watch(struct inotify_watch *watch)
{
struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
call_rcu(&chunk->head, __put_chunk);
}
static const struct inotify_operations rtree_inotify_ops = {
.handle_event = handle_event,
.destroy_watch = destroy_watch,
};
static int __init audit_tree_init(void)
{
int i;
rtree_ih = inotify_init(&rtree_inotify_ops);
if (IS_ERR(rtree_ih))
audit_panic("cannot initialize inotify handle for rectree watches");
for (i = 0; i < HASH_SIZE; i++)
INIT_LIST_HEAD(&chunk_hash_heads[i]);
return 0;
}
__initcall(audit_tree_init);

543
kernel/kernel/audit_watch.c Normal file
View File

@@ -0,0 +1,543 @@
/* audit_watch.c -- watching inodes
*
* Copyright 2003-2009 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/sched.h>
#include <linux/inotify.h>
#include <linux/security.h>
#include "audit.h"
/*
* Reference counting:
*
* audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
* event. Each audit_watch holds a reference to its associated parent.
*
* audit_watch: if added to lists, lifetime is from audit_init_watch() to
* audit_remove_watch(). Additionally, an audit_watch may exist
* temporarily to assist in searching existing filter data. Each
* audit_krule holds a reference to its associated watch.
*/
struct audit_watch {
atomic_t count; /* reference count */
dev_t dev; /* associated superblock device */
char *path; /* insertion path */
unsigned long ino; /* associated inode number */
struct audit_parent *parent; /* associated parent */
struct list_head wlist; /* entry in parent->watches list */
struct list_head rules; /* associated rules */
};
struct audit_parent {
struct list_head ilist; /* entry in inotify registration list */
struct list_head watches; /* associated watches */
struct inotify_watch wdata; /* inotify watch data */
unsigned flags; /* status flags */
};
/* Inotify handle. */
struct inotify_handle *audit_ih;
/*
* audit_parent status flags:
*
* AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
* a filesystem event to ensure we're adding audit watches to a valid parent.
* Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
* receive them while we have nameidata, but must be used for IN_MOVE_SELF which
* we can receive while holding nameidata.
*/
#define AUDIT_PARENT_INVALID 0x001
/* Inotify events we care about. */
#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
static void audit_free_parent(struct inotify_watch *i_watch)
{
struct audit_parent *parent;
parent = container_of(i_watch, struct audit_parent, wdata);
WARN_ON(!list_empty(&parent->watches));
kfree(parent);
}
void audit_get_watch(struct audit_watch *watch)
{
atomic_inc(&watch->count);
}
void audit_put_watch(struct audit_watch *watch)
{
if (atomic_dec_and_test(&watch->count)) {
WARN_ON(watch->parent);
WARN_ON(!list_empty(&watch->rules));
kfree(watch->path);
kfree(watch);
}
}
void audit_remove_watch(struct audit_watch *watch)
{
list_del(&watch->wlist);
put_inotify_watch(&watch->parent->wdata);
watch->parent = NULL;
audit_put_watch(watch); /* match initial get */
}
char *audit_watch_path(struct audit_watch *watch)
{
return watch->path;
}
struct list_head *audit_watch_rules(struct audit_watch *watch)
{
return &watch->rules;
}
unsigned long audit_watch_inode(struct audit_watch *watch)
{
return watch->ino;
}
dev_t audit_watch_dev(struct audit_watch *watch)
{
return watch->dev;
}
/* Initialize a parent watch entry. */
static struct audit_parent *audit_init_parent(struct nameidata *ndp)
{
struct audit_parent *parent;
s32 wd;
parent = kzalloc(sizeof(*parent), GFP_KERNEL);
if (unlikely(!parent))
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&parent->watches);
parent->flags = 0;
inotify_init_watch(&parent->wdata);
/* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
get_inotify_watch(&parent->wdata);
wd = inotify_add_watch(audit_ih, &parent->wdata,
ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
if (wd < 0) {
audit_free_parent(&parent->wdata);
return ERR_PTR(wd);
}
return parent;
}
/* Initialize a watch entry. */
static struct audit_watch *audit_init_watch(char *path)
{
struct audit_watch *watch;
watch = kzalloc(sizeof(*watch), GFP_KERNEL);
if (unlikely(!watch))
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&watch->rules);
atomic_set(&watch->count, 1);
watch->path = path;
watch->dev = (dev_t)-1;
watch->ino = (unsigned long)-1;
return watch;
}
/* Translate a watch string to kernel respresentation. */
int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
{
struct audit_watch *watch;
if (!audit_ih)
return -EOPNOTSUPP;
if (path[0] != '/' || path[len-1] == '/' ||
krule->listnr != AUDIT_FILTER_EXIT ||
op != Audit_equal ||
krule->inode_f || krule->watch || krule->tree)
return -EINVAL;
watch = audit_init_watch(path);
if (IS_ERR(watch))
return PTR_ERR(watch);
audit_get_watch(watch);
krule->watch = watch;
return 0;
}
/* Duplicate the given audit watch. The new watch's rules list is initialized
* to an empty list and wlist is undefined. */
static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
{
char *path;
struct audit_watch *new;
path = kstrdup(old->path, GFP_KERNEL);
if (unlikely(!path))
return ERR_PTR(-ENOMEM);
new = audit_init_watch(path);
if (IS_ERR(new)) {
kfree(path);
goto out;
}
new->dev = old->dev;
new->ino = old->ino;
get_inotify_watch(&old->parent->wdata);
new->parent = old->parent;
out:
return new;
}
static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
{
if (audit_enabled) {
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, "auid=%u ses=%u op=",
audit_get_loginuid(current),
audit_get_sessionid(current));
audit_log_string(ab, op);
audit_log_format(ab, " path=");
audit_log_untrustedstring(ab, w->path);
audit_log_key(ab, r->filterkey);
audit_log_format(ab, " list=%d res=1", r->listnr);
audit_log_end(ab);
}
}
/* Update inode info in audit rules based on filesystem event. */
static void audit_update_watch(struct audit_parent *parent,
const char *dname, dev_t dev,
unsigned long ino, unsigned invalidating)
{
struct audit_watch *owatch, *nwatch, *nextw;
struct audit_krule *r, *nextr;
struct audit_entry *oentry, *nentry;
mutex_lock(&audit_filter_mutex);
list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
if (audit_compare_dname_path(dname, owatch->path, NULL))
continue;
/* If the update involves invalidating rules, do the inode-based
* filtering now, so we don't omit records. */
if (invalidating && current->audit_context)
audit_filter_inodes(current, current->audit_context);
nwatch = audit_dupe_watch(owatch);
if (IS_ERR(nwatch)) {
mutex_unlock(&audit_filter_mutex);
audit_panic("error updating watch, skipping");
return;
}
nwatch->dev = dev;
nwatch->ino = ino;
list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
oentry = container_of(r, struct audit_entry, rule);
list_del(&oentry->rule.rlist);
list_del_rcu(&oentry->list);
nentry = audit_dupe_rule(&oentry->rule, nwatch);
if (IS_ERR(nentry)) {
list_del(&oentry->rule.list);
audit_panic("error updating watch, removing");
} else {
int h = audit_hash_ino((u32)ino);
list_add(&nentry->rule.rlist, &nwatch->rules);
list_add_rcu(&nentry->list, &audit_inode_hash[h]);
list_replace(&oentry->rule.list,
&nentry->rule.list);
}
audit_watch_log_rule_change(r, owatch, "updated rules");
call_rcu(&oentry->rcu, audit_free_rule_rcu);
}
audit_remove_watch(owatch);
goto add_watch_to_parent; /* event applies to a single watch */
}
mutex_unlock(&audit_filter_mutex);
return;
add_watch_to_parent:
list_add(&nwatch->wlist, &parent->watches);
mutex_unlock(&audit_filter_mutex);
return;
}
/* Remove all watches & rules associated with a parent that is going away. */
static void audit_remove_parent_watches(struct audit_parent *parent)
{
struct audit_watch *w, *nextw;
struct audit_krule *r, *nextr;
struct audit_entry *e;
mutex_lock(&audit_filter_mutex);
parent->flags |= AUDIT_PARENT_INVALID;
list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
e = container_of(r, struct audit_entry, rule);
audit_watch_log_rule_change(r, w, "remove rule");
list_del(&r->rlist);
list_del(&r->list);
list_del_rcu(&e->list);
call_rcu(&e->rcu, audit_free_rule_rcu);
}
audit_remove_watch(w);
}
mutex_unlock(&audit_filter_mutex);
}
/* Unregister inotify watches for parents on in_list.
* Generates an IN_IGNORED event. */
void audit_inotify_unregister(struct list_head *in_list)
{
struct audit_parent *p, *n;
list_for_each_entry_safe(p, n, in_list, ilist) {
list_del(&p->ilist);
inotify_rm_watch(audit_ih, &p->wdata);
/* the unpin matching the pin in audit_do_del_rule() */
unpin_inotify_watch(&p->wdata);
}
}
/* Get path information necessary for adding watches. */
static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
{
struct nameidata *ndparent, *ndwatch;
int err;
ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
if (unlikely(!ndparent))
return -ENOMEM;
ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
if (unlikely(!ndwatch)) {
kfree(ndparent);
return -ENOMEM;
}
err = path_lookup(path, LOOKUP_PARENT, ndparent);
if (err) {
kfree(ndparent);
kfree(ndwatch);
return err;
}
err = path_lookup(path, 0, ndwatch);
if (err) {
kfree(ndwatch);
ndwatch = NULL;
}
*ndp = ndparent;
*ndw = ndwatch;
return 0;
}
/* Release resources used for watch path information. */
static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
{
if (ndp) {
path_put(&ndp->path);
kfree(ndp);
}
if (ndw) {
path_put(&ndw->path);
kfree(ndw);
}
}
/* Associate the given rule with an existing parent inotify_watch.
* Caller must hold audit_filter_mutex. */
static void audit_add_to_parent(struct audit_krule *krule,
struct audit_parent *parent)
{
struct audit_watch *w, *watch = krule->watch;
int watch_found = 0;
list_for_each_entry(w, &parent->watches, wlist) {
if (strcmp(watch->path, w->path))
continue;
watch_found = 1;
/* put krule's and initial refs to temporary watch */
audit_put_watch(watch);
audit_put_watch(watch);
audit_get_watch(w);
krule->watch = watch = w;
break;
}
if (!watch_found) {
get_inotify_watch(&parent->wdata);
watch->parent = parent;
list_add(&watch->wlist, &parent->watches);
}
list_add(&krule->rlist, &watch->rules);
}
/* Find a matching watch entry, or add this one.
* Caller must hold audit_filter_mutex. */
int audit_add_watch(struct audit_krule *krule)
{
struct audit_watch *watch = krule->watch;
struct inotify_watch *i_watch;
struct audit_parent *parent;
struct nameidata *ndp = NULL, *ndw = NULL;
int ret = 0;
mutex_unlock(&audit_filter_mutex);
/* Avoid calling path_lookup under audit_filter_mutex. */
ret = audit_get_nd(watch->path, &ndp, &ndw);
if (ret) {
/* caller expects mutex locked */
mutex_lock(&audit_filter_mutex);
goto error;
}
/* update watch filter fields */
if (ndw) {
watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
watch->ino = ndw->path.dentry->d_inode->i_ino;
}
/* The audit_filter_mutex must not be held during inotify calls because
* we hold it during inotify event callback processing. If an existing
* inotify watch is found, inotify_find_watch() grabs a reference before
* returning.
*/
if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
&i_watch) < 0) {
parent = audit_init_parent(ndp);
if (IS_ERR(parent)) {
/* caller expects mutex locked */
mutex_lock(&audit_filter_mutex);
ret = PTR_ERR(parent);
goto error;
}
} else
parent = container_of(i_watch, struct audit_parent, wdata);
mutex_lock(&audit_filter_mutex);
/* parent was moved before we took audit_filter_mutex */
if (parent->flags & AUDIT_PARENT_INVALID)
ret = -ENOENT;
else
audit_add_to_parent(krule, parent);
/* match get in audit_init_parent or inotify_find_watch */
put_inotify_watch(&parent->wdata);
error:
audit_put_nd(ndp, ndw); /* NULL args OK */
return ret;
}
void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
{
struct audit_watch *watch = krule->watch;
struct audit_parent *parent = watch->parent;
list_del(&krule->rlist);
if (list_empty(&watch->rules)) {
audit_remove_watch(watch);
if (list_empty(&parent->watches)) {
/* Put parent on the inotify un-registration
* list. Grab a reference before releasing
* audit_filter_mutex, to be released in
* audit_inotify_unregister().
* If filesystem is going away, just leave
* the sucker alone, eviction will take
* care of it. */
if (pin_inotify_watch(&parent->wdata))
list_add(&parent->ilist, list);
}
}
}
/* Update watch data in audit rules based on inotify events. */
static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
u32 cookie, const char *dname, struct inode *inode)
{
struct audit_parent *parent;
parent = container_of(i_watch, struct audit_parent, wdata);
if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
audit_update_watch(parent, dname, inode->i_sb->s_dev,
inode->i_ino, 0);
else if (mask & (IN_DELETE|IN_MOVED_FROM))
audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
/* inotify automatically removes the watch and sends IN_IGNORED */
else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
audit_remove_parent_watches(parent);
/* inotify does not remove the watch, so remove it manually */
else if(mask & IN_MOVE_SELF) {
audit_remove_parent_watches(parent);
inotify_remove_watch_locked(audit_ih, i_watch);
} else if (mask & IN_IGNORED)
put_inotify_watch(i_watch);
}
static const struct inotify_operations audit_inotify_ops = {
.handle_event = audit_handle_ievent,
.destroy_watch = audit_free_parent,
};
static int __init audit_watch_init(void)
{
audit_ih = inotify_init(&audit_inotify_ops);
if (IS_ERR(audit_ih))
audit_panic("cannot initialize inotify handle");
return 0;
}
subsys_initcall(audit_watch_init);

1381
kernel/kernel/auditfilter.c Normal file

File diff suppressed because it is too large Load Diff

2538
kernel/kernel/auditsc.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,91 @@
/*
* Simple stack backtrace regression test module
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/completion.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/stacktrace.h>
static void backtrace_test_normal(void)
{
printk("Testing a backtrace from process context.\n");
printk("The following trace is a kernel self test and not a bug!\n");
dump_stack();
}
static DECLARE_COMPLETION(backtrace_work);
static void backtrace_test_irq_callback(unsigned long data)
{
dump_stack();
complete(&backtrace_work);
}
static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
static void backtrace_test_irq(void)
{
printk("Testing a backtrace from irq context.\n");
printk("The following trace is a kernel self test and not a bug!\n");
init_completion(&backtrace_work);
tasklet_schedule(&backtrace_tasklet);
wait_for_completion(&backtrace_work);
}
#ifdef CONFIG_STACKTRACE
static void backtrace_test_saved(void)
{
struct stack_trace trace;
unsigned long entries[8];
printk("Testing a saved backtrace.\n");
printk("The following trace is a kernel self test and not a bug!\n");
trace.nr_entries = 0;
trace.max_entries = ARRAY_SIZE(entries);
trace.entries = entries;
trace.skip = 0;
save_stack_trace(&trace);
print_stack_trace(&trace, 0);
}
#else
static void backtrace_test_saved(void)
{
printk("Saved backtrace test skipped.\n");
}
#endif
static int backtrace_regression_test(void)
{
printk("====[ backtrace testing ]===========\n");
backtrace_test_normal();
backtrace_test_irq();
backtrace_test_saved();
printk("====[ end of backtrace testing ]====\n");
return 0;
}
static void exitf(void)
{
}
module_init(backtrace_regression_test);
module_exit(exitf);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");

19
kernel/kernel/bounds.c Normal file
View File

@@ -0,0 +1,19 @@
/*
* Generate definitions needed by the preprocessor.
* This code generates raw asm output which is post-processed
* to extract and format the required data.
*/
#define __GENERATING_BOUNDS_H
/* Include headers that define the enum constants of interest */
#include <linux/page-flags.h>
#include <linux/mmzone.h>
#include <linux/kbuild.h>
void foo(void)
{
/* The enum constants to put into include/linux/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
/* End of constants */
}

314
kernel/kernel/capability.c Normal file
View File

@@ -0,0 +1,314 @@
/*
* linux/kernel/capability.c
*
* Copyright (C) 1997 Andrew Main <zefram@fysh.org>
*
* Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org>
* 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
*/
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <asm/uaccess.h>
/*
* Leveraged for setting/resetting capabilities
*/
const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
const kernel_cap_t __cap_full_set = CAP_FULL_SET;
const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
EXPORT_SYMBOL(__cap_empty_set);
EXPORT_SYMBOL(__cap_full_set);
EXPORT_SYMBOL(__cap_init_eff_set);
#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
int file_caps_enabled = 1;
static int __init file_caps_disable(char *str)
{
file_caps_enabled = 0;
return 1;
}
__setup("no_file_caps", file_caps_disable);
#endif
/*
* More recent versions of libcap are available from:
*
* http://www.kernel.org/pub/linux/libs/security/linux-privs/
*/
static void warn_legacy_capability_use(void)
{
static int warned;
if (!warned) {
char name[sizeof(current->comm)];
printk(KERN_INFO "warning: `%s' uses 32-bit capabilities"
" (legacy support in use)\n",
get_task_comm(name, current));
warned = 1;
}
}
/*
* Version 2 capabilities worked fine, but the linux/capability.h file
* that accompanied their introduction encouraged their use without
* the necessary user-space source code changes. As such, we have
* created a version 3 with equivalent functionality to version 2, but
* with a header change to protect legacy source code from using
* version 2 when it wanted to use version 1. If your system has code
* that trips the following warning, it is using version 2 specific
* capabilities and may be doing so insecurely.
*
* The remedy is to either upgrade your version of libcap (to 2.10+,
* if the application is linked against it), or recompile your
* application with modern kernel headers and this warning will go
* away.
*/
static void warn_deprecated_v2(void)
{
static int warned;
if (!warned) {
char name[sizeof(current->comm)];
printk(KERN_INFO "warning: `%s' uses deprecated v2"
" capabilities in a way that may be insecure.\n",
get_task_comm(name, current));
warned = 1;
}
}
/*
* Version check. Return the number of u32s in each capability flag
* array, or a negative value on error.
*/
static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
{
__u32 version;
if (get_user(version, &header->version))
return -EFAULT;
switch (version) {
case _LINUX_CAPABILITY_VERSION_1:
warn_legacy_capability_use();
*tocopy = _LINUX_CAPABILITY_U32S_1;
break;
case _LINUX_CAPABILITY_VERSION_2:
warn_deprecated_v2();
/*
* fall through - v3 is otherwise equivalent to v2.
*/
case _LINUX_CAPABILITY_VERSION_3:
*tocopy = _LINUX_CAPABILITY_U32S_3;
break;
default:
if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
return -EFAULT;
return -EINVAL;
}
return 0;
}
/*
* The only thing that can change the capabilities of the current
* process is the current process. As such, we can't be in this code
* at the same time as we are in the process of setting capabilities
* in this process. The net result is that we can limit our use of
* locks to when we are reading the caps of another process.
*/
static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
kernel_cap_t *pIp, kernel_cap_t *pPp)
{
int ret;
if (pid && (pid != task_pid_vnr(current))) {
struct task_struct *target;
read_lock(&tasklist_lock);
target = find_task_by_vpid(pid);
if (!target)
ret = -ESRCH;
else
ret = security_capget(target, pEp, pIp, pPp);
read_unlock(&tasklist_lock);
} else
ret = security_capget(current, pEp, pIp, pPp);
return ret;
}
/**
* sys_capget - get the capabilities of a given process.
* @header: pointer to struct that contains capability version and
* target pid data
* @dataptr: pointer to struct that contains the effective, permitted,
* and inheritable capabilities that are returned
*
* Returns 0 on success and < 0 on error.
*/
SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
{
int ret = 0;
pid_t pid;
unsigned tocopy;
kernel_cap_t pE, pI, pP;
ret = cap_validate_magic(header, &tocopy);
if (ret != 0)
return ret;
if (get_user(pid, &header->pid))
return -EFAULT;
if (pid < 0)
return -EINVAL;
ret = cap_get_target_pid(pid, &pE, &pI, &pP);
if (!ret) {
struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
unsigned i;
for (i = 0; i < tocopy; i++) {
kdata[i].effective = pE.cap[i];
kdata[i].permitted = pP.cap[i];
kdata[i].inheritable = pI.cap[i];
}
/*
* Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
* we silently drop the upper capabilities here. This
* has the effect of making older libcap
* implementations implicitly drop upper capability
* bits when they perform a: capget/modify/capset
* sequence.
*
* This behavior is considered fail-safe
* behavior. Upgrading the application to a newer
* version of libcap will enable access to the newer
* capabilities.
*
* An alternative would be to return an error here
* (-ERANGE), but that causes legacy applications to
* unexpectidly fail; the capget/modify/capset aborts
* before modification is attempted and the application
* fails.
*/
if (copy_to_user(dataptr, kdata, tocopy
* sizeof(struct __user_cap_data_struct))) {
return -EFAULT;
}
}
return ret;
}
/**
* sys_capset - set capabilities for a process or (*) a group of processes
* @header: pointer to struct that contains capability version and
* target pid data
* @data: pointer to struct that contains the effective, permitted,
* and inheritable capabilities
*
* Set capabilities for the current process only. The ability to any other
* process(es) has been deprecated and removed.
*
* The restrictions on setting capabilities are specified as:
*
* I: any raised capabilities must be a subset of the old permitted
* P: any raised capabilities must be a subset of the old permitted
* E: must be set to a subset of new permitted
*
* Returns 0 on success and < 0 on error.
*/
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
unsigned i, tocopy;
kernel_cap_t inheritable, permitted, effective;
struct cred *new;
int ret;
pid_t pid;
ret = cap_validate_magic(header, &tocopy);
if (ret != 0)
return ret;
if (get_user(pid, &header->pid))
return -EFAULT;
/* may only affect current now */
if (pid != 0 && pid != task_pid_vnr(current))
return -EPERM;
if (copy_from_user(&kdata, data,
tocopy * sizeof(struct __user_cap_data_struct)))
return -EFAULT;
for (i = 0; i < tocopy; i++) {
effective.cap[i] = kdata[i].effective;
permitted.cap[i] = kdata[i].permitted;
inheritable.cap[i] = kdata[i].inheritable;
}
while (i < _KERNEL_CAPABILITY_U32S) {
effective.cap[i] = 0;
permitted.cap[i] = 0;
inheritable.cap[i] = 0;
i++;
}
new = prepare_creds();
if (!new)
return -ENOMEM;
ret = security_capset(new, current_cred(),
&effective, &inheritable, &permitted);
if (ret < 0)
goto error;
audit_log_capset(pid, new, current_cred());
return commit_creds(new);
error:
abort_creds(new);
return ret;
}
/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
* Return true if the current task has the given superior capability currently
* available for use, false if not.
*
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
int capable(int cap)
{
if (unlikely(!cap_valid(cap))) {
printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
BUG();
}
if (security_capable(cap) == 0) {
current->flags |= PF_SUPERPRIV;
return 1;
}
return 0;
}
EXPORT_SYMBOL(capable);

4196
kernel/kernel/cgroup.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,395 @@
/*
* cgroup_freezer.c - control group freezer subsystem
*
* Copyright IBM Corporation, 2007
*
* Author : Cedric Le Goater <clg@fr.ibm.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*/
#include <linux/module.h>
#include <linux/cgroup.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
enum freezer_state {
CGROUP_THAWED = 0,
CGROUP_FREEZING,
CGROUP_FROZEN,
};
struct freezer {
struct cgroup_subsys_state css;
enum freezer_state state;
spinlock_t lock; /* protects _writes_ to state */
};
static inline struct freezer *cgroup_freezer(
struct cgroup *cgroup)
{
return container_of(
cgroup_subsys_state(cgroup, freezer_subsys_id),
struct freezer, css);
}
static inline struct freezer *task_freezer(struct task_struct *task)
{
return container_of(task_subsys_state(task, freezer_subsys_id),
struct freezer, css);
}
int cgroup_freezing_or_frozen(struct task_struct *task)
{
struct freezer *freezer;
enum freezer_state state;
task_lock(task);
freezer = task_freezer(task);
if (!freezer->css.cgroup->parent)
state = CGROUP_THAWED; /* root cgroup can't be frozen */
else
state = freezer->state;
task_unlock(task);
return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
}
/*
* cgroups_write_string() limits the size of freezer state strings to
* CGROUP_LOCAL_BUFFER_SIZE
*/
static const char *freezer_state_strs[] = {
"THAWED",
"FREEZING",
"FROZEN",
};
/*
* State diagram
* Transitions are caused by userspace writes to the freezer.state file.
* The values in parenthesis are state labels. The rest are edge labels.
*
* (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
* ^ ^ | |
* | \_______THAWED_______/ |
* \__________________________THAWED____________/
*/
struct cgroup_subsys freezer_subsys;
/* Locks taken and their ordering
* ------------------------------
* css_set_lock
* cgroup_mutex (AKA cgroup_lock)
* task->alloc_lock (AKA task_lock)
* freezer->lock
* task->sighand->siglock
*
* cgroup code forces css_set_lock to be taken before task->alloc_lock
*
* freezer_create(), freezer_destroy():
* cgroup_mutex [ by cgroup core ]
*
* can_attach():
* cgroup_mutex
*
* cgroup_frozen():
* task->alloc_lock (to get task's cgroup)
*
* freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
* task->alloc_lock (to get task's cgroup)
* freezer->lock
* sighand->siglock (if the cgroup is freezing)
*
* freezer_read():
* cgroup_mutex
* freezer->lock
* read_lock css_set_lock (cgroup iterator start)
*
* freezer_write() (freeze):
* cgroup_mutex
* freezer->lock
* read_lock css_set_lock (cgroup iterator start)
* sighand->siglock
*
* freezer_write() (unfreeze):
* cgroup_mutex
* freezer->lock
* read_lock css_set_lock (cgroup iterator start)
* task->alloc_lock (to prevent races with freeze_task())
* sighand->siglock
*/
static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
struct cgroup *cgroup)
{
struct freezer *freezer;
freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
if (!freezer)
return ERR_PTR(-ENOMEM);
spin_lock_init(&freezer->lock);
freezer->state = CGROUP_THAWED;
return &freezer->css;
}
static void freezer_destroy(struct cgroup_subsys *ss,
struct cgroup *cgroup)
{
kfree(cgroup_freezer(cgroup));
}
/* Task is frozen or will freeze immediately when next it gets woken */
static bool is_task_frozen_enough(struct task_struct *task)
{
return frozen(task) ||
(task_is_stopped_or_traced(task) && freezing(task));
}
/*
* The call to cgroup_lock() in the freezer.state write method prevents
* a write to that file racing against an attach, and hence the
* can_attach() result will remain valid until the attach completes.
*/
static int freezer_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup,
struct task_struct *task, bool threadgroup)
{
struct freezer *freezer;
/*
* Anything frozen can't move or be moved to/from.
*
* Since orig_freezer->state == FROZEN means that @task has been
* frozen, so it's sufficient to check the latter condition.
*/
if (is_task_frozen_enough(task))
return -EBUSY;
freezer = cgroup_freezer(new_cgroup);
if (freezer->state == CGROUP_FROZEN)
return -EBUSY;
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
if (is_task_frozen_enough(c)) {
rcu_read_unlock();
return -EBUSY;
}
}
rcu_read_unlock();
}
return 0;
}
static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
{
struct freezer *freezer;
/*
* No lock is needed, since the task isn't on tasklist yet,
* so it can't be moved to another cgroup, which means the
* freezer won't be removed and will be valid during this
* function call.
*/
freezer = task_freezer(task);
/*
* The root cgroup is non-freezable, so we can skip the
* following check.
*/
if (!freezer->css.cgroup->parent)
return;
spin_lock_irq(&freezer->lock);
BUG_ON(freezer->state == CGROUP_FROZEN);
/* Locking avoids race with FREEZING -> THAWED transitions. */
if (freezer->state == CGROUP_FREEZING)
freeze_task(task, true);
spin_unlock_irq(&freezer->lock);
}
/*
* caller must hold freezer->lock
*/
static void update_freezer_state(struct cgroup *cgroup,
struct freezer *freezer)
{
struct cgroup_iter it;
struct task_struct *task;
unsigned int nfrozen = 0, ntotal = 0;
cgroup_iter_start(cgroup, &it);
while ((task = cgroup_iter_next(cgroup, &it))) {
ntotal++;
if (is_task_frozen_enough(task))
nfrozen++;
}
/*
* Transition to FROZEN when no new tasks can be added ensures
* that we never exist in the FROZEN state while there are unfrozen
* tasks.
*/
if (nfrozen == ntotal)
freezer->state = CGROUP_FROZEN;
else if (nfrozen > 0)
freezer->state = CGROUP_FREEZING;
else
freezer->state = CGROUP_THAWED;
cgroup_iter_end(cgroup, &it);
}
static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
struct seq_file *m)
{
struct freezer *freezer;
enum freezer_state state;
if (!cgroup_lock_live_group(cgroup))
return -ENODEV;
freezer = cgroup_freezer(cgroup);
spin_lock_irq(&freezer->lock);
state = freezer->state;
if (state == CGROUP_FREEZING) {
/* We change from FREEZING to FROZEN lazily if the cgroup was
* only partially frozen when we exitted write. */
update_freezer_state(cgroup, freezer);
state = freezer->state;
}
spin_unlock_irq(&freezer->lock);
cgroup_unlock();
seq_puts(m, freezer_state_strs[state]);
seq_putc(m, '\n');
return 0;
}
static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
{
struct cgroup_iter it;
struct task_struct *task;
unsigned int num_cant_freeze_now = 0;
freezer->state = CGROUP_FREEZING;
cgroup_iter_start(cgroup, &it);
while ((task = cgroup_iter_next(cgroup, &it))) {
if (!freeze_task(task, true))
continue;
if (is_task_frozen_enough(task))
continue;
if (!freezing(task) && !freezer_should_skip(task))
num_cant_freeze_now++;
}
cgroup_iter_end(cgroup, &it);
return num_cant_freeze_now ? -EBUSY : 0;
}
static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
{
struct cgroup_iter it;
struct task_struct *task;
cgroup_iter_start(cgroup, &it);
while ((task = cgroup_iter_next(cgroup, &it))) {
thaw_process(task);
}
cgroup_iter_end(cgroup, &it);
freezer->state = CGROUP_THAWED;
}
static int freezer_change_state(struct cgroup *cgroup,
enum freezer_state goal_state)
{
struct freezer *freezer;
int retval = 0;
freezer = cgroup_freezer(cgroup);
spin_lock_irq(&freezer->lock);
update_freezer_state(cgroup, freezer);
if (goal_state == freezer->state)
goto out;
switch (goal_state) {
case CGROUP_THAWED:
unfreeze_cgroup(cgroup, freezer);
break;
case CGROUP_FROZEN:
retval = try_to_freeze_cgroup(cgroup, freezer);
break;
default:
BUG();
}
out:
spin_unlock_irq(&freezer->lock);
return retval;
}
static int freezer_write(struct cgroup *cgroup,
struct cftype *cft,
const char *buffer)
{
int retval;
enum freezer_state goal_state;
if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
goal_state = CGROUP_THAWED;
else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
goal_state = CGROUP_FROZEN;
else
return -EINVAL;
if (!cgroup_lock_live_group(cgroup))
return -ENODEV;
retval = freezer_change_state(cgroup, goal_state);
cgroup_unlock();
return retval;
}
static struct cftype files[] = {
{
.name = "state",
.read_seq_string = freezer_read,
.write_string = freezer_write,
},
};
static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
{
if (!cgroup->parent)
return 0;
return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
}
struct cgroup_subsys freezer_subsys = {
.name = "freezer",
.create = freezer_create,
.destroy = freezer_destroy,
.populate = freezer_populate,
.subsys_id = freezer_subsys_id,
.can_attach = freezer_can_attach,
.attach = NULL,
.fork = freezer_fork,
.exit = NULL,
};

1160
kernel/kernel/compat.c Normal file

File diff suppressed because it is too large Load Diff

98
kernel/kernel/configs.c Normal file
View File

@@ -0,0 +1,98 @@
/*
* kernel/configs.c
* Echo the kernel .config file used to build the kernel
*
* Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
* Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
* Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
* Copyright (C) 2002 Hewlett-Packard Company
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <asm/uaccess.h>
/**************************************************/
/* the actual current config file */
/*
* Define kernel_config_data and kernel_config_data_size, which contains the
* wrapped and compressed configuration file. The file is first compressed
* with gzip and then bounded by two eight byte magic numbers to allow
* extraction from a binary kernel image:
*
* IKCFG_ST
* <image>
* IKCFG_ED
*/
#define MAGIC_START "IKCFG_ST"
#define MAGIC_END "IKCFG_ED"
#include "config_data.h"
#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
#define kernel_config_data_size \
(sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
#ifdef CONFIG_IKCONFIG_PROC
static ssize_t
ikconfig_read_current(struct file *file, char __user *buf,
size_t len, loff_t * offset)
{
return simple_read_from_buffer(buf, len, offset,
kernel_config_data + MAGIC_SIZE,
kernel_config_data_size);
}
static const struct file_operations ikconfig_file_ops = {
.owner = THIS_MODULE,
.read = ikconfig_read_current,
};
static int __init ikconfig_init(void)
{
struct proc_dir_entry *entry;
/* create the current config file */
entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
&ikconfig_file_ops);
if (!entry)
return -ENOMEM;
entry->size = kernel_config_data_size;
return 0;
}
static void __exit ikconfig_cleanup(void)
{
remove_proc_entry("config.gz", NULL);
}
module_init(ikconfig_init);
module_exit(ikconfig_cleanup);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Randy Dunlap");
MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
#endif /* CONFIG_IKCONFIG_PROC */

562
kernel/kernel/cpu.c Normal file
View File

@@ -0,0 +1,562 @@
/* CPU control.
* (C) 2001, 2002, 2003, 2004 Rusty Russell
*
* This code is licenced under the GPL.
*/
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/sched.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
*/
static int cpu_hotplug_disabled;
static struct {
struct task_struct *active_writer;
struct mutex lock; /* Synchronizes accesses to refcount, */
/*
* Also blocks the new readers during
* an ongoing cpu hotplug operation.
*/
int refcount;
} cpu_hotplug = {
.active_writer = NULL,
.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
.refcount = 0,
};
#ifdef CONFIG_HOTPLUG_CPU
void get_online_cpus(void)
{
might_sleep();
if (cpu_hotplug.active_writer == current)
return;
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
}
EXPORT_SYMBOL_GPL(get_online_cpus);
void put_online_cpus(void)
{
if (cpu_hotplug.active_writer == current)
return;
mutex_lock(&cpu_hotplug.lock);
if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
wake_up_process(cpu_hotplug.active_writer);
mutex_unlock(&cpu_hotplug.lock);
}
EXPORT_SYMBOL_GPL(put_online_cpus);
#endif /* CONFIG_HOTPLUG_CPU */
/*
* The following two API's must be used when attempting
* to serialize the updates to cpu_online_mask, cpu_present_mask.
*/
void cpu_maps_update_begin(void)
{
mutex_lock(&cpu_add_remove_lock);
}
void cpu_maps_update_done(void)
{
mutex_unlock(&cpu_add_remove_lock);
}
/*
* This ensures that the hotplug operation can begin only when the
* refcount goes to zero.
*
* Note that during a cpu-hotplug operation, the new readers, if any,
* will be blocked by the cpu_hotplug.lock
*
* Since cpu_hotplug_begin() is always called after invoking
* cpu_maps_update_begin(), we can be sure that only one writer is active.
*
* Note that theoretically, there is a possibility of a livelock:
* - Refcount goes to zero, last reader wakes up the sleeping
* writer.
* - Last reader unlocks the cpu_hotplug.lock.
* - A new reader arrives at this moment, bumps up the refcount.
* - The writer acquires the cpu_hotplug.lock finds the refcount
* non zero and goes to sleep again.
*
* However, this is very difficult to achieve in practice since
* get_online_cpus() not an api which is called all that often.
*
*/
static void cpu_hotplug_begin(void)
{
cpu_hotplug.active_writer = current;
for (;;) {
mutex_lock(&cpu_hotplug.lock);
if (likely(!cpu_hotplug.refcount))
break;
__set_current_state(TASK_UNINTERRUPTIBLE);
mutex_unlock(&cpu_hotplug.lock);
schedule();
}
}
static void cpu_hotplug_done(void)
{
cpu_hotplug.active_writer = NULL;
mutex_unlock(&cpu_hotplug.lock);
}
/* Need to know about CPUs going up/down? */
int __ref register_cpu_notifier(struct notifier_block *nb)
{
int ret;
cpu_maps_update_begin();
ret = raw_notifier_chain_register(&cpu_chain, nb);
cpu_maps_update_done();
return ret;
}
#ifdef CONFIG_HOTPLUG_CPU
EXPORT_SYMBOL(register_cpu_notifier);
void __ref unregister_cpu_notifier(struct notifier_block *nb)
{
cpu_maps_update_begin();
raw_notifier_chain_unregister(&cpu_chain, nb);
cpu_maps_update_done();
}
EXPORT_SYMBOL(unregister_cpu_notifier);
static inline void check_for_tasks(int cpu)
{
struct task_struct *p;
write_lock_irq(&tasklist_lock);
for_each_process(p) {
if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
(!cputime_eq(p->utime, cputime_zero) ||
!cputime_eq(p->stime, cputime_zero)))
printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
(state = %ld, flags = %x) \n",
p->comm, task_pid_nr(p), cpu,
p->state, p->flags);
}
write_unlock_irq(&tasklist_lock);
}
struct take_cpu_down_param {
struct task_struct *caller;
unsigned long mod;
void *hcpu;
};
/* Take this CPU down. */
static int __ref take_cpu_down(void *_param)
{
struct take_cpu_down_param *param = _param;
unsigned int cpu = (unsigned long)param->hcpu;
int err;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
return err;
raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
param->hcpu);
if (task_cpu(param->caller) == cpu)
move_task_off_dead_cpu(cpu, param->caller);
/* Force idle task to run as soon as we yield: it should
immediately notice cpu is offline and die quickly. */
sched_idle_next();
return 0;
}
/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
{
int err, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct take_cpu_down_param tcd_param = {
.caller = current,
.mod = mod,
.hcpu = hcpu,
};
if (num_online_cpus() == 1)
return -EBUSY;
if (!cpu_online(cpu))
return -EINVAL;
cpu_hotplug_begin();
set_cpu_active(cpu, false);
err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
hcpu, -1, &nr_calls);
if (err == NOTIFY_BAD) {
set_cpu_active(cpu, true);
nr_calls--;
__raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
hcpu, nr_calls, NULL);
printk("%s: attempt to take down CPU %u failed\n",
__func__, cpu);
err = -EINVAL;
goto out_release;
}
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
set_cpu_active(cpu, true);
/* CPU didn't die: tell everyone. Can't complain. */
if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
hcpu) == NOTIFY_BAD)
BUG();
goto out_release;
}
BUG_ON(cpu_online(cpu));
/* Wait for it to sleep (leaving idle task). */
while (!idle_cpu(cpu))
yield();
/* This actually kills the CPU. */
__cpu_die(cpu);
/* CPU is completely dead: tell everyone. Too late to complain. */
if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
hcpu) == NOTIFY_BAD)
BUG();
check_for_tasks(cpu);
out_release:
cpu_hotplug_done();
if (!err) {
if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
hcpu) == NOTIFY_BAD)
BUG();
}
return err;
}
int __ref cpu_down(unsigned int cpu)
{
int err;
err = stop_machine_create();
if (err)
return err;
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
err = -EBUSY;
goto out;
}
err = _cpu_down(cpu, 0);
out:
cpu_maps_update_done();
stop_machine_destroy();
return err;
}
EXPORT_SYMBOL(cpu_down);
#endif /*CONFIG_HOTPLUG_CPU*/
/* Requires cpu_add_remove_lock to be held */
static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
{
int ret, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
if (cpu_online(cpu) || !cpu_present(cpu))
return -EINVAL;
cpu_hotplug_begin();
ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
-1, &nr_calls);
if (ret == NOTIFY_BAD) {
nr_calls--;
printk("%s: attempt to bring up CPU %u failed\n",
__func__, cpu);
ret = -EINVAL;
goto out_notify;
}
/* Arch-specific enabling code. */
ret = __cpu_up(cpu);
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
set_cpu_active(cpu, true);
/* Now call notifier in preparation. */
raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
out_notify:
if (ret != 0)
__raw_notifier_call_chain(&cpu_chain,
CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
cpu_hotplug_done();
return ret;
}
int __cpuinit cpu_up(unsigned int cpu)
{
int err = 0;
if (!cpu_possible(cpu)) {
printk(KERN_ERR "can't online cpu %d because it is not "
"configured as may-hotadd at boot time\n", cpu);
#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
printk(KERN_ERR "please check additional_cpus= boot "
"parameter\n");
#endif
return -EINVAL;
}
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
err = -EBUSY;
goto out;
}
err = _cpu_up(cpu, 0);
out:
cpu_maps_update_done();
return err;
}
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
int disable_nonboot_cpus(void)
{
int cpu, first_cpu, error;
error = stop_machine_create();
if (error)
return error;
cpu_maps_update_begin();
first_cpu = cpumask_first(cpu_online_mask);
/*
* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
*/
cpumask_clear(frozen_cpus);
printk("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
if (cpu == first_cpu)
continue;
error = _cpu_down(cpu, 1);
if (!error)
cpumask_set_cpu(cpu, frozen_cpus);
else {
printk(KERN_ERR "Error taking CPU%d down: %d\n",
cpu, error);
break;
}
}
if (!error) {
BUG_ON(num_online_cpus() > 1);
/* Make sure the CPUs won't be enabled by someone else */
cpu_hotplug_disabled = 1;
} else {
printk(KERN_ERR "Non-boot CPUs are not disabled\n");
}
cpu_maps_update_done();
stop_machine_destroy();
return error;
}
void __weak arch_enable_nonboot_cpus_begin(void)
{
}
void __weak arch_enable_nonboot_cpus_end(void)
{
}
void __ref enable_nonboot_cpus(void)
{
int cpu, error;
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
cpu_hotplug_disabled = 0;
if (cpumask_empty(frozen_cpus))
goto out;
printk("Enabling non-boot CPUs ...\n");
arch_enable_nonboot_cpus_begin();
for_each_cpu(cpu, frozen_cpus) {
error = _cpu_up(cpu, 1);
if (!error) {
printk("CPU%d is up\n", cpu);
continue;
}
printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
}
arch_enable_nonboot_cpus_end();
cpumask_clear(frozen_cpus);
out:
cpu_maps_update_done();
}
static int alloc_frozen_cpus(void)
{
if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
return -ENOMEM;
return 0;
}
core_initcall(alloc_frozen_cpus);
#endif /* CONFIG_PM_SLEEP_SMP */
/**
* notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
* @cpu: cpu that just started
*
* This function calls the cpu_chain notifiers with CPU_STARTING.
* It must be called by the arch code on the new cpu, before the new cpu
* enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
void __cpuinit notify_cpu_starting(unsigned int cpu)
{
unsigned long val = CPU_STARTING;
#ifdef CONFIG_PM_SLEEP_SMP
if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
val = CPU_STARTING_FROZEN;
#endif /* CONFIG_PM_SLEEP_SMP */
raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
}
#endif /* CONFIG_SMP */
/*
* cpu_bit_bitmap[] is a special, "compressed" data structure that
* represents all NR_CPUS bits binary values of 1<<nr.
*
* It is used by cpumask_of() to get a constant address to a CPU
* mask value that has a single bit set only.
*/
/* cpu_bit_bitmap[0] is empty - so we can back into it */
#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x)
#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
MASK_DECLARE_8(0), MASK_DECLARE_8(8),
MASK_DECLARE_8(16), MASK_DECLARE_8(24),
#if BITS_PER_LONG > 32
MASK_DECLARE_8(32), MASK_DECLARE_8(40),
MASK_DECLARE_8(48), MASK_DECLARE_8(56),
#endif
};
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);
#ifdef CONFIG_INIT_ALL_POSSIBLE
static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
= CPU_BITS_ALL;
#else
static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
#endif
const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
EXPORT_SYMBOL(cpu_possible_mask);
static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
EXPORT_SYMBOL(cpu_online_mask);
static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
EXPORT_SYMBOL(cpu_present_mask);
static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
EXPORT_SYMBOL(cpu_active_mask);
void set_cpu_possible(unsigned int cpu, bool possible)
{
if (possible)
cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
else
cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
}
void set_cpu_present(unsigned int cpu, bool present)
{
if (present)
cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
else
cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
}
void set_cpu_online(unsigned int cpu, bool online)
{
if (online)
cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
else
cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
}
void set_cpu_active(unsigned int cpu, bool active)
{
if (active)
cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
else
cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
}
void init_cpu_present(const struct cpumask *src)
{
cpumask_copy(to_cpumask(cpu_present_bits), src);
}
void init_cpu_possible(const struct cpumask *src)
{
cpumask_copy(to_cpumask(cpu_possible_bits), src);
}
void init_cpu_online(const struct cpumask *src)
{
cpumask_copy(to_cpumask(cpu_online_bits), src);
}

2581
kernel/kernel/cpuset.c Normal file

File diff suppressed because it is too large Load Diff

916
kernel/kernel/cred.c Normal file
View File

@@ -0,0 +1,916 @@
/* Task credentials management - see Documentation/credentials.txt
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public Licence
* as published by the Free Software Foundation; either version
* 2 of the Licence, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/cred.h>
#include <linux/sched.h>
#include <linux/key.h>
#include <linux/keyctl.h>
#include <linux/init_task.h>
#include <linux/security.h>
#include <linux/cn_proc.h>
#if 0
#define kdebug(FMT, ...) \
printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
#else
static inline __attribute__((format(printf, 1, 2)))
void no_printk(const char *fmt, ...)
{
}
#define kdebug(FMT, ...) \
no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
#endif
static struct kmem_cache *cred_jar;
/*
* The common credentials for the initial task's thread group
*/
#ifdef CONFIG_KEYS
static struct thread_group_cred init_tgcred = {
.usage = ATOMIC_INIT(2),
.tgid = 0,
.lock = SPIN_LOCK_UNLOCKED,
};
#endif
/*
* The initial credentials for the initial task
*/
struct cred init_cred = {
.usage = ATOMIC_INIT(4),
#ifdef CONFIG_DEBUG_CREDENTIALS
.subscribers = ATOMIC_INIT(2),
.magic = CRED_MAGIC,
#endif
.securebits = SECUREBITS_DEFAULT,
.cap_inheritable = CAP_INIT_INH_SET,
.cap_permitted = CAP_FULL_SET,
.cap_effective = CAP_INIT_EFF_SET,
.cap_bset = CAP_INIT_BSET,
.user = INIT_USER,
.group_info = &init_groups,
#ifdef CONFIG_KEYS
.tgcred = &init_tgcred,
#endif
};
static inline void set_cred_subscribers(struct cred *cred, int n)
{
#ifdef CONFIG_DEBUG_CREDENTIALS
atomic_set(&cred->subscribers, n);
#endif
}
static inline int read_cred_subscribers(const struct cred *cred)
{
#ifdef CONFIG_DEBUG_CREDENTIALS
return atomic_read(&cred->subscribers);
#else
return 0;
#endif
}
static inline void alter_cred_subscribers(const struct cred *_cred, int n)
{
#ifdef CONFIG_DEBUG_CREDENTIALS
struct cred *cred = (struct cred *) _cred;
atomic_add(n, &cred->subscribers);
#endif
}
/*
* Dispose of the shared task group credentials
*/
#ifdef CONFIG_KEYS
static void release_tgcred_rcu(struct rcu_head *rcu)
{
struct thread_group_cred *tgcred =
container_of(rcu, struct thread_group_cred, rcu);
BUG_ON(atomic_read(&tgcred->usage) != 0);
key_put(tgcred->session_keyring);
key_put(tgcred->process_keyring);
kfree(tgcred);
}
#endif
/*
* Release a set of thread group credentials.
*/
static void release_tgcred(struct cred *cred)
{
#ifdef CONFIG_KEYS
struct thread_group_cred *tgcred = cred->tgcred;
if (atomic_dec_and_test(&tgcred->usage))
call_rcu(&tgcred->rcu, release_tgcred_rcu);
#endif
}
/*
* The RCU callback to actually dispose of a set of credentials
*/
static void put_cred_rcu(struct rcu_head *rcu)
{
struct cred *cred = container_of(rcu, struct cred, rcu);
kdebug("put_cred_rcu(%p)", cred);
#ifdef CONFIG_DEBUG_CREDENTIALS
if (cred->magic != CRED_MAGIC_DEAD ||
atomic_read(&cred->usage) != 0 ||
read_cred_subscribers(cred) != 0)
panic("CRED: put_cred_rcu() sees %p with"
" mag %x, put %p, usage %d, subscr %d\n",
cred, cred->magic, cred->put_addr,
atomic_read(&cred->usage),
read_cred_subscribers(cred));
#else
if (atomic_read(&cred->usage) != 0)
panic("CRED: put_cred_rcu() sees %p with usage %d\n",
cred, atomic_read(&cred->usage));
#endif
security_cred_free(cred);
key_put(cred->thread_keyring);
key_put(cred->request_key_auth);
release_tgcred(cred);
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
kmem_cache_free(cred_jar, cred);
}
/**
* __put_cred - Destroy a set of credentials
* @cred: The record to release
*
* Destroy a set of credentials on which no references remain.
*/
void __put_cred(struct cred *cred)
{
kdebug("__put_cred(%p{%d,%d})", cred,
atomic_read(&cred->usage),
read_cred_subscribers(cred));
BUG_ON(atomic_read(&cred->usage) != 0);
#ifdef CONFIG_DEBUG_CREDENTIALS
BUG_ON(read_cred_subscribers(cred) != 0);
cred->magic = CRED_MAGIC_DEAD;
cred->put_addr = __builtin_return_address(0);
#endif
BUG_ON(cred == current->cred);
BUG_ON(cred == current->real_cred);
call_rcu(&cred->rcu, put_cred_rcu);
}
EXPORT_SYMBOL(__put_cred);
/*
* Clean up a task's credentials when it exits
*/
void exit_creds(struct task_struct *tsk)
{
struct cred *cred;
kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
atomic_read(&tsk->cred->usage),
read_cred_subscribers(tsk->cred));
cred = (struct cred *) tsk->real_cred;
tsk->real_cred = NULL;
validate_creds(cred);
alter_cred_subscribers(cred, -1);
put_cred(cred);
cred = (struct cred *) tsk->cred;
tsk->cred = NULL;
validate_creds(cred);
alter_cred_subscribers(cred, -1);
put_cred(cred);
cred = (struct cred *) tsk->replacement_session_keyring;
if (cred) {
tsk->replacement_session_keyring = NULL;
validate_creds(cred);
put_cred(cred);
}
}
/**
* get_task_cred - Get another task's objective credentials
* @task: The task to query
*
* Get the objective credentials of a task, pinning them so that they can't go
* away. Accessing a task's credentials directly is not permitted.
*
* The caller must also make sure task doesn't get deleted, either by holding a
* ref on task or by holding tasklist_lock to prevent it from being unlinked.
*/
const struct cred *get_task_cred(struct task_struct *task)
{
const struct cred *cred;
rcu_read_lock();
do {
cred = __task_cred((task));
BUG_ON(!cred);
} while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
rcu_read_unlock();
return cred;
}
/*
* Allocate blank credentials, such that the credentials can be filled in at a
* later date without risk of ENOMEM.
*/
struct cred *cred_alloc_blank(void)
{
struct cred *new;
new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
#ifdef CONFIG_KEYS
new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
if (!new->tgcred) {
kmem_cache_free(cred_jar, new);
return NULL;
}
atomic_set(&new->tgcred->usage, 1);
#endif
atomic_set(&new->usage, 1);
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
#endif
if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
goto error;
return new;
error:
abort_creds(new);
return NULL;
}
/**
* prepare_creds - Prepare a new set of credentials for modification
*
* Prepare a new set of task credentials for modification. A task's creds
* shouldn't generally be modified directly, therefore this function is used to
* prepare a new copy, which the caller then modifies and then commits by
* calling commit_creds().
*
* Preparation involves making a copy of the objective creds for modification.
*
* Returns a pointer to the new creds-to-be if successful, NULL otherwise.
*
* Call commit_creds() or abort_creds() to clean up.
*/
struct cred *prepare_creds(void)
{
struct task_struct *task = current;
const struct cred *old;
struct cred *new;
validate_process_creds();
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
kdebug("prepare_creds() alloc %p", new);
old = task->cred;
memcpy(new, old, sizeof(struct cred));
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
#ifdef CONFIG_KEYS
key_get(new->thread_keyring);
key_get(new->request_key_auth);
atomic_inc(&new->tgcred->usage);
#endif
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
goto error;
validate_creds(new);
return new;
error:
abort_creds(new);
return NULL;
}
EXPORT_SYMBOL(prepare_creds);
/*
* Prepare credentials for current to perform an execve()
* - The caller must hold current->cred_guard_mutex
*/
struct cred *prepare_exec_creds(void)
{
struct thread_group_cred *tgcred = NULL;
struct cred *new;
#ifdef CONFIG_KEYS
tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
if (!tgcred)
return NULL;
#endif
new = prepare_creds();
if (!new) {
kfree(tgcred);
return new;
}
#ifdef CONFIG_KEYS
/* newly exec'd tasks don't get a thread keyring */
key_put(new->thread_keyring);
new->thread_keyring = NULL;
/* create a new per-thread-group creds for all this set of threads to
* share */
memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
atomic_set(&tgcred->usage, 1);
spin_lock_init(&tgcred->lock);
/* inherit the session keyring; new process keyring */
key_get(tgcred->session_keyring);
tgcred->process_keyring = NULL;
release_tgcred(new);
new->tgcred = tgcred;
#endif
return new;
}
/*
* prepare new credentials for the usermode helper dispatcher
*/
struct cred *prepare_usermodehelper_creds(void)
{
#ifdef CONFIG_KEYS
struct thread_group_cred *tgcred = NULL;
#endif
struct cred *new;
#ifdef CONFIG_KEYS
tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
if (!tgcred)
return NULL;
#endif
new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
if (!new)
return NULL;
kdebug("prepare_usermodehelper_creds() alloc %p", new);
memcpy(new, &init_cred, sizeof(struct cred));
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
#ifdef CONFIG_KEYS
new->thread_keyring = NULL;
new->request_key_auth = NULL;
new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
atomic_set(&tgcred->usage, 1);
spin_lock_init(&tgcred->lock);
new->tgcred = tgcred;
#endif
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
goto error;
validate_creds(new);
BUG_ON(atomic_read(&new->usage) != 1);
return new;
error:
put_cred(new);
return NULL;
}
/*
* Copy credentials for the new process created by fork()
*
* We share if we can, but under some circumstances we have to generate a new
* set.
*
* The new process gets the current process's subjective credentials as its
* objective and subjective credentials
*/
int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
#ifdef CONFIG_KEYS
struct thread_group_cred *tgcred;
#endif
struct cred *new;
int ret;
mutex_init(&p->cred_guard_mutex);
if (
#ifdef CONFIG_KEYS
!p->cred->thread_keyring &&
#endif
clone_flags & CLONE_THREAD
) {
p->real_cred = get_cred(p->cred);
get_cred(p->cred);
alter_cred_subscribers(p->cred, 2);
kdebug("share_creds(%p{%d,%d})",
p->cred, atomic_read(&p->cred->usage),
read_cred_subscribers(p->cred));
atomic_inc(&p->cred->user->processes);
return 0;
}
new = prepare_creds();
if (!new)
return -ENOMEM;
if (clone_flags & CLONE_NEWUSER) {
ret = create_user_ns(new);
if (ret < 0)
goto error_put;
}
#ifdef CONFIG_KEYS
/* new threads get their own thread keyrings if their parent already
* had one */
if (new->thread_keyring) {
key_put(new->thread_keyring);
new->thread_keyring = NULL;
if (clone_flags & CLONE_THREAD)
install_thread_keyring_to_cred(new);
}
/* we share the process and session keyrings between all the threads in
* a process - this is slightly icky as we violate COW credentials a
* bit */
if (!(clone_flags & CLONE_THREAD)) {
tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
if (!tgcred) {
ret = -ENOMEM;
goto error_put;
}
atomic_set(&tgcred->usage, 1);
spin_lock_init(&tgcred->lock);
tgcred->process_keyring = NULL;
tgcred->session_keyring = key_get(new->tgcred->session_keyring);
release_tgcred(new);
new->tgcred = tgcred;
}
#endif
atomic_inc(&new->user->processes);
p->cred = p->real_cred = get_cred(new);
alter_cred_subscribers(new, 2);
validate_creds(new);
return 0;
error_put:
put_cred(new);
return ret;
}
/**
* commit_creds - Install new credentials upon the current task
* @new: The credentials to be assigned
*
* Install a new set of credentials to the current task, using RCU to replace
* the old set. Both the objective and the subjective credentials pointers are
* updated. This function may not be called if the subjective credentials are
* in an overridden state.
*
* This function eats the caller's reference to the new credentials.
*
* Always returns 0 thus allowing this function to be tail-called at the end
* of, say, sys_setgid().
*/
int commit_creds(struct cred *new)
{
struct task_struct *task = current;
const struct cred *old = task->real_cred;
kdebug("commit_creds(%p{%d,%d})", new,
atomic_read(&new->usage),
read_cred_subscribers(new));
BUG_ON(task->cred != old);
#ifdef CONFIG_DEBUG_CREDENTIALS
BUG_ON(read_cred_subscribers(old) < 2);
validate_creds(old);
validate_creds(new);
#endif
BUG_ON(atomic_read(&new->usage) < 1);
security_commit_creds(new, old);
get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */
if (old->euid != new->euid ||
old->egid != new->egid ||
old->fsuid != new->fsuid ||
old->fsgid != new->fsgid ||
!cap_issubset(new->cap_permitted, old->cap_permitted)) {
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
smp_wmb();
}
/* alter the thread keyring */
if (new->fsuid != old->fsuid)
key_fsuid_changed(task);
if (new->fsgid != old->fsgid)
key_fsgid_changed(task);
/* do it
* - What if a process setreuid()'s and this brings the
* new uid over his NPROC rlimit? We can check this now
* cheaply with the new uid cache, so if it matters
* we should be checking for it. -DaveM
*/
alter_cred_subscribers(new, 2);
if (new->user != old->user)
atomic_inc(&new->user->processes);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user)
atomic_dec(&old->user->processes);
alter_cred_subscribers(old, -2);
/* send notifications */
if (new->uid != old->uid ||
new->euid != old->euid ||
new->suid != old->suid ||
new->fsuid != old->fsuid)
proc_id_connector(task, PROC_EVENT_UID);
if (new->gid != old->gid ||
new->egid != old->egid ||
new->sgid != old->sgid ||
new->fsgid != old->fsgid)
proc_id_connector(task, PROC_EVENT_GID);
/* release the old obj and subj refs both */
put_cred(old);
put_cred(old);
return 0;
}
EXPORT_SYMBOL(commit_creds);
/**
* abort_creds - Discard a set of credentials and unlock the current task
* @new: The credentials that were going to be applied
*
* Discard a set of credentials that were under construction and unlock the
* current task.
*/
void abort_creds(struct cred *new)
{
kdebug("abort_creds(%p{%d,%d})", new,
atomic_read(&new->usage),
read_cred_subscribers(new));
#ifdef CONFIG_DEBUG_CREDENTIALS
BUG_ON(read_cred_subscribers(new) != 0);
#endif
BUG_ON(atomic_read(&new->usage) < 1);
put_cred(new);
}
EXPORT_SYMBOL(abort_creds);
/**
* override_creds - Override the current process's subjective credentials
* @new: The credentials to be assigned
*
* Install a set of temporary override subjective credentials on the current
* process, returning the old set for later reversion.
*/
const struct cred *override_creds(const struct cred *new)
{
const struct cred *old = current->cred;
kdebug("override_creds(%p{%d,%d})", new,
atomic_read(&new->usage),
read_cred_subscribers(new));
validate_creds(old);
validate_creds(new);
get_cred(new);
alter_cred_subscribers(new, 1);
rcu_assign_pointer(current->cred, new);
alter_cred_subscribers(old, -1);
kdebug("override_creds() = %p{%d,%d}", old,
atomic_read(&old->usage),
read_cred_subscribers(old));
return old;
}
EXPORT_SYMBOL(override_creds);
/**
* revert_creds - Revert a temporary subjective credentials override
* @old: The credentials to be restored
*
* Revert a temporary set of override subjective credentials to an old set,
* discarding the override set.
*/
void revert_creds(const struct cred *old)
{
const struct cred *override = current->cred;
kdebug("revert_creds(%p{%d,%d})", old,
atomic_read(&old->usage),
read_cred_subscribers(old));
validate_creds(old);
validate_creds(override);
alter_cred_subscribers(old, 1);
rcu_assign_pointer(current->cred, old);
alter_cred_subscribers(override, -1);
put_cred(override);
}
EXPORT_SYMBOL(revert_creds);
/*
* initialise the credentials stuff
*/
void __init cred_init(void)
{
/* allocate a slab in which we can store credentials */
cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
}
/**
* prepare_kernel_cred - Prepare a set of credentials for a kernel service
* @daemon: A userspace daemon to be used as a reference
*
* Prepare a set of credentials for a kernel service. This can then be used to
* override a task's own credentials so that work can be done on behalf of that
* task that requires a different subjective context.
*
* @daemon is used to provide a base for the security record, but can be NULL.
* If @daemon is supplied, then the security data will be derived from that;
* otherwise they'll be set to 0 and no groups, full capabilities and no keys.
*
* The caller may change these controls afterwards if desired.
*
* Returns the new credentials or NULL if out of memory.
*
* Does not take, and does not return holding current->cred_replace_mutex.
*/
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
const struct cred *old;
struct cred *new;
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
kdebug("prepare_kernel_cred() alloc %p", new);
if (daemon)
old = get_task_cred(daemon);
else
old = get_cred(&init_cred);
validate_creds(old);
*new = *old;
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_uid(new->user);
get_group_info(new->group_info);
#ifdef CONFIG_KEYS
atomic_inc(&init_tgcred.usage);
new->tgcred = &init_tgcred;
new->request_key_auth = NULL;
new->thread_keyring = NULL;
new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
#endif
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
goto error;
put_cred(old);
validate_creds(new);
return new;
error:
put_cred(new);
put_cred(old);
return NULL;
}
EXPORT_SYMBOL(prepare_kernel_cred);
/**
* set_security_override - Set the security ID in a set of credentials
* @new: The credentials to alter
* @secid: The LSM security ID to set
*
* Set the LSM security ID in a set of credentials so that the subjective
* security is overridden when an alternative set of credentials is used.
*/
int set_security_override(struct cred *new, u32 secid)
{
return security_kernel_act_as(new, secid);
}
EXPORT_SYMBOL(set_security_override);
/**
* set_security_override_from_ctx - Set the security ID in a set of credentials
* @new: The credentials to alter
* @secctx: The LSM security context to generate the security ID from.
*
* Set the LSM security ID in a set of credentials so that the subjective
* security is overridden when an alternative set of credentials is used. The
* security ID is specified in string form as a security context to be
* interpreted by the LSM.
*/
int set_security_override_from_ctx(struct cred *new, const char *secctx)
{
u32 secid;
int ret;
ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
if (ret < 0)
return ret;
return set_security_override(new, secid);
}
EXPORT_SYMBOL(set_security_override_from_ctx);
/**
* set_create_files_as - Set the LSM file create context in a set of credentials
* @new: The credentials to alter
* @inode: The inode to take the context from
*
* Change the LSM file creation context in a set of credentials to be the same
* as the object context of the specified inode, so that the new inodes have
* the same MAC context as that inode.
*/
int set_create_files_as(struct cred *new, struct inode *inode)
{
new->fsuid = inode->i_uid;
new->fsgid = inode->i_gid;
return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);
#ifdef CONFIG_DEBUG_CREDENTIALS
bool creds_are_invalid(const struct cred *cred)
{
if (cred->magic != CRED_MAGIC)
return true;
#ifdef CONFIG_SECURITY_SELINUX
/*
* cred->security == NULL if security_cred_alloc_blank() or
* security_prepare_creds() returned an error.
*/
if (selinux_is_enabled() && cred->security) {
if ((unsigned long) cred->security < PAGE_SIZE)
return true;
if ((*(u32 *)cred->security & 0xffffff00) ==
(POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
return true;
}
#endif
return false;
}
EXPORT_SYMBOL(creds_are_invalid);
/*
* dump invalid credentials
*/
static void dump_invalid_creds(const struct cred *cred, const char *label,
const struct task_struct *tsk)
{
printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
label, cred,
cred == &init_cred ? "[init]" : "",
cred == tsk->real_cred ? "[real]" : "",
cred == tsk->cred ? "[eff]" : "");
printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
cred->magic, cred->put_addr);
printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
atomic_read(&cred->usage),
read_cred_subscribers(cred));
printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
cred->uid, cred->euid, cred->suid, cred->fsuid);
printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
cred->gid, cred->egid, cred->sgid, cred->fsgid);
#ifdef CONFIG_SECURITY
printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
if ((unsigned long) cred->security >= PAGE_SIZE &&
(((unsigned long) cred->security & 0xffffff00) !=
(POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
printk(KERN_ERR "CRED: ->security {%x, %x}\n",
((u32*)cred->security)[0],
((u32*)cred->security)[1]);
#endif
}
/*
* report use of invalid credentials
*/
void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
{
printk(KERN_ERR "CRED: Invalid credentials\n");
printk(KERN_ERR "CRED: At %s:%u\n", file, line);
dump_invalid_creds(cred, "Specified", current);
BUG();
}
EXPORT_SYMBOL(__invalid_creds);
/*
* check the credentials on a process
*/
void __validate_process_creds(struct task_struct *tsk,
const char *file, unsigned line)
{
if (tsk->cred == tsk->real_cred) {
if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
creds_are_invalid(tsk->cred)))
goto invalid_creds;
} else {
if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
read_cred_subscribers(tsk->cred) < 1 ||
creds_are_invalid(tsk->real_cred) ||
creds_are_invalid(tsk->cred)))
goto invalid_creds;
}
return;
invalid_creds:
printk(KERN_ERR "CRED: Invalid process credentials\n");
printk(KERN_ERR "CRED: At %s:%u\n", file, line);
dump_invalid_creds(tsk->real_cred, "Real", tsk);
if (tsk->cred != tsk->real_cred)
dump_invalid_creds(tsk->cred, "Effective", tsk);
else
printk(KERN_ERR "CRED: Effective creds == Real creds\n");
BUG();
}
EXPORT_SYMBOL(__validate_process_creds);
/*
* check creds for do_exit()
*/
void validate_creds_for_do_exit(struct task_struct *tsk)
{
kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
tsk->real_cred, tsk->cred,
atomic_read(&tsk->cred->usage),
read_cred_subscribers(tsk->cred));
__validate_process_creds(tsk, __FILE__, __LINE__);
}
#endif /* CONFIG_DEBUG_CREDENTIALS */

184
kernel/kernel/delayacct.c Normal file
View File

@@ -0,0 +1,184 @@
/* delayacct.c - per-task delay accounting
*
* Copyright (C) Shailabh Nagar, IBM Corp. 2006
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it would be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
* the GNU General Public License for more details.
*/
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/taskstats.h>
#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>
int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
struct kmem_cache *delayacct_cache;
static int __init delayacct_setup_disable(char *str)
{
delayacct_on = 0;
return 1;
}
__setup("nodelayacct", delayacct_setup_disable);
void delayacct_init(void)
{
delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
delayacct_tsk_init(&init_task);
}
void __delayacct_tsk_init(struct task_struct *tsk)
{
tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
if (tsk->delays)
spin_lock_init(&tsk->delays->lock);
}
/*
* Start accounting for a delay statistic using
* its starting timestamp (@start)
*/
static inline void delayacct_start(struct timespec *start)
{
do_posix_clock_monotonic_gettime(start);
}
/*
* Finish delay accounting for a statistic using
* its timestamps (@start, @end), accumalator (@total) and @count
*/
static void delayacct_end(struct timespec *start, struct timespec *end,
u64 *total, u32 *count)
{
struct timespec ts;
s64 ns;
unsigned long flags;
do_posix_clock_monotonic_gettime(end);
ts = timespec_sub(*end, *start);
ns = timespec_to_ns(&ts);
if (ns < 0)
return;
spin_lock_irqsave(&current->delays->lock, flags);
*total += ns;
(*count)++;
spin_unlock_irqrestore(&current->delays->lock, flags);
}
void __delayacct_blkio_start(void)
{
delayacct_start(&current->delays->blkio_start);
}
void __delayacct_blkio_end(void)
{
if (current->delays->flags & DELAYACCT_PF_SWAPIN)
/* Swapin block I/O */
delayacct_end(&current->delays->blkio_start,
&current->delays->blkio_end,
&current->delays->swapin_delay,
&current->delays->swapin_count);
else /* Other block I/O */
delayacct_end(&current->delays->blkio_start,
&current->delays->blkio_end,
&current->delays->blkio_delay,
&current->delays->blkio_count);
}
int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
{
s64 tmp;
unsigned long t1;
unsigned long long t2, t3;
unsigned long flags;
struct timespec ts;
/* Though tsk->delays accessed later, early exit avoids
* unnecessary returning of other data
*/
if (!tsk->delays)
goto done;
tmp = (s64)d->cpu_run_real_total;
cputime_to_timespec(tsk->utime + tsk->stime, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
tmp = (s64)d->cpu_scaled_run_real_total;
cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
/*
* No locking available for sched_info (and too expensive to add one)
* Mitigate by taking snapshot of values
*/
t1 = tsk->sched_info.pcount;
t2 = tsk->sched_info.run_delay;
t3 = tsk->se.sum_exec_runtime;
d->cpu_count += t1;
tmp = (s64)d->cpu_delay_total + t2;
d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
tmp = (s64)d->cpu_run_virtual_total + t3;
d->cpu_run_virtual_total =
(tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
spin_lock_irqsave(&tsk->delays->lock, flags);
tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
d->blkio_count += tsk->delays->blkio_count;
d->swapin_count += tsk->delays->swapin_count;
d->freepages_count += tsk->delays->freepages_count;
spin_unlock_irqrestore(&tsk->delays->lock, flags);
done:
return 0;
}
__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
{
__u64 ret;
unsigned long flags;
spin_lock_irqsave(&tsk->delays->lock, flags);
ret = nsec_to_clock_t(tsk->delays->blkio_delay +
tsk->delays->swapin_delay);
spin_unlock_irqrestore(&tsk->delays->lock, flags);
return ret;
}
void __delayacct_freepages_start(void)
{
delayacct_start(&current->delays->freepages_start);
}
void __delayacct_freepages_end(void)
{
delayacct_end(&current->delays->freepages_start,
&current->delays->freepages_end,
&current->delays->freepages_delay,
&current->delays->freepages_count);
}

161
kernel/kernel/dma.c Normal file
View File

@@ -0,0 +1,161 @@
/*
* linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
*
* Written by Hennus Bergman, 1992.
*
* 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma.
* In the previous version the reported device could end up being wrong,
* if a device requested a DMA channel that was already in use.
* [It also happened to remove the sizeof(char *) == sizeof(int)
* assumption introduced because of those /proc/dma patches. -- Hennus]
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <asm/dma.h>
#include <asm/system.h>
/* A note on resource allocation:
*
* All drivers needing DMA channels, should allocate and release them
* through the public routines `request_dma()' and `free_dma()'.
*
* In order to avoid problems, all processes should allocate resources in
* the same sequence and release them in the reverse order.
*
* So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA.
* When releasing them, first release the DMA, then release the IRQ.
* If you don't, you may cause allocation requests to fail unnecessarily.
* This doesn't really matter now, but it will once we get real semaphores
* in the kernel.
*/
DEFINE_SPINLOCK(dma_spin_lock);
/*
* If our port doesn't define this it has no PC like DMA
*/
#ifdef MAX_DMA_CHANNELS
/* Channel n is busy iff dma_chan_busy[n].lock != 0.
* DMA0 used to be reserved for DRAM refresh, but apparently not any more...
* DMA4 is reserved for cascading.
*/
struct dma_chan {
int lock;
const char *device_id;
};
static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
[4] = { 1, "cascade" },
};
/**
* request_dma - request and reserve a system DMA channel
* @dmanr: DMA channel number
* @device_id: reserving device ID string, used in /proc/dma
*/
int request_dma(unsigned int dmanr, const char * device_id)
{
if (dmanr >= MAX_DMA_CHANNELS)
return -EINVAL;
if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0)
return -EBUSY;
dma_chan_busy[dmanr].device_id = device_id;
/* old flag was 0, now contains 1 to indicate busy */
return 0;
} /* request_dma */
/**
* free_dma - free a reserved system DMA channel
* @dmanr: DMA channel number
*/
void free_dma(unsigned int dmanr)
{
if (dmanr >= MAX_DMA_CHANNELS) {
printk(KERN_WARNING "Trying to free DMA%d\n", dmanr);
return;
}
if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) {
printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr);
return;
}
} /* free_dma */
#else
int request_dma(unsigned int dmanr, const char *device_id)
{
return -EINVAL;
}
void free_dma(unsigned int dmanr)
{
}
#endif
#ifdef CONFIG_PROC_FS
#ifdef MAX_DMA_CHANNELS
static int proc_dma_show(struct seq_file *m, void *v)
{
int i;
for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) {
if (dma_chan_busy[i].lock) {
seq_printf(m, "%2d: %s\n", i,
dma_chan_busy[i].device_id);
}
}
return 0;
}
#else
static int proc_dma_show(struct seq_file *m, void *v)
{
seq_puts(m, "No DMA\n");
return 0;
}
#endif /* MAX_DMA_CHANNELS */
static int proc_dma_open(struct inode *inode, struct file *file)
{
return single_open(file, proc_dma_show, NULL);
}
static const struct file_operations proc_dma_operations = {
.open = proc_dma_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int __init proc_dma_init(void)
{
proc_create("dma", 0, NULL, &proc_dma_operations);
return 0;
}
__initcall(proc_dma_init);
#endif
EXPORT_SYMBOL(request_dma);
EXPORT_SYMBOL(free_dma);
EXPORT_SYMBOL(dma_spin_lock);

207
kernel/kernel/exec_domain.c Normal file
View File

@@ -0,0 +1,207 @@
/*
* Handling of different ABIs (personalities).
*
* We group personalities into execution domains which have their
* own handlers for kernel entry points, signal mapping, etc...
*
* 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org)
*/
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/personality.h>
#include <linux/proc_fs.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/types.h>
#include <linux/fs_struct.h>
static void default_handler(int, struct pt_regs *);
static struct exec_domain *exec_domains = &default_exec_domain;
static DEFINE_RWLOCK(exec_domains_lock);
static u_long ident_map[32] = {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31
};
struct exec_domain default_exec_domain = {
.name = "Linux", /* name */
.handler = default_handler, /* lcall7 causes a seg fault. */
.pers_low = 0, /* PER_LINUX personality. */
.pers_high = 0, /* PER_LINUX personality. */
.signal_map = ident_map, /* Identity map signals. */
.signal_invmap = ident_map, /* - both ways. */
};
static void
default_handler(int segment, struct pt_regs *regp)
{
set_personality(0);
if (current_thread_info()->exec_domain->handler != default_handler)
current_thread_info()->exec_domain->handler(segment, regp);
else
send_sig(SIGSEGV, current, 1);
}
static struct exec_domain *
lookup_exec_domain(u_long personality)
{
struct exec_domain * ep;
u_long pers = personality(personality);
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next) {
if (pers >= ep->pers_low && pers <= ep->pers_high)
if (try_module_get(ep->module))
goto out;
}
#ifdef CONFIG_MODULES
read_unlock(&exec_domains_lock);
request_module("personality-%ld", pers);
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next) {
if (pers >= ep->pers_low && pers <= ep->pers_high)
if (try_module_get(ep->module))
goto out;
}
#endif
ep = &default_exec_domain;
out:
read_unlock(&exec_domains_lock);
return (ep);
}
int
register_exec_domain(struct exec_domain *ep)
{
struct exec_domain *tmp;
int err = -EBUSY;
if (ep == NULL)
return -EINVAL;
if (ep->next != NULL)
return -EBUSY;
write_lock(&exec_domains_lock);
for (tmp = exec_domains; tmp; tmp = tmp->next) {
if (tmp == ep)
goto out;
}
ep->next = exec_domains;
exec_domains = ep;
err = 0;
out:
write_unlock(&exec_domains_lock);
return (err);
}
int
unregister_exec_domain(struct exec_domain *ep)
{
struct exec_domain **epp;
epp = &exec_domains;
write_lock(&exec_domains_lock);
for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
if (ep == *epp)
goto unregister;
}
write_unlock(&exec_domains_lock);
return -EINVAL;
unregister:
*epp = ep->next;
ep->next = NULL;
write_unlock(&exec_domains_lock);
return 0;
}
int
__set_personality(u_long personality)
{
struct exec_domain *ep, *oep;
ep = lookup_exec_domain(personality);
if (ep == current_thread_info()->exec_domain) {
current->personality = personality;
module_put(ep->module);
return 0;
}
current->personality = personality;
oep = current_thread_info()->exec_domain;
current_thread_info()->exec_domain = ep;
module_put(oep->module);
return 0;
}
#ifdef CONFIG_PROC_FS
static int execdomains_proc_show(struct seq_file *m, void *v)
{
struct exec_domain *ep;
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next)
seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
ep->pers_low, ep->pers_high, ep->name,
module_name(ep->module));
read_unlock(&exec_domains_lock);
return 0;
}
static int execdomains_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, execdomains_proc_show, NULL);
}
static const struct file_operations execdomains_proc_fops = {
.open = execdomains_proc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int __init proc_execdomains_init(void)
{
proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
return 0;
}
module_init(proc_execdomains_init);
#endif
SYSCALL_DEFINE1(personality, u_long, personality)
{
u_long old = current->personality;
if (personality != 0xffffffff) {
set_personality(personality);
if (current->personality != personality)
return -EINVAL;
}
return (long)old;
}
EXPORT_SYMBOL(register_exec_domain);
EXPORT_SYMBOL(unregister_exec_domain);
EXPORT_SYMBOL(__set_personality);

1787
kernel/kernel/exit.c Normal file

File diff suppressed because it is too large Load Diff

115
kernel/kernel/extable.c Normal file
View File

@@ -0,0 +1,115 @@
/* Rewritten by Rusty Russell, on the backs of many others...
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <asm/sections.h>
#include <asm/uaccess.h>
/*
* mutex protecting text section modification (dynamic code patching).
* some users need to sleep (allocating memory...) while they hold this lock.
*
* NOT exported to modules - patching kernel text is a really delicate matter.
*/
DEFINE_MUTEX(text_mutex);
extern struct exception_table_entry __start___ex_table[];
extern struct exception_table_entry __stop___ex_table[];
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
{
sort_extable(__start___ex_table, __stop___ex_table);
}
/* Given an address, look for it in the exception tables. */
const struct exception_table_entry *search_exception_tables(unsigned long addr)
{
const struct exception_table_entry *e;
e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
if (!e)
e = search_module_extables(addr);
return e;
}
static inline int init_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext &&
addr <= (unsigned long)_einittext)
return 1;
return 0;
}
int core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr <= (unsigned long)_etext)
return 1;
if (system_state == SYSTEM_BOOTING &&
init_kernel_text(addr))
return 1;
return 0;
}
int __kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
if (is_module_text_address(addr))
return 1;
/*
* There might be init symbols in saved stacktraces.
* Give those symbols a chance to be printed in
* backtraces (such as lockdep traces).
*
* Since we are after the module-symbols check, there's
* no danger of address overlap:
*/
if (init_kernel_text(addr))
return 1;
return 0;
}
int kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
return is_module_text_address(addr);
}
/*
* On some architectures (PPC64, IA64) function pointers
* are actually only tokens to some data that then holds the
* real function address. As a result, to find if a function
* pointer is part of the kernel text, we need to do some
* special dereferencing first.
*/
int func_ptr_is_kernel_text(void *ptr)
{
unsigned long addr;
addr = (unsigned long) dereference_function_descriptor(ptr);
if (core_kernel_text(addr))
return 1;
return is_module_text_address(addr);
}

1756
kernel/kernel/fork.c Normal file

File diff suppressed because it is too large Load Diff

161
kernel/kernel/freezer.c Normal file
View File

@@ -0,0 +1,161 @@
/*
* kernel/freezer.c - Function to freeze a process
*
* Originally from kernel/power/process.c
*/
#include <linux/interrupt.h>
#include <linux/suspend.h>
#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/freezer.h>
/*
* freezing is complete, mark current process as frozen
*/
static inline void frozen_process(void)
{
if (!unlikely(current->flags & PF_NOFREEZE)) {
current->flags |= PF_FROZEN;
wmb();
}
clear_freeze_flag(current);
}
/* Refrigerator is place where frozen processes are stored :-). */
void refrigerator(void)
{
/* Hmm, should we be allowed to suspend when there are realtime
processes around? */
long save;
task_lock(current);
if (freezing(current)) {
frozen_process();
task_unlock(current);
} else {
task_unlock(current);
return;
}
save = current->state;
pr_debug("%s entered refrigerator\n", current->comm);
spin_lock_irq(&current->sighand->siglock);
recalc_sigpending(); /* We sent fake signal, clean it up */
spin_unlock_irq(&current->sighand->siglock);
/* prevent accounting of that task to load */
current->flags |= PF_FREEZING;
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!frozen(current))
break;
schedule();
}
/* Remove the accounting blocker */
current->flags &= ~PF_FREEZING;
pr_debug("%s left refrigerator\n", current->comm);
__set_current_state(save);
}
EXPORT_SYMBOL(refrigerator);
static void fake_signal_wake_up(struct task_struct *p)
{
unsigned long flags;
spin_lock_irqsave(&p->sighand->siglock, flags);
signal_wake_up(p, 0);
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
/**
* freeze_task - send a freeze request to given task
* @p: task to send the request to
* @sig_only: if set, the request will only be sent if the task has the
* PF_FREEZER_NOSIG flag unset
* Return value: 'false', if @sig_only is set and the task has
* PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
*
* The freeze request is sent by setting the tasks's TIF_FREEZE flag and
* either sending a fake signal to it or waking it up, depending on whether
* or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
* has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
* TIF_FREEZE flag will not be set.
*/
bool freeze_task(struct task_struct *p, bool sig_only)
{
/*
* We first check if the task is freezing and next if it has already
* been frozen to avoid the race with frozen_process() which first marks
* the task as frozen and next clears its TIF_FREEZE.
*/
if (!freezing(p)) {
rmb();
if (frozen(p))
return false;
if (!sig_only || should_send_signal(p))
set_freeze_flag(p);
else
return false;
}
if (should_send_signal(p)) {
if (!signal_pending(p))
fake_signal_wake_up(p);
} else if (sig_only) {
return false;
} else {
wake_up_state(p, TASK_INTERRUPTIBLE);
}
return true;
}
void cancel_freezing(struct task_struct *p)
{
unsigned long flags;
if (freezing(p)) {
pr_debug(" clean up: %s\n", p->comm);
clear_freeze_flag(p);
spin_lock_irqsave(&p->sighand->siglock, flags);
recalc_sigpending_and_wake(p);
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
}
static int __thaw_process(struct task_struct *p)
{
if (frozen(p)) {
p->flags &= ~PF_FROZEN;
return 1;
}
clear_freeze_flag(p);
return 0;
}
/*
* Wake up a frozen process
*
* task_lock() is needed to prevent the race with refrigerator() which may
* occur if the freezing of tasks fails. Namely, without the lock, if the
* freezing of tasks failed, thaw_tasks() might have run before a task in
* refrigerator() could call frozen_process(), in which case the task would be
* frozen and no one would thaw it.
*/
int thaw_process(struct task_struct *p)
{
task_lock(p);
if (__thaw_process(p) == 1) {
task_unlock(p);
wake_up_process(p);
return 1;
}
task_unlock(p);
return 0;
}
EXPORT_SYMBOL(thaw_process);

2665
kernel/kernel/futex.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,200 @@
/*
* linux/kernel/futex_compat.c
*
* Futex compatibililty routines.
*
* Copyright 2006, Red Hat, Inc., Ingo Molnar
*/
#include <linux/linkage.h>
#include <linux/compat.h>
#include <linux/nsproxy.h>
#include <linux/futex.h>
#include <asm/uaccess.h>
/*
* Fetch a robust-list pointer. Bit 0 signals PI futexes:
*/
static inline int
fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
compat_uptr_t __user *head, int *pi)
{
if (get_user(*uentry, head))
return -EFAULT;
*entry = compat_ptr((*uentry) & ~1);
*pi = (unsigned int)(*uentry) & 1;
return 0;
}
static void __user *futex_uaddr(struct robust_list __user *entry,
compat_long_t futex_offset)
{
compat_uptr_t base = ptr_to_compat(entry);
void __user *uaddr = compat_ptr(base + futex_offset);
return uaddr;
}
/*
* Walk curr->robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
*
* We silently return on any sign of list-walking problem.
*/
void compat_exit_robust_list(struct task_struct *curr)
{
struct compat_robust_list_head __user *head = curr->compat_robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
compat_uptr_t uentry, next_uentry, upending;
compat_long_t futex_offset;
int rc;
if (!futex_cmpxchg_enabled)
return;
/*
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
*/
if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
return;
/*
* Fetch the relative futex offset:
*/
if (get_user(futex_offset, &head->futex_offset))
return;
/*
* Fetch any possibly pending lock-add first, and handle it
* if it exists:
*/
if (fetch_robust_entry(&upending, &pending,
&head->list_op_pending, &pip))
return;
next_entry = NULL; /* avoid warning with gcc */
while (entry != (struct robust_list __user *) &head->list) {
/*
* Fetch the next entry in the list before calling
* handle_futex_death:
*/
rc = fetch_robust_entry(&next_uentry, &next_entry,
(compat_uptr_t __user *)&entry->next, &next_pi);
/*
* A pending lock might already be on the list, so
* dont process it twice:
*/
if (entry != pending) {
void __user *uaddr = futex_uaddr(entry, futex_offset);
if (handle_futex_death(uaddr, curr, pi))
return;
}
if (rc)
return;
uentry = next_uentry;
entry = next_entry;
pi = next_pi;
/*
* Avoid excessively long or circular lists:
*/
if (!--limit)
break;
cond_resched();
}
if (pending) {
void __user *uaddr = futex_uaddr(pending, futex_offset);
handle_futex_death(uaddr, curr, pip);
}
}
asmlinkage long
compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
compat_size_t len)
{
if (!futex_cmpxchg_enabled)
return -ENOSYS;
if (unlikely(len != sizeof(*head)))
return -EINVAL;
current->compat_robust_list = head;
return 0;
}
asmlinkage long
compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
compat_size_t __user *len_ptr)
{
struct compat_robust_list_head __user *head;
unsigned long ret;
const struct cred *cred = current_cred(), *pcred;
if (!futex_cmpxchg_enabled)
return -ENOSYS;
if (!pid)
head = current->compat_robust_list;
else {
struct task_struct *p;
ret = -ESRCH;
read_lock(&tasklist_lock);
p = find_task_by_vpid(pid);
if (!p)
goto err_unlock;
ret = -EPERM;
pcred = __task_cred(p);
if (cred->euid != pcred->euid &&
cred->euid != pcred->uid &&
!capable(CAP_SYS_PTRACE))
goto err_unlock;
head = p->compat_robust_list;
read_unlock(&tasklist_lock);
}
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(ptr_to_compat(head), head_ptr);
err_unlock:
read_unlock(&tasklist_lock);
return ret;
}
asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
struct compat_timespec __user *utime, u32 __user *uaddr2,
u32 val3)
{
struct timespec ts;
ktime_t t, *tp = NULL;
int val2 = 0;
int cmd = op & FUTEX_CMD_MASK;
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
cmd == FUTEX_WAIT_BITSET ||
cmd == FUTEX_WAIT_REQUEUE_PI)) {
if (get_compat_timespec(&ts, utime))
return -EFAULT;
if (!timespec_valid(&ts))
return -EINVAL;
t = timespec_to_ktime(ts);
if (cmd == FUTEX_WAIT)
t = ktime_add_safe(ktime_get(), t);
tp = &t;
}
if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
val2 = (int) (unsigned long) utime;
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}

View File

@@ -0,0 +1,48 @@
menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS && CONSTRUCTORS
default n
---help---
This option enables gcov-based code profiling (e.g. for code coverage
measurements).
If unsure, say N.
Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data
for the entire kernel. To enable profiling for specific files or
directories, add a line similar to the following to the respective
Makefile:
For a single file (e.g. main.o):
GCOV_PROFILE_main.o := y
For all files in one directory:
GCOV_PROFILE := y
To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
is specified, use:
GCOV_PROFILE_main.o := n
and:
GCOV_PROFILE := n
Note that the debugfs filesystem has to be mounted to access
profiling data.
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
default n
---help---
This options activates profiling for the entire kernel.
If unsure, say N.
Note that a kernel compiled with profiling flags will be significantly
larger and run slower. Also be sure to exclude files from profiling
which are not linked to the kernel image to prevent linker errors.
endmenu

View File

@@ -0,0 +1,3 @@
EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o

148
kernel/kernel/gcov/base.c Normal file
View File

@@ -0,0 +1,148 @@
/*
* This code maintains a list of active profiling data structures.
*
* Copyright IBM Corp. 2009
* Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
*
* Uses gcc-internal data definitions.
* Based on the gcov-kernel patch by:
* Hubertus Franke <frankeh@us.ibm.com>
* Nigel Hinds <nhinds@us.ibm.com>
* Rajan Ravindran <rajancr@us.ibm.com>
* Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
* Paul Larson
*/
#define pr_fmt(fmt) "gcov: " fmt
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include "gcov.h"
static struct gcov_info *gcov_info_head;
static int gcov_events_enabled;
static DEFINE_MUTEX(gcov_lock);
/*
* __gcov_init is called by gcc-generated constructor code for each object
* file compiled with -fprofile-arcs.
*/
void __gcov_init(struct gcov_info *info)
{
static unsigned int gcov_version;
mutex_lock(&gcov_lock);
if (gcov_version == 0) {
gcov_version = info->version;
/*
* Printing gcc's version magic may prove useful for debugging
* incompatibility reports.
*/
pr_info("version magic: 0x%x\n", gcov_version);
}
/*
* Add new profiling data structure to list and inform event
* listener.
*/
info->next = gcov_info_head;
gcov_info_head = info;
if (gcov_events_enabled)
gcov_event(GCOV_ADD, info);
mutex_unlock(&gcov_lock);
}
EXPORT_SYMBOL(__gcov_init);
/*
* These functions may be referenced by gcc-generated profiling code but serve
* no function for kernel profiling.
*/
void __gcov_flush(void)
{
/* Unused. */
}
EXPORT_SYMBOL(__gcov_flush);
void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
{
/* Unused. */
}
EXPORT_SYMBOL(__gcov_merge_add);
void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
{
/* Unused. */
}
EXPORT_SYMBOL(__gcov_merge_single);
void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
{
/* Unused. */
}
EXPORT_SYMBOL(__gcov_merge_delta);
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
* Turn on reporting of profiling data load/unload-events through the
* gcov_event() callback. Also replay all previous events once. This function
* is needed because some events are potentially generated too early for the
* callback implementation to handle them initially.
*/
void gcov_enable_events(void)
{
struct gcov_info *info;
mutex_lock(&gcov_lock);
gcov_events_enabled = 1;
/* Perform event callback for previously registered entries. */
for (info = gcov_info_head; info; info = info->next)
gcov_event(GCOV_ADD, info);
mutex_unlock(&gcov_lock);
}
#ifdef CONFIG_MODULES
static inline int within(void *addr, void *start, unsigned long size)
{
return ((addr >= start) && (addr < start + size));
}
/* Update list and generate events when modules are unloaded. */
static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
void *data)
{
struct module *mod = data;
struct gcov_info *info;
struct gcov_info *prev;
if (event != MODULE_STATE_GOING)
return NOTIFY_OK;
mutex_lock(&gcov_lock);
prev = NULL;
/* Remove entries located in module from linked list. */
for (info = gcov_info_head; info; info = info->next) {
if (within(info, mod->module_core, mod->core_size)) {
if (prev)
prev->next = info->next;
else
gcov_info_head = info->next;
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
} else
prev = info;
}
mutex_unlock(&gcov_lock);
return NOTIFY_OK;
}
static struct notifier_block gcov_nb = {
.notifier_call = gcov_module_notifier,
};
static int __init gcov_init(void)
{
return register_module_notifier(&gcov_nb);
}
device_initcall(gcov_init);
#endif /* CONFIG_MODULES */

789
kernel/kernel/gcov/fs.c Normal file
View File

@@ -0,0 +1,789 @@
/*
* This code exports profiling data as debugfs files to userspace.
*
* Copyright IBM Corp. 2009
* Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
*
* Uses gcc-internal data definitions.
* Based on the gcov-kernel patch by:
* Hubertus Franke <frankeh@us.ibm.com>
* Nigel Hinds <nhinds@us.ibm.com>
* Rajan Ravindran <rajancr@us.ibm.com>
* Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
* Paul Larson
* Yi CDL Yang
*/
#define pr_fmt(fmt) "gcov: " fmt
#include <linux/init.h>
#include <linux/module.h>
#include <linux/debugfs.h>
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
#include "gcov.h"
/**
* struct gcov_node - represents a debugfs entry
* @list: list head for child node list
* @children: child nodes
* @all: list head for list of all nodes
* @parent: parent node
* @loaded_info: array of pointers to profiling data sets for loaded object
* files.
* @num_loaded: number of profiling data sets for loaded object files.
* @unloaded_info: accumulated copy of profiling data sets for unloaded
* object files. Used only when gcov_persist=1.
* @dentry: main debugfs entry, either a directory or data file
* @links: associated symbolic links
* @name: data file basename
*
* struct gcov_node represents an entity within the gcov/ subdirectory
* of debugfs. There are directory and data file nodes. The latter represent
* the actual synthesized data file plus any associated symbolic links which
* are needed by the gcov tool to work correctly.
*/
struct gcov_node {
struct list_head list;
struct list_head children;
struct list_head all;
struct gcov_node *parent;
struct gcov_info **loaded_info;
struct gcov_info *unloaded_info;
struct dentry *dentry;
struct dentry **links;
int num_loaded;
char name[0];
};
static const char objtree[] = OBJTREE;
static const char srctree[] = SRCTREE;
static struct gcov_node root_node;
static struct dentry *reset_dentry;
static LIST_HEAD(all_head);
static DEFINE_MUTEX(node_lock);
/* If non-zero, keep copies of profiling data for unloaded modules. */
static int gcov_persist = 1;
static int __init gcov_persist_setup(char *str)
{
unsigned long val;
if (strict_strtoul(str, 0, &val)) {
pr_warning("invalid gcov_persist parameter '%s'\n", str);
return 0;
}
gcov_persist = val;
pr_info("setting gcov_persist to %d\n", gcov_persist);
return 1;
}
__setup("gcov_persist=", gcov_persist_setup);
/*
* seq_file.start() implementation for gcov data files. Note that the
* gcov_iterator interface is designed to be more restrictive than seq_file
* (no start from arbitrary position, etc.), to simplify the iterator
* implementation.
*/
static void *gcov_seq_start(struct seq_file *seq, loff_t *pos)
{
loff_t i;
gcov_iter_start(seq->private);
for (i = 0; i < *pos; i++) {
if (gcov_iter_next(seq->private))
return NULL;
}
return seq->private;
}
/* seq_file.next() implementation for gcov data files. */
static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
{
struct gcov_iterator *iter = data;
if (gcov_iter_next(iter))
return NULL;
(*pos)++;
return iter;
}
/* seq_file.show() implementation for gcov data files. */
static int gcov_seq_show(struct seq_file *seq, void *data)
{
struct gcov_iterator *iter = data;
if (gcov_iter_write(iter, seq))
return -EINVAL;
return 0;
}
static void gcov_seq_stop(struct seq_file *seq, void *data)
{
/* Unused. */
}
static const struct seq_operations gcov_seq_ops = {
.start = gcov_seq_start,
.next = gcov_seq_next,
.show = gcov_seq_show,
.stop = gcov_seq_stop,
};
/*
* Return a profiling data set associated with the given node. This is
* either a data set for a loaded object file or a data set copy in case
* all associated object files have been unloaded.
*/
static struct gcov_info *get_node_info(struct gcov_node *node)
{
if (node->num_loaded > 0)
return node->loaded_info[0];
return node->unloaded_info;
}
/*
* Return a newly allocated profiling data set which contains the sum of
* all profiling data associated with the given node.
*/
static struct gcov_info *get_accumulated_info(struct gcov_node *node)
{
struct gcov_info *info;
int i = 0;
if (node->unloaded_info)
info = gcov_info_dup(node->unloaded_info);
else
info = gcov_info_dup(node->loaded_info[i++]);
if (!info)
return NULL;
for (; i < node->num_loaded; i++)
gcov_info_add(info, node->loaded_info[i]);
return info;
}
/*
* open() implementation for gcov data files. Create a copy of the profiling
* data set and initialize the iterator and seq_file interface.
*/
static int gcov_seq_open(struct inode *inode, struct file *file)
{
struct gcov_node *node = inode->i_private;
struct gcov_iterator *iter;
struct seq_file *seq;
struct gcov_info *info;
int rc = -ENOMEM;
mutex_lock(&node_lock);
/*
* Read from a profiling data copy to minimize reference tracking
* complexity and concurrent access and to keep accumulating multiple
* profiling data sets associated with one node simple.
*/
info = get_accumulated_info(node);
if (!info)
goto out_unlock;
iter = gcov_iter_new(info);
if (!iter)
goto err_free_info;
rc = seq_open(file, &gcov_seq_ops);
if (rc)
goto err_free_iter_info;
seq = file->private_data;
seq->private = iter;
out_unlock:
mutex_unlock(&node_lock);
return rc;
err_free_iter_info:
gcov_iter_free(iter);
err_free_info:
gcov_info_free(info);
goto out_unlock;
}
/*
* release() implementation for gcov data files. Release resources allocated
* by open().
*/
static int gcov_seq_release(struct inode *inode, struct file *file)
{
struct gcov_iterator *iter;
struct gcov_info *info;
struct seq_file *seq;
seq = file->private_data;
iter = seq->private;
info = gcov_iter_get_info(iter);
gcov_iter_free(iter);
gcov_info_free(info);
seq_release(inode, file);
return 0;
}
/*
* Find a node by the associated data file name. Needs to be called with
* node_lock held.
*/
static struct gcov_node *get_node_by_name(const char *name)
{
struct gcov_node *node;
struct gcov_info *info;
list_for_each_entry(node, &all_head, all) {
info = get_node_info(node);
if (info && (strcmp(info->filename, name) == 0))
return node;
}
return NULL;
}
/*
* Reset all profiling data associated with the specified node.
*/
static void reset_node(struct gcov_node *node)
{
int i;
if (node->unloaded_info)
gcov_info_reset(node->unloaded_info);
for (i = 0; i < node->num_loaded; i++)
gcov_info_reset(node->loaded_info[i]);
}
static void remove_node(struct gcov_node *node);
/*
* write() implementation for gcov data files. Reset profiling data for the
* corresponding file. If all associated object files have been unloaded,
* remove the debug fs node as well.
*/
static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
size_t len, loff_t *pos)
{
struct seq_file *seq;
struct gcov_info *info;
struct gcov_node *node;
seq = file->private_data;
info = gcov_iter_get_info(seq->private);
mutex_lock(&node_lock);
node = get_node_by_name(info->filename);
if (node) {
/* Reset counts or remove node for unloaded modules. */
if (node->num_loaded == 0)
remove_node(node);
else
reset_node(node);
}
/* Reset counts for open file. */
gcov_info_reset(info);
mutex_unlock(&node_lock);
return len;
}
/*
* Given a string <path> representing a file path of format:
* path/to/file.gcda
* construct and return a new string:
* <dir/>path/to/file.<ext>
*/
static char *link_target(const char *dir, const char *path, const char *ext)
{
char *target;
char *old_ext;
char *copy;
copy = kstrdup(path, GFP_KERNEL);
if (!copy)
return NULL;
old_ext = strrchr(copy, '.');
if (old_ext)
*old_ext = '\0';
if (dir)
target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext);
else
target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext);
kfree(copy);
return target;
}
/*
* Construct a string representing the symbolic link target for the given
* gcov data file name and link type. Depending on the link type and the
* location of the data file, the link target can either point to a
* subdirectory of srctree, objtree or in an external location.
*/
static char *get_link_target(const char *filename, const struct gcov_link *ext)
{
const char *rel;
char *result;
if (strncmp(filename, objtree, strlen(objtree)) == 0) {
rel = filename + strlen(objtree) + 1;
if (ext->dir == SRC_TREE)
result = link_target(srctree, rel, ext->ext);
else
result = link_target(objtree, rel, ext->ext);
} else {
/* External compilation. */
result = link_target(NULL, filename, ext->ext);
}
return result;
}
#define SKEW_PREFIX ".tmp_"
/*
* For a filename .tmp_filename.ext return filename.ext. Needed to compensate
* for filename skewing caused by the mod-versioning mechanism.
*/
static const char *deskew(const char *basename)
{
if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0)
return basename + sizeof(SKEW_PREFIX) - 1;
return basename;
}
/*
* Create links to additional files (usually .c and .gcno files) which the
* gcov tool expects to find in the same directory as the gcov data file.
*/
static void add_links(struct gcov_node *node, struct dentry *parent)
{
char *basename;
char *target;
int num;
int i;
for (num = 0; gcov_link[num].ext; num++)
/* Nothing. */;
node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL);
if (!node->links)
return;
for (i = 0; i < num; i++) {
target = get_link_target(get_node_info(node)->filename,
&gcov_link[i]);
if (!target)
goto out_err;
basename = strrchr(target, '/');
if (!basename)
goto out_err;
basename++;
node->links[i] = debugfs_create_symlink(deskew(basename),
parent, target);
if (!node->links[i])
goto out_err;
kfree(target);
}
return;
out_err:
kfree(target);
while (i-- > 0)
debugfs_remove(node->links[i]);
kfree(node->links);
node->links = NULL;
}
static const struct file_operations gcov_data_fops = {
.open = gcov_seq_open,
.release = gcov_seq_release,
.read = seq_read,
.llseek = seq_lseek,
.write = gcov_seq_write,
};
/* Basic initialization of a new node. */
static void init_node(struct gcov_node *node, struct gcov_info *info,
const char *name, struct gcov_node *parent)
{
INIT_LIST_HEAD(&node->list);
INIT_LIST_HEAD(&node->children);
INIT_LIST_HEAD(&node->all);
if (node->loaded_info) {
node->loaded_info[0] = info;
node->num_loaded = 1;
}
node->parent = parent;
if (name)
strcpy(node->name, name);
}
/*
* Create a new node and associated debugfs entry. Needs to be called with
* node_lock held.
*/
static struct gcov_node *new_node(struct gcov_node *parent,
struct gcov_info *info, const char *name)
{
struct gcov_node *node;
node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
if (!node)
goto err_nomem;
if (info) {
node->loaded_info = kcalloc(1, sizeof(struct gcov_info *),
GFP_KERNEL);
if (!node->loaded_info)
goto err_nomem;
}
init_node(node, info, name, parent);
/* Differentiate between gcov data file nodes and directory nodes. */
if (info) {
node->dentry = debugfs_create_file(deskew(node->name), 0600,
parent->dentry, node, &gcov_data_fops);
} else
node->dentry = debugfs_create_dir(node->name, parent->dentry);
if (!node->dentry) {
pr_warning("could not create file\n");
kfree(node);
return NULL;
}
if (info)
add_links(node, parent->dentry);
list_add(&node->list, &parent->children);
list_add(&node->all, &all_head);
return node;
err_nomem:
kfree(node);
pr_warning("out of memory\n");
return NULL;
}
/* Remove symbolic links associated with node. */
static void remove_links(struct gcov_node *node)
{
int i;
if (!node->links)
return;
for (i = 0; gcov_link[i].ext; i++)
debugfs_remove(node->links[i]);
kfree(node->links);
node->links = NULL;
}
/*
* Remove node from all lists and debugfs and release associated resources.
* Needs to be called with node_lock held.
*/
static void release_node(struct gcov_node *node)
{
list_del(&node->list);
list_del(&node->all);
debugfs_remove(node->dentry);
remove_links(node);
kfree(node->loaded_info);
if (node->unloaded_info)
gcov_info_free(node->unloaded_info);
kfree(node);
}
/* Release node and empty parents. Needs to be called with node_lock held. */
static void remove_node(struct gcov_node *node)
{
struct gcov_node *parent;
while ((node != &root_node) && list_empty(&node->children)) {
parent = node->parent;
release_node(node);
node = parent;
}
}
/*
* Find child node with given basename. Needs to be called with node_lock
* held.
*/
static struct gcov_node *get_child_by_name(struct gcov_node *parent,
const char *name)
{
struct gcov_node *node;
list_for_each_entry(node, &parent->children, list) {
if (strcmp(node->name, name) == 0)
return node;
}
return NULL;
}
/*
* write() implementation for reset file. Reset all profiling data to zero
* and remove nodes for which all associated object files are unloaded.
*/
static ssize_t reset_write(struct file *file, const char __user *addr,
size_t len, loff_t *pos)
{
struct gcov_node *node;
mutex_lock(&node_lock);
restart:
list_for_each_entry(node, &all_head, all) {
if (node->num_loaded > 0)
reset_node(node);
else if (list_empty(&node->children)) {
remove_node(node);
/* Several nodes may have gone - restart loop. */
goto restart;
}
}
mutex_unlock(&node_lock);
return len;
}
/* read() implementation for reset file. Unused. */
static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
loff_t *pos)
{
/* Allow read operation so that a recursive copy won't fail. */
return 0;
}
static const struct file_operations gcov_reset_fops = {
.write = reset_write,
.read = reset_read,
};
/*
* Create a node for a given profiling data set and add it to all lists and
* debugfs. Needs to be called with node_lock held.
*/
static void add_node(struct gcov_info *info)
{
char *filename;
char *curr;
char *next;
struct gcov_node *parent;
struct gcov_node *node;
filename = kstrdup(info->filename, GFP_KERNEL);
if (!filename)
return;
parent = &root_node;
/* Create directory nodes along the path. */
for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) {
if (curr == next)
continue;
*next = 0;
if (strcmp(curr, ".") == 0)
continue;
if (strcmp(curr, "..") == 0) {
if (!parent->parent)
goto err_remove;
parent = parent->parent;
continue;
}
node = get_child_by_name(parent, curr);
if (!node) {
node = new_node(parent, NULL, curr);
if (!node)
goto err_remove;
}
parent = node;
}
/* Create file node. */
node = new_node(parent, info, curr);
if (!node)
goto err_remove;
out:
kfree(filename);
return;
err_remove:
remove_node(parent);
goto out;
}
/*
* Associate a profiling data set with an existing node. Needs to be called
* with node_lock held.
*/
static void add_info(struct gcov_node *node, struct gcov_info *info)
{
struct gcov_info **loaded_info;
int num = node->num_loaded;
/*
* Prepare new array. This is done first to simplify cleanup in
* case the new data set is incompatible, the node only contains
* unloaded data sets and there's not enough memory for the array.
*/
loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
if (!loaded_info) {
pr_warning("could not add '%s' (out of memory)\n",
info->filename);
return;
}
memcpy(loaded_info, node->loaded_info,
num * sizeof(struct gcov_info *));
loaded_info[num] = info;
/* Check if the new data set is compatible. */
if (num == 0) {
/*
* A module was unloaded, modified and reloaded. The new
* data set replaces the copy of the last one.
*/
if (!gcov_info_is_compatible(node->unloaded_info, info)) {
pr_warning("discarding saved data for %s "
"(incompatible version)\n", info->filename);
gcov_info_free(node->unloaded_info);
node->unloaded_info = NULL;
}
} else {
/*
* Two different versions of the same object file are loaded.
* The initial one takes precedence.
*/
if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
pr_warning("could not add '%s' (incompatible "
"version)\n", info->filename);
kfree(loaded_info);
return;
}
}
/* Overwrite previous array. */
kfree(node->loaded_info);
node->loaded_info = loaded_info;
node->num_loaded = num + 1;
}
/*
* Return the index of a profiling data set associated with a node.
*/
static int get_info_index(struct gcov_node *node, struct gcov_info *info)
{
int i;
for (i = 0; i < node->num_loaded; i++) {
if (node->loaded_info[i] == info)
return i;
}
return -ENOENT;
}
/*
* Save the data of a profiling data set which is being unloaded.
*/
static void save_info(struct gcov_node *node, struct gcov_info *info)
{
if (node->unloaded_info)
gcov_info_add(node->unloaded_info, info);
else {
node->unloaded_info = gcov_info_dup(info);
if (!node->unloaded_info) {
pr_warning("could not save data for '%s' "
"(out of memory)\n", info->filename);
}
}
}
/*
* Disassociate a profiling data set from a node. Needs to be called with
* node_lock held.
*/
static void remove_info(struct gcov_node *node, struct gcov_info *info)
{
int i;
i = get_info_index(node, info);
if (i < 0) {
pr_warning("could not remove '%s' (not found)\n",
info->filename);
return;
}
if (gcov_persist)
save_info(node, info);
/* Shrink array. */
node->loaded_info[i] = node->loaded_info[node->num_loaded - 1];
node->num_loaded--;
if (node->num_loaded > 0)
return;
/* Last loaded data set was removed. */
kfree(node->loaded_info);
node->loaded_info = NULL;
node->num_loaded = 0;
if (!node->unloaded_info)
remove_node(node);
}
/*
* Callback to create/remove profiling files when code compiled with
* -fprofile-arcs is loaded/unloaded.
*/
void gcov_event(enum gcov_action action, struct gcov_info *info)
{
struct gcov_node *node;
mutex_lock(&node_lock);
node = get_node_by_name(info->filename);
switch (action) {
case GCOV_ADD:
if (node)
add_info(node, info);
else
add_node(info);
break;
case GCOV_REMOVE:
if (node)
remove_info(node, info);
else {
pr_warning("could not remove '%s' (not found)\n",
info->filename);
}
break;
}
mutex_unlock(&node_lock);
}
/* Create debugfs entries. */
static __init int gcov_fs_init(void)
{
int rc = -EIO;
init_node(&root_node, NULL, NULL, NULL);
/*
* /sys/kernel/debug/gcov will be parent for the reset control file
* and all profiling files.
*/
root_node.dentry = debugfs_create_dir("gcov", NULL);
if (!root_node.dentry)
goto err_remove;
/*
* Create reset file which resets all profiling counts when written
* to.
*/
reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
NULL, &gcov_reset_fops);
if (!reset_dentry)
goto err_remove;
/* Replay previous events to get our fs hierarchy up-to-date. */
gcov_enable_events();
return 0;
err_remove:
pr_err("init failed\n");
if (root_node.dentry)
debugfs_remove(root_node.dentry);
return rc;
}
device_initcall(gcov_fs_init);

View File

@@ -0,0 +1,447 @@
/*
* This code provides functions to handle gcc's profiling data format
* introduced with gcc 3.4. Future versions of gcc may change the gcov
* format (as happened before), so all format-specific information needs
* to be kept modular and easily exchangeable.
*
* This file is based on gcc-internal definitions. Functions and data
* structures are defined to be compatible with gcc counterparts.
* For a better understanding, refer to gcc source: gcc/gcov-io.h.
*
* Copyright IBM Corp. 2009
* Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
*
* Uses gcc-internal data definitions.
*/
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include "gcov.h"
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
{ 0, NULL},
};
/*
* Determine whether a counter is active. Based on gcc magic. Doesn't change
* at run-time.
*/
static int counter_active(struct gcov_info *info, unsigned int type)
{
return (1 << type) & info->ctr_mask;
}
/* Determine number of active counters. Based on gcc magic. */
static unsigned int num_counter_active(struct gcov_info *info)
{
unsigned int i;
unsigned int result = 0;
for (i = 0; i < GCOV_COUNTERS; i++) {
if (counter_active(info, i))
result++;
}
return result;
}
/**
* gcov_info_reset - reset profiling data to zero
* @info: profiling data set
*/
void gcov_info_reset(struct gcov_info *info)
{
unsigned int active = num_counter_active(info);
unsigned int i;
for (i = 0; i < active; i++) {
memset(info->counts[i].values, 0,
info->counts[i].num * sizeof(gcov_type));
}
}
/**
* gcov_info_is_compatible - check if profiling data can be added
* @info1: first profiling data set
* @info2: second profiling data set
*
* Returns non-zero if profiling data can be added, zero otherwise.
*/
int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
{
return (info1->stamp == info2->stamp);
}
/**
* gcov_info_add - add up profiling data
* @dest: profiling data set to which data is added
* @source: profiling data set which is added
*
* Adds profiling counts of @source to @dest.
*/
void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
{
unsigned int i;
unsigned int j;
for (i = 0; i < num_counter_active(dest); i++) {
for (j = 0; j < dest->counts[i].num; j++) {
dest->counts[i].values[j] +=
source->counts[i].values[j];
}
}
}
/* Get size of function info entry. Based on gcc magic. */
static size_t get_fn_size(struct gcov_info *info)
{
size_t size;
size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
sizeof(unsigned int);
if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
size = ALIGN(size, __alignof__(struct gcov_fn_info));
return size;
}
/* Get address of function info entry. Based on gcc magic. */
static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
{
return (struct gcov_fn_info *)
((char *) info->functions + fn * get_fn_size(info));
}
/**
* gcov_info_dup - duplicate profiling data set
* @info: profiling data set to duplicate
*
* Return newly allocated duplicate on success, %NULL on error.
*/
struct gcov_info *gcov_info_dup(struct gcov_info *info)
{
struct gcov_info *dup;
unsigned int i;
unsigned int active;
/* Duplicate gcov_info. */
active = num_counter_active(info);
dup = kzalloc(sizeof(struct gcov_info) +
sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
if (!dup)
return NULL;
dup->version = info->version;
dup->stamp = info->stamp;
dup->n_functions = info->n_functions;
dup->ctr_mask = info->ctr_mask;
/* Duplicate filename. */
dup->filename = kstrdup(info->filename, GFP_KERNEL);
if (!dup->filename)
goto err_free;
/* Duplicate table of functions. */
dup->functions = kmemdup(info->functions, info->n_functions *
get_fn_size(info), GFP_KERNEL);
if (!dup->functions)
goto err_free;
/* Duplicate counter arrays. */
for (i = 0; i < active ; i++) {
struct gcov_ctr_info *ctr = &info->counts[i];
size_t size = ctr->num * sizeof(gcov_type);
dup->counts[i].num = ctr->num;
dup->counts[i].merge = ctr->merge;
dup->counts[i].values = vmalloc(size);
if (!dup->counts[i].values)
goto err_free;
memcpy(dup->counts[i].values, ctr->values, size);
}
return dup;
err_free:
gcov_info_free(dup);
return NULL;
}
/**
* gcov_info_free - release memory for profiling data set duplicate
* @info: profiling data set duplicate to free
*/
void gcov_info_free(struct gcov_info *info)
{
unsigned int active = num_counter_active(info);
unsigned int i;
for (i = 0; i < active ; i++)
vfree(info->counts[i].values);
kfree(info->functions);
kfree(info->filename);
kfree(info);
}
/**
* struct type_info - iterator helper array
* @ctr_type: counter type
* @offset: index of the first value of the current function for this type
*
* This array is needed to convert the in-memory data format into the in-file
* data format:
*
* In-memory:
* for each counter type
* for each function
* values
*
* In-file:
* for each function
* for each counter type
* values
*
* See gcc source gcc/gcov-io.h for more information on data organization.
*/
struct type_info {
int ctr_type;
unsigned int offset;
};
/**
* struct gcov_iterator - specifies current file position in logical records
* @info: associated profiling data
* @record: record type
* @function: function number
* @type: counter type
* @count: index into values array
* @num_types: number of counter types
* @type_info: helper array to get values-array offset for current function
*/
struct gcov_iterator {
struct gcov_info *info;
int record;
unsigned int function;
unsigned int type;
unsigned int count;
int num_types;
struct type_info type_info[0];
};
static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
{
return get_fn_info(iter->info, iter->function);
}
static struct type_info *get_type(struct gcov_iterator *iter)
{
return &iter->type_info[iter->type];
}
/**
* gcov_iter_new - allocate and initialize profiling data iterator
* @info: profiling data set to be iterated
*
* Return file iterator on success, %NULL otherwise.
*/
struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
{
struct gcov_iterator *iter;
iter = kzalloc(sizeof(struct gcov_iterator) +
num_counter_active(info) * sizeof(struct type_info),
GFP_KERNEL);
if (iter)
iter->info = info;
return iter;
}
/**
* gcov_iter_free - release memory for iterator
* @iter: file iterator to free
*/
void gcov_iter_free(struct gcov_iterator *iter)
{
kfree(iter);
}
/**
* gcov_iter_get_info - return profiling data set for given file iterator
* @iter: file iterator
*/
struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
{
return iter->info;
}
/**
* gcov_iter_start - reset file iterator to starting position
* @iter: file iterator
*/
void gcov_iter_start(struct gcov_iterator *iter)
{
int i;
iter->record = 0;
iter->function = 0;
iter->type = 0;
iter->count = 0;
iter->num_types = 0;
for (i = 0; i < GCOV_COUNTERS; i++) {
if (counter_active(iter->info, i)) {
iter->type_info[iter->num_types].ctr_type = i;
iter->type_info[iter->num_types++].offset = 0;
}
}
}
/* Mapping of logical record number to actual file content. */
#define RECORD_FILE_MAGIC 0
#define RECORD_GCOV_VERSION 1
#define RECORD_TIME_STAMP 2
#define RECORD_FUNCTION_TAG 3
#define RECORD_FUNCTON_TAG_LEN 4
#define RECORD_FUNCTION_IDENT 5
#define RECORD_FUNCTION_CHECK 6
#define RECORD_COUNT_TAG 7
#define RECORD_COUNT_LEN 8
#define RECORD_COUNT 9
/**
* gcov_iter_next - advance file iterator to next logical record
* @iter: file iterator
*
* Return zero if new position is valid, non-zero if iterator has reached end.
*/
int gcov_iter_next(struct gcov_iterator *iter)
{
switch (iter->record) {
case RECORD_FILE_MAGIC:
case RECORD_GCOV_VERSION:
case RECORD_FUNCTION_TAG:
case RECORD_FUNCTON_TAG_LEN:
case RECORD_FUNCTION_IDENT:
case RECORD_COUNT_TAG:
/* Advance to next record */
iter->record++;
break;
case RECORD_COUNT:
/* Advance to next count */
iter->count++;
/* fall through */
case RECORD_COUNT_LEN:
if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
iter->record = 9;
break;
}
/* Advance to next counter type */
get_type(iter)->offset += iter->count;
iter->count = 0;
iter->type++;
/* fall through */
case RECORD_FUNCTION_CHECK:
if (iter->type < iter->num_types) {
iter->record = 7;
break;
}
/* Advance to next function */
iter->type = 0;
iter->function++;
/* fall through */
case RECORD_TIME_STAMP:
if (iter->function < iter->info->n_functions)
iter->record = 3;
else
iter->record = -1;
break;
}
/* Check for EOF. */
if (iter->record == -1)
return -EINVAL;
else
return 0;
}
/**
* seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
* @seq: seq_file handle
* @v: value to be stored
*
* Number format defined by gcc: numbers are recorded in the 32 bit
* unsigned binary form of the endianness of the machine generating the
* file.
*/
static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
{
return seq_write(seq, &v, sizeof(v));
}
/**
* seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
* @seq: seq_file handle
* @v: value to be stored
*
* Number format defined by gcc: numbers are recorded in the 32 bit
* unsigned binary form of the endianness of the machine generating the
* file. 64 bit numbers are stored as two 32 bit numbers, the low part
* first.
*/
static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
{
u32 data[2];
data[0] = (v & 0xffffffffUL);
data[1] = (v >> 32);
return seq_write(seq, data, sizeof(data));
}
/**
* gcov_iter_write - write data for current pos to seq_file
* @iter: file iterator
* @seq: seq_file handle
*
* Return zero on success, non-zero otherwise.
*/
int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
{
int rc = -EINVAL;
switch (iter->record) {
case RECORD_FILE_MAGIC:
rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
break;
case RECORD_GCOV_VERSION:
rc = seq_write_gcov_u32(seq, iter->info->version);
break;
case RECORD_TIME_STAMP:
rc = seq_write_gcov_u32(seq, iter->info->stamp);
break;
case RECORD_FUNCTION_TAG:
rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
break;
case RECORD_FUNCTON_TAG_LEN:
rc = seq_write_gcov_u32(seq, 2);
break;
case RECORD_FUNCTION_IDENT:
rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
break;
case RECORD_FUNCTION_CHECK:
rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
break;
case RECORD_COUNT_TAG:
rc = seq_write_gcov_u32(seq,
GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
break;
case RECORD_COUNT_LEN:
rc = seq_write_gcov_u32(seq,
get_func(iter)->n_ctrs[iter->type] * 2);
break;
case RECORD_COUNT:
rc = seq_write_gcov_u64(seq,
iter->info->counts[iter->type].
values[iter->count + get_type(iter)->offset]);
break;
}
return rc;
}

128
kernel/kernel/gcov/gcov.h Normal file
View File

@@ -0,0 +1,128 @@
/*
* Profiling infrastructure declarations.
*
* This file is based on gcc-internal definitions. Data structures are
* defined to be compatible with gcc counterparts. For a better
* understanding, refer to gcc source: gcc/gcov-io.h.
*
* Copyright IBM Corp. 2009
* Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
*
* Uses gcc-internal data definitions.
*/
#ifndef GCOV_H
#define GCOV_H GCOV_H
#include <linux/types.h>
/*
* Profiling data types used for gcc 3.4 and above - these are defined by
* gcc and need to be kept as close to the original definition as possible to
* remain compatible.
*/
#define GCOV_COUNTERS 5
#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
#define GCOV_TAG_FOR_COUNTER(count) \
(GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
#if BITS_PER_LONG >= 64
typedef long gcov_type;
#else
typedef long long gcov_type;
#endif
/**
* struct gcov_fn_info - profiling meta data per function
* @ident: object file-unique function identifier
* @checksum: function checksum
* @n_ctrs: number of values per counter type belonging to this function
*
* This data is generated by gcc during compilation and doesn't change
* at run-time.
*/
struct gcov_fn_info {
unsigned int ident;
unsigned int checksum;
unsigned int n_ctrs[0];
};
/**
* struct gcov_ctr_info - profiling data per counter type
* @num: number of counter values for this type
* @values: array of counter values for this type
* @merge: merge function for counter values of this type (unused)
*
* This data is generated by gcc during compilation and doesn't change
* at run-time with the exception of the values array.
*/
struct gcov_ctr_info {
unsigned int num;
gcov_type *values;
void (*merge)(gcov_type *, unsigned int);
};
/**
* struct gcov_info - profiling data per object file
* @version: gcov version magic indicating the gcc version used for compilation
* @next: list head for a singly-linked list
* @stamp: time stamp
* @filename: name of the associated gcov data file
* @n_functions: number of instrumented functions
* @functions: function data
* @ctr_mask: mask specifying which counter types are active
* @counts: counter data per counter type
*
* This data is generated by gcc during compilation and doesn't change
* at run-time with the exception of the next pointer.
*/
struct gcov_info {
unsigned int version;
struct gcov_info *next;
unsigned int stamp;
const char *filename;
unsigned int n_functions;
const struct gcov_fn_info *functions;
unsigned int ctr_mask;
struct gcov_ctr_info counts[0];
};
/* Base interface. */
enum gcov_action {
GCOV_ADD,
GCOV_REMOVE,
};
void gcov_event(enum gcov_action action, struct gcov_info *info);
void gcov_enable_events(void);
/* Iterator control. */
struct seq_file;
struct gcov_iterator;
struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
void gcov_iter_free(struct gcov_iterator *iter);
void gcov_iter_start(struct gcov_iterator *iter);
int gcov_iter_next(struct gcov_iterator *iter);
int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
/* gcov_info control. */
void gcov_info_reset(struct gcov_info *info);
int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2);
void gcov_info_add(struct gcov_info *dest, struct gcov_info *source);
struct gcov_info *gcov_info_dup(struct gcov_info *info);
void gcov_info_free(struct gcov_info *info);
struct gcov_link {
enum {
OBJ_TREE,
SRC_TREE,
} dir;
const char *ext;
};
extern const struct gcov_link gcov_link[];
#endif /* GCOV_H */

287
kernel/kernel/groups.c Normal file
View File

@@ -0,0 +1,287 @@
/*
* Supplementary group IDs
*/
#include <linux/cred.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <asm/uaccess.h>
/* init to 2 - one for init_task, one to ensure it is never freed */
struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
struct group_info *groups_alloc(int gidsetsize)
{
struct group_info *group_info;
int nblocks;
int i;
nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
/* Make sure we always allocate at least one indirect block pointer */
nblocks = nblocks ? : 1;
group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
if (!group_info)
return NULL;
group_info->ngroups = gidsetsize;
group_info->nblocks = nblocks;
atomic_set(&group_info->usage, 1);
if (gidsetsize <= NGROUPS_SMALL)
group_info->blocks[0] = group_info->small_block;
else {
for (i = 0; i < nblocks; i++) {
gid_t *b;
b = (void *)__get_free_page(GFP_USER);
if (!b)
goto out_undo_partial_alloc;
group_info->blocks[i] = b;
}
}
return group_info;
out_undo_partial_alloc:
while (--i >= 0) {
free_page((unsigned long)group_info->blocks[i]);
}
kfree(group_info);
return NULL;
}
EXPORT_SYMBOL(groups_alloc);
void groups_free(struct group_info *group_info)
{
if (group_info->blocks[0] != group_info->small_block) {
int i;
for (i = 0; i < group_info->nblocks; i++)
free_page((unsigned long)group_info->blocks[i]);
}
kfree(group_info);
}
EXPORT_SYMBOL(groups_free);
/* export the group_info to a user-space array */
static int groups_to_user(gid_t __user *grouplist,
const struct group_info *group_info)
{
int i;
unsigned int count = group_info->ngroups;
for (i = 0; i < group_info->nblocks; i++) {
unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
unsigned int len = cp_count * sizeof(*grouplist);
if (copy_to_user(grouplist, group_info->blocks[i], len))
return -EFAULT;
grouplist += NGROUPS_PER_BLOCK;
count -= cp_count;
}
return 0;
}
/* fill a group_info from a user-space array - it must be allocated already */
static int groups_from_user(struct group_info *group_info,
gid_t __user *grouplist)
{
int i;
unsigned int count = group_info->ngroups;
for (i = 0; i < group_info->nblocks; i++) {
unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
unsigned int len = cp_count * sizeof(*grouplist);
if (copy_from_user(group_info->blocks[i], grouplist, len))
return -EFAULT;
grouplist += NGROUPS_PER_BLOCK;
count -= cp_count;
}
return 0;
}
/* a simple Shell sort */
static void groups_sort(struct group_info *group_info)
{
int base, max, stride;
int gidsetsize = group_info->ngroups;
for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
; /* nothing */
stride /= 3;
while (stride) {
max = gidsetsize - stride;
for (base = 0; base < max; base++) {
int left = base;
int right = left + stride;
gid_t tmp = GROUP_AT(group_info, right);
while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
GROUP_AT(group_info, right) =
GROUP_AT(group_info, left);
right = left;
left -= stride;
}
GROUP_AT(group_info, right) = tmp;
}
stride /= 3;
}
}
/* a simple bsearch */
int groups_search(const struct group_info *group_info, gid_t grp)
{
unsigned int left, right;
if (!group_info)
return 0;
left = 0;
right = group_info->ngroups;
while (left < right) {
unsigned int mid = (left+right)/2;
if (grp > GROUP_AT(group_info, mid))
left = mid + 1;
else if (grp < GROUP_AT(group_info, mid))
right = mid;
else
return 1;
}
return 0;
}
/**
* set_groups - Change a group subscription in a set of credentials
* @new: The newly prepared set of credentials to alter
* @group_info: The group list to install
*
* Validate a group subscription and, if valid, insert it into a set
* of credentials.
*/
int set_groups(struct cred *new, struct group_info *group_info)
{
int retval;
retval = security_task_setgroups(group_info);
if (retval)
return retval;
put_group_info(new->group_info);
groups_sort(group_info);
get_group_info(group_info);
new->group_info = group_info;
return 0;
}
EXPORT_SYMBOL(set_groups);
/**
* set_current_groups - Change current's group subscription
* @group_info: The group list to impose
*
* Validate a group subscription and, if valid, impose it upon current's task
* security record.
*/
int set_current_groups(struct group_info *group_info)
{
struct cred *new;
int ret;
new = prepare_creds();
if (!new)
return -ENOMEM;
ret = set_groups(new, group_info);
if (ret < 0) {
abort_creds(new);
return ret;
}
return commit_creds(new);
}
EXPORT_SYMBOL(set_current_groups);
SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
{
const struct cred *cred = current_cred();
int i;
if (gidsetsize < 0)
return -EINVAL;
/* no need to grab task_lock here; it cannot change */
i = cred->group_info->ngroups;
if (gidsetsize) {
if (i > gidsetsize) {
i = -EINVAL;
goto out;
}
if (groups_to_user(grouplist, cred->group_info)) {
i = -EFAULT;
goto out;
}
}
out:
return i;
}
/*
* SMP: Our groups are copy-on-write. We can set them safely
* without another task interfering.
*/
SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
{
struct group_info *group_info;
int retval;
if (!capable(CAP_SETGID))
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
group_info = groups_alloc(gidsetsize);
if (!group_info)
return -ENOMEM;
retval = groups_from_user(group_info, grouplist);
if (retval) {
put_group_info(group_info);
return retval;
}
retval = set_current_groups(group_info);
put_group_info(group_info);
return retval;
}
/*
* Check whether we're fsgid/egid or in the supplemental group..
*/
int in_group_p(gid_t grp)
{
const struct cred *cred = current_cred();
int retval = 1;
if (grp != cred->fsgid)
retval = groups_search(cred->group_info, grp);
return retval;
}
EXPORT_SYMBOL(in_group_p);
int in_egroup_p(gid_t grp)
{
const struct cred *cred = current_cred();
int retval = 1;
if (grp != cred->egid)
retval = groups_search(cred->group_info, grp);
return retval;
}
EXPORT_SYMBOL(in_egroup_p);

1843
kernel/kernel/hrtimer.c Normal file

File diff suppressed because it is too large Load Diff

217
kernel/kernel/hung_task.c Normal file
View File

@@ -0,0 +1,217 @@
/*
* Detect Hung Task
*
* kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
*
*/
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/lockdep.h>
#include <linux/module.h>
#include <linux/sysctl.h>
/*
* The number of tasks checked:
*/
unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
/*
* Limit number of tasks checked in a batch.
*
* This value controls the preemptibility of khungtaskd since preemption
* is disabled during the critical section. It also controls the size of
* the RCU grace period. So it needs to be upper-bound.
*/
#define HUNG_TASK_BATCHING 1024
/*
* Zero means infinite timeout - no checking done:
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
unsigned long __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
static struct task_struct *watchdog_task;
/*
* Should we panic (and reboot, if panic_timeout= is set) when a
* hung task is detected:
*/
unsigned int __read_mostly sysctl_hung_task_panic =
CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
static int __init hung_task_panic_setup(char *str)
{
sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
return 1;
}
__setup("hung_task_panic=", hung_task_panic_setup);
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
{
did_panic = 1;
return NOTIFY_DONE;
}
static struct notifier_block panic_block = {
.notifier_call = hung_task_panic,
};
static void check_hung_task(struct task_struct *t, unsigned long timeout)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;
/*
* Ensure the task is not frozen.
* Also, when a freshly created task is scheduled once, changes
* its state to TASK_UNINTERRUPTIBLE without having ever been
* switched out once, it musn't be checked.
*/
if (unlikely(t->flags & PF_FROZEN || !switch_count))
return;
if (switch_count != t->last_switch_count) {
t->last_switch_count = switch_count;
return;
}
if (!sysctl_hung_task_warnings)
return;
sysctl_hung_task_warnings--;
/*
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
printk(KERN_ERR "INFO: task %s:%d blocked for more than "
"%ld seconds.\n", t->comm, t->pid, timeout);
printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
__debug_show_held_locks(t);
touch_nmi_watchdog();
if (sysctl_hung_task_panic)
panic("hung_task: blocked tasks");
}
/*
* To avoid extending the RCU grace period for an unbounded amount of time,
* periodically exit the critical section and enter a new one.
*
* For preemptible RCU it is sufficient to call rcu_read_unlock in order
* exit the grace period. For classic RCU, a reschedule is required.
*/
static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
{
get_task_struct(g);
get_task_struct(t);
rcu_read_unlock();
cond_resched();
rcu_read_lock();
put_task_struct(t);
put_task_struct(g);
}
/*
* Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
* a really long time (120 seconds). If that happens, print out
* a warning.
*/
static void check_hung_uninterruptible_tasks(unsigned long timeout)
{
int max_count = sysctl_hung_task_check_count;
int batch_count = HUNG_TASK_BATCHING;
struct task_struct *g, *t;
/*
* If the system crashed already then all bets are off,
* do not report extra hung tasks:
*/
if (test_taint(TAINT_DIE) || did_panic)
return;
rcu_read_lock();
do_each_thread(g, t) {
if (!--max_count)
goto unlock;
if (!--batch_count) {
batch_count = HUNG_TASK_BATCHING;
rcu_lock_break(g, t);
/* Exit if t or g was unhashed during refresh. */
if (t->state == TASK_DEAD || g->state == TASK_DEAD)
goto unlock;
}
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
} while_each_thread(g, t);
unlock:
rcu_read_unlock();
}
static unsigned long timeout_jiffies(unsigned long timeout)
{
/* timeout of 0 will disable the watchdog */
return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
}
/*
* Process updating of timeout sysctl
*/
int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
goto out;
wake_up_process(watchdog_task);
out:
return ret;
}
/*
* kthread which checks for tasks stuck in D state
*/
static int watchdog(void *dummy)
{
set_user_nice(current, 0);
for ( ; ; ) {
unsigned long timeout = sysctl_hung_task_timeout_secs;
while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
timeout = sysctl_hung_task_timeout_secs;
check_hung_uninterruptible_tasks(timeout);
}
return 0;
}
static int __init hung_task_init(void)
{
atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
return 0;
}
module_init(hung_task_init);

View File

@@ -0,0 +1,7 @@
obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
obj-$(CONFIG_PM_SLEEP) += pm.o

View File

@@ -0,0 +1,196 @@
/*
* linux/kernel/irq/autoprobe.c
*
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the interrupt probing code and driver APIs.
*/
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/async.h>
#include "internals.h"
/*
* Autodetection depends on the fact that any interrupt that
* comes in on to an unassigned handler will get stuck with
* "IRQ_WAITING" cleared and the interrupt disabled.
*/
static DEFINE_MUTEX(probing_active);
/**
* probe_irq_on - begin an interrupt autodetect
*
* Commence probing for an interrupt. The interrupts are scanned
* and a mask of potential interrupt lines is returned.
*
*/
unsigned long probe_irq_on(void)
{
struct irq_desc *desc;
unsigned long mask = 0;
unsigned int status;
int i;
/*
* quiesce the kernel, or at least the asynchronous portion
*/
async_synchronize_full();
mutex_lock(&probing_active);
/*
* something may have generated an irq long ago and we want to
* flush such a longstanding irq before considering it as spurious.
*/
for_each_irq_desc_reverse(i, desc) {
spin_lock_irq(&desc->lock);
if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
/*
* An old-style architecture might still have
* the handle_bad_irq handler there:
*/
compat_irq_chip_set_default_handler(desc);
/*
* Some chips need to know about probing in
* progress:
*/
if (desc->chip->set_type)
desc->chip->set_type(i, IRQ_TYPE_PROBE);
desc->chip->startup(i);
}
spin_unlock_irq(&desc->lock);
}
/* Wait for longstanding interrupts to trigger. */
msleep(20);
/*
* enable any unassigned irqs
* (we must startup again here because if a longstanding irq
* happened in the previous stage, it may have masked itself)
*/
for_each_irq_desc_reverse(i, desc) {
spin_lock_irq(&desc->lock);
if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
if (desc->chip->startup(i))
desc->status |= IRQ_PENDING;
}
spin_unlock_irq(&desc->lock);
}
/*
* Wait for spurious interrupts to trigger
*/
msleep(100);
/*
* Now filter out any obviously spurious interrupts
*/
for_each_irq_desc(i, desc) {
spin_lock_irq(&desc->lock);
status = desc->status;
if (status & IRQ_AUTODETECT) {
/* It triggered already - consider it spurious. */
if (!(status & IRQ_WAITING)) {
desc->status = status & ~IRQ_AUTODETECT;
desc->chip->shutdown(i);
} else
if (i < 32)
mask |= 1 << i;
}
spin_unlock_irq(&desc->lock);
}
return mask;
}
EXPORT_SYMBOL(probe_irq_on);
/**
* probe_irq_mask - scan a bitmap of interrupt lines
* @val: mask of interrupts to consider
*
* Scan the interrupt lines and return a bitmap of active
* autodetect interrupts. The interrupt probe logic state
* is then returned to its previous value.
*
* Note: we need to scan all the irq's even though we will
* only return autodetect irq numbers - just so that we reset
* them all to a known state.
*/
unsigned int probe_irq_mask(unsigned long val)
{
unsigned int status, mask = 0;
struct irq_desc *desc;
int i;
for_each_irq_desc(i, desc) {
spin_lock_irq(&desc->lock);
status = desc->status;
if (status & IRQ_AUTODETECT) {
if (i < 16 && !(status & IRQ_WAITING))
mask |= 1 << i;
desc->status = status & ~IRQ_AUTODETECT;
desc->chip->shutdown(i);
}
spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
return mask & val;
}
EXPORT_SYMBOL(probe_irq_mask);
/**
* probe_irq_off - end an interrupt autodetect
* @val: mask of potential interrupts (unused)
*
* Scans the unused interrupt lines and returns the line which
* appears to have triggered the interrupt. If no interrupt was
* found then zero is returned. If more than one interrupt is
* found then minus the first candidate is returned to indicate
* their is doubt.
*
* The interrupt probe logic state is returned to its previous
* value.
*
* BUGS: When used in a module (which arguably shouldn't happen)
* nothing prevents two IRQ probe callers from overlapping. The
* results of this are non-optimal.
*/
int probe_irq_off(unsigned long val)
{
int i, irq_found = 0, nr_of_irqs = 0;
struct irq_desc *desc;
unsigned int status;
for_each_irq_desc(i, desc) {
spin_lock_irq(&desc->lock);
status = desc->status;
if (status & IRQ_AUTODETECT) {
if (!(status & IRQ_WAITING)) {
if (!nr_of_irqs)
irq_found = i;
nr_of_irqs++;
}
desc->status = status & ~IRQ_AUTODETECT;
desc->chip->shutdown(i);
}
spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
if (nr_of_irqs > 1)
irq_found = -irq_found;
return irq_found;
}
EXPORT_SYMBOL(probe_irq_off);

747
kernel/kernel/irq/chip.c Normal file
View File

@@ -0,0 +1,747 @@
/*
* linux/kernel/irq/chip.c
*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
* This file contains the core interrupt handling code, for irq-chip
* based architectures.
*
* Detailed information is available in Documentation/DocBook/genericirq
*/
#include <linux/irq.h>
#include <linux/msi.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include "internals.h"
static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
{
struct irq_desc *desc;
unsigned long flags;
desc = irq_to_desc(irq);
if (!desc) {
WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
return;
}
/* Ensure we don't have left over values from a previous use of this irq */
spin_lock_irqsave(&desc->lock, flags);
desc->status = IRQ_DISABLED;
desc->chip = &no_irq_chip;
desc->handle_irq = handle_bad_irq;
desc->depth = 1;
desc->msi_desc = NULL;
desc->handler_data = NULL;
if (!keep_chip_data)
desc->chip_data = NULL;
desc->action = NULL;
desc->irq_count = 0;
desc->irqs_unhandled = 0;
#ifdef CONFIG_SMP
cpumask_setall(desc->affinity);
#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_clear(desc->pending_mask);
#endif
#endif
spin_unlock_irqrestore(&desc->lock, flags);
}
/**
* dynamic_irq_init - initialize a dynamically allocated irq
* @irq: irq number to initialize
*/
void dynamic_irq_init(unsigned int irq)
{
dynamic_irq_init_x(irq, false);
}
/**
* dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
* @irq: irq number to initialize
*
* does not set irq_to_desc(irq)->chip_data to NULL
*/
void dynamic_irq_init_keep_chip_data(unsigned int irq)
{
dynamic_irq_init_x(irq, true);
}
static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
if (!desc) {
WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
return;
}
spin_lock_irqsave(&desc->lock, flags);
if (desc->action) {
spin_unlock_irqrestore(&desc->lock, flags);
WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
irq);
return;
}
desc->msi_desc = NULL;
desc->handler_data = NULL;
if (!keep_chip_data)
desc->chip_data = NULL;
desc->handle_irq = handle_bad_irq;
desc->chip = &no_irq_chip;
desc->name = NULL;
clear_kstat_irqs(desc);
spin_unlock_irqrestore(&desc->lock, flags);
}
/**
* dynamic_irq_cleanup - cleanup a dynamically allocated irq
* @irq: irq number to initialize
*/
void dynamic_irq_cleanup(unsigned int irq)
{
dynamic_irq_cleanup_x(irq, false);
}
/**
* dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
* @irq: irq number to initialize
*
* does not set irq_to_desc(irq)->chip_data to NULL
*/
void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
{
dynamic_irq_cleanup_x(irq, true);
}
/**
* set_irq_chip - set the irq chip for an irq
* @irq: irq number
* @chip: pointer to irq chip description structure
*/
int set_irq_chip(unsigned int irq, struct irq_chip *chip)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
if (!desc) {
WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
return -EINVAL;
}
if (!chip)
chip = &no_irq_chip;
spin_lock_irqsave(&desc->lock, flags);
irq_chip_set_defaults(chip);
desc->chip = chip;
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
EXPORT_SYMBOL(set_irq_chip);
/**
* set_irq_type - set the irq trigger type for an irq
* @irq: irq number
* @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
*/
int set_irq_type(unsigned int irq, unsigned int type)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
int ret = -ENXIO;
if (!desc) {
printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
return -ENODEV;
}
type &= IRQ_TYPE_SENSE_MASK;
if (type == IRQ_TYPE_NONE)
return 0;
spin_lock_irqsave(&desc->lock, flags);
ret = __irq_set_trigger(desc, irq, type);
spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
EXPORT_SYMBOL(set_irq_type);
/**
* set_irq_data - set irq type data for an irq
* @irq: Interrupt number
* @data: Pointer to interrupt specific data
*
* Set the hardware irq controller data for an irq
*/
int set_irq_data(unsigned int irq, void *data)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
if (!desc) {
printk(KERN_ERR
"Trying to install controller data for IRQ%d\n", irq);
return -EINVAL;
}
spin_lock_irqsave(&desc->lock, flags);
desc->handler_data = data;
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
EXPORT_SYMBOL(set_irq_data);
/**
* set_irq_data - set irq type data for an irq
* @irq: Interrupt number
* @entry: Pointer to MSI descriptor data
*
* Set the hardware irq controller data for an irq
*/
int set_irq_msi(unsigned int irq, struct msi_desc *entry)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
if (!desc) {
printk(KERN_ERR
"Trying to install msi data for IRQ%d\n", irq);
return -EINVAL;
}
spin_lock_irqsave(&desc->lock, flags);
desc->msi_desc = entry;
if (entry)
entry->irq = irq;
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
/**
* set_irq_chip_data - set irq chip data for an irq
* @irq: Interrupt number
* @data: Pointer to chip specific data
*
* Set the hardware irq chip data for an irq
*/
int set_irq_chip_data(unsigned int irq, void *data)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
if (!desc) {
printk(KERN_ERR
"Trying to install chip data for IRQ%d\n", irq);
return -EINVAL;
}
if (!desc->chip) {
printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
return -EINVAL;
}
spin_lock_irqsave(&desc->lock, flags);
desc->chip_data = data;
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
EXPORT_SYMBOL(set_irq_chip_data);
/**
* set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
*
* @irq: Interrupt number
* @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
*
* The IRQ_NESTED_THREAD flag indicates that on
* request_threaded_irq() no separate interrupt thread should be
* created for the irq as the handler are called nested in the
* context of a demultiplexing interrupt handler thread.
*/
void set_irq_nested_thread(unsigned int irq, int nest)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
if (!desc)
return;
spin_lock_irqsave(&desc->lock, flags);
if (nest)
desc->status |= IRQ_NESTED_THREAD;
else
desc->status &= ~IRQ_NESTED_THREAD;
spin_unlock_irqrestore(&desc->lock, flags);
}
EXPORT_SYMBOL_GPL(set_irq_nested_thread);
/*
* default enable function
*/
static void default_enable(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
desc->chip->unmask(irq);
desc->status &= ~IRQ_MASKED;
}
/*
* default disable function
*/
static void default_disable(unsigned int irq)
{
}
/*
* default startup function
*/
static unsigned int default_startup(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
desc->chip->enable(irq);
return 0;
}
/*
* default shutdown function
*/
static void default_shutdown(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
desc->chip->mask(irq);
desc->status |= IRQ_MASKED;
}
/*
* Fixup enable/disable function pointers
*/
void irq_chip_set_defaults(struct irq_chip *chip)
{
if (!chip->enable)
chip->enable = default_enable;
if (!chip->disable)
chip->disable = default_disable;
if (!chip->startup)
chip->startup = default_startup;
/*
* We use chip->disable, when the user provided its own. When
* we have default_disable set for chip->disable, then we need
* to use default_shutdown, otherwise the irq line is not
* disabled on free_irq():
*/
if (!chip->shutdown)
chip->shutdown = chip->disable != default_disable ?
chip->disable : default_shutdown;
if (!chip->name)
chip->name = chip->typename;
if (!chip->end)
chip->end = dummy_irq_chip.end;
}
static inline void mask_ack_irq(struct irq_desc *desc, int irq)
{
if (desc->chip->mask_ack)
desc->chip->mask_ack(irq);
else {
desc->chip->mask(irq);
if (desc->chip->ack)
desc->chip->ack(irq);
}
}
/*
* handle_nested_irq - Handle a nested irq from a irq thread
* @irq: the interrupt number
*
* Handle interrupts which are nested into a threaded interrupt
* handler. The handler function is called inside the calling
* threads context.
*/
void handle_nested_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
irqreturn_t action_ret;
might_sleep();
spin_lock_irq(&desc->lock);
kstat_incr_irqs_this_cpu(irq, desc);
action = desc->action;
if (unlikely(!action || (desc->status & IRQ_DISABLED)))
goto out_unlock;
desc->status |= IRQ_INPROGRESS;
spin_unlock_irq(&desc->lock);
action_ret = action->thread_fn(action->irq, action->dev_id);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
spin_lock_irq(&desc->lock);
desc->status &= ~IRQ_INPROGRESS;
out_unlock:
spin_unlock_irq(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_nested_irq);
/**
* handle_simple_irq - Simple and software-decoded IRQs.
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Simple interrupts are either sent from a demultiplexing interrupt
* handler or come from hardware, where no interrupt hardware control
* is necessary.
*
* Note: The caller is expected to handle the ack, clear, mask and
* unmask issues if necessary.
*/
void
handle_simple_irq(unsigned int irq, struct irq_desc *desc)
{
struct irqaction *action;
irqreturn_t action_ret;
spin_lock(&desc->lock);
if (unlikely(desc->status & IRQ_INPROGRESS))
goto out_unlock;
desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
action = desc->action;
if (unlikely(!action || (desc->status & IRQ_DISABLED)))
goto out_unlock;
desc->status |= IRQ_INPROGRESS;
spin_unlock(&desc->lock);
action_ret = handle_IRQ_event(irq, action);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
spin_lock(&desc->lock);
desc->status &= ~IRQ_INPROGRESS;
out_unlock:
spin_unlock(&desc->lock);
}
/**
* handle_level_irq - Level type irq handler
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Level type interrupts are active as long as the hardware line has
* the active level. This may require to mask the interrupt and unmask
* it after the associated handler has acknowledged the device, so the
* interrupt line is back to inactive.
*/
void
handle_level_irq(unsigned int irq, struct irq_desc *desc)
{
struct irqaction *action;
irqreturn_t action_ret;
spin_lock(&desc->lock);
mask_ack_irq(desc, irq);
if (unlikely(desc->status & IRQ_INPROGRESS))
goto out_unlock;
desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
/*
* If its disabled or no action available
* keep it masked and get out of here
*/
action = desc->action;
if (unlikely(!action || (desc->status & IRQ_DISABLED)))
goto out_unlock;
desc->status |= IRQ_INPROGRESS;
spin_unlock(&desc->lock);
action_ret = handle_IRQ_event(irq, action);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
spin_lock(&desc->lock);
desc->status &= ~IRQ_INPROGRESS;
if (unlikely(desc->status & IRQ_ONESHOT))
desc->status |= IRQ_MASKED;
else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
desc->chip->unmask(irq);
out_unlock:
spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_level_irq);
/**
* handle_fasteoi_irq - irq handler for transparent controllers
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Only a single callback will be issued to the chip: an ->eoi()
* call when the interrupt has been serviced. This enables support
* for modern forms of interrupt handlers, which handle the flow
* details in hardware, transparently.
*/
void
handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
{
struct irqaction *action;
irqreturn_t action_ret;
spin_lock(&desc->lock);
if (unlikely(desc->status & IRQ_INPROGRESS))
goto out;
desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
/*
* If its disabled or no action available
* then mask it and get out of here:
*/
action = desc->action;
if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
desc->status |= IRQ_PENDING;
if (desc->chip->mask)
desc->chip->mask(irq);
goto out;
}
desc->status |= IRQ_INPROGRESS;
desc->status &= ~IRQ_PENDING;
spin_unlock(&desc->lock);
action_ret = handle_IRQ_event(irq, action);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
spin_lock(&desc->lock);
desc->status &= ~IRQ_INPROGRESS;
out:
desc->chip->eoi(irq);
spin_unlock(&desc->lock);
}
/**
* handle_edge_irq - edge type IRQ handler
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Interrupt occures on the falling and/or rising edge of a hardware
* signal. The occurence is latched into the irq controller hardware
* and must be acked in order to be reenabled. After the ack another
* interrupt can happen on the same source even before the first one
* is handled by the assosiacted event handler. If this happens it
* might be necessary to disable (mask) the interrupt depending on the
* controller hardware. This requires to reenable the interrupt inside
* of the loop which handles the interrupts which have arrived while
* the handler was running. If all pending interrupts are handled, the
* loop is left.
*/
void
handle_edge_irq(unsigned int irq, struct irq_desc *desc)
{
spin_lock(&desc->lock);
desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
/*
* If we're currently running this IRQ, or its disabled,
* we shouldn't process the IRQ. Mark it pending, handle
* the necessary masking and go out
*/
if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
!desc->action)) {
desc->status |= (IRQ_PENDING | IRQ_MASKED);
mask_ack_irq(desc, irq);
goto out_unlock;
}
kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
if (desc->chip->ack)
desc->chip->ack(irq);
/* Mark the IRQ currently in progress.*/
desc->status |= IRQ_INPROGRESS;
do {
struct irqaction *action = desc->action;
irqreturn_t action_ret;
if (unlikely(!action)) {
desc->chip->mask(irq);
goto out_unlock;
}
/*
* When another irq arrived while we were handling
* one, we could have masked the irq.
* Renable it, if it was not disabled in meantime.
*/
if (unlikely((desc->status &
(IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
(IRQ_PENDING | IRQ_MASKED))) {
desc->chip->unmask(irq);
desc->status &= ~IRQ_MASKED;
}
desc->status &= ~IRQ_PENDING;
spin_unlock(&desc->lock);
action_ret = handle_IRQ_event(irq, action);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
spin_lock(&desc->lock);
} while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
desc->status &= ~IRQ_INPROGRESS;
out_unlock:
spin_unlock(&desc->lock);
}
/**
* handle_percpu_IRQ - Per CPU local irq handler
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Per CPU interrupts on SMP machines without locking requirements
*/
void
handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
{
irqreturn_t action_ret;
kstat_incr_irqs_this_cpu(irq, desc);
if (desc->chip->ack)
desc->chip->ack(irq);
action_ret = handle_IRQ_event(irq, desc->action);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
if (desc->chip->eoi)
desc->chip->eoi(irq);
}
void
__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
const char *name)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
if (!desc) {
printk(KERN_ERR
"Trying to install type control for IRQ%d\n", irq);
return;
}
if (!handle)
handle = handle_bad_irq;
else if (desc->chip == &no_irq_chip) {
printk(KERN_WARNING "Trying to install %sinterrupt handler "
"for IRQ%d\n", is_chained ? "chained " : "", irq);
/*
* Some ARM implementations install a handler for really dumb
* interrupt hardware without setting an irq_chip. This worked
* with the ARM no_irq_chip but the check in setup_irq would
* prevent us to setup the interrupt at all. Switch it to
* dummy_irq_chip for easy transition.
*/
desc->chip = &dummy_irq_chip;
}
chip_bus_lock(irq, desc);
spin_lock_irqsave(&desc->lock, flags);
/* Uninstall? */
if (handle == handle_bad_irq) {
if (desc->chip != &no_irq_chip)
mask_ack_irq(desc, irq);
desc->status |= IRQ_DISABLED;
desc->depth = 1;
}
desc->handle_irq = handle;
desc->name = name;
if (handle != handle_bad_irq && is_chained) {
desc->status &= ~IRQ_DISABLED;
desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
desc->depth = 0;
desc->chip->startup(irq);
}
spin_unlock_irqrestore(&desc->lock, flags);
chip_bus_sync_unlock(irq, desc);
}
EXPORT_SYMBOL_GPL(__set_irq_handler);
void
set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
irq_flow_handler_t handle)
{
set_irq_chip(irq, chip);
__set_irq_handler(irq, handle, 0, NULL);
}
void
set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
irq_flow_handler_t handle, const char *name)
{
set_irq_chip(irq, chip);
__set_irq_handler(irq, handle, 0, name);
}
void __init set_irq_noprobe(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
if (!desc) {
printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
return;
}
spin_lock_irqsave(&desc->lock, flags);
desc->status |= IRQ_NOPROBE;
spin_unlock_irqrestore(&desc->lock, flags);
}
void __init set_irq_probe(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
if (!desc) {
printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
return;
}
spin_lock_irqsave(&desc->lock, flags);
desc->status &= ~IRQ_NOPROBE;
spin_unlock_irqrestore(&desc->lock, flags);
}

View File

@@ -0,0 +1,94 @@
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/device.h>
#include <linux/gfp.h>
/*
* Device resource management aware IRQ request/free implementation.
*/
struct irq_devres {
unsigned int irq;
void *dev_id;
};
static void devm_irq_release(struct device *dev, void *res)
{
struct irq_devres *this = res;
free_irq(this->irq, this->dev_id);
}
static int devm_irq_match(struct device *dev, void *res, void *data)
{
struct irq_devres *this = res, *match = data;
return this->irq == match->irq && this->dev_id == match->dev_id;
}
/**
* devm_request_threaded_irq - allocate an interrupt line for a managed device
* @dev: device to request interrupt for
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs
* @thread_fn: function to be called in a threaded interrupt context. NULL
* for devices which handle everything in @handler
* @irqflags: Interrupt type flags
* @devname: An ascii name for the claiming device
* @dev_id: A cookie passed back to the handler function
*
* Except for the extra @dev argument, this function takes the
* same arguments and performs the same function as
* request_irq(). IRQs requested with this function will be
* automatically freed on driver detach.
*
* If an IRQ allocated with this function needs to be freed
* separately, dev_free_irq() must be used.
*/
int devm_request_threaded_irq(struct device *dev, unsigned int irq,
irq_handler_t handler, irq_handler_t thread_fn,
unsigned long irqflags, const char *devname,
void *dev_id)
{
struct irq_devres *dr;
int rc;
dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres),
GFP_KERNEL);
if (!dr)
return -ENOMEM;
rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
dev_id);
if (rc) {
devres_free(dr);
return rc;
}
dr->irq = irq;
dr->dev_id = dev_id;
devres_add(dev, dr);
return 0;
}
EXPORT_SYMBOL(devm_request_threaded_irq);
/**
* devm_free_irq - free an interrupt
* @dev: device to free interrupt for
* @irq: Interrupt line to free
* @dev_id: Device identity to free
*
* Except for the extra @dev argument, this function takes the
* same arguments and performs the same function as free_irq().
* This function instead of free_irq() should be used to manually
* free IRQs allocated with dev_request_irq().
*/
void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
{
struct irq_devres match_data = { irq, dev_id };
free_irq(irq, dev_id);
WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
&match_data));
}
EXPORT_SYMBOL(devm_free_irq);

561
kernel/kernel/irq/handle.c Normal file
View File

@@ -0,0 +1,561 @@
/*
* linux/kernel/irq/handle.c
*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
* This file contains the core interrupt handling code.
*
* Detailed information is available in Documentation/DocBook/genericirq
*
*/
#include <linux/irq.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/rculist.h>
#include <linux/hash.h>
#include <linux/bootmem.h>
#include <trace/events/irq.h>
#include "internals.h"
/*
* lockdep: we want to handle all irq_desc locks as a single lock-class:
*/
struct lock_class_key irq_desc_lock_class;
/**
* handle_bad_irq - handle spurious and unhandled irqs
* @irq: the interrupt number
* @desc: description of the interrupt
*
* Handles spurious and unhandled IRQ's. It also prints a debugmessage.
*/
void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
{
print_irq_desc(irq, desc);
kstat_incr_irqs_this_cpu(irq, desc);
ack_bad_irq(irq);
}
#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
static void __init init_irq_default_affinity(void)
{
alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
cpumask_setall(irq_default_affinity);
}
#else
static void __init init_irq_default_affinity(void)
{
}
#endif
/*
* Linux has a controller-independent interrupt architecture.
* Every controller has a 'controller-template', that is used
* by the main code to do the right thing. Each driver-visible
* interrupt source is transparently wired to the appropriate
* controller. Thus drivers need not be aware of the
* interrupt-controller.
*
* The code is designed to be easily extended with new/different
* interrupt controllers, without having to do assembly magic or
* having to touch the generic code.
*
* Controller mappings for all interrupt sources:
*/
int nr_irqs = NR_IRQS;
EXPORT_SYMBOL_GPL(nr_irqs);
#ifdef CONFIG_SPARSE_IRQ
static struct irq_desc irq_desc_init = {
.irq = -1,
.status = IRQ_DISABLED,
.chip = &no_irq_chip,
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
};
void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
{
void *ptr;
if (slab_is_available())
ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
GFP_ATOMIC, node);
else
ptr = alloc_bootmem_node(NODE_DATA(node),
nr * sizeof(*desc->kstat_irqs));
/*
* don't overwite if can not get new one
* init_copy_kstat_irqs() could still use old one
*/
if (ptr) {
printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
desc->kstat_irqs = ptr;
}
}
static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
{
memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
spin_lock_init(&desc->lock);
desc->irq = irq;
#ifdef CONFIG_SMP
desc->node = node;
#endif
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
init_kstat_irqs(desc, node, nr_cpu_ids);
if (!desc->kstat_irqs) {
printk(KERN_ERR "can not alloc kstat_irqs\n");
BUG_ON(1);
}
if (!alloc_desc_masks(desc, node, false)) {
printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
BUG_ON(1);
}
init_desc_masks(desc);
arch_init_chip_data(desc, node);
}
/*
* Protect the sparse_irqs:
*/
DEFINE_SPINLOCK(sparse_irq_lock);
struct irq_desc **irq_desc_ptrs __read_mostly;
static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
[0 ... NR_IRQS_LEGACY-1] = {
.irq = -1,
.status = IRQ_DISABLED,
.chip = &no_irq_chip,
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
}
};
static unsigned int *kstat_irqs_legacy;
int __init early_irq_init(void)
{
struct irq_desc *desc;
int legacy_count;
int node;
int i;
init_irq_default_affinity();
/* initialize nr_irqs based on nr_cpu_ids */
arch_probe_nr_irqs();
printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
desc = irq_desc_legacy;
legacy_count = ARRAY_SIZE(irq_desc_legacy);
node = first_online_node;
/* allocate irq_desc_ptrs array based on nr_irqs */
irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
/* allocate based on nr_cpu_ids */
kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
sizeof(int), GFP_NOWAIT, node);
for (i = 0; i < legacy_count; i++) {
desc[i].irq = i;
#ifdef CONFIG_SMP
desc[i].node = node;
#endif
desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
alloc_desc_masks(&desc[i], node, true);
init_desc_masks(&desc[i]);
irq_desc_ptrs[i] = desc + i;
}
for (i = legacy_count; i < nr_irqs; i++)
irq_desc_ptrs[i] = NULL;
return arch_early_irq_init();
}
struct irq_desc *irq_to_desc(unsigned int irq)
{
if (irq_desc_ptrs && irq < nr_irqs)
return irq_desc_ptrs[irq];
return NULL;
}
struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
{
struct irq_desc *desc;
unsigned long flags;
if (irq >= nr_irqs) {
WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
irq, nr_irqs);
return NULL;
}
desc = irq_desc_ptrs[irq];
if (desc)
return desc;
spin_lock_irqsave(&sparse_irq_lock, flags);
/* We have to check it to avoid races with another CPU */
desc = irq_desc_ptrs[irq];
if (desc)
goto out_unlock;
if (slab_is_available())
desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
else
desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
if (!desc) {
printk(KERN_ERR "can not alloc irq_desc\n");
BUG_ON(1);
}
init_one_irq_desc(irq, desc, node);
irq_desc_ptrs[irq] = desc;
out_unlock:
spin_unlock_irqrestore(&sparse_irq_lock, flags);
return desc;
}
#else /* !CONFIG_SPARSE_IRQ */
struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
[0 ... NR_IRQS-1] = {
.status = IRQ_DISABLED,
.chip = &no_irq_chip,
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
}
};
static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
int __init early_irq_init(void)
{
struct irq_desc *desc;
int count;
int i;
init_irq_default_affinity();
printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
desc = irq_desc;
count = ARRAY_SIZE(irq_desc);
for (i = 0; i < count; i++) {
desc[i].irq = i;
alloc_desc_masks(&desc[i], 0, true);
init_desc_masks(&desc[i]);
desc[i].kstat_irqs = kstat_irqs_all[i];
}
return arch_early_irq_init();
}
struct irq_desc *irq_to_desc(unsigned int irq)
{
return (irq < NR_IRQS) ? irq_desc + irq : NULL;
}
struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
{
return irq_to_desc(irq);
}
#endif /* !CONFIG_SPARSE_IRQ */
void clear_kstat_irqs(struct irq_desc *desc)
{
memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
}
/*
* What should we do if we get a hw irq event on an illegal vector?
* Each architecture has to answer this themself.
*/
static void ack_bad(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
print_irq_desc(irq, desc);
ack_bad_irq(irq);
}
/*
* NOP functions
*/
static void noop(unsigned int irq)
{
}
static unsigned int noop_ret(unsigned int irq)
{
return 0;
}
/*
* Generic no controller implementation
*/
struct irq_chip no_irq_chip = {
.name = "none",
.startup = noop_ret,
.shutdown = noop,
.enable = noop,
.disable = noop,
.ack = ack_bad,
.end = noop,
};
/*
* Generic dummy implementation which can be used for
* real dumb interrupt sources
*/
struct irq_chip dummy_irq_chip = {
.name = "dummy",
.startup = noop_ret,
.shutdown = noop,
.enable = noop,
.disable = noop,
.ack = noop,
.mask = noop,
.unmask = noop,
.end = noop,
};
/*
* Special, empty irq handler:
*/
irqreturn_t no_action(int cpl, void *dev_id)
{
return IRQ_NONE;
}
static void warn_no_thread(unsigned int irq, struct irqaction *action)
{
if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
return;
printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
"but no thread function available.", irq, action->name);
}
/**
* handle_IRQ_event - irq action chain handler
* @irq: the interrupt number
* @action: the interrupt action chain for this irq
*
* Handles the action chain of an irq event
*/
irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
{
irqreturn_t ret, retval = IRQ_NONE;
unsigned int status = 0;
if (!(action->flags & IRQF_DISABLED))
local_irq_enable_in_hardirq();
do {
trace_irq_handler_entry(irq, action);
ret = action->handler(irq, action->dev_id);
trace_irq_handler_exit(irq, action, ret);
switch (ret) {
case IRQ_WAKE_THREAD:
/*
* Set result to handled so the spurious check
* does not trigger.
*/
ret = IRQ_HANDLED;
/*
* Catch drivers which return WAKE_THREAD but
* did not set up a thread function
*/
if (unlikely(!action->thread_fn)) {
warn_no_thread(irq, action);
break;
}
/*
* Wake up the handler thread for this
* action. In case the thread crashed and was
* killed we just pretend that we handled the
* interrupt. The hardirq handler above has
* disabled the device interrupt, so no irq
* storm is lurking.
*/
if (likely(!test_bit(IRQTF_DIED,
&action->thread_flags))) {
set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
wake_up_process(action->thread);
}
/* Fall through to add to randomness */
case IRQ_HANDLED:
status |= action->flags;
break;
default:
break;
}
retval |= ret;
action = action->next;
} while (action);
if (status & IRQF_SAMPLE_RANDOM)
add_interrupt_randomness(irq);
local_irq_disable();
return retval;
}
#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
#ifdef CONFIG_ENABLE_WARN_DEPRECATED
# warning __do_IRQ is deprecated. Please convert to proper flow handlers
#endif
/**
* __do_IRQ - original all in one highlevel IRQ handler
* @irq: the interrupt number
*
* __do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
* handlers).
*
* This is the original x86 implementation which is used for every
* interrupt type.
*/
unsigned int __do_IRQ(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
unsigned int status;
kstat_incr_irqs_this_cpu(irq, desc);
if (CHECK_IRQ_PER_CPU(desc->status)) {
irqreturn_t action_ret;
/*
* No locking required for CPU-local interrupts:
*/
if (desc->chip->ack)
desc->chip->ack(irq);
if (likely(!(desc->status & IRQ_DISABLED))) {
action_ret = handle_IRQ_event(irq, desc->action);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
}
desc->chip->end(irq);
return 1;
}
spin_lock(&desc->lock);
if (desc->chip->ack)
desc->chip->ack(irq);
/*
* REPLAY is when Linux resends an IRQ that was dropped earlier
* WAITING is used by probe to mark irqs that are being tested
*/
status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
status |= IRQ_PENDING; /* we _want_ to handle it */
/*
* If the IRQ is disabled for whatever reason, we cannot
* use the action we have.
*/
action = NULL;
if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
action = desc->action;
status &= ~IRQ_PENDING; /* we commit to handling */
status |= IRQ_INPROGRESS; /* we are handling it */
}
desc->status = status;
/*
* If there is no IRQ handler or it was disabled, exit early.
* Since we set PENDING, if another processor is handling
* a different instance of this same irq, the other processor
* will take care of it.
*/
if (unlikely(!action))
goto out;
/*
* Edge triggered interrupts need to remember
* pending events.
* This applies to any hw interrupts that allow a second
* instance of the same irq to arrive while we are in do_IRQ
* or in the handler. But the code here only handles the _second_
* instance of the irq, not the third or fourth. So it is mostly
* useful for irq hardware that does not mask cleanly in an
* SMP environment.
*/
for (;;) {
irqreturn_t action_ret;
spin_unlock(&desc->lock);
action_ret = handle_IRQ_event(irq, action);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
spin_lock(&desc->lock);
if (likely(!(desc->status & IRQ_PENDING)))
break;
desc->status &= ~IRQ_PENDING;
}
desc->status &= ~IRQ_INPROGRESS;
out:
/*
* The ->end() handler has to deal with interrupts which got
* disabled while the handler was running.
*/
desc->chip->end(irq);
spin_unlock(&desc->lock);
return 1;
}
#endif
void early_init_irq_lock_class(void)
{
struct irq_desc *desc;
int i;
for_each_irq_desc(i, desc) {
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
}
}
unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
return desc ? desc->kstat_irqs[cpu] : 0;
}
EXPORT_SYMBOL(kstat_irqs_cpu);

View File

@@ -0,0 +1,99 @@
/*
* IRQ subsystem internal functions and variables:
*/
extern int noirqdebug;
/* Set default functions for irq_chip structures: */
extern void irq_chip_set_defaults(struct irq_chip *chip);
/* Set default handler: */
extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags);
extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
extern struct lock_class_key irq_desc_lock_class;
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
extern void clear_kstat_irqs(struct irq_desc *desc);
extern spinlock_t sparse_irq_lock;
#ifdef CONFIG_SPARSE_IRQ
/* irq_desc_ptrs allocated at boot time */
extern struct irq_desc **irq_desc_ptrs;
#else
/* irq_desc_ptrs is a fixed size array */
extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
#endif
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void register_handler_proc(unsigned int irq, struct irqaction *action);
extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
#else
static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
static inline void register_handler_proc(unsigned int irq,
struct irqaction *action) { }
static inline void unregister_handler_proc(unsigned int irq,
struct irqaction *action) { }
#endif
extern int irq_select_affinity_usr(unsigned int irq);
extern void irq_set_thread_affinity(struct irq_desc *desc);
/* Inline functions for support of irq chips on slow busses */
static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
{
if (unlikely(desc->chip->bus_lock))
desc->chip->bus_lock(irq);
}
static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
{
if (unlikely(desc->chip->bus_sync_unlock))
desc->chip->bus_sync_unlock(irq);
}
/*
* Debugging printout:
*/
#include <linux/kallsyms.h>
#define P(f) if (desc->status & f) printk("%14s set\n", #f)
static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
{
printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
printk("->handle_irq(): %p, ", desc->handle_irq);
print_symbol("%s\n", (unsigned long)desc->handle_irq);
printk("->chip(): %p, ", desc->chip);
print_symbol("%s\n", (unsigned long)desc->chip);
printk("->action(): %p\n", desc->action);
if (desc->action) {
printk("->action->handler(): %p, ", desc->action->handler);
print_symbol("%s\n", (unsigned long)desc->action->handler);
}
P(IRQ_INPROGRESS);
P(IRQ_DISABLED);
P(IRQ_PENDING);
P(IRQ_REPLAY);
P(IRQ_AUTODETECT);
P(IRQ_WAITING);
P(IRQ_LEVEL);
P(IRQ_MASKED);
#ifdef CONFIG_IRQ_PER_CPU
P(IRQ_PER_CPU);
#endif
P(IRQ_NOPROBE);
P(IRQ_NOREQUEST);
P(IRQ_NOAUTOEN);
}
#undef P

1112
kernel/kernel/irq/manage.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,68 @@
#include <linux/irq.h>
#include <linux/interrupt.h>
#include "internals.h"
void move_masked_irq(int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
if (likely(!(desc->status & IRQ_MOVE_PENDING)))
return;
/*
* Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
*/
if (CHECK_IRQ_PER_CPU(desc->status)) {
WARN_ON(1);
return;
}
desc->status &= ~IRQ_MOVE_PENDING;
if (unlikely(cpumask_empty(desc->pending_mask)))
return;
if (!desc->chip->set_affinity)
return;
assert_spin_locked(&desc->lock);
/*
* If there was a valid mask to work with, please
* do the disable, re-program, enable sequence.
* This is *not* particularly important for level triggered
* but in a edge trigger case, we might be setting rte
* when an active trigger is comming in. This could
* cause some ioapics to mal-function.
* Being paranoid i guess!
*
* For correct operation this depends on the caller
* masking the irqs.
*/
if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
< nr_cpu_ids))
if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
cpumask_copy(desc->affinity, desc->pending_mask);
irq_set_thread_affinity(desc);
}
cpumask_clear(desc->pending_mask);
}
void move_native_irq(int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
if (likely(!(desc->status & IRQ_MOVE_PENDING)))
return;
if (unlikely(desc->status & IRQ_DISABLED))
return;
desc->chip->mask(irq);
move_masked_irq(irq);
desc->chip->unmask(irq);
}

View File

@@ -0,0 +1,119 @@
/*
* NUMA irq-desc migration code
*
* Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
* the new "home node" of the IRQ.
*/
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include "internals.h"
static void init_copy_kstat_irqs(struct irq_desc *old_desc,
struct irq_desc *desc,
int node, int nr)
{
init_kstat_irqs(desc, node, nr);
if (desc->kstat_irqs != old_desc->kstat_irqs)
memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
nr * sizeof(*desc->kstat_irqs));
}
static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
{
if (old_desc->kstat_irqs == desc->kstat_irqs)
return;
kfree(old_desc->kstat_irqs);
old_desc->kstat_irqs = NULL;
}
static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
struct irq_desc *desc, int node)
{
memcpy(desc, old_desc, sizeof(struct irq_desc));
if (!alloc_desc_masks(desc, node, false)) {
printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
"for migration.\n", irq);
return false;
}
spin_lock_init(&desc->lock);
desc->node = node;
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
init_copy_desc_masks(old_desc, desc);
arch_init_copy_chip_data(old_desc, desc, node);
return true;
}
static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
{
free_kstat_irqs(old_desc, desc);
free_desc_masks(old_desc, desc);
arch_free_chip_data(old_desc, desc);
}
static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
int node)
{
struct irq_desc *desc;
unsigned int irq;
unsigned long flags;
irq = old_desc->irq;
spin_lock_irqsave(&sparse_irq_lock, flags);
/* We have to check it to avoid races with another CPU */
desc = irq_desc_ptrs[irq];
if (desc && old_desc != desc)
goto out_unlock;
desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
if (!desc) {
printk(KERN_ERR "irq %d: can not get new irq_desc "
"for migration.\n", irq);
/* still use old one */
desc = old_desc;
goto out_unlock;
}
if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
/* still use old one */
kfree(desc);
desc = old_desc;
goto out_unlock;
}
irq_desc_ptrs[irq] = desc;
spin_unlock_irqrestore(&sparse_irq_lock, flags);
/* free the old one */
free_one_irq_desc(old_desc, desc);
kfree(old_desc);
return desc;
out_unlock:
spin_unlock_irqrestore(&sparse_irq_lock, flags);
return desc;
}
struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
{
/* those static or target node is -1, do not move them */
if (desc->irq < NR_IRQS_LEGACY || node == -1)
return desc;
if (desc->node != node)
desc = __real_move_irq_desc(desc, node);
return desc;
}

76
kernel/kernel/irq/pm.c Normal file
View File

@@ -0,0 +1,76 @@
/*
* linux/kernel/irq/pm.c
*
* Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
*
* This file contains power management functions related to interrupts.
*/
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include "internals.h"
/**
* suspend_device_irqs - disable all currently enabled interrupt lines
*
* During system-wide suspend or hibernation device drivers need to be prevented
* from receiving interrupts and this function is provided for this purpose.
* It marks all interrupt lines in use, except for the timer ones, as disabled
* and sets the IRQ_SUSPENDED flag for each of them.
*/
void suspend_device_irqs(void)
{
struct irq_desc *desc;
int irq;
for_each_irq_desc(irq, desc) {
unsigned long flags;
spin_lock_irqsave(&desc->lock, flags);
__disable_irq(desc, irq, true);
spin_unlock_irqrestore(&desc->lock, flags);
}
for_each_irq_desc(irq, desc)
if (desc->status & IRQ_SUSPENDED)
synchronize_irq(irq);
}
EXPORT_SYMBOL_GPL(suspend_device_irqs);
/**
* resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
*
* Enable all interrupt lines previously disabled by suspend_device_irqs() that
* have the IRQ_SUSPENDED flag set.
*/
void resume_device_irqs(void)
{
struct irq_desc *desc;
int irq;
for_each_irq_desc(irq, desc) {
unsigned long flags;
spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq, true);
spin_unlock_irqrestore(&desc->lock, flags);
}
}
EXPORT_SYMBOL_GPL(resume_device_irqs);
/**
* check_wakeup_irqs - check if any wake-up interrupts are pending
*/
int check_wakeup_irqs(void)
{
struct irq_desc *desc;
int irq;
for_each_irq_desc(irq, desc)
if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
return -EBUSY;
return 0;
}

272
kernel/kernel/irq/proc.c Normal file
View File

@@ -0,0 +1,272 @@
/*
* linux/kernel/irq/proc.c
*
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the /proc/irq/ handling code.
*/
#include <linux/irq.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/interrupt.h>
#include "internals.h"
static struct proc_dir_entry *root_irq_dir;
#ifdef CONFIG_SMP
static int irq_affinity_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
const struct cpumask *mask = desc->affinity;
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (desc->status & IRQ_MOVE_PENDING)
mask = desc->pending_mask;
#endif
seq_cpumask(m, mask);
seq_putc(m, '\n');
return 0;
}
#ifndef is_affinity_mask_valid
#define is_affinity_mask_valid(val) 1
#endif
int no_irq_affinity;
static ssize_t irq_affinity_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
cpumask_var_t new_value;
int err;
if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
irq_balancing_disabled(irq))
return -EIO;
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(buffer, count, new_value);
if (err)
goto free_cpumask;
if (!is_affinity_mask_valid(new_value)) {
err = -EINVAL;
goto free_cpumask;
}
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
* one online CPU still has to be targeted.
*/
if (!cpumask_intersects(new_value, cpu_online_mask)) {
/* Special case for empty set - allow the architecture
code to set default SMP affinity. */
err = irq_select_affinity_usr(irq) ? -EINVAL : count;
} else {
irq_set_affinity(irq, new_value);
err = count;
}
free_cpumask:
free_cpumask_var(new_value);
return err;
}
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
}
static const struct file_operations irq_affinity_proc_fops = {
.open = irq_affinity_proc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
.write = irq_affinity_proc_write,
};
static int default_affinity_show(struct seq_file *m, void *v)
{
seq_cpumask(m, irq_default_affinity);
seq_putc(m, '\n');
return 0;
}
static ssize_t default_affinity_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
cpumask_var_t new_value;
int err;
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(buffer, count, new_value);
if (err)
goto out;
if (!is_affinity_mask_valid(new_value)) {
err = -EINVAL;
goto out;
}
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
* one online CPU still has to be targeted.
*/
if (!cpumask_intersects(new_value, cpu_online_mask)) {
err = -EINVAL;
goto out;
}
cpumask_copy(irq_default_affinity, new_value);
err = count;
out:
free_cpumask_var(new_value);
return err;
}
static int default_affinity_open(struct inode *inode, struct file *file)
{
return single_open(file, default_affinity_show, NULL);
}
static const struct file_operations default_affinity_proc_fops = {
.open = default_affinity_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
.write = default_affinity_write,
};
#endif
static int irq_spurious_read(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
struct irq_desc *desc = irq_to_desc((long) data);
return sprintf(page, "count %u\n"
"unhandled %u\n"
"last_unhandled %u ms\n",
desc->irq_count,
desc->irqs_unhandled,
jiffies_to_msecs(desc->last_unhandled));
}
#define MAX_NAMELEN 128
static int name_unique(unsigned int irq, struct irqaction *new_action)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
unsigned long flags;
int ret = 1;
spin_lock_irqsave(&desc->lock, flags);
for (action = desc->action ; action; action = action->next) {
if ((action != new_action) && action->name &&
!strcmp(new_action->name, action->name)) {
ret = 0;
break;
}
}
spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
void register_handler_proc(unsigned int irq, struct irqaction *action)
{
char name [MAX_NAMELEN];
struct irq_desc *desc = irq_to_desc(irq);
if (!desc->dir || action->dir || !action->name ||
!name_unique(irq, action))
return;
memset(name, 0, MAX_NAMELEN);
snprintf(name, MAX_NAMELEN, "%s", action->name);
/* create /proc/irq/1234/handler/ */
action->dir = proc_mkdir(name, desc->dir);
}
#undef MAX_NAMELEN
#define MAX_NAMELEN 10
void register_irq_proc(unsigned int irq, struct irq_desc *desc)
{
char name [MAX_NAMELEN];
struct proc_dir_entry *entry;
if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
return;
memset(name, 0, MAX_NAMELEN);
sprintf(name, "%d", irq);
/* create /proc/irq/1234 */
desc->dir = proc_mkdir(name, root_irq_dir);
#ifdef CONFIG_SMP
/* create /proc/irq/<irq>/smp_affinity */
proc_create_data("smp_affinity", 0600, desc->dir,
&irq_affinity_proc_fops, (void *)(long)irq);
#endif
entry = create_proc_entry("spurious", 0444, desc->dir);
if (entry) {
entry->data = (void *)(long)irq;
entry->read_proc = irq_spurious_read;
}
}
#undef MAX_NAMELEN
void unregister_handler_proc(unsigned int irq, struct irqaction *action)
{
if (action->dir) {
struct irq_desc *desc = irq_to_desc(irq);
remove_proc_entry(action->dir->name, desc->dir);
}
}
static void register_default_affinity_proc(void)
{
#ifdef CONFIG_SMP
proc_create("irq/default_smp_affinity", 0600, NULL,
&default_affinity_proc_fops);
#endif
}
void init_irq_proc(void)
{
unsigned int irq;
struct irq_desc *desc;
/* create /proc/irq */
root_irq_dir = proc_mkdir("irq", NULL);
if (!root_irq_dir)
return;
register_default_affinity_proc();
/*
* Create entries for all existing IRQs.
*/
for_each_irq_desc(irq, desc) {
if (!desc)
continue;
register_irq_proc(irq, desc);
}
}

View File

@@ -0,0 +1,81 @@
/*
* linux/kernel/irq/resend.c
*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner
*
* This file contains the IRQ-resend code
*
* If the interrupt is waiting to be processed, we try to re-run it.
* We can't directly run it from here since the caller might be in an
* interrupt-protected region. Not all irq controller chips can
* retrigger interrupts at the hardware level, so in those cases
* we allow the resending of IRQs via a tasklet.
*/
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include "internals.h"
#ifdef CONFIG_HARDIRQS_SW_RESEND
/* Bitmap to handle software resend of interrupts: */
static DECLARE_BITMAP(irqs_resend, NR_IRQS);
/*
* Run software resends of IRQ's
*/
static void resend_irqs(unsigned long arg)
{
struct irq_desc *desc;
int irq;
while (!bitmap_empty(irqs_resend, nr_irqs)) {
irq = find_first_bit(irqs_resend, nr_irqs);
clear_bit(irq, irqs_resend);
desc = irq_to_desc(irq);
local_irq_disable();
desc->handle_irq(irq, desc);
local_irq_enable();
}
}
/* Tasklet to handle resend: */
static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
#endif
/*
* IRQ resend
*
* Is called with interrupts disabled and desc->lock held.
*/
void check_irq_resend(struct irq_desc *desc, unsigned int irq)
{
unsigned int status = desc->status;
/*
* Make sure the interrupt is enabled, before resending it:
*/
desc->chip->enable(irq);
/*
* We do not resend level type interrupts. Level type
* interrupts are resent by hardware when they are still
* active.
*/
if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
#ifdef CONFIG_HARDIRQS_SW_RESEND
/* Set it pending and activate the softirq: */
set_bit(irq, irqs_resend);
tasklet_schedule(&resend_tasklet);
#endif
}
}
}

View File

@@ -0,0 +1,313 @@
/*
* linux/kernel/irq/spurious.c
*
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains spurious interrupt handling.
*/
#include <linux/jiffies.h>
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/interrupt.h>
#include <linux/moduleparam.h>
#include <linux/timer.h>
static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
static void poll_spurious_irqs(unsigned long dummy);
static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
/*
* Recovery handler for misrouted interrupts.
*/
static int try_one_irq(int irq, struct irq_desc *desc)
{
struct irqaction *action;
int ok = 0, work = 0;
spin_lock(&desc->lock);
/* Already running on another processor */
if (desc->status & IRQ_INPROGRESS) {
/*
* Already running: If it is shared get the other
* CPU to go looking for our mystery interrupt too
*/
if (desc->action && (desc->action->flags & IRQF_SHARED))
desc->status |= IRQ_PENDING;
spin_unlock(&desc->lock);
return ok;
}
/* Honour the normal IRQ locking */
desc->status |= IRQ_INPROGRESS;
action = desc->action;
spin_unlock(&desc->lock);
while (action) {
/* Only shared IRQ handlers are safe to call */
if (action->flags & IRQF_SHARED) {
if (action->handler(irq, action->dev_id) ==
IRQ_HANDLED)
ok = 1;
}
action = action->next;
}
local_irq_disable();
/* Now clean up the flags */
spin_lock(&desc->lock);
action = desc->action;
/*
* While we were looking for a fixup someone queued a real
* IRQ clashing with our walk:
*/
while ((desc->status & IRQ_PENDING) && action) {
/*
* Perform real IRQ processing for the IRQ we deferred
*/
work = 1;
spin_unlock(&desc->lock);
handle_IRQ_event(irq, action);
spin_lock(&desc->lock);
desc->status &= ~IRQ_PENDING;
}
desc->status &= ~IRQ_INPROGRESS;
/*
* If we did actual work for the real IRQ line we must let the
* IRQ controller clean up too
*/
if (work && desc->chip && desc->chip->end)
desc->chip->end(irq);
spin_unlock(&desc->lock);
return ok;
}
static int misrouted_irq(int irq)
{
struct irq_desc *desc;
int i, ok = 0;
for_each_irq_desc(i, desc) {
if (!i)
continue;
if (i == irq) /* Already tried */
continue;
if (try_one_irq(i, desc))
ok = 1;
}
/* So the caller can adjust the irq error counts */
return ok;
}
static void poll_all_shared_irqs(void)
{
struct irq_desc *desc;
int i;
for_each_irq_desc(i, desc) {
unsigned int status;
if (!i)
continue;
/* Racy but it doesn't matter */
status = desc->status;
barrier();
if (!(status & IRQ_SPURIOUS_DISABLED))
continue;
local_irq_disable();
try_one_irq(i, desc);
local_irq_enable();
}
}
static void poll_spurious_irqs(unsigned long dummy)
{
poll_all_shared_irqs();
mod_timer(&poll_spurious_irq_timer,
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
#ifdef CONFIG_DEBUG_SHIRQ
void debug_poll_all_shared_irqs(void)
{
poll_all_shared_irqs();
}
#endif
/*
* If 99,900 of the previous 100,000 interrupts have not been handled
* then assume that the IRQ is stuck in some manner. Drop a diagnostic
* and try to turn the IRQ off.
*
* (The other 100-of-100,000 interrupts may have been a correctly
* functioning device sharing an IRQ with the failing one)
*
* Called under desc->lock
*/
static void
__report_bad_irq(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
struct irqaction *action;
if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
printk(KERN_ERR "irq event %d: bogus return value %x\n",
irq, action_ret);
} else {
printk(KERN_ERR "irq %d: nobody cared (try booting with "
"the \"irqpoll\" option)\n", irq);
}
dump_stack();
printk(KERN_ERR "handlers:\n");
action = desc->action;
while (action) {
printk(KERN_ERR "[<%p>]", action->handler);
print_symbol(" (%s)",
(unsigned long)action->handler);
printk("\n");
action = action->next;
}
}
static void
report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
{
static int count = 100;
if (count > 0) {
count--;
__report_bad_irq(irq, desc, action_ret);
}
}
static inline int
try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
struct irqaction *action;
if (!irqfixup)
return 0;
/* We didn't actually handle the IRQ - see if it was misrouted? */
if (action_ret == IRQ_NONE)
return 1;
/*
* But for 'irqfixup == 2' we also do it for handled interrupts if
* they are marked as IRQF_IRQPOLL (or for irq zero, which is the
* traditional PC timer interrupt.. Legacy)
*/
if (irqfixup < 2)
return 0;
if (!irq)
return 1;
/*
* Since we don't get the descriptor lock, "action" can
* change under us. We don't really care, but we don't
* want to follow a NULL pointer. So tell the compiler to
* just load it once by using a barrier.
*/
action = desc->action;
barrier();
return action && (action->flags & IRQF_IRQPOLL);
}
void note_interrupt(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
if (unlikely(action_ret != IRQ_HANDLED)) {
/*
* If we are seeing only the odd spurious IRQ caused by
* bus asynchronicity then don't eventually trigger an error,
* otherwise the couter becomes a doomsday timer for otherwise
* working systems
*/
if (time_after(jiffies, desc->last_unhandled + HZ/10))
desc->irqs_unhandled = 1;
else
desc->irqs_unhandled++;
desc->last_unhandled = jiffies;
if (unlikely(action_ret != IRQ_NONE))
report_bad_irq(irq, desc, action_ret);
}
if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
int ok = misrouted_irq(irq);
if (action_ret == IRQ_NONE)
desc->irqs_unhandled -= ok;
}
desc->irq_count++;
if (likely(desc->irq_count < 100000))
return;
desc->irq_count = 0;
if (unlikely(desc->irqs_unhandled > 99900)) {
/*
* The interrupt is stuck
*/
__report_bad_irq(irq, desc, action_ret);
/*
* Now kill the IRQ
*/
printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
desc->depth++;
desc->chip->disable(irq);
mod_timer(&poll_spurious_irq_timer,
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
desc->irqs_unhandled = 0;
}
int noirqdebug __read_mostly;
int noirqdebug_setup(char *str)
{
noirqdebug = 1;
printk(KERN_INFO "IRQ lockup detection disabled\n");
return 1;
}
__setup("noirqdebug", noirqdebug_setup);
module_param(noirqdebug, bool, 0644);
MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
static int __init irqfixup_setup(char *str)
{
irqfixup = 1;
printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
printk(KERN_WARNING "This may impact system performance.\n");
return 1;
}
__setup("irqfixup", irqfixup_setup);
module_param(irqfixup, int, 0644);
static int __init irqpoll_setup(char *str)
{
irqfixup = 2;
printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
"enabled\n");
printk(KERN_WARNING "This may significantly impact system "
"performance\n");
return 1;
}
__setup("irqpoll", irqpoll_setup);

295
kernel/kernel/itimer.c Normal file
View File

@@ -0,0 +1,295 @@
/*
* linux/kernel/itimer.c
*
* Copyright (C) 1992 Darren Senn
*/
/* These are all the functions necessary to implement itimers */
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/syscalls.h>
#include <linux/time.h>
#include <linux/posix-timers.h>
#include <linux/hrtimer.h>
#include <trace/events/timer.h>
#include <asm/uaccess.h>
/**
* itimer_get_remtime - get remaining time for the timer
*
* @timer: the timer to read
*
* Returns the delta between the expiry time and now, which can be
* less than zero or 1usec for an pending expired timer
*/
static struct timeval itimer_get_remtime(struct hrtimer *timer)
{
ktime_t rem = hrtimer_get_remaining(timer);
/*
* Racy but safe: if the itimer expires after the above
* hrtimer_get_remtime() call but before this condition
* then we return 0 - which is correct.
*/
if (hrtimer_active(timer)) {
if (rem.tv64 <= 0)
rem.tv64 = NSEC_PER_USEC;
} else
rem.tv64 = 0;
return ktime_to_timeval(rem);
}
static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
struct itimerval *const value)
{
cputime_t cval, cinterval;
struct cpu_itimer *it = &tsk->signal->it[clock_id];
spin_lock_irq(&tsk->sighand->siglock);
cval = it->expires;
cinterval = it->incr;
if (!cputime_eq(cval, cputime_zero)) {
struct task_cputime cputime;
cputime_t t;
thread_group_cputimer(tsk, &cputime);
if (clock_id == CPUCLOCK_PROF)
t = cputime_add(cputime.utime, cputime.stime);
else
/* CPUCLOCK_VIRT */
t = cputime.utime;
if (cputime_le(cval, t))
/* about to fire */
cval = cputime_one_jiffy;
else
cval = cputime_sub(cval, t);
}
spin_unlock_irq(&tsk->sighand->siglock);
cputime_to_timeval(cval, &value->it_value);
cputime_to_timeval(cinterval, &value->it_interval);
}
int do_getitimer(int which, struct itimerval *value)
{
struct task_struct *tsk = current;
switch (which) {
case ITIMER_REAL:
spin_lock_irq(&tsk->sighand->siglock);
value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
value->it_interval =
ktime_to_timeval(tsk->signal->it_real_incr);
spin_unlock_irq(&tsk->sighand->siglock);
break;
case ITIMER_VIRTUAL:
get_cpu_itimer(tsk, CPUCLOCK_VIRT, value);
break;
case ITIMER_PROF:
get_cpu_itimer(tsk, CPUCLOCK_PROF, value);
break;
default:
return(-EINVAL);
}
return 0;
}
SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
{
int error = -EFAULT;
struct itimerval get_buffer;
if (value) {
error = do_getitimer(which, &get_buffer);
if (!error &&
copy_to_user(value, &get_buffer, sizeof(get_buffer)))
error = -EFAULT;
}
return error;
}
/*
* The timer is automagically restarted, when interval != 0
*/
enum hrtimer_restart it_real_fn(struct hrtimer *timer)
{
struct signal_struct *sig =
container_of(timer, struct signal_struct, real_timer);
trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
return HRTIMER_NORESTART;
}
static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
{
struct timespec ts;
s64 cpu_ns;
cputime_to_timespec(ct, &ts);
cpu_ns = timespec_to_ns(&ts);
return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
}
static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
const struct itimerval *const value,
struct itimerval *const ovalue)
{
cputime_t cval, nval, cinterval, ninterval;
s64 ns_ninterval, ns_nval;
struct cpu_itimer *it = &tsk->signal->it[clock_id];
nval = timeval_to_cputime(&value->it_value);
ns_nval = timeval_to_ns(&value->it_value);
ninterval = timeval_to_cputime(&value->it_interval);
ns_ninterval = timeval_to_ns(&value->it_interval);
it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
it->error = cputime_sub_ns(nval, ns_nval);
spin_lock_irq(&tsk->sighand->siglock);
cval = it->expires;
cinterval = it->incr;
if (!cputime_eq(cval, cputime_zero) ||
!cputime_eq(nval, cputime_zero)) {
if (cputime_gt(nval, cputime_zero))
nval = cputime_add(nval, cputime_one_jiffy);
set_process_cpu_timer(tsk, clock_id, &nval, &cval);
}
it->expires = nval;
it->incr = ninterval;
trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
spin_unlock_irq(&tsk->sighand->siglock);
if (ovalue) {
cputime_to_timeval(cval, &ovalue->it_value);
cputime_to_timeval(cinterval, &ovalue->it_interval);
}
}
/*
* Returns true if the timeval is in canonical form
*/
#define timeval_valid(t) \
(((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
{
struct task_struct *tsk = current;
struct hrtimer *timer;
ktime_t expires;
/*
* Validate the timevals in value.
*/
if (!timeval_valid(&value->it_value) ||
!timeval_valid(&value->it_interval))
return -EINVAL;
switch (which) {
case ITIMER_REAL:
again:
spin_lock_irq(&tsk->sighand->siglock);
timer = &tsk->signal->real_timer;
if (ovalue) {
ovalue->it_value = itimer_get_remtime(timer);
ovalue->it_interval
= ktime_to_timeval(tsk->signal->it_real_incr);
}
/* We are sharing ->siglock with it_real_fn() */
if (hrtimer_try_to_cancel(timer) < 0) {
spin_unlock_irq(&tsk->sighand->siglock);
goto again;
}
expires = timeval_to_ktime(value->it_value);
if (expires.tv64 != 0) {
tsk->signal->it_real_incr =
timeval_to_ktime(value->it_interval);
hrtimer_start(timer, expires, HRTIMER_MODE_REL);
} else
tsk->signal->it_real_incr.tv64 = 0;
trace_itimer_state(ITIMER_REAL, value, 0);
spin_unlock_irq(&tsk->sighand->siglock);
break;
case ITIMER_VIRTUAL:
set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue);
break;
case ITIMER_PROF:
set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue);
break;
default:
return -EINVAL;
}
return 0;
}
/**
* alarm_setitimer - set alarm in seconds
*
* @seconds: number of seconds until alarm
* 0 disables the alarm
*
* Returns the remaining time in seconds of a pending timer or 0 when
* the timer is not active.
*
* On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
* negative timeval settings which would cause immediate expiry.
*/
unsigned int alarm_setitimer(unsigned int seconds)
{
struct itimerval it_new, it_old;
#if BITS_PER_LONG < 64
if (seconds > INT_MAX)
seconds = INT_MAX;
#endif
it_new.it_value.tv_sec = seconds;
it_new.it_value.tv_usec = 0;
it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
do_setitimer(ITIMER_REAL, &it_new, &it_old);
/*
* We can't return 0 if we have an alarm pending ... And we'd
* better return too much than too little anyway
*/
if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
it_old.it_value.tv_usec >= 500000)
it_old.it_value.tv_sec++;
return it_old.it_value.tv_sec;
}
SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
struct itimerval __user *, ovalue)
{
struct itimerval set_buffer, get_buffer;
int error;
if (value) {
if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
return -EFAULT;
} else
memset((char *) &set_buffer, 0, sizeof(set_buffer));
error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
if (error || !ovalue)
return error;
if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
return -EFAULT;
return 0;
}

530
kernel/kernel/kallsyms.c Normal file
View File

@@ -0,0 +1,530 @@
/*
* kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
*
* Rewritten and vastly simplified by Rusty Russell for in-kernel
* module loader:
* Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
*
* ChangeLog:
*
* (25/Aug/2004) Paulo Marques <pmarques@grupopie.com>
* Changed the compression method from stem compression to "table lookup"
* compression (see scripts/kallsyms.c for a more complete description)
*/
#include <linux/kallsyms.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
#include <linux/err.h>
#include <linux/proc_fs.h>
#include <linux/sched.h> /* for cond_resched */
#include <linux/mm.h>
#include <linux/ctype.h>
#include <asm/sections.h>
#ifdef CONFIG_KALLSYMS_ALL
#define all_var 1
#else
#define all_var 0
#endif
/*
* These will be re-linked against their real values
* during the second link stage.
*/
extern const unsigned long kallsyms_addresses[] __attribute__((weak));
extern const u8 kallsyms_names[] __attribute__((weak));
/*
* Tell the compiler that the count isn't in the small data section if the arch
* has one (eg: FRV).
*/
extern const unsigned long kallsyms_num_syms
__attribute__((weak, section(".rodata")));
extern const u8 kallsyms_token_table[] __attribute__((weak));
extern const u16 kallsyms_token_index[] __attribute__((weak));
extern const unsigned long kallsyms_markers[] __attribute__((weak));
static inline int is_kernel_inittext(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext
&& addr <= (unsigned long)_einittext)
return 1;
return 0;
}
static inline int is_kernel_text(unsigned long addr)
{
if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
arch_is_kernel_text(addr))
return 1;
return in_gate_area_no_task(addr);
}
static inline int is_kernel(unsigned long addr)
{
if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
return 1;
return in_gate_area_no_task(addr);
}
static int is_ksym_addr(unsigned long addr)
{
if (all_var)
return is_kernel(addr);
return is_kernel_text(addr) || is_kernel_inittext(addr);
}
/*
* Expand a compressed symbol data into the resulting uncompressed string,
* given the offset to where the symbol is in the compressed stream.
*/
static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
{
int len, skipped_first = 0;
const u8 *tptr, *data;
/* Get the compressed symbol length from the first symbol byte. */
data = &kallsyms_names[off];
len = *data;
data++;
/*
* Update the offset to return the offset for the next symbol on
* the compressed stream.
*/
off += len + 1;
/*
* For every byte on the compressed symbol data, copy the table
* entry for that byte.
*/
while (len) {
tptr = &kallsyms_token_table[kallsyms_token_index[*data]];
data++;
len--;
while (*tptr) {
if (skipped_first) {
*result = *tptr;
result++;
} else
skipped_first = 1;
tptr++;
}
}
*result = '\0';
/* Return to offset to the next symbol. */
return off;
}
/*
* Get symbol type information. This is encoded as a single char at the
* beginning of the symbol name.
*/
static char kallsyms_get_symbol_type(unsigned int off)
{
/*
* Get just the first code, look it up in the token table,
* and return the first char from this token.
*/
return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
}
/*
* Find the offset on the compressed stream given and index in the
* kallsyms array.
*/
static unsigned int get_symbol_offset(unsigned long pos)
{
const u8 *name;
int i;
/*
* Use the closest marker we have. We have markers every 256 positions,
* so that should be close enough.
*/
name = &kallsyms_names[kallsyms_markers[pos >> 8]];
/*
* Sequentially scan all the symbols up to the point we're searching
* for. Every symbol is stored in a [<len>][<len> bytes of data] format,
* so we just need to add the len to the current pointer for every
* symbol we wish to skip.
*/
for (i = 0; i < (pos & 0xFF); i++)
name = name + (*name) + 1;
return name - kallsyms_names;
}
EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
{
char namebuf[KSYM_NAME_LEN];
unsigned long i;
unsigned int off;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf);
if (strcmp(namebuf, name) == 0)
return kallsyms_addresses[i];
}
return module_kallsyms_lookup_name(name);
}
int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
unsigned long),
void *data)
{
char namebuf[KSYM_NAME_LEN];
unsigned long i;
unsigned int off;
int ret;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf);
ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
if (ret != 0)
return ret;
}
return module_kallsyms_on_each_symbol(fn, data);
}
EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
unsigned long *offset)
{
unsigned long symbol_start = 0, symbol_end = 0;
unsigned long i, low, high, mid;
/* This kernel should never had been booted. */
BUG_ON(!kallsyms_addresses);
/* Do a binary search on the sorted kallsyms_addresses array. */
low = 0;
high = kallsyms_num_syms;
while (high - low > 1) {
mid = low + (high - low) / 2;
if (kallsyms_addresses[mid] <= addr)
low = mid;
else
high = mid;
}
/*
* Search for the first aliased symbol. Aliased
* symbols are symbols with the same address.
*/
while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
--low;
symbol_start = kallsyms_addresses[low];
/* Search for next non-aliased symbol. */
for (i = low + 1; i < kallsyms_num_syms; i++) {
if (kallsyms_addresses[i] > symbol_start) {
symbol_end = kallsyms_addresses[i];
break;
}
}
/* If we found no next symbol, we use the end of the section. */
if (!symbol_end) {
if (is_kernel_inittext(addr))
symbol_end = (unsigned long)_einittext;
else if (all_var)
symbol_end = (unsigned long)_end;
else
symbol_end = (unsigned long)_etext;
}
if (symbolsize)
*symbolsize = symbol_end - symbol_start;
if (offset)
*offset = addr - symbol_start;
return low;
}
/*
* Lookup an address but don't bother to find any names.
*/
int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
unsigned long *offset)
{
char namebuf[KSYM_NAME_LEN];
if (is_ksym_addr(addr))
return !!get_symbol_pos(addr, symbolsize, offset);
return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
}
/*
* Lookup an address
* - modname is set to NULL if it's in the kernel.
* - We guarantee that the returned name is valid until we reschedule even if.
* It resides in a module.
* - We also guarantee that modname will be valid until rescheduled.
*/
const char *kallsyms_lookup(unsigned long addr,
unsigned long *symbolsize,
unsigned long *offset,
char **modname, char *namebuf)
{
namebuf[KSYM_NAME_LEN - 1] = 0;
namebuf[0] = 0;
if (is_ksym_addr(addr)) {
unsigned long pos;
pos = get_symbol_pos(addr, symbolsize, offset);
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos), namebuf);
if (modname)
*modname = NULL;
return namebuf;
}
/* See if it's in a module. */
return module_address_lookup(addr, symbolsize, offset, modname,
namebuf);
}
int lookup_symbol_name(unsigned long addr, char *symname)
{
symname[0] = '\0';
symname[KSYM_NAME_LEN - 1] = '\0';
if (is_ksym_addr(addr)) {
unsigned long pos;
pos = get_symbol_pos(addr, NULL, NULL);
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos), symname);
return 0;
}
/* See if it's in a module. */
return lookup_module_symbol_name(addr, symname);
}
int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
unsigned long *offset, char *modname, char *name)
{
name[0] = '\0';
name[KSYM_NAME_LEN - 1] = '\0';
if (is_ksym_addr(addr)) {
unsigned long pos;
pos = get_symbol_pos(addr, size, offset);
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos), name);
modname[0] = '\0';
return 0;
}
/* See if it's in a module. */
return lookup_module_symbol_attrs(addr, size, offset, modname, name);
}
/* Look up a kernel symbol and return it in a text buffer. */
int sprint_symbol(char *buffer, unsigned long address)
{
char *modname;
const char *name;
unsigned long offset, size;
int len;
name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
if (!name)
return sprintf(buffer, "0x%lx", address);
if (name != buffer)
strcpy(buffer, name);
len = strlen(buffer);
buffer += len;
if (modname)
len += sprintf(buffer, "+%#lx/%#lx [%s]",
offset, size, modname);
else
len += sprintf(buffer, "+%#lx/%#lx", offset, size);
return len;
}
EXPORT_SYMBOL_GPL(sprint_symbol);
/* Look up a kernel symbol and print it to the kernel messages. */
void __print_symbol(const char *fmt, unsigned long address)
{
char buffer[KSYM_SYMBOL_LEN];
sprint_symbol(buffer, address);
printk(fmt, buffer);
}
EXPORT_SYMBOL(__print_symbol);
/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
struct kallsym_iter {
loff_t pos;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
char name[KSYM_NAME_LEN];
char module_name[MODULE_NAME_LEN];
int exported;
};
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
&iter->type, iter->name, iter->module_name,
&iter->exported) < 0)
return 0;
return 1;
}
/* Returns space to next name. */
static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
{
unsigned off = iter->nameoff;
iter->module_name[0] = '\0';
iter->value = kallsyms_addresses[iter->pos];
iter->type = kallsyms_get_symbol_type(off);
off = kallsyms_expand_symbol(off, iter->name);
return off - iter->nameoff;
}
static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
{
iter->name[0] = '\0';
iter->nameoff = get_symbol_offset(new_pos);
iter->pos = new_pos;
}
/* Returns false if pos at or past end of file. */
static int update_iter(struct kallsym_iter *iter, loff_t pos)
{
/* Module symbols can be accessed randomly. */
if (pos >= kallsyms_num_syms) {
iter->pos = pos;
return get_ksymbol_mod(iter);
}
/* If we're not on the desired position, reset to new position. */
if (pos != iter->pos)
reset_iter(iter, pos);
iter->nameoff += get_ksymbol_core(iter);
iter->pos++;
return 1;
}
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
(*pos)++;
if (!update_iter(m->private, *pos))
return NULL;
return p;
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
if (!update_iter(m->private, *pos))
return NULL;
return m->private;
}
static void s_stop(struct seq_file *m, void *p)
{
}
static int s_show(struct seq_file *m, void *p)
{
struct kallsym_iter *iter = m->private;
/* Some debugging symbols have no name. Ignore them. */
if (!iter->name[0])
return 0;
if (iter->module_name[0]) {
char type;
/*
* Label it "global" if it is exported,
* "local" if not exported.
*/
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
seq_printf(m, "%0*lx %c %s\t[%s]\n",
(int)(2 * sizeof(void *)),
iter->value, type, iter->name, iter->module_name);
} else
seq_printf(m, "%0*lx %c %s\n",
(int)(2 * sizeof(void *)),
iter->value, iter->type, iter->name);
return 0;
}
static const struct seq_operations kallsyms_op = {
.start = s_start,
.next = s_next,
.stop = s_stop,
.show = s_show
};
static int kallsyms_open(struct inode *inode, struct file *file)
{
/*
* We keep iterator in m->private, since normal case is to
* s_start from where we left off, so we avoid doing
* using get_symbol_offset for every symbol.
*/
struct kallsym_iter *iter;
int ret;
iter = kmalloc(sizeof(*iter), GFP_KERNEL);
if (!iter)
return -ENOMEM;
reset_iter(iter, 0);
ret = seq_open(file, &kallsyms_op);
if (ret == 0)
((struct seq_file *)file->private_data)->private = iter;
else
kfree(iter);
return ret;
}
static const struct file_operations kallsyms_operations = {
.open = kallsyms_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
static int __init kallsyms_init(void)
{
proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
return 0;
}
device_initcall(kallsyms_init);

1504
kernel/kernel/kexec.c Normal file

File diff suppressed because it is too large Load Diff

197
kernel/kernel/kfifo.c Normal file
View File

@@ -0,0 +1,197 @@
/*
* A simple kernel FIFO implementation.
*
* Copyright (C) 2004 Stelian Pop <stelian@popies.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/kfifo.h>
#include <linux/log2.h>
/**
* kfifo_init - allocates a new FIFO using a preallocated buffer
* @buffer: the preallocated buffer to be used.
* @size: the size of the internal buffer, this have to be a power of 2.
* @gfp_mask: get_free_pages mask, passed to kmalloc()
* @lock: the lock to be used to protect the fifo buffer
*
* Do NOT pass the kfifo to kfifo_free() after use! Simply free the
* &struct kfifo with kfree().
*/
struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
gfp_t gfp_mask, spinlock_t *lock)
{
struct kfifo *fifo;
/* size must be a power of 2 */
BUG_ON(!is_power_of_2(size));
fifo = kmalloc(sizeof(struct kfifo), gfp_mask);
if (!fifo)
return ERR_PTR(-ENOMEM);
fifo->buffer = buffer;
fifo->size = size;
fifo->in = fifo->out = 0;
fifo->lock = lock;
return fifo;
}
EXPORT_SYMBOL(kfifo_init);
/**
* kfifo_alloc - allocates a new FIFO and its internal buffer
* @size: the size of the internal buffer to be allocated.
* @gfp_mask: get_free_pages mask, passed to kmalloc()
* @lock: the lock to be used to protect the fifo buffer
*
* The size will be rounded-up to a power of 2.
*/
struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
{
unsigned char *buffer;
struct kfifo *ret;
/*
* round up to the next power of 2, since our 'let the indices
* wrap' technique works only in this case.
*/
if (!is_power_of_2(size)) {
BUG_ON(size > 0x80000000);
size = roundup_pow_of_two(size);
}
buffer = kmalloc(size, gfp_mask);
if (!buffer)
return ERR_PTR(-ENOMEM);
ret = kfifo_init(buffer, size, gfp_mask, lock);
if (IS_ERR(ret))
kfree(buffer);
return ret;
}
EXPORT_SYMBOL(kfifo_alloc);
/**
* kfifo_free - frees the FIFO
* @fifo: the fifo to be freed.
*/
void kfifo_free(struct kfifo *fifo)
{
kfree(fifo->buffer);
kfree(fifo);
}
EXPORT_SYMBOL(kfifo_free);
/**
* __kfifo_put - puts some data into the FIFO, no locking version
* @fifo: the fifo to be used.
* @buffer: the data to be added.
* @len: the length of the data to be added.
*
* This function copies at most @len bytes from the @buffer into
* the FIFO depending on the free space, and returns the number of
* bytes copied.
*
* Note that with only one concurrent reader and one concurrent
* writer, you don't need extra locking to use these functions.
*/
unsigned int __kfifo_put(struct kfifo *fifo,
const unsigned char *buffer, unsigned int len)
{
unsigned int l;
len = min(len, fifo->size - fifo->in + fifo->out);
/*
* Ensure that we sample the fifo->out index -before- we
* start putting bytes into the kfifo.
*/
smp_mb();
/* first put the data starting from fifo->in to buffer end */
l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
/* then put the rest (if any) at the beginning of the buffer */
memcpy(fifo->buffer, buffer + l, len - l);
/*
* Ensure that we add the bytes to the kfifo -before-
* we update the fifo->in index.
*/
smp_wmb();
fifo->in += len;
return len;
}
EXPORT_SYMBOL(__kfifo_put);
/**
* __kfifo_get - gets some data from the FIFO, no locking version
* @fifo: the fifo to be used.
* @buffer: where the data must be copied.
* @len: the size of the destination buffer.
*
* This function copies at most @len bytes from the FIFO into the
* @buffer and returns the number of copied bytes.
*
* Note that with only one concurrent reader and one concurrent
* writer, you don't need extra locking to use these functions.
*/
unsigned int __kfifo_get(struct kfifo *fifo,
unsigned char *buffer, unsigned int len)
{
unsigned int l;
len = min(len, fifo->in - fifo->out);
/*
* Ensure that we sample the fifo->in index -before- we
* start removing bytes from the kfifo.
*/
smp_rmb();
/* first get the data from fifo->out until the end of the buffer */
l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
/* then get the rest (if any) from the beginning of the buffer */
memcpy(buffer + l, fifo->buffer, len - l);
/*
* Ensure that we remove the bytes from the kfifo -before-
* we update the fifo->out index.
*/
smp_mb();
fifo->out += len;
return len;
}
EXPORT_SYMBOL(__kfifo_get);

1733
kernel/kernel/kgdb.c Normal file

File diff suppressed because it is too large Load Diff

538
kernel/kernel/kmod.c Normal file
View File

@@ -0,0 +1,538 @@
/*
kmod, the new module loader (replaces kerneld)
Kirk Petersen
Reorganized not to be a daemon by Adam Richter, with guidance
from Greg Zornetzer.
Modified to avoid chroot and file sharing problems.
Mikael Pettersson
Limit the concurrent number of kmod modprobes to catch loops from
"modprobe needs a service that is in a module".
Keith Owens <kaos@ocs.com.au> December 1999
Unblock all signals when we exec a usermode process.
Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
call_usermodehelper wait flag, and remove exec_usermodehelper.
Rusty Russell <rusty@rustcorp.com.au> Jan 2003
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/completion.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/resource.h>
#include <linux/notifier.h>
#include <linux/suspend.h>
#include <asm/uaccess.h>
#include <trace/events/module.h>
extern int max_threads;
static struct workqueue_struct *khelper_wq;
#ifdef CONFIG_MODULES
/*
modprobe_path is set via /proc/sys.
*/
char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
/**
* __request_module - try to load a kernel module
* @wait: wait (or not) for the operation to complete
* @fmt: printf style format string for the name of the module
* @...: arguments as specified in the format string
*
* Load a module using the user mode module loader. The function returns
* zero on success or a negative errno code on failure. Note that a
* successful module load does not mean the module did not then unload
* and exit on an error of its own. Callers must check that the service
* they requested is now available not blindly invoke it.
*
* If module auto-loading support is disabled then this function
* becomes a no-operation.
*/
int __request_module(bool wait, const char *fmt, ...)
{
va_list args;
char module_name[MODULE_NAME_LEN];
unsigned int max_modprobes;
int ret;
char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
static char *envp[] = { "HOME=/",
"TERM=linux",
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL };
static atomic_t kmod_concurrent = ATOMIC_INIT(0);
#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
ret = security_kernel_module_request();
if (ret)
return ret;
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
va_end(args);
if (ret >= MODULE_NAME_LEN)
return -ENAMETOOLONG;
/* If modprobe needs a service that is in a module, we get a recursive
* loop. Limit the number of running kmod threads to max_threads/2 or
* MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
* would be to run the parents of this process, counting how many times
* kmod was invoked. That would mean accessing the internals of the
* process tables to get the command line, proc_pid_cmdline is static
* and it is not worth changing the proc code just to handle this case.
* KAO.
*
* "trace the ppid" is simple, but will fail if someone's
* parent exits. I think this is as good as it gets. --RR
*/
max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
atomic_inc(&kmod_concurrent);
if (atomic_read(&kmod_concurrent) > max_modprobes) {
/* We may be blaming an innocent here, but unlikely */
if (kmod_loop_msg++ < 5)
printk(KERN_ERR
"request_module: runaway loop modprobe %s\n",
module_name);
atomic_dec(&kmod_concurrent);
return -ENOMEM;
}
trace_module_request(module_name, wait, _RET_IP_);
ret = call_usermodehelper(modprobe_path, argv, envp,
wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
atomic_dec(&kmod_concurrent);
return ret;
}
EXPORT_SYMBOL(__request_module);
#endif /* CONFIG_MODULES */
struct subprocess_info {
struct work_struct work;
struct completion *complete;
struct cred *cred;
char *path;
char **argv;
char **envp;
enum umh_wait wait;
int retval;
struct file *stdin;
void (*cleanup)(char **argv, char **envp);
};
/*
* This is the task which runs the usermode application
*/
static int ____call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
int retval;
BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
/* Unblock all signals */
spin_lock_irq(&current->sighand->siglock);
flush_signal_handlers(current, 1);
sigemptyset(&current->blocked);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
/* Install the credentials */
commit_creds(sub_info->cred);
sub_info->cred = NULL;
/* Install input pipe when needed */
if (sub_info->stdin) {
struct files_struct *f = current->files;
struct fdtable *fdt;
/* no races because files should be private here */
sys_close(0);
fd_install(0, sub_info->stdin);
spin_lock(&f->file_lock);
fdt = files_fdtable(f);
FD_SET(0, fdt->open_fds);
FD_CLR(0, fdt->close_on_exec);
spin_unlock(&f->file_lock);
/* and disallow core files too */
current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
}
/* We can run anywhere, unlike our parent keventd(). */
set_cpus_allowed_ptr(current, cpu_all_mask);
/*
* Our parent is keventd, which runs with elevated scheduling priority.
* Avoid propagating that into the userspace child.
*/
set_user_nice(current, 0);
retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
/* Exec failed? */
sub_info->retval = retval;
do_exit(0);
}
void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
if (info->cleanup)
(*info->cleanup)(info->argv, info->envp);
if (info->cred)
put_cred(info->cred);
kfree(info);
}
EXPORT_SYMBOL(call_usermodehelper_freeinfo);
/* Keventd can't block, but this (a child) can. */
static int wait_for_helper(void *data)
{
struct subprocess_info *sub_info = data;
pid_t pid;
/* Install a handler: if SIGCLD isn't handled sys_wait4 won't
* populate the status, but will return -ECHILD. */
allow_signal(SIGCHLD);
pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
} else {
int ret;
/*
* Normally it is bogus to call wait4() from in-kernel because
* wait4() wants to write the exit code to a userspace address.
* But wait_for_helper() always runs as keventd, and put_user()
* to a kernel address works OK for kernel threads, due to their
* having an mm_segment_t which spans the entire address space.
*
* Thus the __user pointer cast is valid here.
*/
sys_wait4(pid, (int __user *)&ret, 0, NULL);
/*
* If ret is 0, either ____call_usermodehelper failed and the
* real error code is already in sub_info->retval or
* sub_info->retval is 0 anyway, so don't mess with it then.
*/
if (ret)
sub_info->retval = ret;
}
if (sub_info->wait == UMH_NO_WAIT)
call_usermodehelper_freeinfo(sub_info);
else
complete(sub_info->complete);
return 0;
}
/* This is run by khelper thread */
static void __call_usermodehelper(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
pid_t pid;
enum umh_wait wait = sub_info->wait;
BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
/* CLONE_VFORK: wait until the usermode helper has execve'd
* successfully We need the data structures to stay around
* until that is done. */
if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
else
pid = kernel_thread(____call_usermodehelper, sub_info,
CLONE_VFORK | SIGCHLD);
switch (wait) {
case UMH_NO_WAIT:
break;
case UMH_WAIT_PROC:
if (pid > 0)
break;
sub_info->retval = pid;
/* FALLTHROUGH */
case UMH_WAIT_EXEC:
complete(sub_info->complete);
}
}
#ifdef CONFIG_PM_SLEEP
/*
* If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
* (used for preventing user land processes from being created after the user
* land has been frozen during a system-wide hibernation or suspend operation).
*/
static int usermodehelper_disabled;
/* Number of helpers running */
static atomic_t running_helpers = ATOMIC_INIT(0);
/*
* Wait queue head used by usermodehelper_pm_callback() to wait for all running
* helpers to finish.
*/
static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
/*
* Time to wait for running_helpers to become zero before the setting of
* usermodehelper_disabled in usermodehelper_pm_callback() fails
*/
#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
/**
* usermodehelper_disable - prevent new helpers from being started
*/
int usermodehelper_disable(void)
{
long retval;
usermodehelper_disabled = 1;
smp_mb();
/*
* From now on call_usermodehelper_exec() won't start any new
* helpers, so it is sufficient if running_helpers turns out to
* be zero at one point (it may be increased later, but that
* doesn't matter).
*/
retval = wait_event_timeout(running_helpers_waitq,
atomic_read(&running_helpers) == 0,
RUNNING_HELPERS_TIMEOUT);
if (retval)
return 0;
usermodehelper_disabled = 0;
return -EAGAIN;
}
/**
* usermodehelper_enable - allow new helpers to be started again
*/
void usermodehelper_enable(void)
{
usermodehelper_disabled = 0;
}
static void helper_lock(void)
{
atomic_inc(&running_helpers);
smp_mb__after_atomic_inc();
}
static void helper_unlock(void)
{
if (atomic_dec_and_test(&running_helpers))
wake_up(&running_helpers_waitq);
}
#else /* CONFIG_PM_SLEEP */
#define usermodehelper_disabled 0
static inline void helper_lock(void) {}
static inline void helper_unlock(void) {}
#endif /* CONFIG_PM_SLEEP */
/**
* call_usermodehelper_setup - prepare to call a usermode helper
* @path: path to usermode executable
* @argv: arg vector for process
* @envp: environment for process
* @gfp_mask: gfp mask for memory allocation
*
* Returns either %NULL on allocation failure, or a subprocess_info
* structure. This should be passed to call_usermodehelper_exec to
* exec the process and free the structure.
*/
struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
char **envp, gfp_t gfp_mask)
{
struct subprocess_info *sub_info;
sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
if (!sub_info)
goto out;
INIT_WORK(&sub_info->work, __call_usermodehelper);
sub_info->path = path;
sub_info->argv = argv;
sub_info->envp = envp;
sub_info->cred = prepare_usermodehelper_creds();
if (!sub_info->cred) {
kfree(sub_info);
return NULL;
}
out:
return sub_info;
}
EXPORT_SYMBOL(call_usermodehelper_setup);
/**
* call_usermodehelper_setkeys - set the session keys for usermode helper
* @info: a subprocess_info returned by call_usermodehelper_setup
* @session_keyring: the session keyring for the process
*/
void call_usermodehelper_setkeys(struct subprocess_info *info,
struct key *session_keyring)
{
#ifdef CONFIG_KEYS
struct thread_group_cred *tgcred = info->cred->tgcred;
key_put(tgcred->session_keyring);
tgcred->session_keyring = key_get(session_keyring);
#else
BUG();
#endif
}
EXPORT_SYMBOL(call_usermodehelper_setkeys);
/**
* call_usermodehelper_setcleanup - set a cleanup function
* @info: a subprocess_info returned by call_usermodehelper_setup
* @cleanup: a cleanup function
*
* The cleanup function is just befor ethe subprocess_info is about to
* be freed. This can be used for freeing the argv and envp. The
* Function must be runnable in either a process context or the
* context in which call_usermodehelper_exec is called.
*/
void call_usermodehelper_setcleanup(struct subprocess_info *info,
void (*cleanup)(char **argv, char **envp))
{
info->cleanup = cleanup;
}
EXPORT_SYMBOL(call_usermodehelper_setcleanup);
/**
* call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
* @sub_info: a subprocess_info returned by call_usermodehelper_setup
* @filp: set to the write-end of a pipe
*
* This constructs a pipe, and sets the read end to be the stdin of the
* subprocess, and returns the write-end in *@filp.
*/
int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
struct file **filp)
{
struct file *f;
f = create_write_pipe(0);
if (IS_ERR(f))
return PTR_ERR(f);
*filp = f;
f = create_read_pipe(f, 0);
if (IS_ERR(f)) {
free_write_pipe(*filp);
return PTR_ERR(f);
}
sub_info->stdin = f;
return 0;
}
EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
/**
* call_usermodehelper_exec - start a usermode application
* @sub_info: information about the subprocessa
* @wait: wait for the application to finish and return status.
* when -1 don't wait at all, but you get no useful error back when
* the program couldn't be exec'ed. This makes it safe to call
* from interrupt context.
*
* Runs a user-space application. The application is started
* asynchronously if wait is not set, and runs as a child of keventd.
* (ie. it runs with full root capabilities).
*/
int call_usermodehelper_exec(struct subprocess_info *sub_info,
enum umh_wait wait)
{
DECLARE_COMPLETION_ONSTACK(done);
int retval = 0;
BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
validate_creds(sub_info->cred);
helper_lock();
if (sub_info->path[0] == '\0')
goto out;
if (!khelper_wq || usermodehelper_disabled) {
retval = -EBUSY;
goto out;
}
sub_info->complete = &done;
sub_info->wait = wait;
queue_work(khelper_wq, &sub_info->work);
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
wait_for_completion(&done);
retval = sub_info->retval;
out:
call_usermodehelper_freeinfo(sub_info);
unlock:
helper_unlock();
return retval;
}
EXPORT_SYMBOL(call_usermodehelper_exec);
/**
* call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
* @path: path to usermode executable
* @argv: arg vector for process
* @envp: environment for process
* @filp: set to the write-end of a pipe
*
* This is a simple wrapper which executes a usermode-helper function
* with a pipe as stdin. It is implemented entirely in terms of
* lower-level call_usermodehelper_* functions.
*/
int call_usermodehelper_pipe(char *path, char **argv, char **envp,
struct file **filp)
{
struct subprocess_info *sub_info;
int ret;
sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
if (sub_info == NULL)
return -ENOMEM;
ret = call_usermodehelper_stdinpipe(sub_info, filp);
if (ret < 0)
goto out;
return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
out:
call_usermodehelper_freeinfo(sub_info);
return ret;
}
EXPORT_SYMBOL(call_usermodehelper_pipe);
void __init usermodehelper_init(void)
{
khelper_wq = create_singlethread_workqueue("khelper");
BUG_ON(!khelper_wq);
}

1555
kernel/kernel/kprobes.c Normal file

File diff suppressed because it is too large Load Diff

189
kernel/kernel/ksysfs.c Normal file
View File

@@ -0,0 +1,189 @@
/*
* kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
* are not related to any other subsystem
*
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
*
* This file is release under the GPLv2
*
*/
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kexec.h>
#include <linux/profile.h>
#include <linux/sched.h>
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KERNEL_ATTR_RW(_name) \
static struct kobj_attribute _name##_attr = \
__ATTR(_name, 0644, _name##_show, _name##_store)
#if defined(CONFIG_HOTPLUG)
/* current uevent sequence number */
static ssize_t uevent_seqnum_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
}
KERNEL_ATTR_RO(uevent_seqnum);
/* uevent helper program, used during early boo */
static ssize_t uevent_helper_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%s\n", uevent_helper);
}
static ssize_t uevent_helper_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
if (count+1 > UEVENT_HELPER_PATH_LEN)
return -ENOENT;
memcpy(uevent_helper, buf, count);
uevent_helper[count] = '\0';
if (count && uevent_helper[count-1] == '\n')
uevent_helper[count-1] = '\0';
return count;
}
KERNEL_ATTR_RW(uevent_helper);
#endif
#ifdef CONFIG_PROFILING
static ssize_t profiling_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", prof_on);
}
static ssize_t profiling_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int ret;
if (prof_on)
return -EEXIST;
/*
* This eventually calls into get_option() which
* has a ton of callers and is not const. It is
* easiest to cast it away here.
*/
profile_setup((char *)buf);
ret = profile_init();
if (ret)
return ret;
ret = create_proc_profile();
if (ret)
return ret;
return count;
}
KERNEL_ATTR_RW(profiling);
#endif
#ifdef CONFIG_KEXEC
static ssize_t kexec_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", !!kexec_image);
}
KERNEL_ATTR_RO(kexec_loaded);
static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", !!kexec_crash_image);
}
KERNEL_ATTR_RO(kexec_crash_loaded);
static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%lx %x\n",
paddr_vmcoreinfo_note(),
(unsigned int)vmcoreinfo_max_size);
}
KERNEL_ATTR_RO(vmcoreinfo);
#endif /* CONFIG_KEXEC */
/*
* Make /sys/kernel/notes give the raw contents of our kernel .notes section.
*/
extern const void __start_notes __attribute__((weak));
extern const void __stop_notes __attribute__((weak));
#define notes_size (&__stop_notes - &__start_notes)
static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
memcpy(buf, &__start_notes + off, count);
return count;
}
static struct bin_attribute notes_attr = {
.attr = {
.name = "notes",
.mode = S_IRUGO,
},
.read = &notes_read,
};
struct kobject *kernel_kobj;
EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
#if defined(CONFIG_HOTPLUG)
&uevent_seqnum_attr.attr,
&uevent_helper_attr.attr,
#endif
#ifdef CONFIG_PROFILING
&profiling_attr.attr,
#endif
#ifdef CONFIG_KEXEC
&kexec_loaded_attr.attr,
&kexec_crash_loaded_attr.attr,
&vmcoreinfo_attr.attr,
#endif
NULL
};
static struct attribute_group kernel_attr_group = {
.attrs = kernel_attrs,
};
static int __init ksysfs_init(void)
{
int error;
kernel_kobj = kobject_create_and_add("kernel", NULL);
if (!kernel_kobj) {
error = -ENOMEM;
goto exit;
}
error = sysfs_create_group(kernel_kobj, &kernel_attr_group);
if (error)
goto kset_exit;
if (notes_size > 0) {
notes_attr.size = notes_size;
error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
if (error)
goto group_exit;
}
return 0;
group_exit:
sysfs_remove_group(kernel_kobj, &kernel_attr_group);
kset_exit:
kobject_put(kernel_kobj);
exit:
return error;
}
core_initcall(ksysfs_init);

226
kernel/kernel/kthread.c Normal file
View File

@@ -0,0 +1,226 @@
/* Kernel thread helper functions.
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
*
* Creation is done via kthreadd, so that we get a clean environment
* even if we're invoked from userspace (think modprobe, hotplug cpu,
* etc.).
*/
#include <linux/sched.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
int (*threadfn)(void *data);
void *data;
/* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
struct completion done;
struct list_head list;
};
struct kthread {
int should_stop;
struct completion exited;
};
#define to_kthread(tsk) \
container_of((tsk)->vfork_done, struct kthread, exited)
/**
* kthread_should_stop - should this kthread return now?
*
* When someone calls kthread_stop() on your kthread, it will be woken
* and this will return true. You should then return, and your return
* value will be passed through to kthread_stop().
*/
int kthread_should_stop(void)
{
return to_kthread(current)->should_stop;
}
EXPORT_SYMBOL(kthread_should_stop);
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
struct kthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn;
void *data = create->data;
struct kthread self;
int ret;
self.should_stop = 0;
init_completion(&self.exited);
current->vfork_done = &self.exited;
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
complete(&create->done);
schedule();
ret = -EINTR;
if (!self.should_stop)
ret = threadfn(data);
/* we can't just return, we must preserve "self" on stack */
do_exit(ret);
}
static void create_kthread(struct kthread_create_info *create)
{
int pid;
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
create->result = ERR_PTR(pid);
complete(&create->done);
}
}
/**
* kthread_create - create a kthread.
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
* @namefmt: printf-style name for the thread.
*
* Description: This helper function creates and names a kernel
* thread. The thread will be stopped: use wake_up_process() to start
* it. See also kthread_run(), kthread_create_on_cpu().
*
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
* standalone thread for which noone will call kthread_stop(), or
* return when 'kthread_should_stop()' is true (which means
* kthread_stop() has been called). The return value should be zero
* or a negative error number; it will be passed to kthread_stop().
*
* Returns a task_struct or ERR_PTR(-ENOMEM).
*/
struct task_struct *kthread_create(int (*threadfn)(void *data),
void *data,
const char namefmt[],
...)
{
struct kthread_create_info create;
create.threadfn = threadfn;
create.data = data;
init_completion(&create.done);
spin_lock(&kthread_create_lock);
list_add_tail(&create.list, &kthread_create_list);
spin_unlock(&kthread_create_lock);
wake_up_process(kthreadd_task);
wait_for_completion(&create.done);
if (!IS_ERR(create.result)) {
struct sched_param param = { .sched_priority = 0 };
va_list args;
va_start(args, namefmt);
vsnprintf(create.result->comm, sizeof(create.result->comm),
namefmt, args);
va_end(args);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
*/
sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
set_cpus_allowed_ptr(create.result, cpu_all_mask);
}
return create.result;
}
EXPORT_SYMBOL(kthread_create);
/**
* kthread_stop - stop a thread created by kthread_create().
* @k: thread created by kthread_create().
*
* Sets kthread_should_stop() for @k to return true, wakes it, and
* waits for it to exit. This can also be called after kthread_create()
* instead of calling wake_up_process(): the thread will exit without
* calling threadfn().
*
* If threadfn() may call do_exit() itself, the caller must ensure
* task_struct can't go away.
*
* Returns the result of threadfn(), or %-EINTR if wake_up_process()
* was never called.
*/
int kthread_stop(struct task_struct *k)
{
struct kthread *kthread;
int ret;
trace_sched_kthread_stop(k);
get_task_struct(k);
kthread = to_kthread(k);
barrier(); /* it might have exited */
if (k->vfork_done != NULL) {
kthread->should_stop = 1;
wake_up_process(k);
wait_for_completion(&kthread->exited);
}
ret = k->exit_code;
put_task_struct(k);
trace_sched_kthread_stop_ret(ret);
return ret;
}
EXPORT_SYMBOL(kthread_stop);
int kthreadd(void *unused)
{
struct task_struct *tsk = current;
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
set_cpus_allowed_ptr(tsk, cpu_all_mask);
set_mems_allowed(node_states[N_HIGH_MEMORY]);
current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
if (list_empty(&kthread_create_list))
schedule();
__set_current_state(TASK_RUNNING);
spin_lock(&kthread_create_lock);
while (!list_empty(&kthread_create_list)) {
struct kthread_create_info *create;
create = list_entry(kthread_create_list.next,
struct kthread_create_info, list);
list_del_init(&create->list);
spin_unlock(&kthread_create_lock);
create_kthread(create);
spin_lock(&kthread_create_lock);
}
spin_unlock(&kthread_create_lock);
}
return 0;
}

297
kernel/kernel/latencytop.c Normal file
View File

@@ -0,0 +1,297 @@
/*
* latencytop.c: Latency display infrastructure
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
/*
* CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is
* used by the "latencytop" userspace tool. The latency that is tracked is not
* the 'traditional' interrupt latency (which is primarily caused by something
* else consuming CPU), but instead, it is the latency an application encounters
* because the kernel sleeps on its behalf for various reasons.
*
* This code tracks 2 levels of statistics:
* 1) System level latency
* 2) Per process latency
*
* The latency is stored in fixed sized data structures in an accumulated form;
* if the "same" latency cause is hit twice, this will be tracked as one entry
* in the data structure. Both the count, total accumulated latency and maximum
* latency are tracked in this data structure. When the fixed size structure is
* full, no new causes are tracked until the buffer is flushed by writing to
* the /proc file; the userspace tool does this on a regular basis.
*
* A latency cause is identified by a stringified backtrace at the point that
* the scheduler gets invoked. The userland tool will use this string to
* identify the cause of the latency in human readable form.
*
* The information is exported via /proc/latency_stats and /proc/<pid>/latency.
* These files look like this:
*
* Latency Top version : v0.1
* 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl
* | | | |
* | | | +----> the stringified backtrace
* | | +---------> The maximum latency for this entry in microseconds
* | +--------------> The accumulated latency for this entry (microseconds)
* +-------------------> The number of times this entry is hit
*
* (note: the average latency is the accumulated latency divided by the number
* of times)
*/
#include <linux/latencytop.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
static DEFINE_SPINLOCK(latency_lock);
#define MAXLR 128
static struct latency_record latency_record[MAXLR];
int latencytop_enabled;
void clear_all_latency_tracing(struct task_struct *p)
{
unsigned long flags;
if (!latencytop_enabled)
return;
spin_lock_irqsave(&latency_lock, flags);
memset(&p->latency_record, 0, sizeof(p->latency_record));
p->latency_record_count = 0;
spin_unlock_irqrestore(&latency_lock, flags);
}
static void clear_global_latency_tracing(void)
{
unsigned long flags;
spin_lock_irqsave(&latency_lock, flags);
memset(&latency_record, 0, sizeof(latency_record));
spin_unlock_irqrestore(&latency_lock, flags);
}
static void __sched
account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
{
int firstnonnull = MAXLR + 1;
int i;
if (!latencytop_enabled)
return;
/* skip kernel threads for now */
if (!tsk->mm)
return;
for (i = 0; i < MAXLR; i++) {
int q, same = 1;
/* Nothing stored: */
if (!latency_record[i].backtrace[0]) {
if (firstnonnull > i)
firstnonnull = i;
continue;
}
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long record = lat->backtrace[q];
if (latency_record[i].backtrace[q] != record) {
same = 0;
break;
}
/* 0 and ULONG_MAX entries mean end of backtrace: */
if (record == 0 || record == ULONG_MAX)
break;
}
if (same) {
latency_record[i].count++;
latency_record[i].time += lat->time;
if (lat->time > latency_record[i].max)
latency_record[i].max = lat->time;
return;
}
}
i = firstnonnull;
if (i >= MAXLR - 1)
return;
/* Allocted a new one: */
memcpy(&latency_record[i], lat, sizeof(struct latency_record));
}
/*
* Iterator to store a backtrace into a latency record entry
*/
static inline void store_stacktrace(struct task_struct *tsk,
struct latency_record *lat)
{
struct stack_trace trace;
memset(&trace, 0, sizeof(trace));
trace.max_entries = LT_BACKTRACEDEPTH;
trace.entries = &lat->backtrace[0];
save_stack_trace_tsk(tsk, &trace);
}
/**
* __account_scheduler_latency - record an occured latency
* @tsk - the task struct of the task hitting the latency
* @usecs - the duration of the latency in microseconds
* @inter - 1 if the sleep was interruptible, 0 if uninterruptible
*
* This function is the main entry point for recording latency entries
* as called by the scheduler.
*
* This function has a few special cases to deal with normal 'non-latency'
* sleeps: specifically, interruptible sleep longer than 5 msec is skipped
* since this usually is caused by waiting for events via select() and co.
*
* Negative latencies (caused by time going backwards) are also explicitly
* skipped.
*/
void __sched
__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
{
unsigned long flags;
int i, q;
struct latency_record lat;
/* Long interruptible waits are generally user requested... */
if (inter && usecs > 5000)
return;
/* Negative sleeps are time going backwards */
/* Zero-time sleeps are non-interesting */
if (usecs <= 0)
return;
memset(&lat, 0, sizeof(lat));
lat.count = 1;
lat.time = usecs;
lat.max = usecs;
store_stacktrace(tsk, &lat);
spin_lock_irqsave(&latency_lock, flags);
account_global_scheduler_latency(tsk, &lat);
for (i = 0; i < tsk->latency_record_count; i++) {
struct latency_record *mylat;
int same = 1;
mylat = &tsk->latency_record[i];
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long record = lat.backtrace[q];
if (mylat->backtrace[q] != record) {
same = 0;
break;
}
/* 0 and ULONG_MAX entries mean end of backtrace: */
if (record == 0 || record == ULONG_MAX)
break;
}
if (same) {
mylat->count++;
mylat->time += lat.time;
if (lat.time > mylat->max)
mylat->max = lat.time;
goto out_unlock;
}
}
/*
* short term hack; if we're > 32 we stop; future we recycle:
*/
if (tsk->latency_record_count >= LT_SAVECOUNT)
goto out_unlock;
/* Allocated a new one: */
i = tsk->latency_record_count++;
memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
out_unlock:
spin_unlock_irqrestore(&latency_lock, flags);
}
static int lstats_show(struct seq_file *m, void *v)
{
int i;
seq_puts(m, "Latency Top version : v0.1\n");
for (i = 0; i < MAXLR; i++) {
if (latency_record[i].backtrace[0]) {
int q;
seq_printf(m, "%i %lu %lu ",
latency_record[i].count,
latency_record[i].time,
latency_record[i].max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
char sym[KSYM_SYMBOL_LEN];
char *c;
if (!latency_record[i].backtrace[q])
break;
if (latency_record[i].backtrace[q] == ULONG_MAX)
break;
sprint_symbol(sym, latency_record[i].backtrace[q]);
c = strchr(sym, '+');
if (c)
*c = 0;
seq_printf(m, "%s ", sym);
}
seq_printf(m, "\n");
}
}
return 0;
}
static ssize_t
lstats_write(struct file *file, const char __user *buf, size_t count,
loff_t *offs)
{
clear_global_latency_tracing();
return count;
}
static int lstats_open(struct inode *inode, struct file *filp)
{
return single_open(filp, lstats_show, NULL);
}
static const struct file_operations lstats_fops = {
.open = lstats_open,
.read = seq_read,
.write = lstats_write,
.llseek = seq_lseek,
.release = single_release,
};
static int __init init_lstats_procfs(void)
{
proc_create("latency_stats", 0644, NULL, &lstats_fops);
return 0;
}
device_initcall(init_lstats_procfs);

3802
kernel/kernel/lockdep.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,140 @@
/*
* kernel/lockdep_internals.h
*
* Runtime locking correctness validator
*
* lockdep subsystem internal functions and variables.
*/
/*
* Lock-class usage-state bits:
*/
enum lock_usage_bit {
#define LOCKDEP_STATE(__STATE) \
LOCK_USED_IN_##__STATE, \
LOCK_USED_IN_##__STATE##_READ, \
LOCK_ENABLED_##__STATE, \
LOCK_ENABLED_##__STATE##_READ,
#include "lockdep_states.h"
#undef LOCKDEP_STATE
LOCK_USED,
LOCK_USAGE_STATES
};
/*
* Usage-state bitmasks:
*/
#define __LOCKF(__STATE) LOCKF_##__STATE = (1 << LOCK_##__STATE),
enum {
#define LOCKDEP_STATE(__STATE) \
__LOCKF(USED_IN_##__STATE) \
__LOCKF(USED_IN_##__STATE##_READ) \
__LOCKF(ENABLED_##__STATE) \
__LOCKF(ENABLED_##__STATE##_READ)
#include "lockdep_states.h"
#undef LOCKDEP_STATE
__LOCKF(USED)
};
#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
#define LOCKF_ENABLED_IRQ_READ \
(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
#define LOCKF_USED_IN_IRQ_READ \
(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
/*
* MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
* we track.
*
* We use the per-lock dependency maps in two ways: we grow it by adding
* every to-be-taken lock to all currently held lock's own dependency
* table (if it's not there yet), and we check it for lock order
* conflicts and deadlocks.
*/
#define MAX_LOCKDEP_ENTRIES 16384UL
#define MAX_LOCKDEP_CHAINS_BITS 15
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
/*
* Stack-trace: tightly packed array of stack backtrace
* addresses. Protected by the hash_lock.
*/
#define MAX_STACK_TRACE_ENTRIES 262144UL
extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
extern void get_usage_chars(struct lock_class *class,
char usage[LOCK_USAGE_CHARS]);
extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
extern unsigned long nr_lock_classes;
extern unsigned long nr_list_entries;
extern unsigned long nr_lock_chains;
extern int nr_chain_hlocks;
extern unsigned long nr_stack_trace_entries;
extern unsigned int nr_hardirq_chains;
extern unsigned int nr_softirq_chains;
extern unsigned int nr_process_chains;
extern unsigned int max_lockdep_depth;
extern unsigned int max_recursion_depth;
extern unsigned int max_bfs_queue_depth;
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
extern unsigned long lockdep_count_backward_deps(struct lock_class *);
#else
static inline unsigned long
lockdep_count_forward_deps(struct lock_class *class)
{
return 0;
}
static inline unsigned long
lockdep_count_backward_deps(struct lock_class *class)
{
return 0;
}
#endif
#ifdef CONFIG_DEBUG_LOCKDEP
/*
* Various lockdep statistics:
*/
extern atomic_t chain_lookup_hits;
extern atomic_t chain_lookup_misses;
extern atomic_t hardirqs_on_events;
extern atomic_t hardirqs_off_events;
extern atomic_t redundant_hardirqs_on;
extern atomic_t redundant_hardirqs_off;
extern atomic_t softirqs_on_events;
extern atomic_t softirqs_off_events;
extern atomic_t redundant_softirqs_on;
extern atomic_t redundant_softirqs_off;
extern atomic_t nr_unused_locks;
extern atomic_t nr_cyclic_checks;
extern atomic_t nr_cyclic_check_recursions;
extern atomic_t nr_find_usage_forwards_checks;
extern atomic_t nr_find_usage_forwards_recursions;
extern atomic_t nr_find_usage_backwards_checks;
extern atomic_t nr_find_usage_backwards_recursions;
# define debug_atomic_inc(ptr) atomic_inc(ptr)
# define debug_atomic_dec(ptr) atomic_dec(ptr)
# define debug_atomic_read(ptr) atomic_read(ptr)
#else
# define debug_atomic_inc(ptr) do { } while (0)
# define debug_atomic_dec(ptr) do { } while (0)
# define debug_atomic_read(ptr) 0
#endif

View File

@@ -0,0 +1,691 @@
/*
* kernel/lockdep_proc.c
*
* Runtime locking correctness validator
*
* Started by Ingo Molnar:
*
* Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*
* Code for /proc/lockdep and /proc/lockdep_stats:
*
*/
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
#include <linux/vmalloc.h>
#include <linux/sort.h>
#include <asm/uaccess.h>
#include <asm/div64.h>
#include "lockdep_internals.h"
static void *l_next(struct seq_file *m, void *v, loff_t *pos)
{
return seq_list_next(v, &all_lock_classes, pos);
}
static void *l_start(struct seq_file *m, loff_t *pos)
{
return seq_list_start_head(&all_lock_classes, *pos);
}
static void l_stop(struct seq_file *m, void *v)
{
}
static void print_name(struct seq_file *m, struct lock_class *class)
{
char str[128];
const char *name = class->name;
if (!name) {
name = __get_key_name(class->key, str);
seq_printf(m, "%s", name);
} else{
seq_printf(m, "%s", name);
if (class->name_version > 1)
seq_printf(m, "#%d", class->name_version);
if (class->subclass)
seq_printf(m, "/%d", class->subclass);
}
}
static int l_show(struct seq_file *m, void *v)
{
struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
struct lock_list *entry;
char usage[LOCK_USAGE_CHARS];
if (v == &all_lock_classes) {
seq_printf(m, "all lock classes:\n");
return 0;
}
seq_printf(m, "%p", class->key);
#ifdef CONFIG_DEBUG_LOCKDEP
seq_printf(m, " OPS:%8ld", class->ops);
#endif
#ifdef CONFIG_PROVE_LOCKING
seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
#endif
get_usage_chars(class, usage);
seq_printf(m, " %s", usage);
seq_printf(m, ": ");
print_name(m, class);
seq_puts(m, "\n");
list_for_each_entry(entry, &class->locks_after, entry) {
if (entry->distance == 1) {
seq_printf(m, " -> [%p] ", entry->class->key);
print_name(m, entry->class);
seq_puts(m, "\n");
}
}
seq_puts(m, "\n");
return 0;
}
static const struct seq_operations lockdep_ops = {
.start = l_start,
.next = l_next,
.stop = l_stop,
.show = l_show,
};
static int lockdep_open(struct inode *inode, struct file *file)
{
return seq_open(file, &lockdep_ops);
}
static const struct file_operations proc_lockdep_operations = {
.open = lockdep_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
#ifdef CONFIG_PROVE_LOCKING
static void *lc_start(struct seq_file *m, loff_t *pos)
{
if (*pos == 0)
return SEQ_START_TOKEN;
if (*pos - 1 < nr_lock_chains)
return lock_chains + (*pos - 1);
return NULL;
}
static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
{
(*pos)++;
return lc_start(m, pos);
}
static void lc_stop(struct seq_file *m, void *v)
{
}
static int lc_show(struct seq_file *m, void *v)
{
struct lock_chain *chain = v;
struct lock_class *class;
int i;
if (v == SEQ_START_TOKEN) {
seq_printf(m, "all lock chains:\n");
return 0;
}
seq_printf(m, "irq_context: %d\n", chain->irq_context);
for (i = 0; i < chain->depth; i++) {
class = lock_chain_get_class(chain, i);
if (!class->key)
continue;
seq_printf(m, "[%p] ", class->key);
print_name(m, class);
seq_puts(m, "\n");
}
seq_puts(m, "\n");
return 0;
}
static const struct seq_operations lockdep_chains_ops = {
.start = lc_start,
.next = lc_next,
.stop = lc_stop,
.show = lc_show,
};
static int lockdep_chains_open(struct inode *inode, struct file *file)
{
return seq_open(file, &lockdep_chains_ops);
}
static const struct file_operations proc_lockdep_chains_operations = {
.open = lockdep_chains_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
#endif /* CONFIG_PROVE_LOCKING */
static void lockdep_stats_debug_show(struct seq_file *m)
{
#ifdef CONFIG_DEBUG_LOCKDEP
unsigned int hi1 = debug_atomic_read(&hardirqs_on_events),
hi2 = debug_atomic_read(&hardirqs_off_events),
hr1 = debug_atomic_read(&redundant_hardirqs_on),
hr2 = debug_atomic_read(&redundant_hardirqs_off),
si1 = debug_atomic_read(&softirqs_on_events),
si2 = debug_atomic_read(&softirqs_off_events),
sr1 = debug_atomic_read(&redundant_softirqs_on),
sr2 = debug_atomic_read(&redundant_softirqs_off);
seq_printf(m, " chain lookup misses: %11u\n",
debug_atomic_read(&chain_lookup_misses));
seq_printf(m, " chain lookup hits: %11u\n",
debug_atomic_read(&chain_lookup_hits));
seq_printf(m, " cyclic checks: %11u\n",
debug_atomic_read(&nr_cyclic_checks));
seq_printf(m, " find-mask forwards checks: %11u\n",
debug_atomic_read(&nr_find_usage_forwards_checks));
seq_printf(m, " find-mask backwards checks: %11u\n",
debug_atomic_read(&nr_find_usage_backwards_checks));
seq_printf(m, " hardirq on events: %11u\n", hi1);
seq_printf(m, " hardirq off events: %11u\n", hi2);
seq_printf(m, " redundant hardirq ons: %11u\n", hr1);
seq_printf(m, " redundant hardirq offs: %11u\n", hr2);
seq_printf(m, " softirq on events: %11u\n", si1);
seq_printf(m, " softirq off events: %11u\n", si2);
seq_printf(m, " redundant softirq ons: %11u\n", sr1);
seq_printf(m, " redundant softirq offs: %11u\n", sr2);
#endif
}
static int lockdep_stats_show(struct seq_file *m, void *v)
{
struct lock_class *class;
unsigned long nr_unused = 0, nr_uncategorized = 0,
nr_irq_safe = 0, nr_irq_unsafe = 0,
nr_softirq_safe = 0, nr_softirq_unsafe = 0,
nr_hardirq_safe = 0, nr_hardirq_unsafe = 0,
nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
sum_forward_deps = 0, factor = 0;
list_for_each_entry(class, &all_lock_classes, lock_entry) {
if (class->usage_mask == 0)
nr_unused++;
if (class->usage_mask == LOCKF_USED)
nr_uncategorized++;
if (class->usage_mask & LOCKF_USED_IN_IRQ)
nr_irq_safe++;
if (class->usage_mask & LOCKF_ENABLED_IRQ)
nr_irq_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
nr_softirq_safe++;
if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
nr_softirq_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
nr_hardirq_safe++;
if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
nr_hardirq_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
nr_irq_read_safe++;
if (class->usage_mask & LOCKF_ENABLED_IRQ_READ)
nr_irq_read_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
nr_softirq_read_safe++;
if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
nr_softirq_read_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
nr_hardirq_read_safe++;
if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
nr_hardirq_read_unsafe++;
#ifdef CONFIG_PROVE_LOCKING
sum_forward_deps += lockdep_count_forward_deps(class);
#endif
}
#ifdef CONFIG_DEBUG_LOCKDEP
DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
#endif
seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
nr_lock_classes, MAX_LOCKDEP_KEYS);
seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
nr_list_entries, MAX_LOCKDEP_ENTRIES);
seq_printf(m, " indirect dependencies: %11lu\n",
sum_forward_deps);
/*
* Total number of dependencies:
*
* All irq-safe locks may nest inside irq-unsafe locks,
* plus all the other known dependencies:
*/
seq_printf(m, " all direct dependencies: %11lu\n",
nr_irq_unsafe * nr_irq_safe +
nr_hardirq_unsafe * nr_hardirq_safe +
nr_list_entries);
/*
* Estimated factor between direct and indirect
* dependencies:
*/
if (nr_list_entries)
factor = sum_forward_deps / nr_list_entries;
#ifdef CONFIG_PROVE_LOCKING
seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
nr_lock_chains, MAX_LOCKDEP_CHAINS);
seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n",
nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
seq_printf(m, " in-hardirq chains: %11u\n",
nr_hardirq_chains);
seq_printf(m, " in-softirq chains: %11u\n",
nr_softirq_chains);
#endif
seq_printf(m, " in-process chains: %11u\n",
nr_process_chains);
seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
seq_printf(m, " combined max dependencies: %11u\n",
(nr_hardirq_chains + 1) *
(nr_softirq_chains + 1) *
(nr_process_chains + 1)
);
seq_printf(m, " hardirq-safe locks: %11lu\n",
nr_hardirq_safe);
seq_printf(m, " hardirq-unsafe locks: %11lu\n",
nr_hardirq_unsafe);
seq_printf(m, " softirq-safe locks: %11lu\n",
nr_softirq_safe);
seq_printf(m, " softirq-unsafe locks: %11lu\n",
nr_softirq_unsafe);
seq_printf(m, " irq-safe locks: %11lu\n",
nr_irq_safe);
seq_printf(m, " irq-unsafe locks: %11lu\n",
nr_irq_unsafe);
seq_printf(m, " hardirq-read-safe locks: %11lu\n",
nr_hardirq_read_safe);
seq_printf(m, " hardirq-read-unsafe locks: %11lu\n",
nr_hardirq_read_unsafe);
seq_printf(m, " softirq-read-safe locks: %11lu\n",
nr_softirq_read_safe);
seq_printf(m, " softirq-read-unsafe locks: %11lu\n",
nr_softirq_read_unsafe);
seq_printf(m, " irq-read-safe locks: %11lu\n",
nr_irq_read_safe);
seq_printf(m, " irq-read-unsafe locks: %11lu\n",
nr_irq_read_unsafe);
seq_printf(m, " uncategorized locks: %11lu\n",
nr_uncategorized);
seq_printf(m, " unused locks: %11lu\n",
nr_unused);
seq_printf(m, " max locking depth: %11u\n",
max_lockdep_depth);
#ifdef CONFIG_PROVE_LOCKING
seq_printf(m, " max bfs queue depth: %11u\n",
max_bfs_queue_depth);
#endif
lockdep_stats_debug_show(m);
seq_printf(m, " debug_locks: %11u\n",
debug_locks);
return 0;
}
static int lockdep_stats_open(struct inode *inode, struct file *file)
{
return single_open(file, lockdep_stats_show, NULL);
}
static const struct file_operations proc_lockdep_stats_operations = {
.open = lockdep_stats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
#ifdef CONFIG_LOCK_STAT
struct lock_stat_data {
struct lock_class *class;
struct lock_class_stats stats;
};
struct lock_stat_seq {
struct lock_stat_data *iter_end;
struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
};
/*
* sort on absolute number of contentions
*/
static int lock_stat_cmp(const void *l, const void *r)
{
const struct lock_stat_data *dl = l, *dr = r;
unsigned long nl, nr;
nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr;
nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr;
return nr - nl;
}
static void seq_line(struct seq_file *m, char c, int offset, int length)
{
int i;
for (i = 0; i < offset; i++)
seq_puts(m, " ");
for (i = 0; i < length; i++)
seq_printf(m, "%c", c);
seq_puts(m, "\n");
}
static void snprint_time(char *buf, size_t bufsiz, s64 nr)
{
s64 div;
s32 rem;
nr += 5; /* for display rounding */
div = div_s64_rem(nr, 1000, &rem);
snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
}
static void seq_time(struct seq_file *m, s64 time)
{
char num[15];
snprint_time(num, sizeof(num), time);
seq_printf(m, " %14s", num);
}
static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
{
seq_printf(m, "%14lu", lt->nr);
seq_time(m, lt->min);
seq_time(m, lt->max);
seq_time(m, lt->total);
}
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
{
char name[39];
struct lock_class *class;
struct lock_class_stats *stats;
int i, namelen;
class = data->class;
stats = &data->stats;
namelen = 38;
if (class->name_version > 1)
namelen -= 2; /* XXX truncates versions > 9 */
if (class->subclass)
namelen -= 2;
if (!class->name) {
char str[KSYM_NAME_LEN];
const char *key_name;
key_name = __get_key_name(class->key, str);
snprintf(name, namelen, "%s", key_name);
} else {
snprintf(name, namelen, "%s", class->name);
}
namelen = strlen(name);
if (class->name_version > 1) {
snprintf(name+namelen, 3, "#%d", class->name_version);
namelen += 2;
}
if (class->subclass) {
snprintf(name+namelen, 3, "/%d", class->subclass);
namelen += 2;
}
if (stats->write_holdtime.nr) {
if (stats->read_holdtime.nr)
seq_printf(m, "%38s-W:", name);
else
seq_printf(m, "%40s:", name);
seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]);
seq_lock_time(m, &stats->write_waittime);
seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]);
seq_lock_time(m, &stats->write_holdtime);
seq_puts(m, "\n");
}
if (stats->read_holdtime.nr) {
seq_printf(m, "%38s-R:", name);
seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]);
seq_lock_time(m, &stats->read_waittime);
seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]);
seq_lock_time(m, &stats->read_holdtime);
seq_puts(m, "\n");
}
if (stats->read_waittime.nr + stats->write_waittime.nr == 0)
return;
if (stats->read_holdtime.nr)
namelen += 2;
for (i = 0; i < LOCKSTAT_POINTS; i++) {
char sym[KSYM_SYMBOL_LEN];
char ip[32];
if (class->contention_point[i] == 0)
break;
if (!i)
seq_line(m, '-', 40-namelen, namelen);
sprint_symbol(sym, class->contention_point[i]);
snprintf(ip, sizeof(ip), "[<%p>]",
(void *)class->contention_point[i]);
seq_printf(m, "%40s %14lu %29s %s\n", name,
stats->contention_point[i],
ip, sym);
}
for (i = 0; i < LOCKSTAT_POINTS; i++) {
char sym[KSYM_SYMBOL_LEN];
char ip[32];
if (class->contending_point[i] == 0)
break;
if (!i)
seq_line(m, '-', 40-namelen, namelen);
sprint_symbol(sym, class->contending_point[i]);
snprintf(ip, sizeof(ip), "[<%p>]",
(void *)class->contending_point[i]);
seq_printf(m, "%40s %14lu %29s %s\n", name,
stats->contending_point[i],
ip, sym);
}
if (i) {
seq_puts(m, "\n");
seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
seq_puts(m, "\n");
}
}
static void seq_header(struct seq_file *m)
{
seq_printf(m, "lock_stat version 0.3\n");
if (unlikely(!debug_locks))
seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
"%14s %14s\n",
"class name",
"con-bounces",
"contentions",
"waittime-min",
"waittime-max",
"waittime-total",
"acq-bounces",
"acquisitions",
"holdtime-min",
"holdtime-max",
"holdtime-total");
seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
seq_printf(m, "\n");
}
static void *ls_start(struct seq_file *m, loff_t *pos)
{
struct lock_stat_seq *data = m->private;
struct lock_stat_data *iter;
if (*pos == 0)
return SEQ_START_TOKEN;
iter = data->stats + (*pos - 1);
if (iter >= data->iter_end)
iter = NULL;
return iter;
}
static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
{
(*pos)++;
return ls_start(m, pos);
}
static void ls_stop(struct seq_file *m, void *v)
{
}
static int ls_show(struct seq_file *m, void *v)
{
if (v == SEQ_START_TOKEN)
seq_header(m);
else
seq_stats(m, v);
return 0;
}
static const struct seq_operations lockstat_ops = {
.start = ls_start,
.next = ls_next,
.stop = ls_stop,
.show = ls_show,
};
static int lock_stat_open(struct inode *inode, struct file *file)
{
int res;
struct lock_class *class;
struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq));
if (!data)
return -ENOMEM;
res = seq_open(file, &lockstat_ops);
if (!res) {
struct lock_stat_data *iter = data->stats;
struct seq_file *m = file->private_data;
list_for_each_entry(class, &all_lock_classes, lock_entry) {
iter->class = class;
iter->stats = lock_stats(class);
iter++;
}
data->iter_end = iter;
sort(data->stats, data->iter_end - data->stats,
sizeof(struct lock_stat_data),
lock_stat_cmp, NULL);
m->private = data;
} else
vfree(data);
return res;
}
static ssize_t lock_stat_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct lock_class *class;
char c;
if (count) {
if (get_user(c, buf))
return -EFAULT;
if (c != '0')
return count;
list_for_each_entry(class, &all_lock_classes, lock_entry)
clear_lock_stats(class);
}
return count;
}
static int lock_stat_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
vfree(seq->private);
return seq_release(inode, file);
}
static const struct file_operations proc_lock_stat_operations = {
.open = lock_stat_open,
.write = lock_stat_write,
.read = seq_read,
.llseek = seq_lseek,
.release = lock_stat_release,
};
#endif /* CONFIG_LOCK_STAT */
static int __init lockdep_proc_init(void)
{
proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
#ifdef CONFIG_PROVE_LOCKING
proc_create("lockdep_chains", S_IRUSR, NULL,
&proc_lockdep_chains_operations);
#endif
proc_create("lockdep_stats", S_IRUSR, NULL,
&proc_lockdep_stats_operations);
#ifdef CONFIG_LOCK_STAT
proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
&proc_lock_stat_operations);
#endif
return 0;
}
__initcall(lockdep_proc_init);

View File

@@ -0,0 +1,9 @@
/*
* Lockdep states,
*
* please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever
* you add one, or come up with a nice dynamic solution.
*/
LOCKDEP_STATE(HARDIRQ)
LOCKDEP_STATE(SOFTIRQ)
LOCKDEP_STATE(RECLAIM_FS)

3423
kernel/kernel/module.c Normal file

File diff suppressed because it is too large Load Diff

110
kernel/kernel/mutex-debug.c Normal file
View File

@@ -0,0 +1,110 @@
/*
* kernel/mutex-debug.c
*
* Debugging code for mutexes
*
* Started by Ingo Molnar:
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* lock debugging, locking tree, deadlock detection started by:
*
* Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
* Released under the General Public License (GPL).
*/
#include <linux/mutex.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/poison.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/kallsyms.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
#include "mutex-debug.h"
/*
* Must be called with lock->wait_lock held.
*/
void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
{
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
waiter->magic = waiter;
INIT_LIST_HEAD(&waiter->list);
}
void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
{
SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
}
void debug_mutex_free_waiter(struct mutex_waiter *waiter)
{
DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list));
memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
}
void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct thread_info *ti)
{
SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
/* Mark the current thread as blocked on the lock: */
ti->task->blocked_on = waiter;
}
void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct thread_info *ti)
{
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
ti->task->blocked_on = NULL;
list_del_init(&waiter->list);
waiter->task = NULL;
}
void debug_mutex_unlock(struct mutex *lock)
{
if (unlikely(!debug_locks))
return;
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
mutex_clear_owner(lock);
}
void debug_mutex_init(struct mutex *lock, const char *name,
struct lock_class_key *key)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
* Make sure we are not reinitializing a held lock:
*/
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
lockdep_init_map(&lock->dep_map, name, key, 0);
#endif
lock->magic = lock;
}
/***
* mutex_destroy - mark a mutex unusable
* @lock: the mutex to be destroyed
*
* This function marks the mutex uninitialized, and any subsequent
* use of the mutex is forbidden. The mutex must not be locked when
* this function is called.
*/
void mutex_destroy(struct mutex *lock)
{
DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
lock->magic = NULL;
}
EXPORT_SYMBOL_GPL(mutex_destroy);

View File

@@ -0,0 +1,55 @@
/*
* Mutexes: blocking mutual exclusion locks
*
* started by Ingo Molnar:
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* This file contains mutex debugging related internal declarations,
* prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
* More details are in kernel/mutex-debug.c.
*/
/*
* This must be called with lock->wait_lock held.
*/
extern void debug_mutex_lock_common(struct mutex *lock,
struct mutex_waiter *waiter);
extern void debug_mutex_wake_waiter(struct mutex *lock,
struct mutex_waiter *waiter);
extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
extern void debug_mutex_add_waiter(struct mutex *lock,
struct mutex_waiter *waiter,
struct thread_info *ti);
extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct thread_info *ti);
extern void debug_mutex_unlock(struct mutex *lock);
extern void debug_mutex_init(struct mutex *lock, const char *name,
struct lock_class_key *key);
static inline void mutex_set_owner(struct mutex *lock)
{
lock->owner = current_thread_info();
}
static inline void mutex_clear_owner(struct mutex *lock)
{
lock->owner = NULL;
}
#define spin_lock_mutex(lock, flags) \
do { \
struct mutex *l = container_of(lock, struct mutex, wait_lock); \
\
DEBUG_LOCKS_WARN_ON(in_interrupt()); \
local_irq_save(flags); \
__raw_spin_lock(&(lock)->raw_lock); \
DEBUG_LOCKS_WARN_ON(l->magic != l); \
} while (0)
#define spin_unlock_mutex(lock, flags) \
do { \
__raw_spin_unlock(&(lock)->raw_lock); \
local_irq_restore(flags); \
preempt_check_resched(); \
} while (0)

507
kernel/kernel/mutex.c Normal file
View File

@@ -0,0 +1,507 @@
/*
* kernel/mutex.c
*
* Mutexes: blocking mutual exclusion locks
*
* Started by Ingo Molnar:
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
* David Howells for suggestions and improvements.
*
* - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
* from the -rt tree, where it was originally implemented for rtmutexes
* by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
* and Sven Dietrich.
*
* Also see Documentation/mutex-design.txt.
*/
#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
* which forces all calls into the slowpath:
*/
#ifdef CONFIG_DEBUG_MUTEXES
# include "mutex-debug.h"
# include <asm-generic/mutex-null.h>
#else
# include "mutex.h"
# include <asm/mutex.h>
#endif
/***
* mutex_init - initialize the mutex
* @lock: the mutex to be initialized
* @key: the lock_class_key for the class; used by mutex lock debugging
*
* Initialize the mutex to unlocked state.
*
* It is not allowed to initialize an already locked mutex.
*/
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
atomic_set(&lock->count, 1);
spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
debug_mutex_init(lock, name, key);
}
EXPORT_SYMBOL(__mutex_init);
#ifndef CONFIG_DEBUG_LOCK_ALLOC
/*
* We split the mutex lock/unlock logic into separate fastpath and
* slowpath functions, to reduce the register pressure on the fastpath.
* We also put the fastpath first in the kernel image, to make sure the
* branch is predicted by the CPU as default-untaken.
*/
static __used noinline void __sched
__mutex_lock_slowpath(atomic_t *lock_count);
/***
* mutex_lock - acquire the mutex
* @lock: the mutex to be acquired
*
* Lock the mutex exclusively for this task. If the mutex is not
* available right now, it will sleep until it can get it.
*
* The mutex must later on be released by the same task that
* acquired it. Recursive locking is not allowed. The task
* may not exit without first unlocking the mutex. Also, kernel
* memory where the mutex resides mutex must not be freed with
* the mutex still locked. The mutex must first be initialized
* (or statically defined) before it can be locked. memset()-ing
* the mutex to 0 is not allowed.
*
* ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
* checks that will enforce the restrictions and will also do
* deadlock debugging. )
*
* This function is similar to (but not equivalent to) down().
*/
void __sched mutex_lock(struct mutex *lock)
{
might_sleep();
/*
* The locking fastpath is the 1->0 transition from
* 'unlocked' into 'locked' state.
*/
__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
mutex_set_owner(lock);
}
EXPORT_SYMBOL(mutex_lock);
#endif
static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
/***
* mutex_unlock - release the mutex
* @lock: the mutex to be released
*
* Unlock a mutex that has been locked by this task previously.
*
* This function must not be used in interrupt context. Unlocking
* of a not locked mutex is not allowed.
*
* This function is similar to (but not equivalent to) up().
*/
void __sched mutex_unlock(struct mutex *lock)
{
/*
* The unlocking fastpath is the 0->1 transition from 'locked'
* into 'unlocked' state:
*/
#ifndef CONFIG_DEBUG_MUTEXES
/*
* When debugging is enabled we must not clear the owner before time,
* the slow path will always be taken, and that clears the owner field
* after verifying that it was indeed current.
*/
mutex_clear_owner(lock);
#endif
__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
}
EXPORT_SYMBOL(mutex_unlock);
/*
* Lock a mutex (possibly interruptible), slowpath:
*/
static inline int __sched
__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
unsigned long ip)
{
struct task_struct *task = current;
struct mutex_waiter waiter;
unsigned long flags;
preempt_disable();
mutex_acquire(&lock->dep_map, subclass, 0, ip);
#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \
!defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES)
/*
* Optimistic spinning.
*
* We try to spin for acquisition when we find that there are no
* pending waiters and the lock owner is currently running on a
* (different) CPU.
*
* The rationale is that if the lock owner is running, it is likely to
* release the lock soon.
*
* Since this needs the lock owner, and this mutex implementation
* doesn't track the owner atomically in the lock field, we need to
* track it non-atomically.
*
* We can't do this for DEBUG_MUTEXES because that relies on wait_lock
* to serialize everything.
*/
for (;;) {
struct thread_info *owner;
/*
* If we own the BKL, then don't spin. The owner of
* the mutex might be waiting on us to release the BKL.
*/
if (unlikely(current->lock_depth >= 0))
break;
/*
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
owner = ACCESS_ONCE(lock->owner);
if (owner && !mutex_spin_on_owner(lock, owner))
break;
if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
lock_acquired(&lock->dep_map, ip);
mutex_set_owner(lock);
preempt_enable();
return 0;
}
/*
* When there's no owner, we might have preempted between the
* owner acquiring the lock and setting the owner field. If
* we're an RT task that will live-lock because we won't let
* the owner complete.
*/
if (!owner && (need_resched() || rt_task(task)))
break;
/*
* The cpu_relax() call is a compiler barrier which forces
* everything in this loop to be re-loaded. We don't need
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
cpu_relax();
}
#endif
spin_lock_mutex(&lock->wait_lock, flags);
debug_mutex_lock_common(lock, &waiter);
debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
/* add waiting tasks to the end of the waitqueue (FIFO): */
list_add_tail(&waiter.list, &lock->wait_list);
waiter.task = task;
if (atomic_xchg(&lock->count, -1) == 1)
goto done;
lock_contended(&lock->dep_map, ip);
for (;;) {
/*
* Lets try to take the lock again - this is needed even if
* we get here for the first time (shortly after failing to
* acquire the lock), to make sure that we get a wakeup once
* it's unlocked. Later on, if we sleep, this is the
* operation that gives us the lock. We xchg it to -1, so
* that when we release the lock, we properly wake up the
* other waiters:
*/
if (atomic_xchg(&lock->count, -1) == 1)
break;
/*
* got a signal? (This code gets eliminated in the
* TASK_UNINTERRUPTIBLE case.)
*/
if (unlikely(signal_pending_state(state, task))) {
mutex_remove_waiter(lock, &waiter,
task_thread_info(task));
mutex_release(&lock->dep_map, 1, ip);
spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
preempt_enable();
return -EINTR;
}
__set_task_state(task, state);
/* didnt get the lock, go to sleep: */
spin_unlock_mutex(&lock->wait_lock, flags);
preempt_enable_no_resched();
schedule();
preempt_disable();
spin_lock_mutex(&lock->wait_lock, flags);
}
done:
lock_acquired(&lock->dep_map, ip);
/* got the lock - rejoice! */
mutex_remove_waiter(lock, &waiter, current_thread_info());
mutex_set_owner(lock);
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
atomic_set(&lock->count, 0);
spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
preempt_enable();
return 0;
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __sched
mutex_lock_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_nested);
int __sched
mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
int __sched
mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
subclass, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
#endif
/*
* Release the lock, slowpath:
*/
static inline void
__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
unsigned long flags;
spin_lock_mutex(&lock->wait_lock, flags);
mutex_release(&lock->dep_map, nested, _RET_IP_);
debug_mutex_unlock(lock);
/*
* some architectures leave the lock unlocked in the fastpath failure
* case, others need to leave it locked. In the later case we have to
* unlock it here
*/
if (__mutex_slowpath_needs_to_unlock())
atomic_set(&lock->count, 1);
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
struct mutex_waiter *waiter =
list_entry(lock->wait_list.next,
struct mutex_waiter, list);
debug_mutex_wake_waiter(lock, waiter);
wake_up_process(waiter->task);
}
spin_unlock_mutex(&lock->wait_lock, flags);
}
/*
* Release the lock, slowpath:
*/
static __used noinline void
__mutex_unlock_slowpath(atomic_t *lock_count)
{
__mutex_unlock_common_slowpath(lock_count, 1);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
/*
* Here come the less common (and hence less performance-critical) APIs:
* mutex_lock_interruptible() and mutex_trylock().
*/
static noinline int __sched
__mutex_lock_killable_slowpath(atomic_t *lock_count);
static noinline int __sched
__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
/***
* mutex_lock_interruptible - acquire the mutex, interruptable
* @lock: the mutex to be acquired
*
* Lock the mutex like mutex_lock(), and return 0 if the mutex has
* been acquired or sleep until the mutex becomes available. If a
* signal arrives while waiting for the lock then this function
* returns -EINTR.
*
* This function is similar to (but not equivalent to) down_interruptible().
*/
int __sched mutex_lock_interruptible(struct mutex *lock)
{
int ret;
might_sleep();
ret = __mutex_fastpath_lock_retval
(&lock->count, __mutex_lock_interruptible_slowpath);
if (!ret)
mutex_set_owner(lock);
return ret;
}
EXPORT_SYMBOL(mutex_lock_interruptible);
int __sched mutex_lock_killable(struct mutex *lock)
{
int ret;
might_sleep();
ret = __mutex_fastpath_lock_retval
(&lock->count, __mutex_lock_killable_slowpath);
if (!ret)
mutex_set_owner(lock);
return ret;
}
EXPORT_SYMBOL(mutex_lock_killable);
static __used noinline void __sched
__mutex_lock_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
}
static noinline int __sched
__mutex_lock_killable_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_);
}
static noinline int __sched
__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_);
}
#endif
/*
* Spinlock based trylock, we take the spinlock and check whether we
* can get the lock:
*/
static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
unsigned long flags;
int prev;
spin_lock_mutex(&lock->wait_lock, flags);
prev = atomic_xchg(&lock->count, -1);
if (likely(prev == 1)) {
mutex_set_owner(lock);
mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
}
/* Set it back to 0 if there are no waiters: */
if (likely(list_empty(&lock->wait_list)))
atomic_set(&lock->count, 0);
spin_unlock_mutex(&lock->wait_lock, flags);
return prev == 1;
}
/***
* mutex_trylock - try acquire the mutex, without waiting
* @lock: the mutex to be acquired
*
* Try to acquire the mutex atomically. Returns 1 if the mutex
* has been acquired successfully, and 0 on contention.
*
* NOTE: this function follows the spin_trylock() convention, so
* it is negated to the down_trylock() return values! Be careful
* about this when converting semaphore users to mutexes.
*
* This function must not be used in interrupt context. The
* mutex must be released by the same task that acquired it.
*/
int __sched mutex_trylock(struct mutex *lock)
{
int ret;
ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
if (ret)
mutex_set_owner(lock);
return ret;
}
EXPORT_SYMBOL(mutex_trylock);
/**
* atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
* @cnt: the atomic which we are to dec
* @lock: the mutex to return holding if we dec to 0
*
* return true and hold lock if we dec to 0, return false otherwise
*/
int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
{
/* dec if we can't possibly hit 0 */
if (atomic_add_unless(cnt, -1, 1))
return 0;
/* we might hit 0, so take the lock */
mutex_lock(lock);
if (!atomic_dec_and_test(cnt)) {
/* when we actually did the dec, we didn't hit 0 */
mutex_unlock(lock);
return 0;
}
/* we hit 0, and we hold the lock */
return 1;
}
EXPORT_SYMBOL(atomic_dec_and_mutex_lock);

48
kernel/kernel/mutex.h Normal file
View File

@@ -0,0 +1,48 @@
/*
* Mutexes: blocking mutual exclusion locks
*
* started by Ingo Molnar:
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* This file contains mutex debugging related internal prototypes, for the
* !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
*/
#define spin_lock_mutex(lock, flags) \
do { spin_lock(lock); (void)(flags); } while (0)
#define spin_unlock_mutex(lock, flags) \
do { spin_unlock(lock); (void)(flags); } while (0)
#define mutex_remove_waiter(lock, waiter, ti) \
__list_del((waiter)->list.prev, (waiter)->list.next)
#ifdef CONFIG_SMP
static inline void mutex_set_owner(struct mutex *lock)
{
lock->owner = current_thread_info();
}
static inline void mutex_clear_owner(struct mutex *lock)
{
lock->owner = NULL;
}
#else
static inline void mutex_set_owner(struct mutex *lock)
{
}
static inline void mutex_clear_owner(struct mutex *lock)
{
}
#endif
#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
#define debug_mutex_free_waiter(waiter) do { } while (0)
#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
#define debug_mutex_unlock(lock) do { } while (0)
#define debug_mutex_init(lock, name, key) do { } while (0)
static inline void
debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
{
}

586
kernel/kernel/notifier.c Normal file
View File

@@ -0,0 +1,586 @@
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>
/*
* Notifier list for kernel code which wants to be called
* at shutdown. This is used to stop any idling DMA operations
* and the like.
*/
BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
/*
* Notifier chain core routines. The exported routines below
* are layered on top of these, with appropriate locking added.
*/
static int notifier_chain_register(struct notifier_block **nl,
struct notifier_block *n)
{
while ((*nl) != NULL) {
if (n->priority > (*nl)->priority)
break;
nl = &((*nl)->next);
}
n->next = *nl;
rcu_assign_pointer(*nl, n);
return 0;
}
static int notifier_chain_cond_register(struct notifier_block **nl,
struct notifier_block *n)
{
while ((*nl) != NULL) {
if ((*nl) == n)
return 0;
if (n->priority > (*nl)->priority)
break;
nl = &((*nl)->next);
}
n->next = *nl;
rcu_assign_pointer(*nl, n);
return 0;
}
static int notifier_chain_unregister(struct notifier_block **nl,
struct notifier_block *n)
{
while ((*nl) != NULL) {
if ((*nl) == n) {
rcu_assign_pointer(*nl, n->next);
return 0;
}
nl = &((*nl)->next);
}
return -ENOENT;
}
/**
* notifier_call_chain - Informs the registered notifiers about an event.
* @nl: Pointer to head of the blocking notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
* @nr_to_call: Number of notifier functions to be called. Don't care
* value of this parameter is -1.
* @nr_calls: Records the number of notifications sent. Don't care
* value of this field is NULL.
* @returns: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
static int __kprobes notifier_call_chain(struct notifier_block **nl,
unsigned long val, void *v,
int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
struct notifier_block *nb, *next_nb;
nb = rcu_dereference(*nl);
while (nb && nr_to_call) {
next_nb = rcu_dereference(nb->next);
#ifdef CONFIG_DEBUG_NOTIFIERS
if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
WARN(1, "Invalid notifier called!");
nb = next_nb;
continue;
}
#endif
ret = nb->notifier_call(nb, val, v);
if (nr_calls)
(*nr_calls)++;
if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
break;
nb = next_nb;
nr_to_call--;
}
return ret;
}
/*
* Atomic notifier chain routines. Registration and unregistration
* use a spinlock, and call_chain is synchronized by RCU (no locks).
*/
/**
* atomic_notifier_chain_register - Add notifier to an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @n: New entry in notifier chain
*
* Adds a notifier to an atomic notifier chain.
*
* Currently always returns zero.
*/
int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
struct notifier_block *n)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&nh->lock, flags);
ret = notifier_chain_register(&nh->head, n);
spin_unlock_irqrestore(&nh->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
/**
* atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @n: Entry to remove from notifier chain
*
* Removes a notifier from an atomic notifier chain.
*
* Returns zero on success or %-ENOENT on failure.
*/
int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
struct notifier_block *n)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&nh->lock, flags);
ret = notifier_chain_unregister(&nh->head, n);
spin_unlock_irqrestore(&nh->lock, flags);
synchronize_rcu();
return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
/**
* __atomic_notifier_call_chain - Call functions in an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
* @nr_to_call: See the comment for notifier_call_chain.
* @nr_calls: See the comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in an atomic context, so they must not block.
* This routine uses RCU to synchronize with changes to the chain.
*
* If the return value of the notifier can be and'ed
* with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
* will return immediately, with the return value of
* the notifier function which halted execution.
* Otherwise the return value is the return value
* of the last notifier function called.
*/
int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
unsigned long val, void *v,
int nr_to_call, int *nr_calls)
{
int ret;
rcu_read_lock();
ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
unsigned long val, void *v)
{
return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
/*
* Blocking notifier chain routines. All access to the chain is
* synchronized by an rwsem.
*/
/**
* blocking_notifier_chain_register - Add notifier to a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @n: New entry in notifier chain
*
* Adds a notifier to a blocking notifier chain.
* Must be called in process context.
*
* Currently always returns zero.
*/
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
struct notifier_block *n)
{
int ret;
/*
* This code gets used during boot-up, when task switching is
* not yet working and interrupts must remain disabled. At
* such times we must not call down_write().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
return notifier_chain_register(&nh->head, n);
down_write(&nh->rwsem);
ret = notifier_chain_register(&nh->head, n);
up_write(&nh->rwsem);
return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
/**
* blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @n: New entry in notifier chain
*
* Adds a notifier to a blocking notifier chain, only if not already
* present in the chain.
* Must be called in process context.
*
* Currently always returns zero.
*/
int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
struct notifier_block *n)
{
int ret;
down_write(&nh->rwsem);
ret = notifier_chain_cond_register(&nh->head, n);
up_write(&nh->rwsem);
return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
/**
* blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @n: Entry to remove from notifier chain
*
* Removes a notifier from a blocking notifier chain.
* Must be called from process context.
*
* Returns zero on success or %-ENOENT on failure.
*/
int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
struct notifier_block *n)
{
int ret;
/*
* This code gets used during boot-up, when task switching is
* not yet working and interrupts must remain disabled. At
* such times we must not call down_write().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
return notifier_chain_unregister(&nh->head, n);
down_write(&nh->rwsem);
ret = notifier_chain_unregister(&nh->head, n);
up_write(&nh->rwsem);
return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
/**
* __blocking_notifier_call_chain - Call functions in a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
* @nr_to_call: See comment for notifier_call_chain.
* @nr_calls: See comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
*
* If the return value of the notifier can be and'ed
* with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
* will return immediately, with the return value of
* the notifier function which halted execution.
* Otherwise the return value is the return value
* of the last notifier function called.
*/
int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
unsigned long val, void *v,
int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
/*
* We check the head outside the lock, but if this access is
* racy then it does not matter what the result of the test
* is, we re-check the list after having taken the lock anyway:
*/
if (rcu_dereference(nh->head)) {
down_read(&nh->rwsem);
ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
nr_calls);
up_read(&nh->rwsem);
}
return ret;
}
EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
unsigned long val, void *v)
{
return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
/*
* Raw notifier chain routines. There is no protection;
* the caller must provide it. Use at your own risk!
*/
/**
* raw_notifier_chain_register - Add notifier to a raw notifier chain
* @nh: Pointer to head of the raw notifier chain
* @n: New entry in notifier chain
*
* Adds a notifier to a raw notifier chain.
* All locking must be provided by the caller.
*
* Currently always returns zero.
*/
int raw_notifier_chain_register(struct raw_notifier_head *nh,
struct notifier_block *n)
{
return notifier_chain_register(&nh->head, n);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
/**
* raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
* @nh: Pointer to head of the raw notifier chain
* @n: Entry to remove from notifier chain
*
* Removes a notifier from a raw notifier chain.
* All locking must be provided by the caller.
*
* Returns zero on success or %-ENOENT on failure.
*/
int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
struct notifier_block *n)
{
return notifier_chain_unregister(&nh->head, n);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
/**
* __raw_notifier_call_chain - Call functions in a raw notifier chain
* @nh: Pointer to head of the raw notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
* @nr_to_call: See comment for notifier_call_chain.
* @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in an undefined context.
* All locking must be provided by the caller.
*
* If the return value of the notifier can be and'ed
* with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
* will return immediately, with the return value of
* the notifier function which halted execution.
* Otherwise the return value is the return value
* of the last notifier function called.
*/
int __raw_notifier_call_chain(struct raw_notifier_head *nh,
unsigned long val, void *v,
int nr_to_call, int *nr_calls)
{
return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
}
EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
int raw_notifier_call_chain(struct raw_notifier_head *nh,
unsigned long val, void *v)
{
return __raw_notifier_call_chain(nh, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
/*
* SRCU notifier chain routines. Registration and unregistration
* use a mutex, and call_chain is synchronized by SRCU (no locks).
*/
/**
* srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
* @nh: Pointer to head of the SRCU notifier chain
* @n: New entry in notifier chain
*
* Adds a notifier to an SRCU notifier chain.
* Must be called in process context.
*
* Currently always returns zero.
*/
int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
struct notifier_block *n)
{
int ret;
/*
* This code gets used during boot-up, when task switching is
* not yet working and interrupts must remain disabled. At
* such times we must not call mutex_lock().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
return notifier_chain_register(&nh->head, n);
mutex_lock(&nh->mutex);
ret = notifier_chain_register(&nh->head, n);
mutex_unlock(&nh->mutex);
return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);
/**
* srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
* @nh: Pointer to head of the SRCU notifier chain
* @n: Entry to remove from notifier chain
*
* Removes a notifier from an SRCU notifier chain.
* Must be called from process context.
*
* Returns zero on success or %-ENOENT on failure.
*/
int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
struct notifier_block *n)
{
int ret;
/*
* This code gets used during boot-up, when task switching is
* not yet working and interrupts must remain disabled. At
* such times we must not call mutex_lock().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
return notifier_chain_unregister(&nh->head, n);
mutex_lock(&nh->mutex);
ret = notifier_chain_unregister(&nh->head, n);
mutex_unlock(&nh->mutex);
synchronize_srcu(&nh->srcu);
return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
/**
* __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
* @nh: Pointer to head of the SRCU notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
* @nr_to_call: See comment for notifier_call_chain.
* @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
*
* If the return value of the notifier can be and'ed
* with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
* will return immediately, with the return value of
* the notifier function which halted execution.
* Otherwise the return value is the return value
* of the last notifier function called.
*/
int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
unsigned long val, void *v,
int nr_to_call, int *nr_calls)
{
int ret;
int idx;
idx = srcu_read_lock(&nh->srcu);
ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
srcu_read_unlock(&nh->srcu, idx);
return ret;
}
EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
unsigned long val, void *v)
{
return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
/**
* srcu_init_notifier_head - Initialize an SRCU notifier head
* @nh: Pointer to head of the srcu notifier chain
*
* Unlike other sorts of notifier heads, SRCU notifier heads require
* dynamic initialization. Be sure to call this routine before
* calling any of the other SRCU notifier routines for this head.
*
* If an SRCU notifier head is deallocated, it must first be cleaned
* up by calling srcu_cleanup_notifier_head(). Otherwise the head's
* per-cpu data (used by the SRCU mechanism) will leak.
*/
void srcu_init_notifier_head(struct srcu_notifier_head *nh)
{
mutex_init(&nh->mutex);
if (init_srcu_struct(&nh->srcu) < 0)
BUG();
nh->head = NULL;
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
/**
* register_reboot_notifier - Register function to be called at reboot time
* @nb: Info about notifier function to be called
*
* Registers a function with the list of functions
* to be called at reboot time.
*
* Currently always returns zero, as blocking_notifier_chain_register()
* always returns zero.
*/
int register_reboot_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&reboot_notifier_list, nb);
}
EXPORT_SYMBOL(register_reboot_notifier);
/**
* unregister_reboot_notifier - Unregister previously registered reboot notifier
* @nb: Hook to be unregistered
*
* Unregisters a previously registered reboot
* notifier function.
*
* Returns zero on success, or %-ENOENT on failure.
*/
int unregister_reboot_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
}
EXPORT_SYMBOL(unregister_reboot_notifier);
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace notify_die(enum die_val val, const char *str,
struct pt_regs *regs, long err, int trap, int sig)
{
struct die_args args = {
.regs = regs,
.str = str,
.err = err,
.trapnr = trap,
.signr = sig,
};
return atomic_notifier_call_chain(&die_chain, val, &args);
}
int register_die_notifier(struct notifier_block *nb)
{
vmalloc_sync_all();
return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);
int unregister_die_notifier(struct notifier_block *nb)
{
return atomic_notifier_chain_unregister(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_die_notifier);

110
kernel/kernel/ns_cgroup.c Normal file
View File

@@ -0,0 +1,110 @@
/*
* ns_cgroup.c - namespace cgroup subsystem
*
* Copyright 2006, 2007 IBM Corp
*/
#include <linux/module.h>
#include <linux/cgroup.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/slab.h>
#include <linux/nsproxy.h>
struct ns_cgroup {
struct cgroup_subsys_state css;
};
struct cgroup_subsys ns_subsys;
static inline struct ns_cgroup *cgroup_to_ns(
struct cgroup *cgroup)
{
return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
struct ns_cgroup, css);
}
int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
{
char name[PROC_NUMBUF];
snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
return cgroup_clone(task, &ns_subsys, name);
}
/*
* Rules:
* 1. you can only enter a cgroup which is a descendant of your current
* cgroup
* 2. you can only place another process into a cgroup if
* a. you have CAP_SYS_ADMIN
* b. your cgroup is an ancestor of task's destination cgroup
* (hence either you are in the same cgroup as task, or in an
* ancestor cgroup thereof)
*/
static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
struct task_struct *task, bool threadgroup)
{
if (current != task) {
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!cgroup_is_descendant(new_cgroup, current))
return -EPERM;
}
if (!cgroup_is_descendant(new_cgroup, task))
return -EPERM;
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
if (!cgroup_is_descendant(new_cgroup, c)) {
rcu_read_unlock();
return -EPERM;
}
}
rcu_read_unlock();
}
return 0;
}
/*
* Rules: you can only create a cgroup if
* 1. you are capable(CAP_SYS_ADMIN)
* 2. the target cgroup is a descendant of your own cgroup
*/
static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
struct cgroup *cgroup)
{
struct ns_cgroup *ns_cgroup;
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
if (!cgroup_is_descendant(cgroup, current))
return ERR_PTR(-EPERM);
ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
if (!ns_cgroup)
return ERR_PTR(-ENOMEM);
return &ns_cgroup->css;
}
static void ns_destroy(struct cgroup_subsys *ss,
struct cgroup *cgroup)
{
struct ns_cgroup *ns_cgroup;
ns_cgroup = cgroup_to_ns(cgroup);
kfree(ns_cgroup);
}
struct cgroup_subsys ns_subsys = {
.name = "ns",
.can_attach = ns_can_attach,
.create = ns_create,
.destroy = ns_destroy,
.subsys_id = ns_subsys_id,
};

230
kernel/kernel/nsproxy.c Normal file
View File

@@ -0,0 +1,230 @@
/*
* Copyright (C) 2006 IBM Corporation
*
* Author: Serge Hallyn <serue@us.ibm.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, version 2 of the
* License.
*
* Jun 2006 - namespaces support
* OpenVZ, SWsoft Inc.
* Pavel Emelianov <xemul@openvz.org>
*/
#include <linux/module.h>
#include <linux/nsproxy.h>
#include <linux/init_task.h>
#include <linux/mnt_namespace.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
static struct kmem_cache *nsproxy_cachep;
struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
static inline struct nsproxy *create_nsproxy(void)
{
struct nsproxy *nsproxy;
nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
if (nsproxy)
atomic_set(&nsproxy->count, 1);
return nsproxy;
}
/*
* Create new nsproxy and all of its the associated namespaces.
* Return the newly created nsproxy. Do not attach this to the task,
* leave it to the caller to do proper locking and attach it to task.
*/
static struct nsproxy *create_new_namespaces(unsigned long flags,
struct task_struct *tsk, struct fs_struct *new_fs)
{
struct nsproxy *new_nsp;
int err;
new_nsp = create_nsproxy();
if (!new_nsp)
return ERR_PTR(-ENOMEM);
new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
if (IS_ERR(new_nsp->mnt_ns)) {
err = PTR_ERR(new_nsp->mnt_ns);
goto out_ns;
}
new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
if (IS_ERR(new_nsp->uts_ns)) {
err = PTR_ERR(new_nsp->uts_ns);
goto out_uts;
}
new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
if (IS_ERR(new_nsp->ipc_ns)) {
err = PTR_ERR(new_nsp->ipc_ns);
goto out_ipc;
}
new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
if (IS_ERR(new_nsp->pid_ns)) {
err = PTR_ERR(new_nsp->pid_ns);
goto out_pid;
}
new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
goto out_net;
}
return new_nsp;
out_net:
if (new_nsp->pid_ns)
put_pid_ns(new_nsp->pid_ns);
out_pid:
if (new_nsp->ipc_ns)
put_ipc_ns(new_nsp->ipc_ns);
out_ipc:
if (new_nsp->uts_ns)
put_uts_ns(new_nsp->uts_ns);
out_uts:
if (new_nsp->mnt_ns)
put_mnt_ns(new_nsp->mnt_ns);
out_ns:
kmem_cache_free(nsproxy_cachep, new_nsp);
return ERR_PTR(err);
}
/*
* called from clone. This now handles copy for nsproxy and all
* namespaces therein.
*/
int copy_namespaces(unsigned long flags, struct task_struct *tsk)
{
struct nsproxy *old_ns = tsk->nsproxy;
struct nsproxy *new_ns;
int err = 0;
if (!old_ns)
return 0;
get_nsproxy(old_ns);
if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET)))
return 0;
if (!capable(CAP_SYS_ADMIN)) {
err = -EPERM;
goto out;
}
/*
* CLONE_NEWIPC must detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
* namespace are unreachable. In clone parlance, CLONE_SYSVSEM
* means share undolist with parent, so we must forbid using
* it along with CLONE_NEWIPC.
*/
if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
err = -EINVAL;
goto out;
}
new_ns = create_new_namespaces(flags, tsk, tsk->fs);
if (IS_ERR(new_ns)) {
err = PTR_ERR(new_ns);
goto out;
}
tsk->nsproxy = new_ns;
out:
put_nsproxy(old_ns);
return err;
}
void free_nsproxy(struct nsproxy *ns)
{
if (ns->mnt_ns)
put_mnt_ns(ns->mnt_ns);
if (ns->uts_ns)
put_uts_ns(ns->uts_ns);
if (ns->ipc_ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns)
put_pid_ns(ns->pid_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
}
/*
* Called from unshare. Unshare all the namespaces part of nsproxy.
* On success, returns the new nsproxy.
*/
int unshare_nsproxy_namespaces(unsigned long unshare_flags,
struct nsproxy **new_nsp, struct fs_struct *new_fs)
{
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWNET)))
return 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
*new_nsp = create_new_namespaces(unshare_flags, current,
new_fs ? new_fs : current->fs);
if (IS_ERR(*new_nsp)) {
err = PTR_ERR(*new_nsp);
goto out;
}
err = ns_cgroup_clone(current, task_pid(current));
if (err)
put_nsproxy(*new_nsp);
out:
return err;
}
void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
{
struct nsproxy *ns;
might_sleep();
ns = p->nsproxy;
rcu_assign_pointer(p->nsproxy, new);
if (ns && atomic_dec_and_test(&ns->count)) {
/*
* wait for others to get what they want from this nsproxy.
*
* cannot release this nsproxy via the call_rcu() since
* put_mnt_ns() will want to sleep
*/
synchronize_rcu();
free_nsproxy(ns);
}
}
void exit_task_namespaces(struct task_struct *p)
{
switch_task_namespaces(p, NULL);
}
static int __init nsproxy_cache_init(void)
{
nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
return 0;
}
module_init(nsproxy_cache_init);

403
kernel/kernel/panic.c Normal file
View File

@@ -0,0 +1,403 @@
/*
* linux/kernel/panic.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
/*
* This function is used through-out the kernel (including mm and fs)
* to indicate a major problem.
*/
#include <linux/debug_locks.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
#include <linux/notifier.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/reboot.h>
#include <linux/delay.h>
#include <linux/kexec.h>
#include <linux/sched.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
#include <linux/dmi.h>
int panic_on_oops;
static unsigned long tainted_mask;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
int panic_timeout;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
static long no_blink(long time)
{
return 0;
}
/* Returns how long it waited in ms */
long (*panic_blink)(long time);
EXPORT_SYMBOL(panic_blink);
/**
* panic - halt the system
* @fmt: The text string to print
*
* Display a message, then perform cleanups.
*
* This function never returns.
*/
NORET_TYPE void panic(const char * fmt, ...)
{
static char buf[1024];
va_list args;
long i;
/*
* It's possible to come here directly from a panic-assertion and
* not have preempt disabled. Some functions called from here want
* preempt to be disabled. No point enabling it later though...
*/
preempt_disable();
bust_spinlocks(1);
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
#ifdef CONFIG_DEBUG_BUGVERBOSE
dump_stack();
#endif
/*
* If we have crashed and we have a crash kernel loaded let it handle
* everything else.
* Do we want to call this before we try to display a message?
*/
crash_kexec(NULL);
/*
* Note smp_send_stop is the usual smp shutdown function, which
* unfortunately means it may not be hardened to work in a panic
* situation.
*/
smp_send_stop();
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
bust_spinlocks(0);
if (!panic_blink)
panic_blink = no_blink;
if (panic_timeout > 0) {
/*
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
for (i = 0; i < panic_timeout*1000; ) {
touch_nmi_watchdog();
i += panic_blink(i);
mdelay(1);
i++;
}
/*
* This will not be a clean reboot, with everything
* shutting down. But if there is a chance of
* rebooting the system it will be rebooted.
*/
emergency_restart();
}
#ifdef __sparc__
{
extern int stop_a_enabled;
/* Make sure the user can actually press Stop-A (L1-A) */
stop_a_enabled = 1;
printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
}
#endif
#if defined(CONFIG_S390)
{
unsigned long caller;
caller = (unsigned long)__builtin_return_address(0);
disabled_wait(caller);
}
#endif
local_irq_enable();
for (i = 0; ; ) {
touch_softlockup_watchdog();
i += panic_blink(i);
mdelay(1);
i++;
}
}
EXPORT_SYMBOL(panic);
struct tnt {
u8 bit;
char true;
char false;
};
static const struct tnt tnts[] = {
{ TAINT_PROPRIETARY_MODULE, 'P', 'G' },
{ TAINT_FORCED_MODULE, 'F', ' ' },
{ TAINT_UNSAFE_SMP, 'S', ' ' },
{ TAINT_FORCED_RMMOD, 'R', ' ' },
{ TAINT_MACHINE_CHECK, 'M', ' ' },
{ TAINT_BAD_PAGE, 'B', ' ' },
{ TAINT_USER, 'U', ' ' },
{ TAINT_DIE, 'D', ' ' },
{ TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
{ TAINT_WARN, 'W', ' ' },
{ TAINT_CRAP, 'C', ' ' },
};
/**
* print_tainted - return a string to represent the kernel taint state.
*
* 'P' - Proprietary module has been loaded.
* 'F' - Module has been forcibly loaded.
* 'S' - SMP with CPUs not designed for SMP.
* 'R' - User forced a module unload.
* 'M' - System experienced a machine check exception.
* 'B' - System has hit bad_page.
* 'U' - Userspace-defined naughtiness.
* 'D' - Kernel has oopsed before
* 'A' - ACPI table overridden.
* 'W' - Taint on warning.
* 'C' - modules from drivers/staging are loaded.
*
* The string is overwritten by the next call to print_tainted().
*/
const char *print_tainted(void)
{
static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1];
if (tainted_mask) {
char *s;
int i;
s = buf + sprintf(buf, "Tainted: ");
for (i = 0; i < ARRAY_SIZE(tnts); i++) {
const struct tnt *t = &tnts[i];
*s++ = test_bit(t->bit, &tainted_mask) ?
t->true : t->false;
}
*s = 0;
} else
snprintf(buf, sizeof(buf), "Not tainted");
return buf;
}
int test_taint(unsigned flag)
{
return test_bit(flag, &tainted_mask);
}
EXPORT_SYMBOL(test_taint);
unsigned long get_taint(void)
{
return tainted_mask;
}
void add_taint(unsigned flag)
{
/*
* Can't trust the integrity of the kernel anymore.
* We don't call directly debug_locks_off() because the issue
* is not necessarily serious enough to set oops_in_progress to 1
* Also we want to keep up lockdep for staging development and
* post-warning case.
*/
if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
set_bit(flag, &tainted_mask);
}
EXPORT_SYMBOL(add_taint);
static void spin_msec(int msecs)
{
int i;
for (i = 0; i < msecs; i++) {
touch_nmi_watchdog();
mdelay(1);
}
}
/*
* It just happens that oops_enter() and oops_exit() are identically
* implemented...
*/
static void do_oops_enter_exit(void)
{
unsigned long flags;
static int spin_counter;
if (!pause_on_oops)
return;
spin_lock_irqsave(&pause_on_oops_lock, flags);
if (pause_on_oops_flag == 0) {
/* This CPU may now print the oops message */
pause_on_oops_flag = 1;
} else {
/* We need to stall this CPU */
if (!spin_counter) {
/* This CPU gets to do the counting */
spin_counter = pause_on_oops;
do {
spin_unlock(&pause_on_oops_lock);
spin_msec(MSEC_PER_SEC);
spin_lock(&pause_on_oops_lock);
} while (--spin_counter);
pause_on_oops_flag = 0;
} else {
/* This CPU waits for a different one */
while (spin_counter) {
spin_unlock(&pause_on_oops_lock);
spin_msec(1);
spin_lock(&pause_on_oops_lock);
}
}
}
spin_unlock_irqrestore(&pause_on_oops_lock, flags);
}
/*
* Return true if the calling CPU is allowed to print oops-related info.
* This is a bit racy..
*/
int oops_may_print(void)
{
return pause_on_oops_flag == 0;
}
/*
* Called when the architecture enters its oops handler, before it prints
* anything. If this is the first CPU to oops, and it's oopsing the first
* time then let it proceed.
*
* This is all enabled by the pause_on_oops kernel boot option. We do all
* this to ensure that oopses don't scroll off the screen. It has the
* side-effect of preventing later-oopsing CPUs from mucking up the display,
* too.
*
* It turns out that the CPU which is allowed to print ends up pausing for
* the right duration, whereas all the other CPUs pause for twice as long:
* once in oops_enter(), once in oops_exit().
*/
void oops_enter(void)
{
tracing_off();
/* can't trust the integrity of the kernel anymore: */
debug_locks_off();
do_oops_enter_exit();
}
/*
* 64-bit random ID for oopses:
*/
static u64 oops_id;
static int init_oops_id(void)
{
if (!oops_id)
get_random_bytes(&oops_id, sizeof(oops_id));
else
oops_id++;
return 0;
}
late_initcall(init_oops_id);
static void print_oops_end_marker(void)
{
init_oops_id();
printk(KERN_WARNING "---[ end trace %016llx ]---\n",
(unsigned long long)oops_id);
}
/*
* Called when the architecture exits its oops handler, after printing
* everything.
*/
void oops_exit(void)
{
do_oops_enter_exit();
print_oops_end_marker();
}
#ifdef WANT_WARN_ON_SLOWPATH
struct slowpath_args {
const char *fmt;
va_list args;
};
static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args)
{
const char *board;
printk(KERN_WARNING "------------[ cut here ]------------\n");
printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
board = dmi_get_system_info(DMI_PRODUCT_NAME);
if (board)
printk(KERN_WARNING "Hardware name: %s\n", board);
if (args)
vprintk(args->fmt, args->args);
print_modules();
dump_stack();
print_oops_end_marker();
add_taint(TAINT_WARN);
}
void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
{
struct slowpath_args args;
args.fmt = fmt;
va_start(args.args, fmt);
warn_slowpath_common(file, line, __builtin_return_address(0), &args);
va_end(args.args);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
void warn_slowpath_null(const char *file, int line)
{
warn_slowpath_common(file, line, __builtin_return_address(0), NULL);
}
EXPORT_SYMBOL(warn_slowpath_null);
#endif
#ifdef CONFIG_CC_STACKPROTECTOR
/*
* Called when gcc's -fstack-protector feature is used, and
* gcc detects corruption of the on-stack canary value
*/
void __stack_chk_fail(void)
{
panic("stack-protector: Kernel stack is corrupted in: %p\n",
__builtin_return_address(0));
}
EXPORT_SYMBOL(__stack_chk_fail);
#endif
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);

797
kernel/kernel/params.c Normal file
View File

@@ -0,0 +1,797 @@
/* Helpers for initial module or kernel cmdline parsing
Copyright (C) 2001 Rusty Russell.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/ctype.h>
#if 0
#define DEBUGP printk
#else
#define DEBUGP(fmt, a...)
#endif
static inline char dash2underscore(char c)
{
if (c == '-')
return '_';
return c;
}
static inline int parameq(const char *input, const char *paramname)
{
unsigned int i;
for (i = 0; dash2underscore(input[i]) == paramname[i]; i++)
if (input[i] == '\0')
return 1;
return 0;
}
static int parse_one(char *param,
char *val,
struct kernel_param *params,
unsigned num_params,
int (*handle_unknown)(char *param, char *val))
{
unsigned int i;
/* Find parameter */
for (i = 0; i < num_params; i++) {
if (parameq(param, params[i].name)) {
DEBUGP("They are equal! Calling %p\n",
params[i].set);
return params[i].set(val, &params[i]);
}
}
if (handle_unknown) {
DEBUGP("Unknown argument: calling %p\n", handle_unknown);
return handle_unknown(param, val);
}
DEBUGP("Unknown argument `%s'\n", param);
return -ENOENT;
}
/* You can use " around spaces, but can't escape ". */
/* Hyphens and underscores equivalent in parameter names. */
static char *next_arg(char *args, char **param, char **val)
{
unsigned int i, equals = 0;
int in_quote = 0, quoted = 0;
char *next;
if (*args == '"') {
args++;
in_quote = 1;
quoted = 1;
}
for (i = 0; args[i]; i++) {
if (isspace(args[i]) && !in_quote)
break;
if (equals == 0) {
if (args[i] == '=')
equals = i;
}
if (args[i] == '"')
in_quote = !in_quote;
}
*param = args;
if (!equals)
*val = NULL;
else {
args[equals] = '\0';
*val = args + equals + 1;
/* Don't include quotes in value. */
if (**val == '"') {
(*val)++;
if (args[i-1] == '"')
args[i-1] = '\0';
}
if (quoted && args[i-1] == '"')
args[i-1] = '\0';
}
if (args[i]) {
args[i] = '\0';
next = args + i + 1;
} else
next = args + i;
/* Chew up trailing spaces. */
while (isspace(*next))
next++;
return next;
}
/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
int parse_args(const char *name,
char *args,
struct kernel_param *params,
unsigned num,
int (*unknown)(char *param, char *val))
{
char *param, *val;
DEBUGP("Parsing ARGS: %s\n", args);
/* Chew leading spaces */
while (isspace(*args))
args++;
while (*args) {
int ret;
int irq_was_disabled;
args = next_arg(args, &param, &val);
irq_was_disabled = irqs_disabled();
ret = parse_one(param, val, params, num, unknown);
if (irq_was_disabled && !irqs_disabled()) {
printk(KERN_WARNING "parse_args(): option '%s' enabled "
"irq's!\n", param);
}
switch (ret) {
case -ENOENT:
printk(KERN_ERR "%s: Unknown parameter `%s'\n",
name, param);
return ret;
case -ENOSPC:
printk(KERN_ERR
"%s: `%s' too large for parameter `%s'\n",
name, val ?: "", param);
return ret;
case 0:
break;
default:
printk(KERN_ERR
"%s: `%s' invalid for parameter `%s'\n",
name, val ?: "", param);
return ret;
}
}
/* All parsed OK. */
return 0;
}
/* Lazy bastard, eh? */
#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \
int param_set_##name(const char *val, struct kernel_param *kp) \
{ \
tmptype l; \
int ret; \
\
if (!val) return -EINVAL; \
ret = strtolfn(val, 0, &l); \
if (ret == -EINVAL || ((type)l != l)) \
return -EINVAL; \
*((type *)kp->arg) = l; \
return 0; \
} \
int param_get_##name(char *buffer, struct kernel_param *kp) \
{ \
return sprintf(buffer, format, *((type *)kp->arg)); \
}
STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul);
STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol);
STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
int param_set_charp(const char *val, struct kernel_param *kp)
{
if (!val) {
printk(KERN_ERR "%s: string parameter expected\n",
kp->name);
return -EINVAL;
}
if (strlen(val) > 1024) {
printk(KERN_ERR "%s: string parameter too long\n",
kp->name);
return -ENOSPC;
}
/* This is a hack. We can't need to strdup in early boot, and we
* don't need to; this mangled commandline is preserved. */
if (slab_is_available()) {
*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
if (!*(char **)kp->arg)
return -ENOMEM;
} else
*(const char **)kp->arg = val;
return 0;
}
int param_get_charp(char *buffer, struct kernel_param *kp)
{
return sprintf(buffer, "%s", *((char **)kp->arg));
}
/* Actually could be a bool or an int, for historical reasons. */
int param_set_bool(const char *val, struct kernel_param *kp)
{
bool v;
/* No equals means "set"... */
if (!val) val = "1";
/* One of =[yYnN01] */
switch (val[0]) {
case 'y': case 'Y': case '1':
v = true;
break;
case 'n': case 'N': case '0':
v = false;
break;
default:
return -EINVAL;
}
if (kp->flags & KPARAM_ISBOOL)
*(bool *)kp->arg = v;
else
*(int *)kp->arg = v;
return 0;
}
int param_get_bool(char *buffer, struct kernel_param *kp)
{
bool val;
if (kp->flags & KPARAM_ISBOOL)
val = *(bool *)kp->arg;
else
val = *(int *)kp->arg;
/* Y and N chosen as being relatively non-coder friendly */
return sprintf(buffer, "%c", val ? 'Y' : 'N');
}
/* This one must be bool. */
int param_set_invbool(const char *val, struct kernel_param *kp)
{
int ret;
bool boolval;
struct kernel_param dummy;
dummy.arg = &boolval;
dummy.flags = KPARAM_ISBOOL;
ret = param_set_bool(val, &dummy);
if (ret == 0)
*(bool *)kp->arg = !boolval;
return ret;
}
int param_get_invbool(char *buffer, struct kernel_param *kp)
{
return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
}
/* We break the rule and mangle the string. */
static int param_array(const char *name,
const char *val,
unsigned int min, unsigned int max,
void *elem, int elemsize,
int (*set)(const char *, struct kernel_param *kp),
u16 flags,
unsigned int *num)
{
int ret;
struct kernel_param kp;
char save;
/* Get the name right for errors. */
kp.name = name;
kp.arg = elem;
kp.flags = flags;
/* No equals sign? */
if (!val) {
printk(KERN_ERR "%s: expects arguments\n", name);
return -EINVAL;
}
*num = 0;
/* We expect a comma-separated list of values. */
do {
int len;
if (*num == max) {
printk(KERN_ERR "%s: can only take %i arguments\n",
name, max);
return -EINVAL;
}
len = strcspn(val, ",");
/* nul-terminate and parse */
save = val[len];
((char *)val)[len] = '\0';
ret = set(val, &kp);
if (ret != 0)
return ret;
kp.arg += elemsize;
val += len+1;
(*num)++;
} while (save == ',');
if (*num < min) {
printk(KERN_ERR "%s: needs at least %i arguments\n",
name, min);
return -EINVAL;
}
return 0;
}
int param_array_set(const char *val, struct kernel_param *kp)
{
const struct kparam_array *arr = kp->arr;
unsigned int temp_num;
return param_array(kp->name, val, 1, arr->max, arr->elem,
arr->elemsize, arr->set, kp->flags,
arr->num ?: &temp_num);
}
int param_array_get(char *buffer, struct kernel_param *kp)
{
int i, off, ret;
const struct kparam_array *arr = kp->arr;
struct kernel_param p;
p = *kp;
for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
if (i)
buffer[off++] = ',';
p.arg = arr->elem + arr->elemsize * i;
ret = arr->get(buffer + off, &p);
if (ret < 0)
return ret;
off += ret;
}
buffer[off] = '\0';
return off;
}
int param_set_copystring(const char *val, struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
if (!val) {
printk(KERN_ERR "%s: missing param set value\n", kp->name);
return -EINVAL;
}
if (strlen(val)+1 > kps->maxlen) {
printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
kp->name, kps->maxlen-1);
return -ENOSPC;
}
strcpy(kps->string, val);
return 0;
}
int param_get_string(char *buffer, struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
return strlcpy(buffer, kps->string, kps->maxlen);
}
/* sysfs output in /sys/modules/XYZ/parameters/ */
#define to_module_attr(n) container_of(n, struct module_attribute, attr);
#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
extern struct kernel_param __start___param[], __stop___param[];
struct param_attribute
{
struct module_attribute mattr;
struct kernel_param *param;
};
struct module_param_attrs
{
unsigned int num;
struct attribute_group grp;
struct param_attribute attrs[0];
};
#ifdef CONFIG_SYSFS
#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
static ssize_t param_attr_show(struct module_attribute *mattr,
struct module *mod, char *buf)
{
int count;
struct param_attribute *attribute = to_param_attr(mattr);
if (!attribute->param->get)
return -EPERM;
count = attribute->param->get(buf, attribute->param);
if (count > 0) {
strcat(buf, "\n");
++count;
}
return count;
}
/* sysfs always hands a nul-terminated string in buf. We rely on that. */
static ssize_t param_attr_store(struct module_attribute *mattr,
struct module *owner,
const char *buf, size_t len)
{
int err;
struct param_attribute *attribute = to_param_attr(mattr);
if (!attribute->param->set)
return -EPERM;
err = attribute->param->set(buf, attribute->param);
if (!err)
return len;
return err;
}
#endif
#ifdef CONFIG_MODULES
#define __modinit
#else
#define __modinit __init
#endif
#ifdef CONFIG_SYSFS
/*
* add_sysfs_param - add a parameter to sysfs
* @mk: struct module_kobject
* @kparam: the actual parameter definition to add to sysfs
* @name: name of parameter
*
* Create a kobject if for a (per-module) parameter if mp NULL, and
* create file in sysfs. Returns an error on out of memory. Always cleans up
* if there's an error.
*/
static __modinit int add_sysfs_param(struct module_kobject *mk,
struct kernel_param *kp,
const char *name)
{
struct module_param_attrs *new;
struct attribute **attrs;
int err, num;
/* We don't bother calling this with invisible parameters. */
BUG_ON(!kp->perm);
if (!mk->mp) {
num = 0;
attrs = NULL;
} else {
num = mk->mp->num;
attrs = mk->mp->grp.attrs;
}
/* Enlarge. */
new = krealloc(mk->mp,
sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
GFP_KERNEL);
if (!new) {
kfree(mk->mp);
err = -ENOMEM;
goto fail;
}
attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
if (!attrs) {
err = -ENOMEM;
goto fail_free_new;
}
/* Sysfs wants everything zeroed. */
memset(new, 0, sizeof(*new));
memset(&new->attrs[num], 0, sizeof(new->attrs[num]));
memset(&attrs[num], 0, sizeof(attrs[num]));
new->grp.name = "parameters";
new->grp.attrs = attrs;
/* Tack new one on the end. */
new->attrs[num].param = kp;
new->attrs[num].mattr.show = param_attr_show;
new->attrs[num].mattr.store = param_attr_store;
new->attrs[num].mattr.attr.name = (char *)name;
new->attrs[num].mattr.attr.mode = kp->perm;
new->num = num+1;
/* Fix up all the pointers, since krealloc can move us */
for (num = 0; num < new->num; num++)
new->grp.attrs[num] = &new->attrs[num].mattr.attr;
new->grp.attrs[num] = NULL;
mk->mp = new;
return 0;
fail_free_new:
kfree(new);
fail:
mk->mp = NULL;
return err;
}
#ifdef CONFIG_MODULES
static void free_module_param_attrs(struct module_kobject *mk)
{
kfree(mk->mp->grp.attrs);
kfree(mk->mp);
mk->mp = NULL;
}
/*
* module_param_sysfs_setup - setup sysfs support for one module
* @mod: module
* @kparam: module parameters (array)
* @num_params: number of module parameters
*
* Adds sysfs entries for module parameters under
* /sys/module/[mod->name]/parameters/
*/
int module_param_sysfs_setup(struct module *mod,
struct kernel_param *kparam,
unsigned int num_params)
{
int i, err;
bool params = false;
for (i = 0; i < num_params; i++) {
if (kparam[i].perm == 0)
continue;
err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
if (err)
return err;
params = true;
}
if (!params)
return 0;
/* Create the param group. */
err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
if (err)
free_module_param_attrs(&mod->mkobj);
return err;
}
/*
* module_param_sysfs_remove - remove sysfs support for one module
* @mod: module
*
* Remove sysfs entries for module parameters and the corresponding
* kobject.
*/
void module_param_sysfs_remove(struct module *mod)
{
if (mod->mkobj.mp) {
sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
/* We are positive that no one is using any param
* attrs at this point. Deallocate immediately. */
free_module_param_attrs(&mod->mkobj);
}
}
#endif
void destroy_params(const struct kernel_param *params, unsigned num)
{
/* FIXME: This should free kmalloced charp parameters. It doesn't. */
}
static void __init kernel_add_sysfs_param(const char *name,
struct kernel_param *kparam,
unsigned int name_skip)
{
struct module_kobject *mk;
struct kobject *kobj;
int err;
kobj = kset_find_obj(module_kset, name);
if (kobj) {
/* We already have one. Remove params so we can add more. */
mk = to_module_kobject(kobj);
/* We need to remove it before adding parameters. */
sysfs_remove_group(&mk->kobj, &mk->mp->grp);
} else {
mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
BUG_ON(!mk);
mk->mod = THIS_MODULE;
mk->kobj.kset = module_kset;
err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
"%s", name);
if (err) {
kobject_put(&mk->kobj);
printk(KERN_ERR "Module '%s' failed add to sysfs, "
"error number %d\n", name, err);
printk(KERN_ERR "The system will be unstable now.\n");
return;
}
/* So that exit path is even. */
kobject_get(&mk->kobj);
}
/* These should not fail at boot. */
err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
BUG_ON(err);
err = sysfs_create_group(&mk->kobj, &mk->mp->grp);
BUG_ON(err);
kobject_uevent(&mk->kobj, KOBJ_ADD);
kobject_put(&mk->kobj);
}
/*
* param_sysfs_builtin - add contents in /sys/parameters for built-in modules
*
* Add module_parameters to sysfs for "modules" built into the kernel.
*
* The "module" name (KBUILD_MODNAME) is stored before a dot, the
* "parameter" name is stored behind a dot in kernel_param->name. So,
* extract the "module" name for all built-in kernel_param-eters,
* and for all who have the same, call kernel_add_sysfs_param.
*/
static void __init param_sysfs_builtin(void)
{
struct kernel_param *kp;
unsigned int name_len;
char modname[MODULE_NAME_LEN];
for (kp = __start___param; kp < __stop___param; kp++) {
char *dot;
if (kp->perm == 0)
continue;
dot = strchr(kp->name, '.');
if (!dot) {
/* This happens for core_param() */
strcpy(modname, "kernel");
name_len = 0;
} else {
name_len = dot - kp->name + 1;
strlcpy(modname, kp->name, name_len);
}
kernel_add_sysfs_param(modname, kp, name_len);
}
}
/* module-related sysfs stuff */
static ssize_t module_attr_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
{
struct module_attribute *attribute;
struct module_kobject *mk;
int ret;
attribute = to_module_attr(attr);
mk = to_module_kobject(kobj);
if (!attribute->show)
return -EIO;
ret = attribute->show(attribute, mk->mod, buf);
return ret;
}
static ssize_t module_attr_store(struct kobject *kobj,
struct attribute *attr,
const char *buf, size_t len)
{
struct module_attribute *attribute;
struct module_kobject *mk;
int ret;
attribute = to_module_attr(attr);
mk = to_module_kobject(kobj);
if (!attribute->store)
return -EIO;
ret = attribute->store(attribute, mk->mod, buf, len);
return ret;
}
static struct sysfs_ops module_sysfs_ops = {
.show = module_attr_show,
.store = module_attr_store,
};
static int uevent_filter(struct kset *kset, struct kobject *kobj)
{
struct kobj_type *ktype = get_ktype(kobj);
if (ktype == &module_ktype)
return 1;
return 0;
}
static struct kset_uevent_ops module_uevent_ops = {
.filter = uevent_filter,
};
struct kset *module_kset;
int module_sysfs_initialized;
struct kobj_type module_ktype = {
.sysfs_ops = &module_sysfs_ops,
};
/*
* param_sysfs_init - wrapper for built-in params support
*/
static int __init param_sysfs_init(void)
{
module_kset = kset_create_and_add("module", &module_uevent_ops, NULL);
if (!module_kset) {
printk(KERN_WARNING "%s (%d): error creating kset\n",
__FILE__, __LINE__);
return -ENOMEM;
}
module_sysfs_initialized = 1;
param_sysfs_builtin();
return 0;
}
subsys_initcall(param_sysfs_init);
#endif /* CONFIG_SYSFS */
EXPORT_SYMBOL(param_set_byte);
EXPORT_SYMBOL(param_get_byte);
EXPORT_SYMBOL(param_set_short);
EXPORT_SYMBOL(param_get_short);
EXPORT_SYMBOL(param_set_ushort);
EXPORT_SYMBOL(param_get_ushort);
EXPORT_SYMBOL(param_set_int);
EXPORT_SYMBOL(param_get_int);
EXPORT_SYMBOL(param_set_uint);
EXPORT_SYMBOL(param_get_uint);
EXPORT_SYMBOL(param_set_long);
EXPORT_SYMBOL(param_get_long);
EXPORT_SYMBOL(param_set_ulong);
EXPORT_SYMBOL(param_get_ulong);
EXPORT_SYMBOL(param_set_charp);
EXPORT_SYMBOL(param_get_charp);
EXPORT_SYMBOL(param_set_bool);
EXPORT_SYMBOL(param_get_bool);
EXPORT_SYMBOL(param_set_invbool);
EXPORT_SYMBOL(param_get_invbool);
EXPORT_SYMBOL(param_array_set);
EXPORT_SYMBOL(param_array_get);
EXPORT_SYMBOL(param_set_copystring);
EXPORT_SYMBOL(param_get_string);

5172
kernel/kernel/perf_event.c Normal file

File diff suppressed because it is too large Load Diff

524
kernel/kernel/pid.c Normal file
View File

@@ -0,0 +1,524 @@
/*
* Generic pidhash and scalable, time-bounded PID allocator
*
* (C) 2002-2003 William Irwin, IBM
* (C) 2004 William Irwin, Oracle
* (C) 2002-2004 Ingo Molnar, Red Hat
*
* pid-structures are backing objects for tasks sharing a given ID to chain
* against. There is very little to them aside from hashing them and
* parking tasks using given ID's on a list.
*
* The hash is always changed with the tasklist_lock write-acquired,
* and the hash is only accessed with the tasklist_lock at least
* read-acquired, so there's no additional SMP locking needed here.
*
* We have a list of bitmap pages, which bitmaps represent the PID space.
* Allocating and freeing PIDs is completely lockless. The worst-case
* allocation scenario when all but one out of 1 million PIDs possible are
* allocated already: the scanning of 32 list entries and at most PAGE_SIZE
* bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
*
* Pid namespaces:
* (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
* (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
* Many thanks to Oleg Nesterov for comments and help
*
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/bootmem.h>
#include <linux/hash.h>
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
static struct hlist_head *pid_hash;
static unsigned int pidhash_shift = 4;
struct pid init_struct_pid = INIT_STRUCT_PID;
int pid_max = PID_MAX_DEFAULT;
#define RESERVED_PIDS 300
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
#define BITS_PER_PAGE (PAGE_SIZE*8)
#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
static inline int mk_pid(struct pid_namespace *pid_ns,
struct pidmap *map, int off)
{
return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
}
#define find_next_offset(map, off) \
find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
/*
* PID-map pages start out as NULL, they get allocated upon
* first use and are never deallocated. This way a low pid_max
* value does not cause lots of bitmaps to be allocated, but
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
.kref = {
.refcount = ATOMIC_INIT(2),
},
.pidmap = {
[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
},
.last_pid = 0,
.level = 0,
.child_reaper = &init_task,
};
EXPORT_SYMBOL_GPL(init_pid_ns);
int is_container_init(struct task_struct *tsk)
{
int ret = 0;
struct pid *pid;
rcu_read_lock();
pid = task_pid(tsk);
if (pid != NULL && pid->numbers[pid->level].nr == 1)
ret = 1;
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(is_container_init);
/*
* Note: disable interrupts while the pidmap_lock is held as an
* interrupt might come in and do read_lock(&tasklist_lock).
*
* If we don't disable interrupts there is a nasty deadlock between
* detach_pid()->free_pid() and another cpu that does
* spin_lock(&pidmap_lock) followed by an interrupt routine that does
* read_lock(&tasklist_lock);
*
* After we clean up the tasklist_lock and know there are no
* irq handlers that take it we can leave the interrupts enabled.
* For now it is easier to be safe than to prove it can't happen.
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
static void free_pidmap(struct upid *upid)
{
int nr = upid->nr;
struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
int offset = nr & BITS_PER_PAGE_MASK;
clear_bit(offset, map->page);
atomic_inc(&map->nr_free);
}
static int alloc_pidmap(struct pid_namespace *pid_ns)
{
int i, offset, max_scan, pid, last = pid_ns->last_pid;
struct pidmap *map;
pid = last + 1;
if (pid >= pid_max)
pid = RESERVED_PIDS;
offset = pid & BITS_PER_PAGE_MASK;
map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
for (i = 0; i <= max_scan; ++i) {
if (unlikely(!map->page)) {
void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
/*
* Free the page if someone raced with us
* installing it:
*/
spin_lock_irq(&pidmap_lock);
if (map->page)
kfree(page);
else
map->page = page;
spin_unlock_irq(&pidmap_lock);
if (unlikely(!map->page))
break;
}
if (likely(atomic_read(&map->nr_free))) {
do {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
pid_ns->last_pid = pid;
return pid;
}
offset = find_next_offset(map, offset);
pid = mk_pid(pid_ns, map, offset);
/*
* find_next_offset() found a bit, the pid from it
* is in-bounds, and if we fell back to the last
* bitmap block and the final block was the same
* as the starting point, pid is before last_pid.
*/
} while (offset < BITS_PER_PAGE && pid < pid_max &&
(i != max_scan || pid < last ||
!((last+1) & BITS_PER_PAGE_MASK)));
}
if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
++map;
offset = 0;
} else {
map = &pid_ns->pidmap[0];
offset = RESERVED_PIDS;
if (unlikely(last == offset))
break;
}
pid = mk_pid(pid_ns, map, offset);
}
return -1;
}
int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
{
int offset;
struct pidmap *map, *end;
if (last >= PID_MAX_LIMIT)
return -1;
offset = (last + 1) & BITS_PER_PAGE_MASK;
map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
end = &pid_ns->pidmap[PIDMAP_ENTRIES];
for (; map < end; map++, offset = 0) {
if (unlikely(!map->page))
continue;
offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
if (offset < BITS_PER_PAGE)
return mk_pid(pid_ns, map, offset);
}
return -1;
}
void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
if (!pid)
return;
ns = pid->numbers[pid->level].ns;
if ((atomic_read(&pid->count) == 1) ||
atomic_dec_and_test(&pid->count)) {
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
}
EXPORT_SYMBOL_GPL(put_pid);
static void delayed_put_pid(struct rcu_head *rhp)
{
struct pid *pid = container_of(rhp, struct pid, rcu);
put_pid(pid);
}
void free_pid(struct pid *pid)
{
/* We can be called with write_lock_irq(&tasklist_lock) held */
int i;
unsigned long flags;
spin_lock_irqsave(&pidmap_lock, flags);
for (i = 0; i <= pid->level; i++)
hlist_del_rcu(&pid->numbers[i].pid_chain);
spin_unlock_irqrestore(&pidmap_lock, flags);
for (i = 0; i <= pid->level; i++)
free_pidmap(pid->numbers + i);
call_rcu(&pid->rcu, delayed_put_pid);
}
struct pid *alloc_pid(struct pid_namespace *ns)
{
struct pid *pid;
enum pid_type type;
int i, nr;
struct pid_namespace *tmp;
struct upid *upid;
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
goto out;
tmp = ns;
for (i = ns->level; i >= 0; i--) {
nr = alloc_pidmap(tmp);
if (nr < 0)
goto out_free;
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
tmp = tmp->parent;
}
get_pid_ns(ns);
pid->level = ns->level;
atomic_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
spin_lock_irq(&pidmap_lock);
for (i = ns->level; i >= 0; i--) {
upid = &pid->numbers[i];
hlist_add_head_rcu(&upid->pid_chain,
&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
}
spin_unlock_irq(&pidmap_lock);
out:
return pid;
out_free:
while (++i <= ns->level)
free_pidmap(pid->numbers + i);
kmem_cache_free(ns->pid_cachep, pid);
pid = NULL;
goto out;
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
struct hlist_node *elem;
struct upid *pnr;
hlist_for_each_entry_rcu(pnr, elem,
&pid_hash[pid_hashfn(nr, ns)], pid_chain)
if (pnr->nr == nr && pnr->ns == ns)
return container_of(pnr, struct pid,
numbers[ns->level]);
return NULL;
}
EXPORT_SYMBOL_GPL(find_pid_ns);
struct pid *find_vpid(int nr)
{
return find_pid_ns(nr, current->nsproxy->pid_ns);
}
EXPORT_SYMBOL_GPL(find_vpid);
/*
* attach_pid() must be called with the tasklist_lock write-held.
*/
void attach_pid(struct task_struct *task, enum pid_type type,
struct pid *pid)
{
struct pid_link *link;
link = &task->pids[type];
link->pid = pid;
hlist_add_head_rcu(&link->node, &pid->tasks[type]);
}
static void __change_pid(struct task_struct *task, enum pid_type type,
struct pid *new)
{
struct pid_link *link;
struct pid *pid;
int tmp;
link = &task->pids[type];
pid = link->pid;
hlist_del_rcu(&link->node);
link->pid = new;
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (!hlist_empty(&pid->tasks[tmp]))
return;
free_pid(pid);
}
void detach_pid(struct task_struct *task, enum pid_type type)
{
__change_pid(task, type, NULL);
}
void change_pid(struct task_struct *task, enum pid_type type,
struct pid *pid)
{
__change_pid(task, type, pid);
attach_pid(task, type, pid);
}
/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
new->pids[type].pid = old->pids[type].pid;
hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
}
struct task_struct *pid_task(struct pid *pid, enum pid_type type)
{
struct task_struct *result = NULL;
if (pid) {
struct hlist_node *first;
first = rcu_dereference(pid->tasks[type].first);
if (first)
result = hlist_entry(first, struct task_struct, pids[(type)].node);
}
return result;
}
EXPORT_SYMBOL(pid_task);
/*
* Must be called under rcu_read_lock() or with tasklist_lock read-held.
*/
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}
struct task_struct *find_task_by_vpid(pid_t vnr)
{
return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
}
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
struct pid *pid;
rcu_read_lock();
if (type != PIDTYPE_PID)
task = task->group_leader;
pid = get_pid(task->pids[type].pid);
rcu_read_unlock();
return pid;
}
struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
{
struct task_struct *result;
rcu_read_lock();
result = pid_task(pid, type);
if (result)
get_task_struct(result);
rcu_read_unlock();
return result;
}
struct pid *find_get_pid(pid_t nr)
{
struct pid *pid;
rcu_read_lock();
pid = get_pid(find_vpid(nr));
rcu_read_unlock();
return pid;
}
EXPORT_SYMBOL_GPL(find_get_pid);
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
struct upid *upid;
pid_t nr = 0;
if (pid && ns->level <= pid->level) {
upid = &pid->numbers[ns->level];
if (upid->ns == ns)
nr = upid->nr;
}
return nr;
}
pid_t pid_vnr(struct pid *pid)
{
return pid_nr_ns(pid, current->nsproxy->pid_ns);
}
EXPORT_SYMBOL_GPL(pid_vnr);
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
struct pid_namespace *ns)
{
pid_t nr = 0;
rcu_read_lock();
if (!ns)
ns = current->nsproxy->pid_ns;
if (likely(pid_alive(task))) {
if (type != PIDTYPE_PID)
task = task->group_leader;
nr = pid_nr_ns(task->pids[type].pid, ns);
}
rcu_read_unlock();
return nr;
}
EXPORT_SYMBOL(__task_pid_nr_ns);
pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return pid_nr_ns(task_tgid(tsk), ns);
}
EXPORT_SYMBOL(task_tgid_nr_ns);
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));
}
EXPORT_SYMBOL_GPL(task_active_pid_ns);
/*
* Used by proc to find the first pid that is greater than or equal to nr.
*
* If there is a pid at nr this function is exactly the same as find_pid_ns.
*/
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
struct pid *pid;
do {
pid = find_pid_ns(nr, ns);
if (pid)
break;
nr = next_pidmap(ns, nr);
} while (nr > 0);
return pid;
}
/*
* The pid hash table is scaled according to the amount of memory in the
* machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
* more.
*/
void __init pidhash_init(void)
{
int i, pidhash_size;
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL,
&pidhash_shift, NULL, 4096);
pidhash_size = 1 << pidhash_shift;
for (i = 0; i < pidhash_size; i++)
INIT_HLIST_HEAD(&pid_hash[i]);
}
void __init pidmap_init(void)
{
init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
/* Reserve PID 0. We never call free_pidmap(0) */
set_bit(0, init_pid_ns.pidmap[0].page);
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC);
}

View File

@@ -0,0 +1,193 @@
/*
* Pid namespaces
*
* Authors:
* (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
* (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
* Many thanks to Oleg Nesterov for comments and help
*
*/
#include <linux/pid.h>
#include <linux/pid_namespace.h>
#include <linux/syscalls.h>
#include <linux/err.h>
#include <linux/acct.h>
#define BITS_PER_PAGE (PAGE_SIZE*8)
struct pid_cache {
int nr_ids;
char name[16];
struct kmem_cache *cachep;
struct list_head list;
};
static LIST_HEAD(pid_caches_lh);
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
/*
* creates the kmem cache to allocate pids from.
* @nr_ids: the number of numerical ids this pid will have to carry
*/
static struct kmem_cache *create_pid_cachep(int nr_ids)
{
struct pid_cache *pcache;
struct kmem_cache *cachep;
mutex_lock(&pid_caches_mutex);
list_for_each_entry(pcache, &pid_caches_lh, list)
if (pcache->nr_ids == nr_ids)
goto out;
pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
if (pcache == NULL)
goto err_alloc;
snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
cachep = kmem_cache_create(pcache->name,
sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
0, SLAB_HWCACHE_ALIGN, NULL);
if (cachep == NULL)
goto err_cachep;
pcache->nr_ids = nr_ids;
pcache->cachep = cachep;
list_add(&pcache->list, &pid_caches_lh);
out:
mutex_unlock(&pid_caches_mutex);
return pcache->cachep;
err_cachep:
kfree(pcache);
err_alloc:
mutex_unlock(&pid_caches_mutex);
return NULL;
}
static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
int i;
ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
if (ns == NULL)
goto out;
ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!ns->pidmap[0].page)
goto out_free;
ns->pid_cachep = create_pid_cachep(level + 1);
if (ns->pid_cachep == NULL)
goto out_free_map;
kref_init(&ns->kref);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
set_bit(0, ns->pidmap[0].page);
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
return ns;
out_free_map:
kfree(ns->pidmap[0].page);
out_free:
kmem_cache_free(pid_ns_cachep, ns);
out:
return ERR_PTR(-ENOMEM);
}
static void destroy_pid_namespace(struct pid_namespace *ns)
{
int i;
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
kmem_cache_free(pid_ns_cachep, ns);
}
struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
{
if (!(flags & CLONE_NEWPID))
return get_pid_ns(old_ns);
if (flags & (CLONE_THREAD|CLONE_PARENT))
return ERR_PTR(-EINVAL);
return create_pid_namespace(old_ns);
}
void free_pid_ns(struct kref *kref)
{
struct pid_namespace *ns, *parent;
ns = container_of(kref, struct pid_namespace, kref);
parent = ns->parent;
destroy_pid_namespace(ns);
if (parent != NULL)
put_pid_ns(parent);
}
void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
int nr;
int rc;
struct task_struct *task;
/*
* The last thread in the cgroup-init thread group is terminating.
* Find remaining pid_ts in the namespace, signal and wait for them
* to exit.
*
* Note: This signals each threads in the namespace - even those that
* belong to the same thread group, To avoid this, we would have
* to walk the entire tasklist looking a processes in this
* namespace, but that could be unnecessarily expensive if the
* pid namespace has just a few processes. Or we need to
* maintain a tasklist for each pid namespace.
*
*/
read_lock(&tasklist_lock);
nr = next_pidmap(pid_ns, 1);
while (nr > 0) {
rcu_read_lock();
/*
* Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
* any nested-container's init processes don't ignore the
* signal
*/
task = pid_task(find_vpid(nr), PIDTYPE_PID);
if (task)
force_sig(SIGKILL, task);
rcu_read_unlock();
nr = next_pidmap(pid_ns, nr);
}
read_unlock(&tasklist_lock);
do {
clear_thread_flag(TIF_SIGPENDING);
rc = sys_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);
acct_exit_ns(pid_ns);
return;
}
static __init int pid_namespaces_init(void)
{
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
return 0;
}
__initcall(pid_namespaces_init);

View File

@@ -0,0 +1,423 @@
/*
* This module exposes the interface to kernel space for specifying
* QoS dependencies. It provides infrastructure for registration of:
*
* Dependents on a QoS value : register requirements
* Watchers of QoS value : get notified when target QoS value changes
*
* This QoS design is best effort based. Dependents register their QoS needs.
* Watchers register to keep track of the current QoS needs of the system.
*
* There are 3 basic classes of QoS parameter: latency, timeout, throughput
* each have defined units:
* latency: usec
* timeout: usec <-- currently not used.
* throughput: kbs (kilo byte / sec)
*
* There are lists of pm_qos_objects each one wrapping requirements, notifiers
*
* User mode requirements on a QOS parameter register themselves to the
* subsystem by opening the device node /dev/... and writing there request to
* the node. As long as the process holds a file handle open to the node the
* client continues to be accounted for. Upon file release the usermode
* requirement is removed and a new qos target is computed. This way when the
* requirement that the application has is cleaned up when closes the file
* pointer or exits the pm_qos_object will get an opportunity to clean up.
*
* Mark Gross <mgross@linux.intel.com>
*/
#include <linux/pm_qos_params.h>
#include <linux/sched.h>
#include <linux/smp_lock.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/miscdevice.h>
#include <linux/string.h>
#include <linux/platform_device.h>
#include <linux/init.h>
#include <linux/uaccess.h>
/*
* locking rule: all changes to requirements or notifiers lists
* or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
* held, taken with _irqsave. One lock to rule them all
*/
struct requirement_list {
struct list_head list;
union {
s32 value;
s32 usec;
s32 kbps;
};
char *name;
};
static s32 max_compare(s32 v1, s32 v2);
static s32 min_compare(s32 v1, s32 v2);
struct pm_qos_object {
struct requirement_list requirements;
struct blocking_notifier_head *notifiers;
struct miscdevice pm_qos_power_miscdev;
char *name;
s32 default_value;
atomic_t target_value;
s32 (*comparitor)(s32, s32);
};
static struct pm_qos_object null_pm_qos;
static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
static struct pm_qos_object cpu_dma_pm_qos = {
.requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)},
.notifiers = &cpu_dma_lat_notifier,
.name = "cpu_dma_latency",
.default_value = 2000 * USEC_PER_SEC,
.target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
.comparitor = min_compare
};
static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
static struct pm_qos_object network_lat_pm_qos = {
.requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)},
.notifiers = &network_lat_notifier,
.name = "network_latency",
.default_value = 2000 * USEC_PER_SEC,
.target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
.comparitor = min_compare
};
static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
static struct pm_qos_object network_throughput_pm_qos = {
.requirements =
{LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
.notifiers = &network_throughput_notifier,
.name = "network_throughput",
.default_value = 0,
.target_value = ATOMIC_INIT(0),
.comparitor = max_compare
};
static struct pm_qos_object *pm_qos_array[] = {
&null_pm_qos,
&cpu_dma_pm_qos,
&network_lat_pm_qos,
&network_throughput_pm_qos
};
static DEFINE_SPINLOCK(pm_qos_lock);
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
size_t count, loff_t *f_pos);
static int pm_qos_power_open(struct inode *inode, struct file *filp);
static int pm_qos_power_release(struct inode *inode, struct file *filp);
static const struct file_operations pm_qos_power_fops = {
.write = pm_qos_power_write,
.open = pm_qos_power_open,
.release = pm_qos_power_release,
};
/* static helper functions */
static s32 max_compare(s32 v1, s32 v2)
{
return max(v1, v2);
}
static s32 min_compare(s32 v1, s32 v2)
{
return min(v1, v2);
}
static void update_target(int target)
{
s32 extreme_value;
struct requirement_list *node;
unsigned long flags;
int call_notifier = 0;
spin_lock_irqsave(&pm_qos_lock, flags);
extreme_value = pm_qos_array[target]->default_value;
list_for_each_entry(node,
&pm_qos_array[target]->requirements.list, list) {
extreme_value = pm_qos_array[target]->comparitor(
extreme_value, node->value);
}
if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
call_notifier = 1;
atomic_set(&pm_qos_array[target]->target_value, extreme_value);
pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
atomic_read(&pm_qos_array[target]->target_value));
}
spin_unlock_irqrestore(&pm_qos_lock, flags);
if (call_notifier)
blocking_notifier_call_chain(pm_qos_array[target]->notifiers,
(unsigned long) extreme_value, NULL);
}
static int register_pm_qos_misc(struct pm_qos_object *qos)
{
qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
qos->pm_qos_power_miscdev.name = qos->name;
qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
return misc_register(&qos->pm_qos_power_miscdev);
}
static int find_pm_qos_object_by_minor(int minor)
{
int pm_qos_class;
for (pm_qos_class = 0;
pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
if (minor ==
pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
return pm_qos_class;
}
return -1;
}
/**
* pm_qos_requirement - returns current system wide qos expectation
* @pm_qos_class: identification of which qos value is requested
*
* This function returns the current target value in an atomic manner.
*/
int pm_qos_requirement(int pm_qos_class)
{
return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
}
EXPORT_SYMBOL_GPL(pm_qos_requirement);
/**
* pm_qos_add_requirement - inserts new qos request into the list
* @pm_qos_class: identifies which list of qos request to us
* @name: identifies the request
* @value: defines the qos request
*
* This function inserts a new entry in the pm_qos_class list of requested qos
* performance characteristics. It recomputes the aggregate QoS expectations
* for the pm_qos_class of parameters.
*/
int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
{
struct requirement_list *dep;
unsigned long flags;
dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL);
if (dep) {
if (value == PM_QOS_DEFAULT_VALUE)
dep->value = pm_qos_array[pm_qos_class]->default_value;
else
dep->value = value;
dep->name = kstrdup(name, GFP_KERNEL);
if (!dep->name)
goto cleanup;
spin_lock_irqsave(&pm_qos_lock, flags);
list_add(&dep->list,
&pm_qos_array[pm_qos_class]->requirements.list);
spin_unlock_irqrestore(&pm_qos_lock, flags);
update_target(pm_qos_class);
return 0;
}
cleanup:
kfree(dep);
return -ENOMEM;
}
EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
/**
* pm_qos_update_requirement - modifies an existing qos request
* @pm_qos_class: identifies which list of qos request to us
* @name: identifies the request
* @value: defines the qos request
*
* Updates an existing qos requirement for the pm_qos_class of parameters along
* with updating the target pm_qos_class value.
*
* If the named request isn't in the list then no change is made.
*/
int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
{
unsigned long flags;
struct requirement_list *node;
int pending_update = 0;
spin_lock_irqsave(&pm_qos_lock, flags);
list_for_each_entry(node,
&pm_qos_array[pm_qos_class]->requirements.list, list) {
if (strcmp(node->name, name) == 0) {
if (new_value == PM_QOS_DEFAULT_VALUE)
node->value =
pm_qos_array[pm_qos_class]->default_value;
else
node->value = new_value;
pending_update = 1;
break;
}
}
spin_unlock_irqrestore(&pm_qos_lock, flags);
if (pending_update)
update_target(pm_qos_class);
return 0;
}
EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
/**
* pm_qos_remove_requirement - modifies an existing qos request
* @pm_qos_class: identifies which list of qos request to us
* @name: identifies the request
*
* Will remove named qos request from pm_qos_class list of parameters and
* recompute the current target value for the pm_qos_class.
*/
void pm_qos_remove_requirement(int pm_qos_class, char *name)
{
unsigned long flags;
struct requirement_list *node;
int pending_update = 0;
spin_lock_irqsave(&pm_qos_lock, flags);
list_for_each_entry(node,
&pm_qos_array[pm_qos_class]->requirements.list, list) {
if (strcmp(node->name, name) == 0) {
kfree(node->name);
list_del(&node->list);
kfree(node);
pending_update = 1;
break;
}
}
spin_unlock_irqrestore(&pm_qos_lock, flags);
if (pending_update)
update_target(pm_qos_class);
}
EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
/**
* pm_qos_add_notifier - sets notification entry for changes to target value
* @pm_qos_class: identifies which qos target changes should be notified.
* @notifier: notifier block managed by caller.
*
* will register the notifier into a notification chain that gets called
* upon changes to the pm_qos_class target value.
*/
int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
{
int retval;
retval = blocking_notifier_chain_register(
pm_qos_array[pm_qos_class]->notifiers, notifier);
return retval;
}
EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
/**
* pm_qos_remove_notifier - deletes notification entry from chain.
* @pm_qos_class: identifies which qos target changes are notified.
* @notifier: notifier block to be removed.
*
* will remove the notifier from the notification chain that gets called
* upon changes to the pm_qos_class target value.
*/
int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
{
int retval;
retval = blocking_notifier_chain_unregister(
pm_qos_array[pm_qos_class]->notifiers, notifier);
return retval;
}
EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
#define PID_NAME_LEN sizeof("process_1234567890")
static char name[PID_NAME_LEN];
static int pm_qos_power_open(struct inode *inode, struct file *filp)
{
int ret;
long pm_qos_class;
lock_kernel();
pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
if (pm_qos_class >= 0) {
filp->private_data = (void *)pm_qos_class;
sprintf(name, "process_%d", current->pid);
ret = pm_qos_add_requirement(pm_qos_class, name,
PM_QOS_DEFAULT_VALUE);
if (ret >= 0) {
unlock_kernel();
return 0;
}
}
unlock_kernel();
return -EPERM;
}
static int pm_qos_power_release(struct inode *inode, struct file *filp)
{
int pm_qos_class;
pm_qos_class = (long)filp->private_data;
sprintf(name, "process_%d", current->pid);
pm_qos_remove_requirement(pm_qos_class, name);
return 0;
}
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
size_t count, loff_t *f_pos)
{
s32 value;
int pm_qos_class;
pm_qos_class = (long)filp->private_data;
if (count != sizeof(s32))
return -EINVAL;
if (copy_from_user(&value, buf, sizeof(s32)))
return -EFAULT;
sprintf(name, "process_%d", current->pid);
pm_qos_update_requirement(pm_qos_class, name, value);
return sizeof(s32);
}
static int __init pm_qos_power_init(void)
{
int ret = 0;
ret = register_pm_qos_misc(&cpu_dma_pm_qos);
if (ret < 0) {
printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n");
return ret;
}
ret = register_pm_qos_misc(&network_lat_pm_qos);
if (ret < 0) {
printk(KERN_ERR "pm_qos_param: network_latency setup failed\n");
return ret;
}
ret = register_pm_qos_misc(&network_throughput_pm_qos);
if (ret < 0)
printk(KERN_ERR
"pm_qos_param: network_throughput setup failed\n");
return ret;
}
late_initcall(pm_qos_power_init);

File diff suppressed because it is too large Load Diff

1039
kernel/kernel/posix-timers.c Normal file

File diff suppressed because it is too large Load Diff

251
kernel/kernel/power/Kconfig Normal file
View File

@@ -0,0 +1,251 @@
config PM
bool "Power Management support"
depends on !IA64_HP_SIM
---help---
"Power Management" means that parts of your computer are shut
off or put into a power conserving "sleep" mode if they are not
being used. There are two competing standards for doing this: APM
and ACPI. If you want to use either one, say Y here and then also
to the requisite support below.
Power Management is most important for battery powered laptop
computers; if you have a laptop, check out the Linux Laptop home
page on the WWW at <http://www.linux-on-laptops.com/> or
Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
and the Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
Note that, even if you say N here, Linux on the x86 architecture
will issue the hlt instruction if nothing is to be done, thereby
sending the processor to sleep and saving power.
config PM_DEBUG
bool "Power Management Debug Support"
depends on PM
---help---
This option enables various debugging support in the Power Management
code. This is helpful when debugging and reporting PM bugs, like
suspend support.
config PM_VERBOSE
bool "Verbose Power Management debugging"
depends on PM_DEBUG
default n
---help---
This option enables verbose messages from the Power Management code.
config CAN_PM_TRACE
def_bool y
depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
config PM_TRACE
bool
help
This enables code to save the last PM event point across
reboot. The architecture needs to support this, x86 for
example does by saving things in the RTC, see below.
The architecture specific code must provide the extern
functions from <linux/resume-trace.h> as well as the
<asm/resume-trace.h> header with a TRACE_RESUME() macro.
The way the information is presented is architecture-
dependent, x86 will print the information during a
late_initcall.
config PM_TRACE_RTC
bool "Suspend/resume event tracing"
depends on CAN_PM_TRACE
depends on X86
select PM_TRACE
default n
---help---
This enables some cheesy code to save the last PM event point in the
RTC across reboots, so that you can debug a machine that just hangs
during suspend (or more commonly, during resume).
To use this debugging feature you should attempt to suspend the
machine, reboot it and then run
dmesg -s 1000000 | grep 'hash matches'
CAUTION: this option will cause your machine's real-time clock to be
set to an invalid time after a resume.
config PM_SLEEP_SMP
bool
depends on SMP
depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
depends on PM_SLEEP
select HOTPLUG_CPU
default y
config PM_SLEEP
bool
depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE || HIBERNATION_ON_MEMORY
default y
config SUSPEND
bool "Suspend to RAM and standby"
depends on PM && ARCH_SUSPEND_POSSIBLE
default y
---help---
Allow the system to enter sleep states in which main memory is
powered and thus its contents are preserved, such as the
suspend-to-RAM state (e.g. the ACPI S3 state).
config PM_TEST_SUSPEND
bool "Test suspend/resume and wakealarm during bootup"
depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
---help---
This option will let you suspend your machine during bootup, and
make it wake up a few seconds later using an RTC wakeup alarm.
Enable this with a kernel parameter like "test_suspend=mem".
You probably want to have your system's RTC driver statically
linked, ensuring that it's available when this test runs.
config SUSPEND_FREEZER
bool "Enable freezer for suspend to RAM/standby" \
if ARCH_WANTS_FREEZER_CONTROL || BROKEN
depends on SUSPEND
default y
help
This allows you to turn off the freezer for suspend. If this is
done, no tasks are frozen for suspend to RAM/standby.
Turning OFF this setting is NOT recommended! If in doubt, say Y.
config HIBERNATION_NVS
bool
config HIBERNATION
bool "Hibernation (aka 'suspend to disk')"
depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
select HIBERNATION_NVS if HAS_IOMEM
---help---
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
system and powers it off; and restores that checkpoint on reboot.
You can suspend your machine with 'echo disk > /sys/power/state'
after placing resume=/dev/swappartition on the kernel command line
in your bootloader's configuration file.
Alternatively, you can use the additional userland tools available
from <http://suspend.sf.net>.
In principle it does not require ACPI or APM, although for example
ACPI will be used for the final steps when it is available. One
of the reasons to use software suspend is that the firmware hooks
for suspend states like suspend-to-RAM (STR) often don't work very
well with Linux.
It creates an image which is saved in your active swap. Upon the next
boot, pass the 'resume=/dev/swappartition' argument to the kernel to
have it detect the saved image, restore memory state from it, and
continue to run as before. If you do not want the previous state to
be reloaded, then use the 'noresume' kernel command line argument.
Note, however, that fsck will be run on your filesystems and you will
need to run mkswap against the swap partition used for the suspend.
It also works with swap files to a limited extent (for details see
<file:Documentation/power/swsusp-and-swap-files.txt>).
Right now you may boot without resuming and resume later but in the
meantime you cannot use the swap partition(s)/file(s) involved in
suspending. Also in this case you must not use the filesystems
that were mounted before the suspend. In particular, you MUST NOT
MOUNT any journaled filesystems mounted before the suspend or they
will get corrupted in a nasty way.
For more information take a look at <file:Documentation/power/swsusp.txt>.
config PM_STD_PARTITION
string "Default resume partition"
depends on HIBERNATION
default ""
---help---
The default resume partition is the partition that the suspend-
to-disk implementation will look for a suspended disk image.
The partition specified here will be different for almost every user.
It should be a valid swap partition (at least for now) that is turned
on before suspending.
The partition specified can be overridden by specifying:
resume=/dev/<other device>
which will set the resume partition to the device specified.
Note there is currently not a way to specify which device to save the
suspended image to. It will simply pick the first available swap
device.
config HIBERNATION_ON_MEMORY
bool "Hibernation on memory"
depends on PM
select HIBERNATION
---help---
Enable the hibernation on memory (HoM) functionality. This techniques
will put the memory in self-refresh and after that it will turn-off
the chip.
config HOM_TAG_VIRTUAL_ADDRESS
hex "Hibernation on memory - Tag virtual addess"
depends on HIBERNATION_ON_MEMORY
depends on SUPERH32
default "0x80001800"
---help---
Set the Absolute Virtual Address where the Hibernation
config HOM_DEBUG
bool "Hibernation on memory - Debug"
depends on HIBERNATION_ON_MEMORY
depends on SUPERH32
default n
---help---
This option enables the hom_printk function. This works
like a standard printk on the empty_zero_page.
It's used as a rudimental printk to debug.
config APM_EMULATION
tristate "Advanced Power Management Emulation"
depends on PM && SYS_SUPPORTS_APM_EMULATION
help
APM is a BIOS specification for saving power using several different
techniques. This is mostly useful for battery powered laptops with
APM compliant BIOSes. If you say Y here, the system time will be
reset after a RESUME operation, the /proc/apm device will provide
battery status information, and user-space programs will receive
notification of APM "events" (e.g. battery status change).
In order to use APM, you will need supporting software. For location
and more information, read <file:Documentation/power/pm.txt> and the
Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
This driver does not spin down disk drives (see the hdparm(8)
manpage ("man 8 hdparm") for that), and it doesn't turn off
VESA-compliant "green" monitors.
Generally, if you don't have a battery in your machine, there isn't
much point in using this driver and you should say N. If you get
random kernel OOPSes or reboots that don't seem to be related to
anything, try disabling/enabling this option (or disabling/enabling
APM in your BIOS).
config PM_RUNTIME
bool "Run-time PM core functionality"
depends on PM
---help---
Enable functionality allowing I/O devices to be put into energy-saving
(low power) states at run time (or autosuspended) after a specified
period of inactivity and woken up in response to a hardware-generated
wake-up event or a driver's request.
Hardware support is generally required for this functionality to work
and the bus type drivers of the buses the devices are on are
responsible for the actual handling of the autosuspend requests and
wake-up events.

View File

@@ -0,0 +1,15 @@
ifeq ($(CONFIG_PM_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
endif
obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_PM_SLEEP) += console.o
obj-$(CONFIG_FREEZER) += process.o
obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o
obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
obj-$(CONFIG_HIBERNATION_ON_MEMORY) += hom.o

View File

@@ -0,0 +1,36 @@
/*
* drivers/power/process.c - Functions for saving/restoring console.
*
* Originally from swsusp.
*/
#include <linux/vt_kern.h>
#include <linux/kbd_kern.h>
#include <linux/console.h>
#include <linux/module.h>
#include "power.h"
#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
static int orig_fgconsole, orig_kmsg;
int pm_prepare_console(void)
{
orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
if (orig_fgconsole < 0)
return 1;
orig_kmsg = kmsg_redirect;
kmsg_redirect = SUSPEND_CONSOLE;
return 0;
}
void pm_restore_console(void)
{
if (orig_fgconsole >= 0) {
vt_move_to_console(orig_fgconsole, 0);
kmsg_redirect = orig_kmsg;
}
}
#endif

View File

@@ -0,0 +1,967 @@
/*
* kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
* Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
* Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
*
* This file is released under the GPLv2.
*/
#include <linux/suspend.h>
#include <linux/syscalls.h>
#include <linux/reboot.h>
#include <linux/string.h>
#include <linux/device.h>
#include <linux/kmod.h>
#include <linux/delay.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pm.h>
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
#include <scsi/scsi_scan.h>
#include <asm/suspend.h>
#include "power.h"
static int noresume = 0;
static char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
enum {
HIBERNATION_INVALID,
HIBERNATION_PLATFORM,
HIBERNATION_TEST,
HIBERNATION_TESTPROC,
HIBERNATION_SHUTDOWN,
HIBERNATION_REBOOT,
/* keep last */
__HIBERNATION_AFTER_LAST
};
#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
static int hibernation_mode = HIBERNATION_SHUTDOWN;
static struct platform_hibernation_ops *hibernation_ops;
/**
* hibernation_set_ops - set the global hibernate operations
* @ops: the hibernation operations to use in subsequent hibernation transitions
*/
void hibernation_set_ops(struct platform_hibernation_ops *ops)
{
if (ops && !(ops->begin && ops->end && ops->pre_snapshot
&& ops->prepare && ops->finish && ops->enter && ops->pre_restore
&& ops->restore_cleanup)) {
WARN_ON(1);
return;
}
mutex_lock(&pm_mutex);
hibernation_ops = ops;
if (ops)
hibernation_mode = HIBERNATION_PLATFORM;
else if (hibernation_mode == HIBERNATION_PLATFORM)
hibernation_mode = HIBERNATION_SHUTDOWN;
mutex_unlock(&pm_mutex);
}
static bool entering_platform_hibernation;
bool system_entering_hibernation(void)
{
return entering_platform_hibernation;
}
EXPORT_SYMBOL(system_entering_hibernation);
#ifdef CONFIG_PM_DEBUG
static void hibernation_debug_sleep(void)
{
printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n");
mdelay(5000);
}
static int hibernation_testmode(int mode)
{
if (hibernation_mode == mode) {
hibernation_debug_sleep();
return 1;
}
return 0;
}
static int hibernation_test(int level)
{
if (pm_test_level == level) {
hibernation_debug_sleep();
return 1;
}
return 0;
}
#else /* !CONFIG_PM_DEBUG */
static int hibernation_testmode(int mode) { return 0; }
static int hibernation_test(int level) { return 0; }
#endif /* !CONFIG_PM_DEBUG */
/**
* platform_begin - tell the platform driver that we're starting
* hibernation
*/
static int platform_begin(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->begin() : 0;
}
/**
* platform_end - tell the platform driver that we've entered the
* working state
*/
static void platform_end(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->end();
}
/**
* platform_pre_snapshot - prepare the machine for hibernation using the
* platform driver if so configured and return an error code if it fails
*/
static int platform_pre_snapshot(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->pre_snapshot() : 0;
}
/**
* platform_leave - prepare the machine for switching to the normal mode
* of operation using the platform driver (called with interrupts disabled)
*/
static void platform_leave(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->leave();
}
/**
* platform_finish - switch the machine to the normal mode of operation
* using the platform driver (must be called after platform_prepare())
*/
static void platform_finish(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->finish();
}
/**
* platform_pre_restore - prepare the platform for the restoration from a
* hibernation image. If the restore fails after this function has been
* called, platform_restore_cleanup() must be called.
*/
static int platform_pre_restore(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->pre_restore() : 0;
}
/**
* platform_restore_cleanup - switch the platform to the normal mode of
* operation after a failing restore. If platform_pre_restore() has been
* called before the failing restore, this function must be called too,
* regardless of the result of platform_pre_restore().
*/
static void platform_restore_cleanup(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->restore_cleanup();
}
/**
* platform_recover - recover the platform from a failure to suspend
* devices.
*/
static void platform_recover(int platform_mode)
{
if (platform_mode && hibernation_ops && hibernation_ops->recover)
hibernation_ops->recover();
}
/**
* create_image - freeze devices that need to be frozen with interrupts
* off, create the hibernation image and thaw those devices. Control
* reappears in this routine after a restore.
*/
static int create_image(int platform_mode)
{
int error;
error = arch_prepare_suspend();
if (error)
return error;
/* At this point, dpm_suspend_start() has been called, but *not*
* dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
* Otherwise, drivers for some devices (e.g. interrupt controllers)
* become desynchronized with the actual state of the hardware
* at resume time, and evil weirdness ensues.
*/
error = dpm_suspend_noirq(PMSG_FREEZE);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
"aborting hibernation\n");
return error;
}
error = platform_pre_snapshot(platform_mode);
if (error || hibernation_test(TEST_PLATFORM))
goto Platform_finish;
error = disable_nonboot_cpus();
if (error || hibernation_test(TEST_CPUS)
|| hibernation_testmode(HIBERNATION_TEST))
goto Enable_cpus;
local_irq_disable();
error = sysdev_suspend(PMSG_FREEZE);
if (error) {
printk(KERN_ERR "PM: Some system devices failed to power down, "
"aborting hibernation\n");
goto Enable_irqs;
}
if (hibernation_test(TEST_CORE))
goto Power_up;
in_suspend = 1;
save_processor_state();
error = swsusp_arch_suspend();
if (error)
printk(KERN_ERR "PM: Error %d creating hibernation image\n",
error);
/* Restore control flow magically appears here */
restore_processor_state();
if (!in_suspend)
platform_leave(platform_mode);
Power_up:
sysdev_resume();
/* NOTE: dpm_resume_noirq() is just a resume() for devices
* that suspended with irqs off ... no overall powerup.
*/
Enable_irqs:
local_irq_enable();
Enable_cpus:
enable_nonboot_cpus();
Platform_finish:
platform_finish(platform_mode);
dpm_resume_noirq(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
return error;
}
/**
* hibernation_snapshot - quiesce devices and create the hibernation
* snapshot image.
* @platform_mode - if set, use the platform driver, if available, to
* prepare the platform firmware for the power transition.
*
* Must be called with pm_mutex held
*/
int hibernation_snapshot(int platform_mode)
{
int error;
error = platform_begin(platform_mode);
if (error)
return error;
/* Preallocate image memory before shutting down devices. */
error = hibernate_preallocate_memory();
if (error)
goto Close;
suspend_console();
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
goto Recover_platform;
if (hibernation_test(TEST_DEVICES))
goto Recover_platform;
error = create_image(platform_mode);
/* Control returns here after successful restore */
Resume_devices:
/* We may need to release the preallocated image pages here. */
if (error || !in_suspend)
swsusp_free();
dpm_resume_end(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
resume_console();
Close:
platform_end(platform_mode);
return error;
Recover_platform:
platform_recover(platform_mode);
goto Resume_devices;
}
/**
* resume_target_kernel - prepare devices that need to be suspended with
* interrupts off, restore the contents of highmem that have not been
* restored yet from the image and run the low level code that will restore
* the remaining contents of memory and switch to the just restored target
* kernel.
*/
static int resume_target_kernel(bool platform_mode)
{
int error;
error = dpm_suspend_noirq(PMSG_QUIESCE);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
"aborting resume\n");
return error;
}
error = platform_pre_restore(platform_mode);
if (error)
goto Cleanup;
error = disable_nonboot_cpus();
if (error)
goto Enable_cpus;
local_irq_disable();
error = sysdev_suspend(PMSG_QUIESCE);
if (error)
goto Enable_irqs;
/* We'll ignore saved state, but this gets preempt count (etc) right */
save_processor_state();
error = restore_highmem();
if (!error) {
error = swsusp_arch_resume();
/*
* The code below is only ever reached in case of a failure.
* Otherwise execution continues at place where
* swsusp_arch_suspend() was called
*/
BUG_ON(!error);
/* This call to restore_highmem() undos the previous one */
restore_highmem();
}
/*
* The only reason why swsusp_arch_resume() can fail is memory being
* very tight, so we have to free it as soon as we can to avoid
* subsequent failures
*/
swsusp_free();
restore_processor_state();
touch_softlockup_watchdog();
sysdev_resume();
Enable_irqs:
local_irq_enable();
Enable_cpus:
enable_nonboot_cpus();
Cleanup:
platform_restore_cleanup(platform_mode);
dpm_resume_noirq(PMSG_RECOVER);
return error;
}
/**
* hibernation_restore - quiesce devices and restore the hibernation
* snapshot image. If successful, control returns in hibernation_snaphot()
* @platform_mode - if set, use the platform driver, if available, to
* prepare the platform firmware for the transition.
*
* Must be called with pm_mutex held
*/
int hibernation_restore(int platform_mode)
{
int error;
pm_prepare_console();
suspend_console();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
error = resume_target_kernel(platform_mode);
dpm_resume_end(PMSG_RECOVER);
}
resume_console();
pm_restore_console();
return error;
}
/**
* hibernation_platform_enter - enter the hibernation state using the
* platform driver (if available)
*/
int hibernation_platform_enter(void)
{
int error;
if (!hibernation_ops)
return -ENOSYS;
/*
* We have cancelled the power transition by running
* hibernation_ops->finish() before saving the image, so we should let
* the firmware know that we're going to enter the sleep state after all
*/
error = hibernation_ops->begin();
if (error)
goto Close;
entering_platform_hibernation = true;
suspend_console();
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
hibernation_ops->recover();
goto Resume_devices;
}
error = dpm_suspend_noirq(PMSG_HIBERNATE);
if (error)
goto Resume_devices;
error = hibernation_ops->prepare();
if (error)
goto Platform_finish;
error = disable_nonboot_cpus();
if (error)
goto Platform_finish;
local_irq_disable();
sysdev_suspend(PMSG_HIBERNATE);
hibernation_ops->enter();
/* We should never get here */
while (1);
/*
* We don't need to reenable the nonboot CPUs or resume consoles, since
* the system is going to be halted anyway.
*/
Platform_finish:
hibernation_ops->finish();
dpm_suspend_noirq(PMSG_RESTORE);
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
resume_console();
Close:
hibernation_ops->end();
return error;
}
/**
* power_down - Shut the machine down for hibernation.
*
* Use the platform driver, if configured so; otherwise try
* to power off or reboot.
*/
static void power_down(void)
{
switch (hibernation_mode) {
case HIBERNATION_TEST:
case HIBERNATION_TESTPROC:
break;
case HIBERNATION_REBOOT:
kernel_restart(NULL);
break;
case HIBERNATION_PLATFORM:
hibernation_platform_enter();
case HIBERNATION_SHUTDOWN:
kernel_power_off();
break;
}
kernel_halt();
/*
* Valid image is on the disk, if we continue we risk serious data
* corruption after resume.
*/
printk(KERN_CRIT "PM: Please power down manually\n");
while(1);
}
static int prepare_processes(void)
{
int error = 0;
if (freeze_processes()) {
error = -EBUSY;
thaw_processes();
}
return error;
}
/**
* hibernate - The granpappy of the built-in hibernation management
*/
int hibernate(void)
{
int error;
mutex_lock(&pm_mutex);
/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
error = -EBUSY;
goto Unlock;
}
pm_prepare_console();
error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
if (error)
goto Exit;
error = usermodehelper_disable();
if (error)
goto Exit;
/* Allocate memory management structures */
error = create_basic_memory_bitmaps();
if (error)
goto Exit;
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
error = prepare_processes();
if (error)
goto Finish;
if (hibernation_test(TEST_FREEZER))
goto Thaw;
if (hibernation_testmode(HIBERNATION_TESTPROC))
goto Thaw;
error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
if (error)
goto Thaw;
if (in_suspend) {
unsigned int flags = 0;
if (hibernation_mode == HIBERNATION_PLATFORM)
flags |= SF_PLATFORM_MODE;
pr_debug("PM: writing image.\n");
error = swsusp_write(flags);
swsusp_free();
if (!error)
power_down();
} else {
pr_debug("PM: Image restored successfully.\n");
}
Thaw:
thaw_processes();
Finish:
free_basic_memory_bitmaps();
usermodehelper_enable();
Exit:
pm_notifier_call_chain(PM_POST_HIBERNATION);
pm_restore_console();
atomic_inc(&snapshot_device_available);
Unlock:
mutex_unlock(&pm_mutex);
return error;
}
/**
* software_resume - Resume from a saved image.
*
* Called as a late_initcall (so all devices are discovered and
* initialized), we call swsusp to see if we have a saved image or not.
* If so, we quiesce devices, the restore the saved image. We will
* return above (in hibernate() ) if everything goes well.
* Otherwise, we fail gracefully and return to the normally
* scheduled program.
*
*/
static int software_resume(void)
{
int error;
unsigned int flags;
/*
* If the user said "noresume".. bail out early.
*/
if (noresume)
return 0;
/*
* name_to_dev_t() below takes a sysfs buffer mutex when sysfs
* is configured into the kernel. Since the regular hibernate
* trigger path is via sysfs which takes a buffer mutex before
* calling hibernate functions (which take pm_mutex) this can
* cause lockdep to complain about a possible ABBA deadlock
* which cannot happen since we're in the boot code here and
* sysfs can't be invoked yet. Therefore, we use a subclass
* here to avoid lockdep complaining.
*/
mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
if (swsusp_resume_device)
goto Check_image;
if (!strlen(resume_file)) {
error = -ENOENT;
goto Unlock;
}
pr_debug("PM: Checking image partition %s\n", resume_file);
/* Check if the device is there */
swsusp_resume_device = name_to_dev_t(resume_file);
if (!swsusp_resume_device) {
/*
* Some device discovery might still be in progress; we need
* to wait for this to finish.
*/
wait_for_device_probe();
/*
* We can't depend on SCSI devices being available after loading
* one of their modules until scsi_complete_async_scans() is
* called and the resume device usually is a SCSI one.
*/
scsi_complete_async_scans();
swsusp_resume_device = name_to_dev_t(resume_file);
if (!swsusp_resume_device) {
error = -ENODEV;
goto Unlock;
}
}
Check_image:
pr_debug("PM: Resume from partition %d:%d\n",
MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
pr_debug("PM: Checking hibernation image.\n");
error = swsusp_check();
if (error)
goto Unlock;
/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
error = -EBUSY;
swsusp_close(FMODE_READ);
goto Unlock;
}
pm_prepare_console();
error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
if (error)
goto close_finish;
error = usermodehelper_disable();
if (error)
goto close_finish;
error = create_basic_memory_bitmaps();
if (error)
goto close_finish;
pr_debug("PM: Preparing processes for restore.\n");
error = prepare_processes();
if (error) {
swsusp_close(FMODE_READ);
goto Done;
}
pr_debug("PM: Reading hibernation image.\n");
error = swsusp_read(&flags);
swsusp_close(FMODE_READ);
if (!error)
hibernation_restore(flags & SF_PLATFORM_MODE);
printk(KERN_ERR "PM: Restore failed, recovering.\n");
swsusp_free();
thaw_processes();
Done:
free_basic_memory_bitmaps();
usermodehelper_enable();
Finish:
pm_notifier_call_chain(PM_POST_RESTORE);
pm_restore_console();
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
Unlock:
mutex_unlock(&pm_mutex);
pr_debug("PM: Resume from disk failed.\n");
return error;
close_finish:
swsusp_close(FMODE_READ);
goto Finish;
}
late_initcall(software_resume);
static const char * const hibernation_modes[] = {
[HIBERNATION_PLATFORM] = "platform",
[HIBERNATION_SHUTDOWN] = "shutdown",
[HIBERNATION_REBOOT] = "reboot",
[HIBERNATION_TEST] = "test",
[HIBERNATION_TESTPROC] = "testproc",
};
/**
* disk - Control hibernation mode
*
* Suspend-to-disk can be handled in several ways. We have a few options
* for putting the system to sleep - using the platform driver (e.g. ACPI
* or other hibernation_ops), powering off the system or rebooting the
* system (for testing) as well as the two test modes.
*
* The system can support 'platform', and that is known a priori (and
* encoded by the presence of hibernation_ops). However, the user may
* choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
* test modes, 'test' or 'testproc'.
*
* show() will display what the mode is currently set to.
* store() will accept one of
*
* 'platform'
* 'shutdown'
* 'reboot'
* 'test'
* 'testproc'
*
* It will only change to 'platform' if the system
* supports it (as determined by having hibernation_ops).
*/
static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
int i;
char *start = buf;
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (!hibernation_modes[i])
continue;
switch (i) {
case HIBERNATION_SHUTDOWN:
case HIBERNATION_REBOOT:
case HIBERNATION_TEST:
case HIBERNATION_TESTPROC:
break;
case HIBERNATION_PLATFORM:
if (hibernation_ops)
break;
/* not a valid mode, continue with loop */
continue;
}
if (i == hibernation_mode)
buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
else
buf += sprintf(buf, "%s ", hibernation_modes[i]);
}
buf += sprintf(buf, "\n");
return buf-start;
}
static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
int error = 0;
int i;
int len;
char *p;
int mode = HIBERNATION_INVALID;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
mutex_lock(&pm_mutex);
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (len == strlen(hibernation_modes[i])
&& !strncmp(buf, hibernation_modes[i], len)) {
mode = i;
break;
}
}
if (mode != HIBERNATION_INVALID) {
switch (mode) {
case HIBERNATION_SHUTDOWN:
case HIBERNATION_REBOOT:
case HIBERNATION_TEST:
case HIBERNATION_TESTPROC:
hibernation_mode = mode;
break;
case HIBERNATION_PLATFORM:
if (hibernation_ops)
hibernation_mode = mode;
else
error = -EINVAL;
}
} else
error = -EINVAL;
if (!error)
pr_debug("PM: Hibernation mode set to '%s'\n",
hibernation_modes[mode]);
mutex_unlock(&pm_mutex);
return error ? error : n;
}
power_attr(disk);
static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
MINOR(swsusp_resume_device));
}
static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
unsigned int maj, min;
dev_t res;
int ret = -EINVAL;
if (sscanf(buf, "%u:%u", &maj, &min) != 2)
goto out;
res = MKDEV(maj,min);
if (maj != MAJOR(res) || min != MINOR(res))
goto out;
mutex_lock(&pm_mutex);
swsusp_resume_device = res;
mutex_unlock(&pm_mutex);
printk(KERN_INFO "PM: Starting manual resume from disk\n");
noresume = 0;
software_resume();
ret = n;
out:
return ret;
}
power_attr(resume);
static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%lu\n", image_size);
}
static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
unsigned long size;
if (sscanf(buf, "%lu", &size) == 1) {
image_size = size;
return n;
}
return -EINVAL;
}
power_attr(image_size);
static struct attribute * g[] = {
&disk_attr.attr,
&resume_attr.attr,
&image_size_attr.attr,
NULL,
};
static struct attribute_group attr_group = {
.attrs = g,
};
static int __init pm_disk_init(void)
{
return sysfs_create_group(power_kobj, &attr_group);
}
core_initcall(pm_disk_init);
static int __init resume_setup(char *str)
{
if (noresume)
return 1;
strncpy( resume_file, str, 255 );
return 1;
}
static int __init resume_offset_setup(char *str)
{
unsigned long long offset;
if (noresume)
return 1;
if (sscanf(str, "%llu", &offset) == 1)
swsusp_resume_block = offset;
return 1;
}
static int __init noresume_setup(char *str)
{
noresume = 1;
return 1;
}
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);

View File

@@ -0,0 +1,135 @@
/*
* linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
*
* Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
*
* This file is released under the GPLv2.
*/
#include <linux/io.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/suspend.h>
/*
* Platforms, like ACPI, may want us to save some memory used by them during
* hibernation and to restore the contents of this memory during the subsequent
* resume. The code below implements a mechanism allowing us to do that.
*/
struct nvs_page {
unsigned long phys_start;
unsigned int size;
void *kaddr;
void *data;
struct list_head node;
};
static LIST_HEAD(nvs_list);
/**
* hibernate_nvs_register - register platform NVS memory region to save
* @start - physical address of the region
* @size - size of the region
*
* The NVS region need not be page-aligned (both ends) and we arrange
* things so that the data from page-aligned addresses in this region will
* be copied into separate RAM pages.
*/
int hibernate_nvs_register(unsigned long start, unsigned long size)
{
struct nvs_page *entry, *next;
while (size > 0) {
unsigned int nr_bytes;
entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
if (!entry)
goto Error;
list_add_tail(&entry->node, &nvs_list);
entry->phys_start = start;
nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
entry->size = (size < nr_bytes) ? size : nr_bytes;
start += entry->size;
size -= entry->size;
}
return 0;
Error:
list_for_each_entry_safe(entry, next, &nvs_list, node) {
list_del(&entry->node);
kfree(entry);
}
return -ENOMEM;
}
/**
* hibernate_nvs_free - free data pages allocated for saving NVS regions
*/
void hibernate_nvs_free(void)
{
struct nvs_page *entry;
list_for_each_entry(entry, &nvs_list, node)
if (entry->data) {
free_page((unsigned long)entry->data);
entry->data = NULL;
if (entry->kaddr) {
iounmap(entry->kaddr);
entry->kaddr = NULL;
}
}
}
/**
* hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
*/
int hibernate_nvs_alloc(void)
{
struct nvs_page *entry;
list_for_each_entry(entry, &nvs_list, node) {
entry->data = (void *)__get_free_page(GFP_KERNEL);
if (!entry->data) {
hibernate_nvs_free();
return -ENOMEM;
}
}
return 0;
}
/**
* hibernate_nvs_save - save NVS memory regions
*/
void hibernate_nvs_save(void)
{
struct nvs_page *entry;
printk(KERN_INFO "PM: Saving platform NVS memory\n");
list_for_each_entry(entry, &nvs_list, node)
if (entry->data) {
entry->kaddr = ioremap(entry->phys_start, entry->size);
memcpy(entry->data, entry->kaddr, entry->size);
}
}
/**
* hibernate_nvs_restore - restore NVS memory regions
*
* This function is going to be called with interrupts disabled, so it
* cannot iounmap the virtual addresses used to access the NVS region.
*/
void hibernate_nvs_restore(void)
{
struct nvs_page *entry;
printk(KERN_INFO "PM: Restoring platform NVS memory\n");
list_for_each_entry(entry, &nvs_list, node)
if (entry->data)
memcpy(entry->kaddr, entry->data, entry->size);
}

271
kernel/kernel/power/hom.c Normal file
View File

@@ -0,0 +1,271 @@
/*
* kernel/power/hom.c - Hibernation on Memory core functionality.
*
* Copyright (c) 2010 STMicroelectronics
* Author: Francesco M. Virlinzi <francesco.virlinzi@st.com>
*
* This file is released under the GPLv2
*
* This files reuses most of the code used to manage
* system wide suspend and system wide hibernation
* to support a new kind of system wide power operation:
*
* Hibernation on Memory where the
* - Chip is off
* - DRAM is in self-refresh mode
*
*/
#include <linux/hom.h>
#include <linux/freezer.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/fs.h>
#include <linux/syscalls.h>
#include <asm-generic/sections.h>
#include "power.h"
static struct platform_hom_ops *hom_ops;
/**
* hom_set_ops - Set the global power method table.
* @ops: Pointer to ops structure.
*/
int hom_set_ops(struct platform_hom_ops *ops)
{
pr_debug("%s\n", __func__);
if (!ops)
return -EINVAL;
mutex_lock(&pm_mutex);
hom_ops = ops;
mutex_unlock(&pm_mutex);
return 0;
}
EXPORT_SYMBOL(hom_set_ops);
static inline int platform_begin(void)
{
if (hom_ops->begin)
return hom_ops->begin();
return 0;
}
static inline int platform_prepare(void)
{
if (hom_ops->prepare)
return hom_ops->prepare();
return 0;
}
#ifndef CONFIG_HMEM_MODE_TEST
static inline int platform_enter(void)
{
if (hom_ops->enter)
return hom_ops->enter();
return 0;
}
#else
static inline int platform_enter(void) { return 0; }
#endif
static inline int platform_complete(void)
{
if (hom_ops->complete)
return hom_ops->complete();
return 0;
}
static inline int platform_end(void)
{
if (hom_ops->end)
return hom_ops->end();
return 0;
}
static int prepare_processes(void)
{
int error = 0;
if (freeze_processes()) {
error = -EBUSY;
thaw_processes();
}
return error;
}
/**
* hom_prepare - Do prep work before entering low-power state.
*
* This is common code that is called for each state that we're entering.
* Run suspend notifiers, allocate a console and stop all processes.
*/
static int hom_prepare(void)
{
int error;
pm_prepare_console();
error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
if (error)
goto Exit;
error = usermodehelper_disable();
if (error)
goto Exit;
pr_info("PM: Syncing filesystems ... ");
sys_sync();
pr_info("done.\n");
error = prepare_processes();
if (!error)
return 0;
usermodehelper_enable();
Exit:
pm_notifier_call_chain(PM_POST_HIBERNATION);
pm_restore_console();
return error;
}
/**
* hom_freeze_devices_and_enter -
* freeze devices and enter in hibernation on memory
*/
static int hibernation_on_memory_enter(void)
{
int error = 0;
unsigned long pe_counter;
if (!hom_ops)
return -ENOSYS;
pr_debug("[STM]:[PM]: platform_begin\n");
error = platform_begin();
if (error)
goto Close;
suspend_console();
pr_debug("[STM]:[PM]: Suspend devices\n");
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
goto Resume_devices;
pr_debug("[STM]:[PM]: Suspend devices (noirq)\n");
error = dpm_suspend_noirq(PMSG_FREEZE);
if (error)
goto Resume_devices_noirq;
error = disable_nonboot_cpus();
if (error)
goto Enable_cpus;
local_irq_disable();
pr_debug("[STM]:[PM]: Suspend sysdevices\n");
error = sysdev_suspend(PMSG_FREEZE);
if (error)
goto Enable_irqs;
pr_debug("[STM]:[PM]: platform_prepare\n");
error = platform_prepare();
if (error) {
pr_err("[STM]:[PM]: platform_prepare has refused the HoM\n");
goto Skip_enter;
}
pr_debug("[STM]:[PM]: platform_enter\n");
pe_counter = preempt_count();
platform_enter();
BUG_ON(pe_counter != preempt_count());
Skip_enter:
pr_debug("[STM]:[PM]: platform_complete\n");
platform_complete();
pr_debug("[STM]:[PM]: Resumed sysdevices\n");
sysdev_resume();
Enable_irqs:
local_irq_enable();
Enable_cpus:
enable_nonboot_cpus();
Resume_devices_noirq:
pr_debug("[STM]:[PM]: Resume devices (noirq)\n");
dpm_resume_noirq(PMSG_RESTORE);
Resume_devices:
pr_debug("[STM]:[PM]: Resume devices\n");
dpm_resume_end(PMSG_RESTORE);
resume_console();
Close:
pr_debug("[STM]:[PM]: platform_end\n");
platform_end();
pr_debug("[STM]:[PM]: exit\n");
return error;
}
/**
* hom_finish - Do final work before exiting suspend sequence.
*
* Call platform code to clean up, restart processes, and free the
* console that we've allocated. This is not called for suspend-to-disk.
*/
static void hom_finish(void)
{
pr_debug("%s\n", __func__);
thaw_processes();
usermodehelper_enable();
pm_notifier_call_chain(PM_POST_HIBERNATION);
pm_restore_console();
}
static int hom_enter(void)
{
int err;
mutex_lock(&pm_mutex);
err = hom_prepare();
if (err)
goto Finish;
err = hibernation_on_memory_enter();
hom_finish();
Finish:
mutex_unlock(&pm_mutex);
pr_info("[STM]:[PM]: HoM End...\n");
return err;
}
int hibernate_on_memory(void)
{
return hom_enter();
}
EXPORT_SYMBOL(hibernate_on_memory);

256
kernel/kernel/power/main.c Normal file
View File

@@ -0,0 +1,256 @@
/*
* kernel/power/main.c - PM subsystem core functionality.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
*
* This file is released under the GPLv2
*
*/
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/resume-trace.h>
#include <linux/workqueue.h>
#include <linux/hom.h>
#include "power.h"
DEFINE_MUTEX(pm_mutex);
unsigned int pm_flags;
EXPORT_SYMBOL(pm_flags);
#ifdef CONFIG_PM_SLEEP
/* Routines for PM-transition notifications */
static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
int register_pm_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&pm_chain_head, nb);
}
EXPORT_SYMBOL_GPL(register_pm_notifier);
int unregister_pm_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&pm_chain_head, nb);
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
int pm_notifier_call_chain(unsigned long val)
{
return (blocking_notifier_call_chain(&pm_chain_head, val, NULL)
== NOTIFY_BAD) ? -EINVAL : 0;
}
#ifdef CONFIG_PM_DEBUG
int pm_test_level = TEST_NONE;
static const char * const pm_tests[__TEST_AFTER_LAST] = {
[TEST_NONE] = "none",
[TEST_CORE] = "core",
[TEST_CPUS] = "processors",
[TEST_PLATFORM] = "platform",
[TEST_DEVICES] = "devices",
[TEST_FREEZER] = "freezer",
};
static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
char *s = buf;
int level;
for (level = TEST_FIRST; level <= TEST_MAX; level++)
if (pm_tests[level]) {
if (level == pm_test_level)
s += sprintf(s, "[%s] ", pm_tests[level]);
else
s += sprintf(s, "%s ", pm_tests[level]);
}
if (s != buf)
/* convert the last space to a newline */
*(s-1) = '\n';
return (s - buf);
}
static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
const char * const *s;
int level;
char *p;
int len;
int error = -EINVAL;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
mutex_lock(&pm_mutex);
level = TEST_FIRST;
for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
pm_test_level = level;
error = 0;
break;
}
mutex_unlock(&pm_mutex);
return error ? error : n;
}
power_attr(pm_test);
#endif /* CONFIG_PM_DEBUG */
#endif /* CONFIG_PM_SLEEP */
struct kobject *power_kobj;
/**
* state - control system power state.
*
* show() returns what states are supported, which is hard-coded to
* 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
* 'disk' (Suspend-to-Disk).
*
* store() accepts one of those strings, translates it into the
* proper enumerated value, and initiates a suspend transition.
*/
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
char *s = buf;
#ifdef CONFIG_SUSPEND
int i;
for (i = 0; i < PM_SUSPEND_MAX; i++) {
if (pm_states[i] && valid_state(i))
s += sprintf(s,"%s ", pm_states[i]);
}
#endif
#ifdef CONFIG_HIBERNATION_ON_MEMORY
s += sprintf(s, "%s ", "hom");
#endif
#ifdef CONFIG_HIBERNATION
s += sprintf(s, "%s\n", "disk");
#else
if (s != buf)
/* convert the last space to a newline */
*(s-1) = '\n';
#endif
return (s - buf);
}
static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
suspend_state_t state = PM_SUSPEND_STANDBY;
const char * const *s;
#endif
char *p;
int len;
int error = -EINVAL;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
/* First, check if we are requested to hibernate */
if (len == 4 && !strncmp(buf, "disk", len)) {
error = hibernate();
goto Exit;
}
#ifdef CONFIG_HIBERNATION_ON_MEMORY
/* Second, check if we are requesting the hibernate on memory */
if (len == 3 && !strncmp(buf, "hom", len)) {
error = hibernate_on_memory();
goto Exit;
}
#endif
#ifdef CONFIG_SUSPEND
for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
break;
}
if (state < PM_SUSPEND_MAX && *s)
error = enter_state(state);
#endif
Exit:
return error ? error : n;
}
power_attr(state);
#ifdef CONFIG_PM_TRACE
int pm_trace_enabled;
static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%d\n", pm_trace_enabled);
}
static ssize_t
pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
int val;
if (sscanf(buf, "%d", &val) == 1) {
pm_trace_enabled = !!val;
return n;
}
return -EINVAL;
}
power_attr(pm_trace);
#endif /* CONFIG_PM_TRACE */
static struct attribute * g[] = {
&state_attr.attr,
#ifdef CONFIG_PM_TRACE
&pm_trace_attr.attr,
#endif
#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG)
&pm_test_attr.attr,
#endif
NULL,
};
static struct attribute_group attr_group = {
.attrs = g,
};
#ifdef CONFIG_PM_RUNTIME
struct workqueue_struct *pm_wq;
static int __init pm_start_workqueue(void)
{
pm_wq = create_freezeable_workqueue("pm");
return pm_wq ? 0 : -ENOMEM;
}
#else
static inline int pm_start_workqueue(void) { return 0; }
#endif
static int __init pm_init(void)
{
int error = pm_start_workqueue();
if (error)
return error;
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
return sysfs_create_group(power_kobj, &attr_group);
}
core_initcall(pm_init);

238
kernel/kernel/power/power.h Normal file
View File

@@ -0,0 +1,238 @@
#include <linux/suspend.h>
#include <linux/suspend_ioctls.h>
#include <linux/utsname.h>
#include <linux/freezer.h>
struct swsusp_info {
struct new_utsname uts;
u32 version_code;
unsigned long num_physpages;
int cpus;
unsigned long image_pages;
unsigned long pages;
unsigned long size;
} __attribute__((aligned(PAGE_SIZE)));
#ifdef CONFIG_HIBERNATION
#ifdef CONFIG_ARCH_HIBERNATION_HEADER
/* Maximum size of architecture specific data in a hibernation header */
#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
extern int arch_hibernation_header_restore(void *addr);
static inline int init_header_complete(struct swsusp_info *info)
{
return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
}
static inline char *check_image_kernel(struct swsusp_info *info)
{
return arch_hibernation_header_restore(info) ?
"architecture specific data" : NULL;
}
#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
*/
#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT)
/*
* Keep 1 MB of memory free so that device drivers can allocate some pages in
* their .suspend() routines without breaking the suspend to disk.
*/
#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
/* kernel/power/hibernate.c */
extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
extern int hibernation_platform_enter(void);
#endif
extern int pfn_is_nosave(unsigned long);
#define power_attr(_name) \
static struct kobj_attribute _name##_attr = { \
.attr = { \
.name = __stringify(_name), \
.mode = 0644, \
}, \
.show = _name##_show, \
.store = _name##_store, \
}
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
extern int in_suspend;
extern dev_t swsusp_resume_device;
extern sector_t swsusp_resume_block;
extern asmlinkage int swsusp_arch_suspend(void);
extern asmlinkage int swsusp_arch_resume(void);
extern int create_basic_memory_bitmaps(void);
extern void free_basic_memory_bitmaps(void);
extern int hibernate_preallocate_memory(void);
/**
* Auxiliary structure used for reading the snapshot image data and
* metadata from and writing them to the list of page backup entries
* (PBEs) which is the main data structure of swsusp.
*
* Using struct snapshot_handle we can transfer the image, including its
* metadata, as a continuous sequence of bytes with the help of
* snapshot_read_next() and snapshot_write_next().
*
* The code that writes the image to a storage or transfers it to
* the user land is required to use snapshot_read_next() for this
* purpose and it should not make any assumptions regarding the internal
* structure of the image. Similarly, the code that reads the image from
* a storage or transfers it from the user land is required to use
* snapshot_write_next().
*
* This may allow us to change the internal structure of the image
* in the future with considerably less effort.
*/
struct snapshot_handle {
loff_t offset; /* number of the last byte ready for reading
* or writing in the sequence
*/
unsigned int cur; /* number of the block of PAGE_SIZE bytes the
* next operation will refer to (ie. current)
*/
unsigned int cur_offset; /* offset with respect to the current
* block (for the next operation)
*/
unsigned int prev; /* number of the block of PAGE_SIZE bytes that
* was the current one previously
*/
void *buffer; /* address of the block to read from
* or write to
*/
unsigned int buf_offset; /* location to read from or write to,
* given as a displacement from 'buffer'
*/
int sync_read; /* Set to one to notify the caller of
* snapshot_write_next() that it may
* need to call wait_on_bio_chain()
*/
};
/* This macro returns the address from/to which the caller of
* snapshot_read_next()/snapshot_write_next() is allowed to
* read/write data after the function returns
*/
#define data_of(handle) ((handle).buffer + (handle).buf_offset)
extern unsigned int snapshot_additional_pages(struct zone *zone);
extern unsigned long snapshot_get_image_size(void);
extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
extern void snapshot_write_finalize(struct snapshot_handle *handle);
extern int snapshot_image_loaded(struct snapshot_handle *handle);
/* If unset, the snapshot device cannot be open. */
extern atomic_t snapshot_device_available;
extern sector_t alloc_swapdev_block(int swap);
extern void free_all_swap_pages(int swap);
extern int swsusp_swap_in_use(void);
/*
* Flags that can be passed from the hibernatig hernel to the "boot" kernel in
* the image header.
*/
#define SF_PLATFORM_MODE 1
/* kernel/power/hibernate.c */
extern int swsusp_check(void);
extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
extern void swsusp_close(fmode_t);
struct timeval;
/* kernel/power/swsusp.c */
extern void swsusp_show_speed(struct timeval *, struct timeval *,
unsigned int, char *);
#ifdef CONFIG_SUSPEND
/* kernel/power/suspend.c */
extern const char *const pm_states[];
extern bool valid_state(suspend_state_t state);
extern int suspend_devices_and_enter(suspend_state_t state);
extern int enter_state(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
static inline int suspend_devices_and_enter(suspend_state_t state)
{
return -ENOSYS;
}
static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
static inline bool valid_state(suspend_state_t state) { return false; }
#endif /* !CONFIG_SUSPEND */
#ifdef CONFIG_PM_TEST_SUSPEND
/* kernel/power/suspend_test.c */
extern void suspend_test_start(void);
extern void suspend_test_finish(const char *label);
#else /* !CONFIG_PM_TEST_SUSPEND */
static inline void suspend_test_start(void) {}
static inline void suspend_test_finish(const char *label) {}
#endif /* !CONFIG_PM_TEST_SUSPEND */
#ifdef CONFIG_PM_SLEEP
/* kernel/power/main.c */
extern int pm_notifier_call_chain(unsigned long val);
#endif
#ifdef CONFIG_HIGHMEM
int restore_highmem(void);
#else
static inline unsigned int count_highmem_pages(void) { return 0; }
static inline int restore_highmem(void) { return 0; }
#endif
/*
* Suspend test levels
*/
enum {
/* keep first */
TEST_NONE,
TEST_CORE,
TEST_CPUS,
TEST_PLATFORM,
TEST_DEVICES,
TEST_FREEZER,
/* keep last */
__TEST_AFTER_LAST
};
#define TEST_FIRST TEST_NONE
#define TEST_MAX (__TEST_AFTER_LAST - 1)
extern int pm_test_level;
#ifdef CONFIG_SUSPEND_FREEZER
static inline int suspend_freeze_processes(void)
{
return freeze_processes();
}
static inline void suspend_thaw_processes(void)
{
thaw_processes();
}
#else
static inline int suspend_freeze_processes(void)
{
return 0;
}
static inline void suspend_thaw_processes(void)
{
}
#endif

View File

@@ -0,0 +1,46 @@
/*
* poweroff.c - sysrq handler to gracefully power down machine.
*
* This file is released under the GPL v2
*/
#include <linux/kernel.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/pm.h>
#include <linux/workqueue.h>
#include <linux/reboot.h>
#include <linux/cpumask.h>
/*
* When the user hits Sys-Rq o to power down the machine this is the
* callback we use.
*/
static void do_poweroff(struct work_struct *dummy)
{
kernel_power_off();
}
static DECLARE_WORK(poweroff_work, do_poweroff);
static void handle_poweroff(int key, struct tty_struct *tty)
{
/* run sysrq poweroff on boot cpu */
schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
}
static struct sysrq_key_op sysrq_poweroff_op = {
.handler = handle_poweroff,
.help_msg = "powerOff",
.action_msg = "Power Off",
.enable_mask = SYSRQ_ENABLE_BOOT,
};
static int pm_sysrq_init(void)
{
register_sysrq_key('o', &sysrq_poweroff_op);
return 0;
}
subsys_initcall(pm_sysrq_init);

View File

@@ -0,0 +1,160 @@
/*
* drivers/power/process.c - Functions for starting/stopping processes on
* suspend transitions.
*
* Originally from swsusp.
*/
#undef DEBUG
#include <linux/interrupt.h>
#include <linux/oom.h>
#include <linux/suspend.h>
#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/freezer.h>
/*
* Timeout for stopping processes
*/
#define TIMEOUT (20 * HZ)
static inline int freezeable(struct task_struct * p)
{
if ((p == current) ||
(p->flags & PF_NOFREEZE) ||
(p->exit_state != 0))
return 0;
return 1;
}
static int try_to_freeze_tasks(bool sig_only)
{
struct task_struct *g, *p;
unsigned long end_time;
unsigned int todo;
struct timeval start, end;
u64 elapsed_csecs64;
unsigned int elapsed_csecs;
do_gettimeofday(&start);
end_time = jiffies + TIMEOUT;
do {
todo = 0;
read_lock(&tasklist_lock);
do_each_thread(g, p) {
if (frozen(p) || !freezeable(p))
continue;
if (!freeze_task(p, sig_only))
continue;
/*
* Now that we've done set_freeze_flag, don't
* perturb a task in TASK_STOPPED or TASK_TRACED.
* It is "frozen enough". If the task does wake
* up, it will immediately call try_to_freeze.
*/
if (!task_is_stopped_or_traced(p) &&
!freezer_should_skip(p))
todo++;
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
yield(); /* Yield is okay here */
if (time_after(jiffies, end_time))
break;
} while (todo);
do_gettimeofday(&end);
elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
do_div(elapsed_csecs64, NSEC_PER_SEC / 100);
elapsed_csecs = elapsed_csecs64;
if (todo) {
/* This does not unfreeze processes that are already frozen
* (we have slightly ugly calling convention in that respect,
* and caller must call thaw_processes() if something fails),
* but it cleans up leftover PF_FREEZE requests.
*/
printk("\n");
printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
"(%d tasks refusing to freeze):\n",
elapsed_csecs / 100, elapsed_csecs % 100, todo);
show_state();
read_lock(&tasklist_lock);
do_each_thread(g, p) {
task_lock(p);
if (freezing(p) && !freezer_should_skip(p))
printk(KERN_ERR " %s\n", p->comm);
cancel_freezing(p);
task_unlock(p);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
} else {
printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
elapsed_csecs % 100);
}
return todo ? -EBUSY : 0;
}
/**
* freeze_processes - tell processes to enter the refrigerator
*/
int freeze_processes(void)
{
int error;
printk("Freezing user space processes ... ");
error = try_to_freeze_tasks(true);
if (error)
goto Exit;
printk("done.\n");
printk("Freezing remaining freezable tasks ... ");
error = try_to_freeze_tasks(false);
if (error)
goto Exit;
printk("done.");
oom_killer_disable();
Exit:
BUG_ON(in_atomic());
printk("\n");
return error;
}
static void thaw_tasks(bool nosig_only)
{
struct task_struct *g, *p;
read_lock(&tasklist_lock);
do_each_thread(g, p) {
if (!freezeable(p))
continue;
if (nosig_only && should_send_signal(p))
continue;
if (cgroup_freezing_or_frozen(p))
continue;
thaw_process(p);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
}
void thaw_processes(void)
{
oom_killer_enable();
printk("Restarting tasks ... ");
thaw_tasks(true);
thaw_tasks(false);
schedule();
printk("done.\n");
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,300 @@
/*
* kernel/power/suspend.c - Suspend to RAM and standby functionality.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
* Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
*
* This file is released under the GPLv2.
*/
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include "power.h"
const char *const pm_states[PM_SUSPEND_MAX] = {
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
};
static struct platform_suspend_ops *suspend_ops;
/**
* suspend_set_ops - Set the global suspend method table.
* @ops: Pointer to ops structure.
*/
void suspend_set_ops(struct platform_suspend_ops *ops)
{
mutex_lock(&pm_mutex);
suspend_ops = ops;
mutex_unlock(&pm_mutex);
}
bool valid_state(suspend_state_t state)
{
/*
* All states need lowlevel support and need to be valid to the lowlevel
* implementation, no valid callback implies that none are valid.
*/
return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
}
/**
* suspend_valid_only_mem - generic memory-only valid callback
*
* Platform drivers that implement mem suspend only and only need
* to check for that in their .valid callback can use this instead
* of rolling their own .valid callback.
*/
int suspend_valid_only_mem(suspend_state_t state)
{
return state == PM_SUSPEND_MEM;
}
static int suspend_test(int level)
{
#ifdef CONFIG_PM_DEBUG
if (pm_test_level == level) {
printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
mdelay(5000);
return 1;
}
#endif /* !CONFIG_PM_DEBUG */
return 0;
}
/**
* suspend_prepare - Do prep work before entering low-power state.
*
* This is common code that is called for each state that we're entering.
* Run suspend notifiers, allocate a console and stop all processes.
*/
static int suspend_prepare(void)
{
int error;
if (!suspend_ops || !suspend_ops->enter)
return -EPERM;
pm_prepare_console();
error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
if (error)
goto Finish;
error = usermodehelper_disable();
if (error)
goto Finish;
error = suspend_freeze_processes();
if (!error)
return 0;
suspend_thaw_processes();
usermodehelper_enable();
Finish:
pm_notifier_call_chain(PM_POST_SUSPEND);
pm_restore_console();
return error;
}
/* default implementation */
void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
{
local_irq_disable();
}
/* default implementation */
void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
{
local_irq_enable();
}
/**
* suspend_enter - enter the desired system sleep state.
* @state: state to enter
*
* This function should be called after devices have been suspended.
*/
static int suspend_enter(suspend_state_t state)
{
int error;
if (suspend_ops->prepare) {
error = suspend_ops->prepare();
if (error)
return error;
}
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down\n");
goto Platfrom_finish;
}
if (suspend_ops->prepare_late) {
error = suspend_ops->prepare_late();
if (error)
goto Power_up_devices;
}
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
error = sysdev_suspend(PMSG_SUSPEND);
if (!error) {
if (!suspend_test(TEST_CORE))
error = suspend_ops->enter(state);
sysdev_resume();
}
arch_suspend_enable_irqs();
BUG_ON(irqs_disabled());
Enable_cpus:
enable_nonboot_cpus();
Platform_wake:
if (suspend_ops->wake)
suspend_ops->wake();
Power_up_devices:
dpm_resume_noirq(PMSG_RESUME);
Platfrom_finish:
if (suspend_ops->finish)
suspend_ops->finish();
return error;
}
/**
* suspend_devices_and_enter - suspend devices and enter the desired system
* sleep state.
* @state: state to enter
*/
int suspend_devices_and_enter(suspend_state_t state)
{
int error;
if (!suspend_ops)
return -ENOSYS;
if (suspend_ops->begin) {
error = suspend_ops->begin(state);
if (error)
goto Close;
}
suspend_console();
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "PM: Some devices failed to suspend\n");
goto Recover_platform;
}
suspend_test_finish("suspend devices");
if (suspend_test(TEST_DEVICES))
goto Recover_platform;
suspend_enter(state);
Resume_devices:
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
resume_console();
Close:
if (suspend_ops->end)
suspend_ops->end();
return error;
Recover_platform:
if (suspend_ops->recover)
suspend_ops->recover();
goto Resume_devices;
}
/**
* suspend_finish - Do final work before exiting suspend sequence.
*
* Call platform code to clean up, restart processes, and free the
* console that we've allocated. This is not called for suspend-to-disk.
*/
static void suspend_finish(void)
{
suspend_thaw_processes();
usermodehelper_enable();
pm_notifier_call_chain(PM_POST_SUSPEND);
pm_restore_console();
}
/**
* enter_state - Do common work of entering low-power state.
* @state: pm_state structure for state we're entering.
*
* Make sure we're the only ones trying to enter a sleep state. Fail
* if someone has beat us to it, since we don't want anything weird to
* happen when we wake up.
* Then, do the setup for suspend, enter the state, and cleaup (after
* we've woken up).
*/
int enter_state(suspend_state_t state)
{
int error;
if (!valid_state(state))
return -ENODEV;
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
error = suspend_prepare();
if (error)
goto Unlock;
if (suspend_test(TEST_FREEZER))
goto Finish;
pr_debug("PM: Entering %s sleep\n", pm_states[state]);
error = suspend_devices_and_enter(state);
Finish:
pr_debug("PM: Finishing wakeup.\n");
suspend_finish();
Unlock:
mutex_unlock(&pm_mutex);
return error;
}
/**
* pm_suspend - Externally visible function for suspending system.
* @state: Enumerated value of state to enter.
*
* Determine whether or not value is within range, get state
* structure, and enter (above).
*/
int pm_suspend(suspend_state_t state)
{
if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
return enter_state(state);
return -EINVAL;
}
EXPORT_SYMBOL(pm_suspend);

View File

@@ -0,0 +1,188 @@
/*
* kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
*
* Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
*
* This file is released under the GPLv2.
*/
#include <linux/init.h>
#include <linux/rtc.h>
#include "power.h"
/*
* We test the system suspend code by setting an RTC wakealarm a short
* time in the future, then suspending. Suspending the devices won't
* normally take long ... some systems only need a few milliseconds.
*
* The time it takes is system-specific though, so when we test this
* during system bootup we allow a LOT of time.
*/
#define TEST_SUSPEND_SECONDS 10
static unsigned long suspend_test_start_time;
void suspend_test_start(void)
{
/* FIXME Use better timebase than "jiffies", ideally a clocksource.
* What we want is a hardware counter that will work correctly even
* during the irqs-are-off stages of the suspend/resume cycle...
*/
suspend_test_start_time = jiffies;
}
void suspend_test_finish(const char *label)
{
long nj = jiffies - suspend_test_start_time;
unsigned msec;
msec = jiffies_to_msecs(abs(nj));
pr_info("PM: %s took %d.%03d seconds\n", label,
msec / 1000, msec % 1000);
/* Warning on suspend means the RTC alarm period needs to be
* larger -- the system was sooo slooowwww to suspend that the
* alarm (should have) fired before the system went to sleep!
*
* Warning on either suspend or resume also means the system
* has some performance issues. The stack dump of a WARN_ON
* is more likely to get the right attention than a printk...
*/
WARN(msec > (TEST_SUSPEND_SECONDS * 1000),
"Component: %s, time: %u\n", label, msec);
}
/*
* To test system suspend, we need a hands-off mechanism to resume the
* system. RTCs wake alarms are a common self-contained mechanism.
*/
static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
{
static char err_readtime[] __initdata =
KERN_ERR "PM: can't read %s time, err %d\n";
static char err_wakealarm [] __initdata =
KERN_ERR "PM: can't set %s wakealarm, err %d\n";
static char err_suspend[] __initdata =
KERN_ERR "PM: suspend test failed, error %d\n";
static char info_test[] __initdata =
KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
unsigned long now;
struct rtc_wkalrm alm;
int status;
/* this may fail if the RTC hasn't been initialized */
status = rtc_read_time(rtc, &alm.time);
if (status < 0) {
printk(err_readtime, dev_name(&rtc->dev), status);
return;
}
rtc_tm_to_time(&alm.time, &now);
memset(&alm, 0, sizeof alm);
rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
alm.enabled = true;
status = rtc_set_alarm(rtc, &alm);
if (status < 0) {
printk(err_wakealarm, dev_name(&rtc->dev), status);
return;
}
if (state == PM_SUSPEND_MEM) {
printk(info_test, pm_states[state]);
status = pm_suspend(state);
if (status == -ENODEV)
state = PM_SUSPEND_STANDBY;
}
if (state == PM_SUSPEND_STANDBY) {
printk(info_test, pm_states[state]);
status = pm_suspend(state);
}
if (status < 0)
printk(err_suspend, status);
/* Some platforms can't detect that the alarm triggered the
* wakeup, or (accordingly) disable it after it afterwards.
* It's supposed to give oneshot behavior; cope.
*/
alm.enabled = false;
rtc_set_alarm(rtc, &alm);
}
static int __init has_wakealarm(struct device *dev, void *name_ptr)
{
struct rtc_device *candidate = to_rtc_device(dev);
if (!candidate->ops->set_alarm)
return 0;
if (!device_may_wakeup(candidate->dev.parent))
return 0;
*(const char **)name_ptr = dev_name(dev);
return 1;
}
/*
* Kernel options like "test_suspend=mem" force suspend/resume sanity tests
* at startup time. They're normally disabled, for faster boot and because
* we can't know which states really work on this particular system.
*/
static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
static char warn_bad_state[] __initdata =
KERN_WARNING "PM: can't test '%s' suspend state\n";
static int __init setup_test_suspend(char *value)
{
unsigned i;
/* "=mem" ==> "mem" */
value++;
for (i = 0; i < PM_SUSPEND_MAX; i++) {
if (!pm_states[i])
continue;
if (strcmp(pm_states[i], value) != 0)
continue;
test_state = (__force suspend_state_t) i;
return 0;
}
printk(warn_bad_state, value);
return 0;
}
__setup("test_suspend", setup_test_suspend);
static int __init test_suspend(void)
{
static char warn_no_rtc[] __initdata =
KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
char *pony = NULL;
struct rtc_device *rtc = NULL;
/* PM is initialized by now; is that state testable? */
if (test_state == PM_SUSPEND_ON)
goto done;
if (!valid_state(test_state)) {
printk(warn_bad_state, pm_states[test_state]);
goto done;
}
/* RTCs have initialized by now too ... can we use one? */
class_find_device(rtc_class, NULL, &pony, has_wakealarm);
if (pony)
rtc = rtc_class_open(pony);
if (!rtc) {
printk(warn_no_rtc);
goto done;
}
/* go for it */
test_wakealarm(rtc, test_state);
rtc_class_close(rtc);
done:
return 0;
}
late_initcall(test_suspend);

646
kernel/kernel/power/swap.c Normal file
View File

@@ -0,0 +1,646 @@
/*
* linux/kernel/power/swap.c
*
* This file provides functions for reading the suspend image from
* and writing it to a swap partition.
*
* Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
*
* This file is released under the GPLv2.
*
*/
#include <linux/module.h>
#include <linux/file.h>
#include <linux/delay.h>
#include <linux/bitops.h>
#include <linux/genhd.h>
#include <linux/device.h>
#include <linux/buffer_head.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pm.h>
#include "power.h"
#define SWSUSP_SIG "S1SUSPEND"
struct swsusp_header {
char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
sector_t image;
unsigned int flags; /* Flags to pass to the "boot" kernel */
char orig_sig[10];
char sig[10];
} __attribute__((packed));
static struct swsusp_header *swsusp_header;
/*
* General things
*/
static unsigned short root_swap = 0xffff;
static struct block_device *resume_bdev;
/**
* submit - submit BIO request.
* @rw: READ or WRITE.
* @off physical offset of page.
* @page: page we're reading or writing.
* @bio_chain: list of pending biod (for async reading)
*
* Straight from the textbook - allocate and initialize the bio.
* If we're reading, make sure the page is marked as dirty.
* Then submit it and, if @bio_chain == NULL, wait.
*/
static int submit(int rw, pgoff_t page_off, struct page *page,
struct bio **bio_chain)
{
const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
struct bio *bio;
bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
bio->bi_sector = page_off * (PAGE_SIZE >> 9);
bio->bi_bdev = resume_bdev;
bio->bi_end_io = end_swap_bio_read;
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
page_off);
bio_put(bio);
return -EFAULT;
}
lock_page(page);
bio_get(bio);
if (bio_chain == NULL) {
submit_bio(bio_rw, bio);
wait_on_page_locked(page);
if (rw == READ)
bio_set_pages_dirty(bio);
bio_put(bio);
} else {
if (rw == READ)
get_page(page); /* These pages are freed later */
bio->bi_private = *bio_chain;
*bio_chain = bio;
submit_bio(bio_rw, bio);
}
return 0;
}
static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
{
return submit(READ, page_off, virt_to_page(addr), bio_chain);
}
static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
{
return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
}
static int wait_on_bio_chain(struct bio **bio_chain)
{
struct bio *bio;
struct bio *next_bio;
int ret = 0;
if (bio_chain == NULL)
return 0;
bio = *bio_chain;
if (bio == NULL)
return 0;
while (bio) {
struct page *page;
next_bio = bio->bi_private;
page = bio->bi_io_vec[0].bv_page;
wait_on_page_locked(page);
if (!PageUptodate(page) || PageError(page))
ret = -EIO;
put_page(page);
bio_put(bio);
bio = next_bio;
}
*bio_chain = NULL;
return ret;
}
/*
* Saving part
*/
static int mark_swapfiles(sector_t start, unsigned int flags)
{
int error;
bio_read_page(swsusp_resume_block, swsusp_header, NULL);
if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
swsusp_header->image = start;
swsusp_header->flags = flags;
error = bio_write_page(swsusp_resume_block,
swsusp_header, NULL);
} else {
printk(KERN_ERR "PM: Swap header not found!\n");
error = -ENODEV;
}
return error;
}
/**
* swsusp_swap_check - check if the resume device is a swap device
* and get its index (if so)
*/
static int swsusp_swap_check(void) /* This is called before saving image */
{
int res;
res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
&resume_bdev);
if (res < 0)
return res;
root_swap = res;
res = blkdev_get(resume_bdev, FMODE_WRITE);
if (res)
return res;
res = set_blocksize(resume_bdev, PAGE_SIZE);
if (res < 0)
blkdev_put(resume_bdev, FMODE_WRITE);
return res;
}
/**
* write_page - Write one page to given swap location.
* @buf: Address we're writing.
* @offset: Offset of the swap page we're writing to.
* @bio_chain: Link the next write BIO here
*/
static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
{
void *src;
if (!offset)
return -ENOSPC;
if (bio_chain) {
src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
if (src) {
memcpy(src, buf, PAGE_SIZE);
} else {
WARN_ON_ONCE(1);
bio_chain = NULL; /* Go synchronous */
src = buf;
}
} else {
src = buf;
}
return bio_write_page(offset, src, bio_chain);
}
/*
* The swap map is a data structure used for keeping track of each page
* written to a swap partition. It consists of many swap_map_page
* structures that contain each an array of MAP_PAGE_SIZE swap entries.
* These structures are stored on the swap and linked together with the
* help of the .next_swap member.
*
* The swap map is created during suspend. The swap map pages are
* allocated and populated one at a time, so we only need one memory
* page to set up the entire structure.
*
* During resume we also only need to use one swap_map_page structure
* at a time.
*/
#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
struct swap_map_page {
sector_t entries[MAP_PAGE_ENTRIES];
sector_t next_swap;
};
/**
* The swap_map_handle structure is used for handling swap in
* a file-alike way
*/
struct swap_map_handle {
struct swap_map_page *cur;
sector_t cur_swap;
unsigned int k;
};
static void release_swap_writer(struct swap_map_handle *handle)
{
if (handle->cur)
free_page((unsigned long)handle->cur);
handle->cur = NULL;
}
static int get_swap_writer(struct swap_map_handle *handle)
{
handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
if (!handle->cur)
return -ENOMEM;
handle->cur_swap = alloc_swapdev_block(root_swap);
if (!handle->cur_swap) {
release_swap_writer(handle);
return -ENOSPC;
}
handle->k = 0;
return 0;
}
static int swap_write_page(struct swap_map_handle *handle, void *buf,
struct bio **bio_chain)
{
int error = 0;
sector_t offset;
if (!handle->cur)
return -EINVAL;
offset = alloc_swapdev_block(root_swap);
error = write_page(buf, offset, bio_chain);
if (error)
return error;
handle->cur->entries[handle->k++] = offset;
if (handle->k >= MAP_PAGE_ENTRIES) {
error = wait_on_bio_chain(bio_chain);
if (error)
goto out;
offset = alloc_swapdev_block(root_swap);
if (!offset)
return -ENOSPC;
handle->cur->next_swap = offset;
error = write_page(handle->cur, handle->cur_swap, NULL);
if (error)
goto out;
memset(handle->cur, 0, PAGE_SIZE);
handle->cur_swap = offset;
handle->k = 0;
}
out:
return error;
}
static int flush_swap_writer(struct swap_map_handle *handle)
{
if (handle->cur && handle->cur_swap)
return write_page(handle->cur, handle->cur_swap, NULL);
else
return -EINVAL;
}
/**
* save_image - save the suspend image data
*/
static int save_image(struct swap_map_handle *handle,
struct snapshot_handle *snapshot,
unsigned int nr_to_write)
{
unsigned int m;
int ret;
int nr_pages;
int err2;
struct bio *bio;
struct timeval start;
struct timeval stop;
printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ",
nr_to_write);
m = nr_to_write / 100;
if (!m)
m = 1;
nr_pages = 0;
bio = NULL;
do_gettimeofday(&start);
while (1) {
ret = snapshot_read_next(snapshot, PAGE_SIZE);
if (ret <= 0)
break;
ret = swap_write_page(handle, data_of(*snapshot), &bio);
if (ret)
break;
if (!(nr_pages % m))
printk("\b\b\b\b%3d%%", nr_pages / m);
nr_pages++;
}
err2 = wait_on_bio_chain(&bio);
do_gettimeofday(&stop);
if (!ret)
ret = err2;
if (!ret)
printk("\b\b\b\bdone\n");
else
printk("\n");
swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
return ret;
}
/**
* enough_swap - Make sure we have enough swap to save the image.
*
* Returns TRUE or FALSE after checking the total amount of swap
* space avaiable from the resume partition.
*/
static int enough_swap(unsigned int nr_pages)
{
unsigned int free_swap = count_swap_pages(root_swap, 1);
pr_debug("PM: Free swap pages: %u\n", free_swap);
return free_swap > nr_pages + PAGES_FOR_IO;
}
/**
* swsusp_write - Write entire image and metadata.
* @flags: flags to pass to the "boot" kernel in the image header
*
* It is important _NOT_ to umount filesystems at this point. We want
* them synced (in case something goes wrong) but we DO not want to mark
* filesystem clean: it is not. (And it does not matter, if we resume
* correctly, we'll mark system clean, anyway.)
*/
int swsusp_write(unsigned int flags)
{
struct swap_map_handle handle;
struct snapshot_handle snapshot;
struct swsusp_info *header;
int error;
error = swsusp_swap_check();
if (error) {
printk(KERN_ERR "PM: Cannot find swap device, try "
"swapon -a.\n");
return error;
}
memset(&snapshot, 0, sizeof(struct snapshot_handle));
error = snapshot_read_next(&snapshot, PAGE_SIZE);
if (error < PAGE_SIZE) {
if (error >= 0)
error = -EFAULT;
goto out;
}
header = (struct swsusp_info *)data_of(snapshot);
if (!enough_swap(header->pages)) {
printk(KERN_ERR "PM: Not enough free swap\n");
error = -ENOSPC;
goto out;
}
error = get_swap_writer(&handle);
if (!error) {
sector_t start = handle.cur_swap;
error = swap_write_page(&handle, header, NULL);
if (!error)
error = save_image(&handle, &snapshot,
header->pages - 1);
if (!error) {
flush_swap_writer(&handle);
printk(KERN_INFO "PM: S");
error = mark_swapfiles(start, flags);
printk("|\n");
}
}
if (error)
free_all_swap_pages(root_swap);
release_swap_writer(&handle);
out:
swsusp_close(FMODE_WRITE);
return error;
}
/**
* The following functions allow us to read data using a swap map
* in a file-alike way
*/
static void release_swap_reader(struct swap_map_handle *handle)
{
if (handle->cur)
free_page((unsigned long)handle->cur);
handle->cur = NULL;
}
static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
{
int error;
if (!start)
return -EINVAL;
handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
if (!handle->cur)
return -ENOMEM;
error = bio_read_page(start, handle->cur, NULL);
if (error) {
release_swap_reader(handle);
return error;
}
handle->k = 0;
return 0;
}
static int swap_read_page(struct swap_map_handle *handle, void *buf,
struct bio **bio_chain)
{
sector_t offset;
int error;
if (!handle->cur)
return -EINVAL;
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
error = bio_read_page(offset, buf, bio_chain);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
error = wait_on_bio_chain(bio_chain);
handle->k = 0;
offset = handle->cur->next_swap;
if (!offset)
release_swap_reader(handle);
else if (!error)
error = bio_read_page(offset, handle->cur, NULL);
}
return error;
}
/**
* load_image - load the image using the swap map handle
* @handle and the snapshot handle @snapshot
* (assume there are @nr_pages pages to load)
*/
static int load_image(struct swap_map_handle *handle,
struct snapshot_handle *snapshot,
unsigned int nr_to_read)
{
unsigned int m;
int error = 0;
struct timeval start;
struct timeval stop;
struct bio *bio;
int err2;
unsigned nr_pages;
printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ",
nr_to_read);
m = nr_to_read / 100;
if (!m)
m = 1;
nr_pages = 0;
bio = NULL;
do_gettimeofday(&start);
for ( ; ; ) {
error = snapshot_write_next(snapshot, PAGE_SIZE);
if (error <= 0)
break;
error = swap_read_page(handle, data_of(*snapshot), &bio);
if (error)
break;
if (snapshot->sync_read)
error = wait_on_bio_chain(&bio);
if (error)
break;
if (!(nr_pages % m))
printk("\b\b\b\b%3d%%", nr_pages / m);
nr_pages++;
}
err2 = wait_on_bio_chain(&bio);
do_gettimeofday(&stop);
if (!error)
error = err2;
if (!error) {
printk("\b\b\b\bdone\n");
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
error = -ENODATA;
} else
printk("\n");
swsusp_show_speed(&start, &stop, nr_to_read, "Read");
return error;
}
/**
* swsusp_read - read the hibernation image.
* @flags_p: flags passed by the "frozen" kernel in the image header should
* be written into this memeory location
*/
int swsusp_read(unsigned int *flags_p)
{
int error;
struct swap_map_handle handle;
struct snapshot_handle snapshot;
struct swsusp_info *header;
*flags_p = swsusp_header->flags;
if (IS_ERR(resume_bdev)) {
pr_debug("PM: Image device not initialised\n");
return PTR_ERR(resume_bdev);
}
memset(&snapshot, 0, sizeof(struct snapshot_handle));
error = snapshot_write_next(&snapshot, PAGE_SIZE);
if (error < PAGE_SIZE)
return error < 0 ? error : -EFAULT;
header = (struct swsusp_info *)data_of(snapshot);
error = get_swap_reader(&handle, swsusp_header->image);
if (!error)
error = swap_read_page(&handle, header, NULL);
if (!error)
error = load_image(&handle, &snapshot, header->pages - 1);
release_swap_reader(&handle);
if (!error)
pr_debug("PM: Image successfully loaded\n");
else
pr_debug("PM: Error %d resuming\n", error);
return error;
}
/**
* swsusp_check - Check for swsusp signature in the resume device
*/
int swsusp_check(void)
{
int error;
resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
if (!IS_ERR(resume_bdev)) {
set_blocksize(resume_bdev, PAGE_SIZE);
memset(swsusp_header, 0, PAGE_SIZE);
error = bio_read_page(swsusp_resume_block,
swsusp_header, NULL);
if (error)
goto put;
if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
/* Reset swap signature now */
error = bio_write_page(swsusp_resume_block,
swsusp_header, NULL);
} else {
error = -EINVAL;
}
put:
if (error)
blkdev_put(resume_bdev, FMODE_READ);
else
pr_debug("PM: Signature found, resuming\n");
} else {
error = PTR_ERR(resume_bdev);
}
if (error)
pr_debug("PM: Error %d checking image file\n", error);
return error;
}
/**
* swsusp_close - close swap device.
*/
void swsusp_close(fmode_t mode)
{
if (IS_ERR(resume_bdev)) {
pr_debug("PM: Image device not initialised\n");
return;
}
blkdev_put(resume_bdev, mode);
}
static int swsusp_header_init(void)
{
swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
if (!swsusp_header)
panic("Could not allocate memory for swsusp_header\n");
return 0;
}
core_initcall(swsusp_header_init);

View File

@@ -0,0 +1,188 @@
/*
* linux/kernel/power/swsusp.c
*
* This file provides code to write suspend image to swap and read it back.
*
* Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
* Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
*
* This file is released under the GPLv2.
*
* I'd like to thank the following people for their work:
*
* Pavel Machek <pavel@ucw.cz>:
* Modifications, defectiveness pointing, being with me at the very beginning,
* suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
*
* Steve Doddi <dirk@loth.demon.co.uk>:
* Support the possibility of hardware state restoring.
*
* Raph <grey.havens@earthling.net>:
* Support for preserving states of network devices and virtual console
* (including X and svgatextmode)
*
* Kurt Garloff <garloff@suse.de>:
* Straightened the critical function in order to prevent compilers from
* playing tricks with local variables.
*
* Andreas Mohr <a.mohr@mailto.de>
*
* Alex Badea <vampire@go.ro>:
* Fixed runaway init
*
* Rafael J. Wysocki <rjw@sisk.pl>
* Reworked the freeing of memory and the handling of swap
*
* More state savers are welcome. Especially for the scsi layer...
*
* For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
*/
#include <linux/mm.h>
#include <linux/suspend.h>
#include <linux/spinlock.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/swap.h>
#include <linux/pm.h>
#include <linux/swapops.h>
#include <linux/bootmem.h>
#include <linux/syscalls.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/rbtree.h>
#include <linux/io.h>
#include "power.h"
int in_suspend __nosavedata = 0;
/**
* The following functions are used for tracing the allocated
* swap pages, so that they can be freed in case of an error.
*/
struct swsusp_extent {
struct rb_node node;
unsigned long start;
unsigned long end;
};
static struct rb_root swsusp_extents = RB_ROOT;
static int swsusp_extents_insert(unsigned long swap_offset)
{
struct rb_node **new = &(swsusp_extents.rb_node);
struct rb_node *parent = NULL;
struct swsusp_extent *ext;
/* Figure out where to put the new node */
while (*new) {
ext = container_of(*new, struct swsusp_extent, node);
parent = *new;
if (swap_offset < ext->start) {
/* Try to merge */
if (swap_offset == ext->start - 1) {
ext->start--;
return 0;
}
new = &((*new)->rb_left);
} else if (swap_offset > ext->end) {
/* Try to merge */
if (swap_offset == ext->end + 1) {
ext->end++;
return 0;
}
new = &((*new)->rb_right);
} else {
/* It already is in the tree */
return -EINVAL;
}
}
/* Add the new node and rebalance the tree. */
ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
if (!ext)
return -ENOMEM;
ext->start = swap_offset;
ext->end = swap_offset;
rb_link_node(&ext->node, parent, new);
rb_insert_color(&ext->node, &swsusp_extents);
return 0;
}
/**
* alloc_swapdev_block - allocate a swap page and register that it has
* been allocated, so that it can be freed in case of an error.
*/
sector_t alloc_swapdev_block(int swap)
{
unsigned long offset;
offset = swp_offset(get_swap_page_of_type(swap));
if (offset) {
if (swsusp_extents_insert(offset))
swap_free(swp_entry(swap, offset));
else
return swapdev_block(swap, offset);
}
return 0;
}
/**
* free_all_swap_pages - free swap pages allocated for saving image data.
* It also frees the extents used to register which swap entres had been
* allocated.
*/
void free_all_swap_pages(int swap)
{
struct rb_node *node;
while ((node = swsusp_extents.rb_node)) {
struct swsusp_extent *ext;
unsigned long offset;
ext = container_of(node, struct swsusp_extent, node);
rb_erase(node, &swsusp_extents);
for (offset = ext->start; offset <= ext->end; offset++)
swap_free(swp_entry(swap, offset));
kfree(ext);
}
}
int swsusp_swap_in_use(void)
{
return (swsusp_extents.rb_node != NULL);
}
/**
* swsusp_show_speed - print the time elapsed between two events represented by
* @start and @stop
*
* @nr_pages - number of pages processed between @start and @stop
* @msg - introductory message to print
*/
void swsusp_show_speed(struct timeval *start, struct timeval *stop,
unsigned nr_pages, char *msg)
{
s64 elapsed_centisecs64;
int centisecs;
int k;
int kps;
elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
centisecs = elapsed_centisecs64;
if (centisecs == 0)
centisecs = 1; /* avoid div-by-zero */
k = nr_pages * (PAGE_SIZE / 1024);
kps = (k * 100) / centisecs;
printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
msg, k,
centisecs / 100, centisecs % 100,
kps / 1000, (kps % 1000) / 10);
}

451
kernel/kernel/power/user.c Normal file
View File

@@ -0,0 +1,451 @@
/*
* linux/kernel/power/user.c
*
* This file provides the user space interface for software suspend/resume.
*
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
*
* This file is released under the GPLv2.
*
*/
#include <linux/suspend.h>
#include <linux/syscalls.h>
#include <linux/reboot.h>
#include <linux/string.h>
#include <linux/device.h>
#include <linux/miscdevice.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pm.h>
#include <linux/fs.h>
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
#include <scsi/scsi_scan.h>
#include <asm/uaccess.h>
#include "power.h"
/*
* NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and
* will be removed in the future. They are only preserved here for
* compatibility with existing userland utilities.
*/
#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
#define PMOPS_PREPARE 1
#define PMOPS_ENTER 2
#define PMOPS_FINISH 3
/*
* NOTE: The following ioctl definitions are wrong and have been replaced with
* correct ones. They are only preserved here for compatibility with existing
* userland utilities and will be removed in the future.
*/
#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
#define SNAPSHOT_MINOR 231
static struct snapshot_data {
struct snapshot_handle handle;
int swap;
int mode;
char frozen;
char ready;
char platform_support;
} snapshot_state;
atomic_t snapshot_device_available = ATOMIC_INIT(1);
static int snapshot_open(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
int error;
mutex_lock(&pm_mutex);
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
error = -EBUSY;
goto Unlock;
}
if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
atomic_inc(&snapshot_device_available);
error = -ENOSYS;
goto Unlock;
}
if(create_basic_memory_bitmaps()) {
atomic_inc(&snapshot_device_available);
error = -ENOMEM;
goto Unlock;
}
nonseekable_open(inode, filp);
data = &snapshot_state;
filp->private_data = data;
memset(&data->handle, 0, sizeof(struct snapshot_handle));
if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
/* Hibernating. The image device should be accessible. */
data->swap = swsusp_resume_device ?
swap_type_of(swsusp_resume_device, 0, NULL) : -1;
data->mode = O_RDONLY;
error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
if (error)
pm_notifier_call_chain(PM_POST_HIBERNATION);
} else {
/*
* Resuming. We may need to wait for the image device to
* appear.
*/
wait_for_device_probe();
scsi_complete_async_scans();
data->swap = -1;
data->mode = O_WRONLY;
error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
if (error)
pm_notifier_call_chain(PM_POST_RESTORE);
}
if (error)
atomic_inc(&snapshot_device_available);
data->frozen = 0;
data->ready = 0;
data->platform_support = 0;
Unlock:
mutex_unlock(&pm_mutex);
return error;
}
static int snapshot_release(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
mutex_lock(&pm_mutex);
swsusp_free();
free_basic_memory_bitmaps();
data = filp->private_data;
free_all_swap_pages(data->swap);
if (data->frozen)
thaw_processes();
pm_notifier_call_chain(data->mode == O_RDONLY ?
PM_POST_HIBERNATION : PM_POST_RESTORE);
atomic_inc(&snapshot_device_available);
mutex_unlock(&pm_mutex);
return 0;
}
static ssize_t snapshot_read(struct file *filp, char __user *buf,
size_t count, loff_t *offp)
{
struct snapshot_data *data;
ssize_t res;
mutex_lock(&pm_mutex);
data = filp->private_data;
if (!data->ready) {
res = -ENODATA;
goto Unlock;
}
res = snapshot_read_next(&data->handle, count);
if (res > 0) {
if (copy_to_user(buf, data_of(data->handle), res))
res = -EFAULT;
else
*offp = data->handle.offset;
}
Unlock:
mutex_unlock(&pm_mutex);
return res;
}
static ssize_t snapshot_write(struct file *filp, const char __user *buf,
size_t count, loff_t *offp)
{
struct snapshot_data *data;
ssize_t res;
mutex_lock(&pm_mutex);
data = filp->private_data;
res = snapshot_write_next(&data->handle, count);
if (res > 0) {
if (copy_from_user(data_of(data->handle), buf, res))
res = -EFAULT;
else
*offp = data->handle.offset;
}
mutex_unlock(&pm_mutex);
return res;
}
static long snapshot_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
int error = 0;
struct snapshot_data *data;
loff_t size;
sector_t offset;
if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
return -ENOTTY;
if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
return -ENOTTY;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
data = filp->private_data;
switch (cmd) {
case SNAPSHOT_FREEZE:
if (data->frozen)
break;
printk("Syncing filesystems ... ");
sys_sync();
printk("done.\n");
error = usermodehelper_disable();
if (error)
break;
error = freeze_processes();
if (error) {
thaw_processes();
usermodehelper_enable();
}
if (!error)
data->frozen = 1;
break;
case SNAPSHOT_UNFREEZE:
if (!data->frozen || data->ready)
break;
thaw_processes();
usermodehelper_enable();
data->frozen = 0;
break;
case SNAPSHOT_CREATE_IMAGE:
case SNAPSHOT_ATOMIC_SNAPSHOT:
if (data->mode != O_RDONLY || !data->frozen || data->ready) {
error = -EPERM;
break;
}
error = hibernation_snapshot(data->platform_support);
if (!error)
error = put_user(in_suspend, (int __user *)arg);
if (!error)
data->ready = 1;
break;
case SNAPSHOT_ATOMIC_RESTORE:
snapshot_write_finalize(&data->handle);
if (data->mode != O_WRONLY || !data->frozen ||
!snapshot_image_loaded(&data->handle)) {
error = -EPERM;
break;
}
error = hibernation_restore(data->platform_support);
break;
case SNAPSHOT_FREE:
swsusp_free();
memset(&data->handle, 0, sizeof(struct snapshot_handle));
data->ready = 0;
break;
case SNAPSHOT_PREF_IMAGE_SIZE:
case SNAPSHOT_SET_IMAGE_SIZE:
image_size = arg;
break;
case SNAPSHOT_GET_IMAGE_SIZE:
if (!data->ready) {
error = -ENODATA;
break;
}
size = snapshot_get_image_size();
size <<= PAGE_SHIFT;
error = put_user(size, (loff_t __user *)arg);
break;
case SNAPSHOT_AVAIL_SWAP_SIZE:
case SNAPSHOT_AVAIL_SWAP:
size = count_swap_pages(data->swap, 1);
size <<= PAGE_SHIFT;
error = put_user(size, (loff_t __user *)arg);
break;
case SNAPSHOT_ALLOC_SWAP_PAGE:
case SNAPSHOT_GET_SWAP_PAGE:
if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
error = -ENODEV;
break;
}
offset = alloc_swapdev_block(data->swap);
if (offset) {
offset <<= PAGE_SHIFT;
error = put_user(offset, (loff_t __user *)arg);
} else {
error = -ENOSPC;
}
break;
case SNAPSHOT_FREE_SWAP_PAGES:
if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
error = -ENODEV;
break;
}
free_all_swap_pages(data->swap);
break;
case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
if (!swsusp_swap_in_use()) {
/*
* User space encodes device types as two-byte values,
* so we need to recode them
*/
if (old_decode_dev(arg)) {
data->swap = swap_type_of(old_decode_dev(arg),
0, NULL);
if (data->swap < 0)
error = -ENODEV;
} else {
data->swap = -1;
error = -EINVAL;
}
} else {
error = -EPERM;
}
break;
case SNAPSHOT_S2RAM:
if (!data->frozen) {
error = -EPERM;
break;
}
/*
* Tasks are frozen and the notifiers have been called with
* PM_HIBERNATION_PREPARE
*/
error = suspend_devices_and_enter(PM_SUSPEND_MEM);
break;
case SNAPSHOT_PLATFORM_SUPPORT:
data->platform_support = !!arg;
break;
case SNAPSHOT_POWER_OFF:
if (data->platform_support)
error = hibernation_platform_enter();
break;
case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
error = -EINVAL;
switch (arg) {
case PMOPS_PREPARE:
data->platform_support = 1;
error = 0;
break;
case PMOPS_ENTER:
if (data->platform_support)
error = hibernation_platform_enter();
break;
case PMOPS_FINISH:
if (data->platform_support)
error = 0;
break;
default:
printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
}
break;
case SNAPSHOT_SET_SWAP_AREA:
if (swsusp_swap_in_use()) {
error = -EPERM;
} else {
struct resume_swap_area swap_area;
dev_t swdev;
error = copy_from_user(&swap_area, (void __user *)arg,
sizeof(struct resume_swap_area));
if (error) {
error = -EFAULT;
break;
}
/*
* User space encodes device types as two-byte values,
* so we need to recode them
*/
swdev = old_decode_dev(swap_area.dev);
if (swdev) {
offset = swap_area.offset;
data->swap = swap_type_of(swdev, offset, NULL);
if (data->swap < 0)
error = -ENODEV;
} else {
data->swap = -1;
error = -EINVAL;
}
}
break;
default:
error = -ENOTTY;
}
mutex_unlock(&pm_mutex);
return error;
}
static const struct file_operations snapshot_fops = {
.open = snapshot_open,
.release = snapshot_release,
.read = snapshot_read,
.write = snapshot_write,
.llseek = no_llseek,
.unlocked_ioctl = snapshot_ioctl,
};
static struct miscdevice snapshot_device = {
.minor = SNAPSHOT_MINOR,
.name = "snapshot",
.fops = &snapshot_fops,
};
static int __init snapshot_device_init(void)
{
return misc_register(&snapshot_device);
};
device_initcall(snapshot_device_init);

Some files were not shown because too many files have changed in this diff Show More