add idl4k kernel firmware version 1.13.0.105

This commit is contained in:
Jaroslav Kysela
2015-03-26 17:22:37 +01:00
parent 5194d2792e
commit e9070cdc77
31064 changed files with 12769984 additions and 0 deletions

4
kernel/drivers/md/.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
mktables
raid6altivec*.c
raid6int*.c
raid6tables.c

323
kernel/drivers/md/Kconfig Normal file
View File

@@ -0,0 +1,323 @@
#
# Block device driver configuration
#
menuconfig MD
bool "Multiple devices driver support (RAID and LVM)"
depends on BLOCK
help
Support multiple physical spindles through a single logical device.
Required for RAID and logical volume management.
if MD
config BLK_DEV_MD
tristate "RAID support"
---help---
This driver lets you combine several hard disk partitions into one
logical block device. This can be used to simply append one
partition to another one or to combine several redundant hard disks
into a RAID1/4/5 device so as to provide protection against hard
disk failures. This is called "Software RAID" since the combining of
the partitions is done by the kernel. "Hardware RAID" means that the
combining is done by a dedicated controller; if you have such a
controller, you do not need to say Y here.
More information about Software RAID on Linux is contained in the
Software RAID mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>. There you will also learn
where to get the supporting user space utilities raidtools.
If unsure, say N.
config MD_AUTODETECT
bool "Autodetect RAID arrays during kernel boot"
depends on BLK_DEV_MD=y
default y
---help---
If you say Y here, then the kernel will try to autodetect raid
arrays as part of its boot process.
If you don't use raid and say Y, this autodetection can cause
a several-second delay in the boot time due to various
synchronisation steps that are part of this step.
If unsure, say Y.
config MD_LINEAR
tristate "Linear (append) mode"
depends on BLK_DEV_MD
---help---
If you say Y here, then your multiple devices driver will be able to
use the so-called linear mode, i.e. it will combine the hard disk
partitions by simply appending one to the other.
To compile this as a module, choose M here: the module
will be called linear.
If unsure, say Y.
config MD_RAID0
tristate "RAID-0 (striping) mode"
depends on BLK_DEV_MD
---help---
If you say Y here, then your multiple devices driver will be able to
use the so-called raid0 mode, i.e. it will combine the hard disk
partitions into one logical device in such a fashion as to fill them
up evenly, one chunk here and one chunk there. This will increase
the throughput rate if the partitions reside on distinct disks.
Information about Software RAID on Linux is contained in the
Software-RAID mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>. There you will also
learn where to get the supporting user space utilities raidtools.
To compile this as a module, choose M here: the module
will be called raid0.
If unsure, say Y.
config MD_RAID1
tristate "RAID-1 (mirroring) mode"
depends on BLK_DEV_MD
---help---
A RAID-1 set consists of several disk drives which are exact copies
of each other. In the event of a mirror failure, the RAID driver
will continue to use the operational mirrors in the set, providing
an error free MD (multiple device) to the higher levels of the
kernel. In a set with N drives, the available space is the capacity
of a single drive, and the set protects against a failure of (N - 1)
drives.
Information about Software RAID on Linux is contained in the
Software-RAID mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>. There you will also
learn where to get the supporting user space utilities raidtools.
If you want to use such a RAID-1 set, say Y. To compile this code
as a module, choose M here: the module will be called raid1.
If unsure, say Y.
config MD_RAID10
tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)"
depends on BLK_DEV_MD && EXPERIMENTAL
---help---
RAID-10 provides a combination of striping (RAID-0) and
mirroring (RAID-1) with easier configuration and more flexible
layout.
Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to
be the same size (or at least, only as much as the smallest device
will be used).
RAID-10 provides a variety of layouts that provide different levels
of redundancy and performance.
RAID-10 requires mdadm-1.7.0 or later, available at:
ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
If unsure, say Y.
config MD_RAID456
tristate "RAID-4/RAID-5/RAID-6 mode"
depends on BLK_DEV_MD
select MD_RAID6_PQ
select ASYNC_MEMCPY
select ASYNC_XOR
select ASYNC_PQ
select ASYNC_RAID6_RECOV
---help---
A RAID-5 set of N drives with a capacity of C MB per drive provides
the capacity of C * (N - 1) MB, and protects against a failure
of a single drive. For a given sector (row) number, (N - 1) drives
contain data sectors, and one drive contains the parity protection.
For a RAID-4 set, the parity blocks are present on a single drive,
while a RAID-5 set distributes the parity across the drives in one
of the available parity distribution methods.
A RAID-6 set of N drives with a capacity of C MB per drive
provides the capacity of C * (N - 2) MB, and protects
against a failure of any two drives. For a given sector
(row) number, (N - 2) drives contain data sectors, and two
drives contains two independent redundancy syndromes. Like
RAID-5, RAID-6 distributes the syndromes across the drives
in one of the available parity distribution methods.
Information about Software RAID on Linux is contained in the
Software-RAID mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>. There you will also
learn where to get the supporting user space utilities raidtools.
If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y. To
compile this code as a module, choose M here: the module
will be called raid456.
If unsure, say Y.
config MULTICORE_RAID456
bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
depends on MD_RAID456
depends on SMP
depends on EXPERIMENTAL
---help---
Enable the raid456 module to dispatch per-stripe raid operations to a
thread pool.
If unsure, say N.
config MD_RAID6_PQ
tristate
config ASYNC_RAID6_TEST
tristate "Self test for hardware accelerated raid6 recovery"
depends on MD_RAID6_PQ
select ASYNC_RAID6_RECOV
---help---
This is a one-shot self test that permutes through the
recovery of all the possible two disk failure scenarios for a
N-disk array. Recovery is performed with the asynchronous
raid6 recovery routines, and will optionally use an offload
engine if one is available.
If unsure, say N.
config MD_MULTIPATH
tristate "Multipath I/O support"
depends on BLK_DEV_MD
help
Multipath-IO is the ability of certain devices to address the same
physical disk over multiple 'IO paths'. The code ensures that such
paths can be defined and handled at runtime, and ensures that a
transparent failover to the backup path(s) happens if a IO errors
arrives on the primary path.
If unsure, say N.
config MD_FAULTY
tristate "Faulty test module for MD"
depends on BLK_DEV_MD
help
The "faulty" module allows for a block device that occasionally returns
read or write errors. It is useful for testing.
In unsure, say N.
config BLK_DEV_DM
tristate "Device mapper support"
---help---
Device-mapper is a low level volume manager. It works by allowing
people to specify mappings for ranges of logical sectors. Various
mapping types are available, in addition people may write their own
modules containing custom mappings if they wish.
Higher level volume managers such as LVM2 use this driver.
To compile this as a module, choose M here: the module will be
called dm-mod.
If unsure, say N.
config DM_DEBUG
boolean "Device mapper debugging support"
depends on BLK_DEV_DM
---help---
Enable this for messages that may help debug device-mapper problems.
If unsure, say N.
config DM_CRYPT
tristate "Crypt target support"
depends on BLK_DEV_DM
select CRYPTO
select CRYPTO_CBC
---help---
This device-mapper target allows you to create a device that
transparently encrypts the data on it. You'll need to activate
the ciphers you're going to use in the cryptoapi configuration.
Information on how to use dm-crypt can be found on
<http://www.saout.de/misc/dm-crypt/>
To compile this code as a module, choose M here: the module will
be called dm-crypt.
If unsure, say N.
config DM_SNAPSHOT
tristate "Snapshot target"
depends on BLK_DEV_DM
---help---
Allow volume managers to take writable snapshots of a device.
config DM_MIRROR
tristate "Mirror target"
depends on BLK_DEV_DM
---help---
Allow volume managers to mirror logical volumes, also
needed for live data migration tools such as 'pvmove'.
config DM_LOG_USERSPACE
tristate "Mirror userspace logging (EXPERIMENTAL)"
depends on DM_MIRROR && EXPERIMENTAL && NET
select CONNECTOR
---help---
The userspace logging module provides a mechanism for
relaying the dm-dirty-log API to userspace. Log designs
which are more suited to userspace implementation (e.g.
shared storage logs) or experimental logs can be implemented
by leveraging this framework.
config DM_ZERO
tristate "Zero target"
depends on BLK_DEV_DM
---help---
A target that discards writes, and returns all zeroes for
reads. Useful in some recovery situations.
config DM_MULTIPATH
tristate "Multipath target"
depends on BLK_DEV_DM
# nasty syntax but means make DM_MULTIPATH independent
# of SCSI_DH if the latter isn't defined but if
# it is, DM_MULTIPATH must depend on it. We get a build
# error if SCSI_DH=m and DM_MULTIPATH=y
depends on SCSI_DH || !SCSI_DH
---help---
Allow volume managers to support multipath hardware.
config DM_MULTIPATH_QL
tristate "I/O Path Selector based on the number of in-flight I/Os"
depends on DM_MULTIPATH
---help---
This path selector is a dynamic load balancer which selects
the path with the least number of in-flight I/Os.
If unsure, say N.
config DM_MULTIPATH_ST
tristate "I/O Path Selector based on the service time"
depends on DM_MULTIPATH
---help---
This path selector is a dynamic load balancer which selects
the path expected to complete the incoming I/O in the shortest
time.
If unsure, say N.
config DM_DELAY
tristate "I/O delaying target (EXPERIMENTAL)"
depends on BLK_DEV_DM && EXPERIMENTAL
---help---
A target that delays reads and/or writes and can send
them to different devices. Useful for testing.
If unsure, say N.
config DM_UEVENT
bool "DM uevents (EXPERIMENTAL)"
depends on BLK_DEV_DM && EXPERIMENTAL
---help---
Generate udev events for DM events.
endif # MD

119
kernel/drivers/md/Makefile Normal file
View File

@@ -0,0 +1,119 @@
#
# Makefile for the kernel software RAID and LVM drivers.
#
dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
dm-multipath-y += dm-path-selector.o dm-mpath.o
dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
dm-snap-persistent.o
dm-mirror-y += dm-raid1.o
dm-log-userspace-y \
+= dm-log-userspace-base.o dm-log-userspace-transfer.o
md-mod-y += md.o bitmap.o
raid456-y += raid5.o
raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \
raid6int1.o raid6int2.o raid6int4.o \
raid6int8.o raid6int16.o raid6int32.o \
raid6altivec1.o raid6altivec2.o raid6altivec4.o \
raid6altivec8.o \
raid6mmx.o raid6sse1.o raid6sse2.o
hostprogs-y += mktables
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
# themselves, and md.o may use the personalities when it
# auto-initialised.
obj-$(CONFIG_MD_LINEAR) += linear.o
obj-$(CONFIG_MD_RAID0) += raid0.o
obj-$(CONFIG_MD_RAID1) += raid1.o
obj-$(CONFIG_MD_RAID10) += raid10.o
obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o
obj-$(CONFIG_MD_RAID456) += raid456.o
obj-$(CONFIG_MD_MULTIPATH) += multipath.o
obj-$(CONFIG_MD_FAULTY) += faulty.o
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_DELAY) += dm-delay.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
obj-$(CONFIG_DM_ZERO) += dm-zero.o
quiet_cmd_unroll = UNROLL $@
cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) \
< $< > $@ || ( rm -f $@ && exit 1 )
ifeq ($(CONFIG_ALTIVEC),y)
altivec_flags := -maltivec -mabi=altivec
endif
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
endif
targets += raid6int1.c
$(obj)/raid6int1.c: UNROLL := 1
$(obj)/raid6int1.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
targets += raid6int2.c
$(obj)/raid6int2.c: UNROLL := 2
$(obj)/raid6int2.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
targets += raid6int4.c
$(obj)/raid6int4.c: UNROLL := 4
$(obj)/raid6int4.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
targets += raid6int8.c
$(obj)/raid6int8.c: UNROLL := 8
$(obj)/raid6int8.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
targets += raid6int16.c
$(obj)/raid6int16.c: UNROLL := 16
$(obj)/raid6int16.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
targets += raid6int32.c
$(obj)/raid6int32.c: UNROLL := 32
$(obj)/raid6int32.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
CFLAGS_raid6altivec1.o += $(altivec_flags)
targets += raid6altivec1.c
$(obj)/raid6altivec1.c: UNROLL := 1
$(obj)/raid6altivec1.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
CFLAGS_raid6altivec2.o += $(altivec_flags)
targets += raid6altivec2.c
$(obj)/raid6altivec2.c: UNROLL := 2
$(obj)/raid6altivec2.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
CFLAGS_raid6altivec4.o += $(altivec_flags)
targets += raid6altivec4.c
$(obj)/raid6altivec4.c: UNROLL := 4
$(obj)/raid6altivec4.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
CFLAGS_raid6altivec8.o += $(altivec_flags)
targets += raid6altivec8.c
$(obj)/raid6altivec8.c: UNROLL := 8
$(obj)/raid6altivec8.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
quiet_cmd_mktable = TABLE $@
cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
targets += raid6tables.c
$(obj)/raid6tables.c: $(obj)/mktables FORCE
$(call if_changed,mktable)

1712
kernel/drivers/md/bitmap.c Normal file

File diff suppressed because it is too large Load Diff

291
kernel/drivers/md/bitmap.h Normal file
View File

@@ -0,0 +1,291 @@
/*
* bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
*
* additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
*/
#ifndef BITMAP_H
#define BITMAP_H 1
#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
* with version 3, it is host-endian which is non-portable
*/
#define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_HOSTENDIAN 3
#define BITMAP_MINOR 39
/*
* in-memory bitmap:
*
* Use 16 bit block counters to track pending writes to each "chunk".
* The 2 high order bits are special-purpose, the first is a flag indicating
* whether a resync is needed. The second is a flag indicating whether a
* resync is active.
* This means that the counter is actually 14 bits:
*
* +--------+--------+------------------------------------------------+
* | resync | resync | counter |
* | needed | active | |
* | (0-1) | (0-1) | (0-16383) |
* +--------+--------+------------------------------------------------+
*
* The "resync needed" bit is set when:
* a '1' bit is read from storage at startup.
* a write request fails on some drives
* a resync is aborted on a chunk with 'resync active' set
* It is cleared (and resync-active set) when a resync starts across all drives
* of the chunk.
*
*
* The "resync active" bit is set when:
* a resync is started on all drives, and resync_needed is set.
* resync_needed will be cleared (as long as resync_active wasn't already set).
* It is cleared when a resync completes.
*
* The counter counts pending write requests, plus the on-disk bit.
* When the counter is '1' and the resync bits are clear, the on-disk
* bit can be cleared aswell, thus setting the counter to 0.
* When we set a bit, or in the counter (to start a write), if the fields is
* 0, we first set the disk bit and set the counter to 1.
*
* If the counter is 0, the on-disk bit is clear and the stipe is clean
* Anything that dirties the stipe pushes the counter to 2 (at least)
* and sets the on-disk bit (lazily).
* If a periodic sweep find the counter at 2, it is decremented to 1.
* If the sweep find the counter at 1, the on-disk bit is cleared and the
* counter goes to zero.
*
* Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
* counters as a fallback when "page" memory cannot be allocated:
*
* Normal case (page memory allocated):
*
* page pointer (32-bit)
*
* [ ] ------+
* |
* +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
* c1 c2 c2048
*
* Hijacked case (page memory allocation failed):
*
* hijacked page pointer (32-bit)
*
* [ ][ ] (no page memory allocated)
* counter #1 (16-bit) counter #2 (16-bit)
*
*/
#ifdef __KERNEL__
#define PAGE_BITS (PAGE_SIZE << 3)
#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
typedef __u16 bitmap_counter_t;
#define COUNTER_BITS 16
#define COUNTER_BIT_SHIFT 4
#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
/* how many counters per page? */
#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
/* same, except a shift value for more efficient bitops */
#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
/* same, except a mask value for more efficient bitops */
#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
#define BITMAP_BLOCK_SIZE 512
#define BITMAP_BLOCK_SHIFT 9
/* how many blocks per chunk? (this is variable) */
#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
/* when hijacked, the counters and bits represent even larger "chunks" */
/* there will be 1024 chunks represented by each counter in the page pointers */
#define PAGEPTR_BLOCK_RATIO(bitmap) \
(CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
#define PAGEPTR_BLOCK_SHIFT(bitmap) \
(CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
/*
* on-disk bitmap:
*
* Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
* file a page at a time. There's a superblock at the start of the file.
*/
/* map chunks (bits) to file pages - offset by the size of the superblock */
#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
#endif
/*
* bitmap structures:
*/
#define BITMAP_MAGIC 0x6d746962
/* use these for bitmap->flags and bitmap->sb->state bit-fields */
enum bitmap_state {
BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */
BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
BITMAP_HOSTENDIAN = 0x8000,
};
/* the superblock at the front of the bitmap file -- little endian */
typedef struct bitmap_super_s {
__le32 magic; /* 0 BITMAP_MAGIC */
__le32 version; /* 4 the bitmap major for now, could change... */
__u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */
__le64 events; /* 24 event counter for the bitmap (1)*/
__le64 events_cleared;/*32 event counter when last bit cleared (2) */
__le64 sync_size; /* 40 the size of the md device's sync range(3) */
__le32 state; /* 48 bitmap state information */
__le32 chunksize; /* 52 the bitmap chunk size in bytes */
__le32 daemon_sleep; /* 56 seconds between disk flushes */
__le32 write_behind; /* 60 number of outstanding write-behind writes */
__u8 pad[256 - 64]; /* set to zero */
} bitmap_super_t;
/* notes:
* (1) This event counter is updated before the eventcounter in the md superblock
* When a bitmap is loaded, it is only accepted if this event counter is equal
* to, or one greater than, the event counter in the superblock.
* (2) This event counter is updated when the other one is *if*and*only*if* the
* array is not degraded. As bits are not cleared when the array is degraded,
* this represents the last time that any bits were cleared.
* If a device is being added that has an event count with this value or
* higher, it is accepted as conforming to the bitmap.
* (3)This is the number of sectors represented by the bitmap, and is the range that
* resync happens across. For raid1 and raid5/6 it is the size of individual
* devices. For raid10 it is the size of the array.
*/
#ifdef __KERNEL__
/* the in-memory bitmap is represented by bitmap_pages */
struct bitmap_page {
/*
* map points to the actual memory page
*/
char *map;
/*
* in emergencies (when map cannot be alloced), hijack the map
* pointer and use it as two counters itself
*/
unsigned int hijacked:1;
/*
* count of dirty bits on the page
*/
unsigned int count:31;
};
/* keep track of bitmap file pages that have pending writes on them */
struct page_list {
struct list_head list;
struct page *page;
};
/* the main bitmap structure - one per mddev */
struct bitmap {
struct bitmap_page *bp;
unsigned long pages; /* total number of pages in the bitmap */
unsigned long missing_pages; /* number of pages not yet allocated */
mddev_t *mddev; /* the md device that the bitmap is for */
int counter_bits; /* how many bits per block counter */
/* bitmap chunksize -- how much data does each bit represent? */
unsigned long chunksize;
unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
unsigned long chunks; /* total number of data chunks for the array */
/* We hold a count on the chunk currently being synced, and drop
* it when the last block is started. If the resync is aborted
* midway, we need to be able to drop that count, so we remember
* the counted chunk..
*/
unsigned long syncchunk;
__u64 events_cleared;
int need_sync;
/* bitmap spinlock */
spinlock_t lock;
long offset; /* offset from superblock if file is NULL */
struct file *file; /* backing disk file */
struct page *sb_page; /* cached copy of the bitmap file superblock */
struct page **filemap; /* list of cache pages for the file */
unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
unsigned long file_pages; /* number of pages in the file */
int last_page_size; /* bytes in the last page */
unsigned long flags;
int allclean;
unsigned long max_write_behind; /* write-behind mode */
atomic_t behind_writes;
/*
* the bitmap daemon - periodically wakes up and sweeps the bitmap
* file, cleaning up bits and flushing out pages to disk as necessary
*/
unsigned long daemon_lastrun; /* jiffies of last run */
unsigned long daemon_sleep; /* how many seconds between updates? */
unsigned long last_end_sync; /* when we lasted called end_sync to
* update bitmap with resync progress */
atomic_t pending_writes; /* pending writes to the bitmap file */
wait_queue_head_t write_wait;
wait_queue_head_t overflow_wait;
#ifndef __GENKSYMS__
wait_queue_head_t behind_wait;
#endif
};
/* the bitmap API */
/* these are used only by md/bitmap */
int bitmap_create(mddev_t *mddev);
void bitmap_flush(mddev_t *mddev);
void bitmap_destroy(mddev_t *mddev);
void bitmap_print_sb(struct bitmap *bitmap);
void bitmap_update_sb(struct bitmap *bitmap);
int bitmap_setallbits(struct bitmap *bitmap);
void bitmap_write_all(struct bitmap *bitmap);
void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
/* these are exported */
int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
unsigned long sectors, int behind);
void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
unsigned long sectors, int success, int behind);
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
void bitmap_close_sync(struct bitmap *bitmap);
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
void bitmap_unplug(struct bitmap *bitmap);
void bitmap_daemon_work(mddev_t *mddev);
#endif
#endif

View File

@@ -0,0 +1,71 @@
/*
* Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
#ifndef DM_BIO_RECORD_H
#define DM_BIO_RECORD_H
#include <linux/bio.h>
/*
* There are lots of mutable fields in the bio struct that get
* changed by the lower levels of the block layer. Some targets,
* such as multipath, may wish to resubmit a bio on error. The
* functions in this file help the target record and restore the
* original bio state.
*/
struct dm_bio_vec_details {
#if PAGE_SIZE < 65536
__u16 bv_len;
__u16 bv_offset;
#else
unsigned bv_len;
unsigned bv_offset;
#endif
};
struct dm_bio_details {
sector_t bi_sector;
struct block_device *bi_bdev;
unsigned int bi_size;
unsigned short bi_idx;
unsigned long bi_flags;
struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES];
};
static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
{
unsigned i;
bd->bi_sector = bio->bi_sector;
bd->bi_bdev = bio->bi_bdev;
bd->bi_size = bio->bi_size;
bd->bi_idx = bio->bi_idx;
bd->bi_flags = bio->bi_flags;
for (i = 0; i < bio->bi_vcnt; i++) {
bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len;
bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset;
}
}
static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
{
unsigned i;
bio->bi_sector = bd->bi_sector;
bio->bi_bdev = bd->bi_bdev;
bio->bi_size = bd->bi_size;
bio->bi_idx = bd->bi_idx;
bio->bi_flags = bd->bi_flags;
for (i = 0; i < bio->bi_vcnt; i++) {
bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len;
bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset;
}
}
#endif

1426
kernel/drivers/md/dm-crypt.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,396 @@
/*
* Copyright (C) 2005-2007 Red Hat GmbH
*
* A target that delays reads and/or writes and can send
* them to different devices.
*
* This file is released under the GPL.
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/device-mapper.h>
#define DM_MSG_PREFIX "delay"
struct delay_c {
struct timer_list delay_timer;
struct mutex timer_lock;
struct work_struct flush_expired_bios;
struct list_head delayed_bios;
atomic_t may_delay;
mempool_t *delayed_pool;
struct dm_dev *dev_read;
sector_t start_read;
unsigned read_delay;
unsigned reads;
struct dm_dev *dev_write;
sector_t start_write;
unsigned write_delay;
unsigned writes;
};
struct dm_delay_info {
struct delay_c *context;
struct list_head list;
struct bio *bio;
unsigned long expires;
};
static DEFINE_MUTEX(delayed_bios_lock);
static struct workqueue_struct *kdelayd_wq;
static struct kmem_cache *delayed_cache;
static void handle_delayed_timer(unsigned long data)
{
struct delay_c *dc = (struct delay_c *)data;
queue_work(kdelayd_wq, &dc->flush_expired_bios);
}
static void queue_timeout(struct delay_c *dc, unsigned long expires)
{
mutex_lock(&dc->timer_lock);
if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires)
mod_timer(&dc->delay_timer, expires);
mutex_unlock(&dc->timer_lock);
}
static void flush_bios(struct bio *bio)
{
struct bio *n;
while (bio) {
n = bio->bi_next;
bio->bi_next = NULL;
generic_make_request(bio);
bio = n;
}
}
static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
{
struct dm_delay_info *delayed, *next;
unsigned long next_expires = 0;
int start_timer = 0;
struct bio_list flush_bios = { };
mutex_lock(&delayed_bios_lock);
list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
if (flush_all || time_after_eq(jiffies, delayed->expires)) {
list_del(&delayed->list);
bio_list_add(&flush_bios, delayed->bio);
if ((bio_data_dir(delayed->bio) == WRITE))
delayed->context->writes--;
else
delayed->context->reads--;
mempool_free(delayed, dc->delayed_pool);
continue;
}
if (!start_timer) {
start_timer = 1;
next_expires = delayed->expires;
} else
next_expires = min(next_expires, delayed->expires);
}
mutex_unlock(&delayed_bios_lock);
if (start_timer)
queue_timeout(dc, next_expires);
return bio_list_get(&flush_bios);
}
static void flush_expired_bios(struct work_struct *work)
{
struct delay_c *dc;
dc = container_of(work, struct delay_c, flush_expired_bios);
flush_bios(flush_delayed_bios(dc, 0));
}
/*
* Mapping parameters:
* <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
*
* With separate write parameters, the first set is only used for reads.
* Delays are specified in milliseconds.
*/
static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct delay_c *dc;
unsigned long long tmpll;
if (argc != 3 && argc != 6) {
ti->error = "requires exactly 3 or 6 arguments";
return -EINVAL;
}
dc = kmalloc(sizeof(*dc), GFP_KERNEL);
if (!dc) {
ti->error = "Cannot allocate context";
return -ENOMEM;
}
dc->reads = dc->writes = 0;
if (sscanf(argv[1], "%llu", &tmpll) != 1) {
ti->error = "Invalid device sector";
goto bad;
}
dc->start_read = tmpll;
if (sscanf(argv[2], "%u", &dc->read_delay) != 1) {
ti->error = "Invalid delay";
goto bad;
}
if (dm_get_device(ti, argv[0], dc->start_read, ti->len,
dm_table_get_mode(ti->table), &dc->dev_read)) {
ti->error = "Device lookup failed";
goto bad;
}
dc->dev_write = NULL;
if (argc == 3)
goto out;
if (sscanf(argv[4], "%llu", &tmpll) != 1) {
ti->error = "Invalid write device sector";
goto bad_dev_read;
}
dc->start_write = tmpll;
if (sscanf(argv[5], "%u", &dc->write_delay) != 1) {
ti->error = "Invalid write delay";
goto bad_dev_read;
}
if (dm_get_device(ti, argv[3], dc->start_write, ti->len,
dm_table_get_mode(ti->table), &dc->dev_write)) {
ti->error = "Write device lookup failed";
goto bad_dev_read;
}
out:
dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache);
if (!dc->delayed_pool) {
DMERR("Couldn't create delayed bio pool.");
goto bad_dev_write;
}
setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc);
INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
INIT_LIST_HEAD(&dc->delayed_bios);
mutex_init(&dc->timer_lock);
atomic_set(&dc->may_delay, 1);
ti->num_flush_requests = 1;
ti->private = dc;
return 0;
bad_dev_write:
if (dc->dev_write)
dm_put_device(ti, dc->dev_write);
bad_dev_read:
dm_put_device(ti, dc->dev_read);
bad:
kfree(dc);
return -EINVAL;
}
static void delay_dtr(struct dm_target *ti)
{
struct delay_c *dc = ti->private;
flush_workqueue(kdelayd_wq);
dm_put_device(ti, dc->dev_read);
if (dc->dev_write)
dm_put_device(ti, dc->dev_write);
mempool_destroy(dc->delayed_pool);
kfree(dc);
}
static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
{
struct dm_delay_info *delayed;
unsigned long expires = 0;
if (!delay || !atomic_read(&dc->may_delay))
return 1;
delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO);
delayed->context = dc;
delayed->bio = bio;
delayed->expires = expires = jiffies + (delay * HZ / 1000);
mutex_lock(&delayed_bios_lock);
if (bio_data_dir(bio) == WRITE)
dc->writes++;
else
dc->reads++;
list_add_tail(&delayed->list, &dc->delayed_bios);
mutex_unlock(&delayed_bios_lock);
queue_timeout(dc, expires);
return 0;
}
static void delay_presuspend(struct dm_target *ti)
{
struct delay_c *dc = ti->private;
atomic_set(&dc->may_delay, 0);
del_timer_sync(&dc->delay_timer);
flush_bios(flush_delayed_bios(dc, 1));
}
static void delay_resume(struct dm_target *ti)
{
struct delay_c *dc = ti->private;
atomic_set(&dc->may_delay, 1);
}
static int delay_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
struct delay_c *dc = ti->private;
if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
bio->bi_bdev = dc->dev_write->bdev;
if (bio_sectors(bio))
bio->bi_sector = dc->start_write +
(bio->bi_sector - ti->begin);
return delay_bio(dc, dc->write_delay, bio);
}
bio->bi_bdev = dc->dev_read->bdev;
bio->bi_sector = dc->start_read +
(bio->bi_sector - ti->begin);
return delay_bio(dc, dc->read_delay, bio);
}
static int delay_status(struct dm_target *ti, status_type_t type,
char *result, unsigned maxlen)
{
struct delay_c *dc = ti->private;
int sz = 0;
switch (type) {
case STATUSTYPE_INFO:
DMEMIT("%u %u", dc->reads, dc->writes);
break;
case STATUSTYPE_TABLE:
DMEMIT("%s %llu %u", dc->dev_read->name,
(unsigned long long) dc->start_read,
dc->read_delay);
if (dc->dev_write)
DMEMIT(" %s %llu %u", dc->dev_write->name,
(unsigned long long) dc->start_write,
dc->write_delay);
break;
}
return 0;
}
static int delay_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct delay_c *dc = ti->private;
int ret = 0;
ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data);
if (ret)
goto out;
if (dc->dev_write)
ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data);
out:
return ret;
}
static struct target_type delay_target = {
.name = "delay",
.version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = delay_ctr,
.dtr = delay_dtr,
.map = delay_map,
.presuspend = delay_presuspend,
.resume = delay_resume,
.status = delay_status,
.iterate_devices = delay_iterate_devices,
};
static int __init dm_delay_init(void)
{
int r = -ENOMEM;
kdelayd_wq = create_workqueue("kdelayd");
if (!kdelayd_wq) {
DMERR("Couldn't start kdelayd");
goto bad_queue;
}
delayed_cache = KMEM_CACHE(dm_delay_info, 0);
if (!delayed_cache) {
DMERR("Couldn't create delayed bio cache.");
goto bad_memcache;
}
r = dm_register_target(&delay_target);
if (r < 0) {
DMERR("register failed %d", r);
goto bad_register;
}
return 0;
bad_register:
kmem_cache_destroy(delayed_cache);
bad_memcache:
destroy_workqueue(kdelayd_wq);
bad_queue:
return r;
}
static void __exit dm_delay_exit(void)
{
dm_unregister_target(&delay_target);
kmem_cache_destroy(delayed_cache);
destroy_workqueue(kdelayd_wq);
}
/* Module hooks */
module_init(dm_delay_init);
module_exit(dm_delay_exit);
MODULE_DESCRIPTION(DM_NAME " delay target");
MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>");
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,300 @@
/*
* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
* Copyright (C) 2006-2008 Red Hat GmbH
*
* This file is released under the GPL.
*/
#include "dm-exception-store.h"
#include <linux/ctype.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#define DM_MSG_PREFIX "snapshot exception stores"
static LIST_HEAD(_exception_store_types);
static DEFINE_SPINLOCK(_lock);
static struct dm_exception_store_type *__find_exception_store_type(const char *name)
{
struct dm_exception_store_type *type;
list_for_each_entry(type, &_exception_store_types, list)
if (!strcmp(name, type->name))
return type;
return NULL;
}
static struct dm_exception_store_type *_get_exception_store_type(const char *name)
{
struct dm_exception_store_type *type;
spin_lock(&_lock);
type = __find_exception_store_type(name);
if (type && !try_module_get(type->module))
type = NULL;
spin_unlock(&_lock);
return type;
}
/*
* get_type
* @type_name
*
* Attempt to retrieve the dm_exception_store_type by name. If not already
* available, attempt to load the appropriate module.
*
* Exstore modules are named "dm-exstore-" followed by the 'type_name'.
* Modules may contain multiple types.
* This function will first try the module "dm-exstore-<type_name>",
* then truncate 'type_name' on the last '-' and try again.
*
* For example, if type_name was "clustered-shared", it would search
* 'dm-exstore-clustered-shared' then 'dm-exstore-clustered'.
*
* 'dm-exception-store-<type_name>' is too long of a name in my
* opinion, which is why I've chosen to have the files
* containing exception store implementations be 'dm-exstore-<type_name>'.
* If you want your module to be autoloaded, you will follow this
* naming convention.
*
* Returns: dm_exception_store_type* on success, NULL on failure
*/
static struct dm_exception_store_type *get_type(const char *type_name)
{
char *p, *type_name_dup;
struct dm_exception_store_type *type;
type = _get_exception_store_type(type_name);
if (type)
return type;
type_name_dup = kstrdup(type_name, GFP_KERNEL);
if (!type_name_dup) {
DMERR("No memory left to attempt load for \"%s\"", type_name);
return NULL;
}
while (request_module("dm-exstore-%s", type_name_dup) ||
!(type = _get_exception_store_type(type_name))) {
p = strrchr(type_name_dup, '-');
if (!p)
break;
p[0] = '\0';
}
if (!type)
DMWARN("Module for exstore type \"%s\" not found.", type_name);
kfree(type_name_dup);
return type;
}
static void put_type(struct dm_exception_store_type *type)
{
spin_lock(&_lock);
module_put(type->module);
spin_unlock(&_lock);
}
int dm_exception_store_type_register(struct dm_exception_store_type *type)
{
int r = 0;
spin_lock(&_lock);
if (!__find_exception_store_type(type->name))
list_add(&type->list, &_exception_store_types);
else
r = -EEXIST;
spin_unlock(&_lock);
return r;
}
EXPORT_SYMBOL(dm_exception_store_type_register);
int dm_exception_store_type_unregister(struct dm_exception_store_type *type)
{
spin_lock(&_lock);
if (!__find_exception_store_type(type->name)) {
spin_unlock(&_lock);
return -EINVAL;
}
list_del(&type->list);
spin_unlock(&_lock);
return 0;
}
EXPORT_SYMBOL(dm_exception_store_type_unregister);
static int set_chunk_size(struct dm_exception_store *store,
const char *chunk_size_arg, char **error)
{
unsigned long chunk_size_ulong;
char *value;
chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10);
if (*chunk_size_arg == '\0' || *value != '\0' ||
chunk_size_ulong > UINT_MAX) {
*error = "Invalid chunk size";
return -EINVAL;
}
if (!chunk_size_ulong) {
store->chunk_size = store->chunk_mask = store->chunk_shift = 0;
return 0;
}
return dm_exception_store_set_chunk_size(store,
(unsigned) chunk_size_ulong,
error);
}
int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
unsigned chunk_size,
char **error)
{
/* Check chunk_size is a power of 2 */
if (!is_power_of_2(chunk_size)) {
*error = "Chunk size is not a power of 2";
return -EINVAL;
}
/* Validate the chunk size against the device block size */
if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
*error = "Chunk size is not a multiple of device blocksize";
return -EINVAL;
}
if (chunk_size > INT_MAX >> SECTOR_SHIFT) {
*error = "Chunk size is too high";
return -EINVAL;
}
store->chunk_size = chunk_size;
store->chunk_mask = chunk_size - 1;
store->chunk_shift = ffs(chunk_size) - 1;
return 0;
}
int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
unsigned *args_used,
struct dm_exception_store **store)
{
int r = 0;
struct dm_exception_store_type *type = NULL;
struct dm_exception_store *tmp_store;
char persistent;
if (argc < 3) {
ti->error = "Insufficient exception store arguments";
return -EINVAL;
}
tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL);
if (!tmp_store) {
ti->error = "Exception store allocation failed";
return -ENOMEM;
}
persistent = toupper(*argv[1]);
if (persistent == 'P')
type = get_type("P");
else if (persistent == 'N')
type = get_type("N");
else {
ti->error = "Persistent flag is not P or N";
r = -EINVAL;
goto bad_type;
}
if (!type) {
ti->error = "Exception store type not recognised";
r = -EINVAL;
goto bad_type;
}
tmp_store->type = type;
tmp_store->ti = ti;
r = dm_get_device(ti, argv[0], 0, 0,
FMODE_READ | FMODE_WRITE, &tmp_store->cow);
if (r) {
ti->error = "Cannot get COW device";
goto bad_cow;
}
r = set_chunk_size(tmp_store, argv[2], &ti->error);
if (r)
goto bad_ctr;
r = type->ctr(tmp_store, 0, NULL);
if (r) {
ti->error = "Exception store type constructor failed";
goto bad_ctr;
}
*args_used = 3;
*store = tmp_store;
return 0;
bad_ctr:
dm_put_device(ti, tmp_store->cow);
bad_cow:
put_type(type);
bad_type:
kfree(tmp_store);
return r;
}
EXPORT_SYMBOL(dm_exception_store_create);
void dm_exception_store_destroy(struct dm_exception_store *store)
{
store->type->dtr(store);
dm_put_device(store->ti, store->cow);
put_type(store->type);
kfree(store);
}
EXPORT_SYMBOL(dm_exception_store_destroy);
int dm_exception_store_init(void)
{
int r;
r = dm_transient_snapshot_init();
if (r) {
DMERR("Unable to register transient exception store type.");
goto transient_fail;
}
r = dm_persistent_snapshot_init();
if (r) {
DMERR("Unable to register persistent exception store type");
goto persistent_fail;
}
return 0;
persistent_fail:
dm_persistent_snapshot_exit();
transient_fail:
return r;
}
void dm_exception_store_exit(void)
{
dm_persistent_snapshot_exit();
dm_transient_snapshot_exit();
}

View File

@@ -0,0 +1,192 @@
/*
* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
* Copyright (C) 2008 Red Hat, Inc. All rights reserved.
*
* Device-mapper snapshot exception store.
*
* This file is released under the GPL.
*/
#ifndef _LINUX_DM_EXCEPTION_STORE
#define _LINUX_DM_EXCEPTION_STORE
#include <linux/blkdev.h>
#include <linux/device-mapper.h>
/*
* The snapshot code deals with largish chunks of the disk at a
* time. Typically 32k - 512k.
*/
typedef sector_t chunk_t;
/*
* An exception is used where an old chunk of data has been
* replaced by a new one.
* If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
* of chunks that follow contiguously. Remaining bits hold the number of the
* chunk within the device.
*/
struct dm_snap_exception {
struct list_head hash_list;
chunk_t old_chunk;
chunk_t new_chunk;
};
/*
* Abstraction to handle the meta/layout of exception stores (the
* COW device).
*/
struct dm_exception_store;
struct dm_exception_store_type {
const char *name;
struct module *module;
int (*ctr) (struct dm_exception_store *store,
unsigned argc, char **argv);
/*
* Destroys this object when you've finished with it.
*/
void (*dtr) (struct dm_exception_store *store);
/*
* The target shouldn't read the COW device until this is
* called. As exceptions are read from the COW, they are
* reported back via the callback.
*/
int (*read_metadata) (struct dm_exception_store *store,
int (*callback)(void *callback_context,
chunk_t old, chunk_t new),
void *callback_context);
/*
* Find somewhere to store the next exception.
*/
int (*prepare_exception) (struct dm_exception_store *store,
struct dm_snap_exception *e);
/*
* Update the metadata with this exception.
*/
void (*commit_exception) (struct dm_exception_store *store,
struct dm_snap_exception *e,
void (*callback) (void *, int success),
void *callback_context);
/*
* The snapshot is invalid, note this in the metadata.
*/
void (*drop_snapshot) (struct dm_exception_store *store);
unsigned (*status) (struct dm_exception_store *store,
status_type_t status, char *result,
unsigned maxlen);
/*
* Return how full the snapshot is.
*/
void (*fraction_full) (struct dm_exception_store *store,
sector_t *numerator,
sector_t *denominator);
/* For internal device-mapper use only. */
struct list_head list;
};
struct dm_exception_store {
struct dm_exception_store_type *type;
struct dm_target *ti;
struct dm_dev *cow;
/* Size of data blocks saved - must be a power of 2 */
unsigned chunk_size;
unsigned chunk_mask;
unsigned chunk_shift;
void *context;
};
/*
* Funtions to manipulate consecutive chunks
*/
# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
# define DM_CHUNK_CONSECUTIVE_BITS 8
# define DM_CHUNK_NUMBER_BITS 56
static inline chunk_t dm_chunk_number(chunk_t chunk)
{
return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
}
static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
{
return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
}
static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
{
e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
BUG_ON(!dm_consecutive_chunk_count(e));
}
# else
# define DM_CHUNK_CONSECUTIVE_BITS 0
static inline chunk_t dm_chunk_number(chunk_t chunk)
{
return chunk;
}
static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
{
return 0;
}
static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
{
}
# endif
/*
* Return the number of sectors in the device.
*/
static inline sector_t get_dev_size(struct block_device *bdev)
{
return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
}
static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
sector_t sector)
{
return sector >> store->chunk_shift;
}
int dm_exception_store_type_register(struct dm_exception_store_type *type);
int dm_exception_store_type_unregister(struct dm_exception_store_type *type);
int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
unsigned chunk_size,
char **error);
int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
unsigned *args_used,
struct dm_exception_store **store);
void dm_exception_store_destroy(struct dm_exception_store *store);
int dm_exception_store_init(void);
void dm_exception_store_exit(void);
/*
* Two exception store implementations.
*/
int dm_persistent_snapshot_init(void);
void dm_persistent_snapshot_exit(void);
int dm_transient_snapshot_init(void);
void dm_transient_snapshot_exit(void);
#endif /* _LINUX_DM_EXCEPTION_STORE */

474
kernel/drivers/md/dm-io.c Normal file
View File

@@ -0,0 +1,474 @@
/*
* Copyright (C) 2003 Sistina Software
* Copyright (C) 2006 Red Hat GmbH
*
* This file is released under the GPL.
*/
#include <linux/device-mapper.h>
#include <linux/bio.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/dm-io.h>
struct dm_io_client {
mempool_t *pool;
struct bio_set *bios;
};
/* FIXME: can we shrink this ? */
struct io {
unsigned long error_bits;
unsigned long eopnotsupp_bits;
atomic_t count;
struct task_struct *sleeper;
struct dm_io_client *client;
io_notify_fn callback;
void *context;
};
/*
* io contexts are only dynamically allocated for asynchronous
* io. Since async io is likely to be the majority of io we'll
* have the same number of io contexts as bios! (FIXME: must reduce this).
*/
static unsigned int pages_to_ios(unsigned int pages)
{
return 4 * pages; /* too many ? */
}
/*
* Create a client with mempool and bioset.
*/
struct dm_io_client *dm_io_client_create(unsigned num_pages)
{
unsigned ios = pages_to_ios(num_pages);
struct dm_io_client *client;
client = kmalloc(sizeof(*client), GFP_KERNEL);
if (!client)
return ERR_PTR(-ENOMEM);
client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
if (!client->pool)
goto bad;
client->bios = bioset_create(16, 0);
if (!client->bios)
goto bad;
return client;
bad:
if (client->pool)
mempool_destroy(client->pool);
kfree(client);
return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(dm_io_client_create);
int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client)
{
return mempool_resize(client->pool, pages_to_ios(num_pages),
GFP_KERNEL);
}
EXPORT_SYMBOL(dm_io_client_resize);
void dm_io_client_destroy(struct dm_io_client *client)
{
mempool_destroy(client->pool);
bioset_free(client->bios);
kfree(client);
}
EXPORT_SYMBOL(dm_io_client_destroy);
/*-----------------------------------------------------------------
* We need to keep track of which region a bio is doing io for.
* In order to save a memory allocation we store this the last
* bvec which we know is unused (blech).
* XXX This is ugly and can OOPS with some configs... find another way.
*---------------------------------------------------------------*/
static inline void bio_set_region(struct bio *bio, unsigned region)
{
bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
}
static inline unsigned bio_get_region(struct bio *bio)
{
return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
}
/*-----------------------------------------------------------------
* We need an io object to keep track of the number of bios that
* have been dispatched for a particular io.
*---------------------------------------------------------------*/
static void dec_count(struct io *io, unsigned int region, int error)
{
if (error) {
set_bit(region, &io->error_bits);
if (error == -EOPNOTSUPP)
set_bit(region, &io->eopnotsupp_bits);
}
if (atomic_dec_and_test(&io->count)) {
if (io->sleeper)
wake_up_process(io->sleeper);
else {
unsigned long r = io->error_bits;
io_notify_fn fn = io->callback;
void *context = io->context;
mempool_free(io, io->client->pool);
fn(r, context);
}
}
}
static void endio(struct bio *bio, int error)
{
struct io *io;
unsigned region;
if (error && bio_data_dir(bio) == READ)
zero_fill_bio(bio);
/*
* The bio destructor in bio_put() may use the io object.
*/
io = bio->bi_private;
region = bio_get_region(bio);
bio->bi_max_vecs++;
bio_put(bio);
dec_count(io, region, error);
}
/*-----------------------------------------------------------------
* These little objects provide an abstraction for getting a new
* destination page for io.
*---------------------------------------------------------------*/
struct dpages {
void (*get_page)(struct dpages *dp,
struct page **p, unsigned long *len, unsigned *offset);
void (*next_page)(struct dpages *dp);
unsigned context_u;
void *context_ptr;
};
/*
* Functions for getting the pages from a list.
*/
static void list_get_page(struct dpages *dp,
struct page **p, unsigned long *len, unsigned *offset)
{
unsigned o = dp->context_u;
struct page_list *pl = (struct page_list *) dp->context_ptr;
*p = pl->page;
*len = PAGE_SIZE - o;
*offset = o;
}
static void list_next_page(struct dpages *dp)
{
struct page_list *pl = (struct page_list *) dp->context_ptr;
dp->context_ptr = pl->next;
dp->context_u = 0;
}
static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset)
{
dp->get_page = list_get_page;
dp->next_page = list_next_page;
dp->context_u = offset;
dp->context_ptr = pl;
}
/*
* Functions for getting the pages from a bvec.
*/
static void bvec_get_page(struct dpages *dp,
struct page **p, unsigned long *len, unsigned *offset)
{
struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
*p = bvec->bv_page;
*len = bvec->bv_len;
*offset = bvec->bv_offset;
}
static void bvec_next_page(struct dpages *dp)
{
struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
dp->context_ptr = bvec + 1;
}
static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
{
dp->get_page = bvec_get_page;
dp->next_page = bvec_next_page;
dp->context_ptr = bvec;
}
/*
* Functions for getting the pages from a VMA.
*/
static void vm_get_page(struct dpages *dp,
struct page **p, unsigned long *len, unsigned *offset)
{
*p = vmalloc_to_page(dp->context_ptr);
*offset = dp->context_u;
*len = PAGE_SIZE - dp->context_u;
}
static void vm_next_page(struct dpages *dp)
{
dp->context_ptr += PAGE_SIZE - dp->context_u;
dp->context_u = 0;
}
static void vm_dp_init(struct dpages *dp, void *data)
{
dp->get_page = vm_get_page;
dp->next_page = vm_next_page;
dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
dp->context_ptr = data;
}
static void dm_bio_destructor(struct bio *bio)
{
struct io *io = bio->bi_private;
bio_free(bio, io->client->bios);
}
/*
* Functions for getting the pages from kernel memory.
*/
static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len,
unsigned *offset)
{
*p = virt_to_page(dp->context_ptr);
*offset = dp->context_u;
*len = PAGE_SIZE - dp->context_u;
}
static void km_next_page(struct dpages *dp)
{
dp->context_ptr += PAGE_SIZE - dp->context_u;
dp->context_u = 0;
}
static void km_dp_init(struct dpages *dp, void *data)
{
dp->get_page = km_get_page;
dp->next_page = km_next_page;
dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
dp->context_ptr = data;
}
/*-----------------------------------------------------------------
* IO routines that accept a list of pages.
*---------------------------------------------------------------*/
static void do_region(int rw, unsigned region, struct dm_io_region *where,
struct dpages *dp, struct io *io)
{
struct bio *bio;
struct page *page;
unsigned long len;
unsigned offset;
unsigned num_bvecs;
sector_t remaining = where->count;
while (remaining) {
/*
* Allocate a suitably sized-bio: we add an extra
* bvec for bio_get/set_region() and decrement bi_max_vecs
* to hide it from bio_add_page().
*/
num_bvecs = dm_sector_div_up(remaining,
(PAGE_SIZE >> SECTOR_SHIFT));
num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev),
num_bvecs);
if (unlikely(num_bvecs > BIO_MAX_PAGES))
num_bvecs = BIO_MAX_PAGES;
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
bio->bi_sector = where->sector + (where->count - remaining);
bio->bi_bdev = where->bdev;
bio->bi_end_io = endio;
bio->bi_private = io;
bio->bi_destructor = dm_bio_destructor;
bio->bi_max_vecs--;
bio_set_region(bio, region);
/*
* Try and add as many pages as possible.
*/
while (remaining) {
dp->get_page(dp, &page, &len, &offset);
len = min(len, to_bytes(remaining));
if (!bio_add_page(bio, page, len, offset))
break;
offset = 0;
remaining -= to_sector(len);
dp->next_page(dp);
}
atomic_inc(&io->count);
submit_bio(rw, bio);
}
}
static void dispatch_io(int rw, unsigned int num_regions,
struct dm_io_region *where, struct dpages *dp,
struct io *io, int sync)
{
int i;
struct dpages old_pages = *dp;
if (sync)
rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
/*
* For multiple regions we need to be careful to rewind
* the dp object for each call to do_region.
*/
for (i = 0; i < num_regions; i++) {
*dp = old_pages;
if (where[i].count)
do_region(rw, i, where + i, dp, io);
}
/*
* Drop the extra reference that we were holding to avoid
* the io being completed too early.
*/
dec_count(io, 0, 0);
}
static int sync_io(struct dm_io_client *client, unsigned int num_regions,
struct dm_io_region *where, int rw, struct dpages *dp,
unsigned long *error_bits)
{
struct io io;
if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
WARN_ON(1);
return -EIO;
}
retry:
io.error_bits = 0;
io.eopnotsupp_bits = 0;
atomic_set(&io.count, 1); /* see dispatch_io() */
io.sleeper = current;
io.client = client;
dispatch_io(rw, num_regions, where, dp, &io, 1);
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!atomic_read(&io.count))
break;
io_schedule();
}
set_current_state(TASK_RUNNING);
if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
rw &= ~(1 << BIO_RW_BARRIER);
goto retry;
}
if (error_bits)
*error_bits = io.error_bits;
return io.error_bits ? -EIO : 0;
}
static int async_io(struct dm_io_client *client, unsigned int num_regions,
struct dm_io_region *where, int rw, struct dpages *dp,
io_notify_fn fn, void *context)
{
struct io *io;
if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
WARN_ON(1);
fn(1, context);
return -EIO;
}
io = mempool_alloc(client->pool, GFP_NOIO);
io->error_bits = 0;
io->eopnotsupp_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->sleeper = NULL;
io->client = client;
io->callback = fn;
io->context = context;
dispatch_io(rw, num_regions, where, dp, io, 0);
return 0;
}
static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
{
/* Set up dpages based on memory type */
switch (io_req->mem.type) {
case DM_IO_PAGE_LIST:
list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
break;
case DM_IO_BVEC:
bvec_dp_init(dp, io_req->mem.ptr.bvec);
break;
case DM_IO_VMA:
vm_dp_init(dp, io_req->mem.ptr.vma);
break;
case DM_IO_KMEM:
km_dp_init(dp, io_req->mem.ptr.addr);
break;
default:
return -EINVAL;
}
return 0;
}
/*
* New collapsed (a)synchronous interface.
*
* If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
* the queue with blk_unplug() some time later or set the BIO_RW_SYNC bit in
* io_req->bi_rw. If you fail to do one of these, the IO will be submitted to
* the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c.
*/
int dm_io(struct dm_io_request *io_req, unsigned num_regions,
struct dm_io_region *where, unsigned long *sync_error_bits)
{
int r;
struct dpages dp;
r = dp_init(io_req, &dp);
if (r)
return r;
if (!io_req->notify.fn)
return sync_io(io_req->client, num_regions, where,
io_req->bi_rw, &dp, sync_error_bits);
return async_io(io_req->client, num_regions, where, io_req->bi_rw,
&dp, io_req->notify.fn, io_req->notify.context);
}
EXPORT_SYMBOL(dm_io);

1618
kernel/drivers/md/dm-ioctl.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,673 @@
/*
* Copyright (C) 2002 Sistina Software (UK) Limited.
* Copyright (C) 2006 Red Hat GmbH
*
* This file is released under the GPL.
*
* Kcopyd provides a simple interface for copying an area of one
* block-device to one or more other block-devices, with an asynchronous
* completion notification.
*/
#include <linux/types.h>
#include <asm/atomic.h>
#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/device-mapper.h>
#include <linux/dm-kcopyd.h>
#include "dm.h"
/*-----------------------------------------------------------------
* Each kcopyd client has its own little pool of preallocated
* pages for kcopyd io.
*---------------------------------------------------------------*/
struct dm_kcopyd_client {
spinlock_t lock;
struct page_list *pages;
unsigned int nr_pages;
unsigned int nr_free_pages;
struct dm_io_client *io_client;
wait_queue_head_t destroyq;
atomic_t nr_jobs;
mempool_t *job_pool;
struct workqueue_struct *kcopyd_wq;
struct work_struct kcopyd_work;
/*
* We maintain three lists of jobs:
*
* i) jobs waiting for pages
* ii) jobs that have pages, and are waiting for the io to be issued.
* iii) jobs that have completed.
*
* All three of these are protected by job_lock.
*/
spinlock_t job_lock;
struct list_head complete_jobs;
struct list_head io_jobs;
struct list_head pages_jobs;
};
static void wake(struct dm_kcopyd_client *kc)
{
queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
}
static struct page_list *alloc_pl(void)
{
struct page_list *pl;
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
if (!pl)
return NULL;
pl->page = alloc_page(GFP_KERNEL);
if (!pl->page) {
kfree(pl);
return NULL;
}
return pl;
}
static void free_pl(struct page_list *pl)
{
__free_page(pl->page);
kfree(pl);
}
static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
unsigned int nr, struct page_list **pages)
{
struct page_list *pl;
spin_lock(&kc->lock);
if (kc->nr_free_pages < nr) {
spin_unlock(&kc->lock);
return -ENOMEM;
}
kc->nr_free_pages -= nr;
for (*pages = pl = kc->pages; --nr; pl = pl->next)
;
kc->pages = pl->next;
pl->next = NULL;
spin_unlock(&kc->lock);
return 0;
}
static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl)
{
struct page_list *cursor;
spin_lock(&kc->lock);
for (cursor = pl; cursor->next; cursor = cursor->next)
kc->nr_free_pages++;
kc->nr_free_pages++;
cursor->next = kc->pages;
kc->pages = pl;
spin_unlock(&kc->lock);
}
/*
* These three functions resize the page pool.
*/
static void drop_pages(struct page_list *pl)
{
struct page_list *next;
while (pl) {
next = pl->next;
free_pl(pl);
pl = next;
}
}
static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr)
{
unsigned int i;
struct page_list *pl = NULL, *next;
for (i = 0; i < nr; i++) {
next = alloc_pl();
if (!next) {
if (pl)
drop_pages(pl);
return -ENOMEM;
}
next->next = pl;
pl = next;
}
kcopyd_put_pages(kc, pl);
kc->nr_pages += nr;
return 0;
}
static void client_free_pages(struct dm_kcopyd_client *kc)
{
BUG_ON(kc->nr_free_pages != kc->nr_pages);
drop_pages(kc->pages);
kc->pages = NULL;
kc->nr_free_pages = kc->nr_pages = 0;
}
/*-----------------------------------------------------------------
* kcopyd_jobs need to be allocated by the *clients* of kcopyd,
* for this reason we use a mempool to prevent the client from
* ever having to do io (which could cause a deadlock).
*---------------------------------------------------------------*/
struct kcopyd_job {
struct dm_kcopyd_client *kc;
struct list_head list;
unsigned long flags;
/*
* Error state of the job.
*/
int read_err;
unsigned long write_err;
/*
* Either READ or WRITE
*/
int rw;
struct dm_io_region source;
/*
* The destinations for the transfer.
*/
unsigned int num_dests;
struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
sector_t offset;
unsigned int nr_pages;
struct page_list *pages;
/*
* Set this to ensure you are notified when the job has
* completed. 'context' is for callback to use.
*/
dm_kcopyd_notify_fn fn;
void *context;
/*
* These fields are only used if the job has been split
* into more manageable parts.
*/
struct mutex lock;
atomic_t sub_jobs;
sector_t progress;
};
/* FIXME: this should scale with the number of pages */
#define MIN_JOBS 512
static struct kmem_cache *_job_cache;
int __init dm_kcopyd_init(void)
{
_job_cache = KMEM_CACHE(kcopyd_job, 0);
if (!_job_cache)
return -ENOMEM;
return 0;
}
void dm_kcopyd_exit(void)
{
kmem_cache_destroy(_job_cache);
_job_cache = NULL;
}
/*
* Functions to push and pop a job onto the head of a given job
* list.
*/
static struct kcopyd_job *pop(struct list_head *jobs,
struct dm_kcopyd_client *kc)
{
struct kcopyd_job *job = NULL;
unsigned long flags;
spin_lock_irqsave(&kc->job_lock, flags);
if (!list_empty(jobs)) {
job = list_entry(jobs->next, struct kcopyd_job, list);
list_del(&job->list);
}
spin_unlock_irqrestore(&kc->job_lock, flags);
return job;
}
static void push(struct list_head *jobs, struct kcopyd_job *job)
{
unsigned long flags;
struct dm_kcopyd_client *kc = job->kc;
spin_lock_irqsave(&kc->job_lock, flags);
list_add_tail(&job->list, jobs);
spin_unlock_irqrestore(&kc->job_lock, flags);
}
static void push_head(struct list_head *jobs, struct kcopyd_job *job)
{
unsigned long flags;
struct dm_kcopyd_client *kc = job->kc;
spin_lock_irqsave(&kc->job_lock, flags);
list_add(&job->list, jobs);
spin_unlock_irqrestore(&kc->job_lock, flags);
}
/*
* These three functions process 1 item from the corresponding
* job list.
*
* They return:
* < 0: error
* 0: success
* > 0: can't process yet.
*/
static int run_complete_job(struct kcopyd_job *job)
{
void *context = job->context;
int read_err = job->read_err;
unsigned long write_err = job->write_err;
dm_kcopyd_notify_fn fn = job->fn;
struct dm_kcopyd_client *kc = job->kc;
if (job->pages)
kcopyd_put_pages(kc, job->pages);
mempool_free(job, kc->job_pool);
fn(read_err, write_err, context);
if (atomic_dec_and_test(&kc->nr_jobs))
wake_up(&kc->destroyq);
return 0;
}
static void complete_io(unsigned long error, void *context)
{
struct kcopyd_job *job = (struct kcopyd_job *) context;
struct dm_kcopyd_client *kc = job->kc;
if (error) {
if (job->rw == WRITE)
job->write_err |= error;
else
job->read_err = 1;
if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
push(&kc->complete_jobs, job);
wake(kc);
return;
}
}
if (job->rw == WRITE)
push(&kc->complete_jobs, job);
else {
job->rw = WRITE;
push(&kc->io_jobs, job);
}
wake(kc);
}
/*
* Request io on as many buffer heads as we can currently get for
* a particular job.
*/
static int run_io_job(struct kcopyd_job *job)
{
int r;
struct dm_io_request io_req = {
.bi_rw = job->rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG),
.mem.type = DM_IO_PAGE_LIST,
.mem.ptr.pl = job->pages,
.mem.offset = job->offset,
.notify.fn = complete_io,
.notify.context = job,
.client = job->kc->io_client,
};
if (job->rw == READ)
r = dm_io(&io_req, 1, &job->source, NULL);
else
r = dm_io(&io_req, job->num_dests, job->dests, NULL);
return r;
}
static int run_pages_job(struct kcopyd_job *job)
{
int r;
job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
PAGE_SIZE >> 9);
r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
if (!r) {
/* this job is ready for io */
push(&job->kc->io_jobs, job);
return 0;
}
if (r == -ENOMEM)
/* can't complete now */
return 1;
return r;
}
/*
* Run through a list for as long as possible. Returns the count
* of successful jobs.
*/
static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
int (*fn) (struct kcopyd_job *))
{
struct kcopyd_job *job;
int r, count = 0;
while ((job = pop(jobs, kc))) {
r = fn(job);
if (r < 0) {
/* error this rogue job */
if (job->rw == WRITE)
job->write_err = (unsigned long) -1L;
else
job->read_err = 1;
push(&kc->complete_jobs, job);
break;
}
if (r > 0) {
/*
* We couldn't service this job ATM, so
* push this job back onto the list.
*/
push_head(jobs, job);
break;
}
count++;
}
return count;
}
/*
* kcopyd does this every time it's woken up.
*/
static void do_work(struct work_struct *work)
{
struct dm_kcopyd_client *kc = container_of(work,
struct dm_kcopyd_client, kcopyd_work);
/*
* The order that these are called is *very* important.
* complete jobs can free some pages for pages jobs.
* Pages jobs when successful will jump onto the io jobs
* list. io jobs call wake when they complete and it all
* starts again.
*/
process_jobs(&kc->complete_jobs, kc, run_complete_job);
process_jobs(&kc->pages_jobs, kc, run_pages_job);
process_jobs(&kc->io_jobs, kc, run_io_job);
}
/*
* If we are copying a small region we just dispatch a single job
* to do the copy, otherwise the io has to be split up into many
* jobs.
*/
static void dispatch_job(struct kcopyd_job *job)
{
struct dm_kcopyd_client *kc = job->kc;
atomic_inc(&kc->nr_jobs);
push(&kc->pages_jobs, job);
wake(kc);
}
#define SUB_JOB_SIZE 128
static void segment_complete(int read_err, unsigned long write_err,
void *context)
{
/* FIXME: tidy this function */
sector_t progress = 0;
sector_t count = 0;
struct kcopyd_job *job = (struct kcopyd_job *) context;
struct dm_kcopyd_client *kc = job->kc;
mutex_lock(&job->lock);
/* update the error */
if (read_err)
job->read_err = 1;
if (write_err)
job->write_err |= write_err;
/*
* Only dispatch more work if there hasn't been an error.
*/
if ((!job->read_err && !job->write_err) ||
test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
/* get the next chunk of work */
progress = job->progress;
count = job->source.count - progress;
if (count) {
if (count > SUB_JOB_SIZE)
count = SUB_JOB_SIZE;
job->progress += count;
}
}
mutex_unlock(&job->lock);
if (count) {
int i;
struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool,
GFP_NOIO);
*sub_job = *job;
sub_job->source.sector += progress;
sub_job->source.count = count;
for (i = 0; i < job->num_dests; i++) {
sub_job->dests[i].sector += progress;
sub_job->dests[i].count = count;
}
sub_job->fn = segment_complete;
sub_job->context = job;
dispatch_job(sub_job);
} else if (atomic_dec_and_test(&job->sub_jobs)) {
/*
* Queue the completion callback to the kcopyd thread.
*
* Some callers assume that all the completions are called
* from a single thread and don't race with each other.
*
* We must not call the callback directly here because this
* code may not be executing in the thread.
*/
push(&kc->complete_jobs, job);
wake(kc);
}
}
/*
* Create some little jobs that will do the move between
* them.
*/
#define SPLIT_COUNT 8
static void split_job(struct kcopyd_job *job)
{
int i;
atomic_inc(&job->kc->nr_jobs);
atomic_set(&job->sub_jobs, SPLIT_COUNT);
for (i = 0; i < SPLIT_COUNT; i++)
segment_complete(0, 0u, job);
}
int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
unsigned int num_dests, struct dm_io_region *dests,
unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
{
struct kcopyd_job *job;
/*
* Allocate a new job.
*/
job = mempool_alloc(kc->job_pool, GFP_NOIO);
/*
* set up for the read.
*/
job->kc = kc;
job->flags = flags;
job->read_err = 0;
job->write_err = 0;
job->rw = READ;
job->source = *from;
job->num_dests = num_dests;
memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
job->offset = 0;
job->nr_pages = 0;
job->pages = NULL;
job->fn = fn;
job->context = context;
if (job->source.count < SUB_JOB_SIZE)
dispatch_job(job);
else {
mutex_init(&job->lock);
job->progress = 0;
split_job(job);
}
return 0;
}
EXPORT_SYMBOL(dm_kcopyd_copy);
/*
* Cancels a kcopyd job, eg. someone might be deactivating a
* mirror.
*/
#if 0
int kcopyd_cancel(struct kcopyd_job *job, int block)
{
/* FIXME: finish */
return -1;
}
#endif /* 0 */
/*-----------------------------------------------------------------
* Client setup
*---------------------------------------------------------------*/
int dm_kcopyd_client_create(unsigned int nr_pages,
struct dm_kcopyd_client **result)
{
int r = -ENOMEM;
struct dm_kcopyd_client *kc;
kc = kmalloc(sizeof(*kc), GFP_KERNEL);
if (!kc)
return -ENOMEM;
spin_lock_init(&kc->lock);
spin_lock_init(&kc->job_lock);
INIT_LIST_HEAD(&kc->complete_jobs);
INIT_LIST_HEAD(&kc->io_jobs);
INIT_LIST_HEAD(&kc->pages_jobs);
kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
if (!kc->job_pool)
goto bad_slab;
INIT_WORK(&kc->kcopyd_work, do_work);
kc->kcopyd_wq = create_singlethread_workqueue("kcopyd");
if (!kc->kcopyd_wq)
goto bad_workqueue;
kc->pages = NULL;
kc->nr_pages = kc->nr_free_pages = 0;
r = client_alloc_pages(kc, nr_pages);
if (r)
goto bad_client_pages;
kc->io_client = dm_io_client_create(nr_pages);
if (IS_ERR(kc->io_client)) {
r = PTR_ERR(kc->io_client);
goto bad_io_client;
}
init_waitqueue_head(&kc->destroyq);
atomic_set(&kc->nr_jobs, 0);
*result = kc;
return 0;
bad_io_client:
client_free_pages(kc);
bad_client_pages:
destroy_workqueue(kc->kcopyd_wq);
bad_workqueue:
mempool_destroy(kc->job_pool);
bad_slab:
kfree(kc);
return r;
}
EXPORT_SYMBOL(dm_kcopyd_client_create);
void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
{
/* Wait for completion of all jobs submitted by this client. */
wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
BUG_ON(!list_empty(&kc->complete_jobs));
BUG_ON(!list_empty(&kc->io_jobs));
BUG_ON(!list_empty(&kc->pages_jobs));
destroy_workqueue(kc->kcopyd_wq);
dm_io_client_destroy(kc->io_client);
client_free_pages(kc);
mempool_destroy(kc->job_pool);
kfree(kc);
}
EXPORT_SYMBOL(dm_kcopyd_client_destroy);

View File

@@ -0,0 +1,171 @@
/*
* Copyright (C) 2001-2003 Sistina Software (UK) Limited.
*
* This file is released under the GPL.
*/
#include "dm.h"
#include <linux/module.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/device-mapper.h>
#define DM_MSG_PREFIX "linear"
/*
* Linear: maps a linear range of a device.
*/
struct linear_c {
struct dm_dev *dev;
sector_t start;
};
/*
* Construct a linear mapping: <dev_path> <offset>
*/
static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct linear_c *lc;
unsigned long long tmp;
if (argc != 2) {
ti->error = "Invalid argument count";
return -EINVAL;
}
lc = kmalloc(sizeof(*lc), GFP_KERNEL);
if (lc == NULL) {
ti->error = "dm-linear: Cannot allocate linear context";
return -ENOMEM;
}
if (sscanf(argv[1], "%llu", &tmp) != 1) {
ti->error = "dm-linear: Invalid device sector";
goto bad;
}
lc->start = tmp;
if (dm_get_device(ti, argv[0], lc->start, ti->len,
dm_table_get_mode(ti->table), &lc->dev)) {
ti->error = "dm-linear: Device lookup failed";
goto bad;
}
ti->num_flush_requests = 1;
ti->private = lc;
return 0;
bad:
kfree(lc);
return -EINVAL;
}
static void linear_dtr(struct dm_target *ti)
{
struct linear_c *lc = (struct linear_c *) ti->private;
dm_put_device(ti, lc->dev);
kfree(lc);
}
static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector)
{
struct linear_c *lc = ti->private;
return lc->start + (bi_sector - ti->begin);
}
static void linear_map_bio(struct dm_target *ti, struct bio *bio)
{
struct linear_c *lc = ti->private;
bio->bi_bdev = lc->dev->bdev;
if (bio_sectors(bio))
bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
}
static int linear_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
linear_map_bio(ti, bio);
return DM_MAPIO_REMAPPED;
}
static int linear_status(struct dm_target *ti, status_type_t type,
char *result, unsigned int maxlen)
{
struct linear_c *lc = (struct linear_c *) ti->private;
switch (type) {
case STATUSTYPE_INFO:
result[0] = '\0';
break;
case STATUSTYPE_TABLE:
snprintf(result, maxlen, "%s %llu", lc->dev->name,
(unsigned long long)lc->start);
break;
}
return 0;
}
static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
unsigned long arg)
{
struct linear_c *lc = (struct linear_c *) ti->private;
return __blkdev_driver_ioctl(lc->dev->bdev, lc->dev->mode, cmd, arg);
}
static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
struct bio_vec *biovec, int max_size)
{
struct linear_c *lc = ti->private;
struct request_queue *q = bdev_get_queue(lc->dev->bdev);
if (!q->merge_bvec_fn)
return max_size;
bvm->bi_bdev = lc->dev->bdev;
bvm->bi_sector = linear_map_sector(ti, bvm->bi_sector);
return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
}
static int linear_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct linear_c *lc = ti->private;
return fn(ti, lc->dev, lc->start, ti->len, data);
}
static struct target_type linear_target = {
.name = "linear",
.version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = linear_ctr,
.dtr = linear_dtr,
.map = linear_map,
.status = linear_status,
.ioctl = linear_ioctl,
.merge = linear_merge,
.iterate_devices = linear_iterate_devices,
};
int __init dm_linear_init(void)
{
int r = dm_register_target(&linear_target);
if (r < 0)
DMERR("register failed %d", r);
return r;
}
void dm_linear_exit(void)
{
dm_unregister_target(&linear_target);
}

View File

@@ -0,0 +1,705 @@
/*
* Copyright (C) 2006-2009 Red Hat, Inc.
*
* This file is released under the LGPL.
*/
#include <linux/bio.h>
#include <linux/dm-dirty-log.h>
#include <linux/device-mapper.h>
#include <linux/dm-log-userspace.h>
#include "dm-log-userspace-transfer.h"
struct flush_entry {
int type;
region_t region;
struct list_head list;
};
struct log_c {
struct dm_target *ti;
uint32_t region_size;
region_t region_count;
uint64_t luid;
char uuid[DM_UUID_LEN];
char *usr_argv_str;
uint32_t usr_argc;
/*
* in_sync_hint gets set when doing is_remote_recovering. It
* represents the first region that needs recovery. IOW, the
* first zero bit of sync_bits. This can be useful for to limit
* traffic for calls like is_remote_recovering and get_resync_work,
* but be take care in its use for anything else.
*/
uint64_t in_sync_hint;
spinlock_t flush_lock;
struct list_head flush_list; /* only for clear and mark requests */
};
static mempool_t *flush_entry_pool;
static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
{
return kmalloc(sizeof(struct flush_entry), gfp_mask);
}
static void flush_entry_free(void *element, void *pool_data)
{
kfree(element);
}
static int userspace_do_request(struct log_c *lc, const char *uuid,
int request_type, char *data, size_t data_size,
char *rdata, size_t *rdata_size)
{
int r;
/*
* If the server isn't there, -ESRCH is returned,
* and we must keep trying until the server is
* restored.
*/
retry:
r = dm_consult_userspace(uuid, lc->luid, request_type, data,
data_size, rdata, rdata_size);
if (r != -ESRCH)
return r;
DMERR(" Userspace log server not found.");
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(2*HZ);
DMWARN("Attempting to contact userspace log server...");
r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR,
lc->usr_argv_str,
strlen(lc->usr_argv_str) + 1,
NULL, NULL);
if (!r)
break;
}
DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL,
0, NULL, NULL);
if (!r)
goto retry;
DMERR("Error trying to resume userspace log: %d", r);
return -ESRCH;
}
static int build_constructor_string(struct dm_target *ti,
unsigned argc, char **argv,
char **ctr_str)
{
int i, str_size;
char *str = NULL;
*ctr_str = NULL;
for (i = 0, str_size = 0; i < argc; i++)
str_size += strlen(argv[i]) + 1; /* +1 for space between args */
str_size += 20; /* Max number of chars in a printed u64 number */
str = kzalloc(str_size, GFP_KERNEL);
if (!str) {
DMWARN("Unable to allocate memory for constructor string");
return -ENOMEM;
}
str_size = sprintf(str, "%llu", (unsigned long long)ti->len);
for (i = 0; i < argc; i++)
str_size += sprintf(str + str_size, " %s", argv[i]);
*ctr_str = str;
return str_size;
}
/*
* userspace_ctr
*
* argv contains:
* <UUID> <other args>
* Where 'other args' is the userspace implementation specific log
* arguments. An example might be:
* <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
*
* So, this module will strip off the <UUID> for identification purposes
* when communicating with userspace about a log; but will pass on everything
* else.
*/
static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
unsigned argc, char **argv)
{
int r = 0;
int str_size;
char *ctr_str = NULL;
struct log_c *lc = NULL;
uint64_t rdata;
size_t rdata_size = sizeof(rdata);
if (argc < 3) {
DMWARN("Too few arguments to userspace dirty log");
return -EINVAL;
}
lc = kmalloc(sizeof(*lc), GFP_KERNEL);
if (!lc) {
DMWARN("Unable to allocate userspace log context.");
return -ENOMEM;
}
/* The ptr value is sufficient for local unique id */
lc->luid = (unsigned long)lc;
lc->ti = ti;
if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
DMWARN("UUID argument too long.");
kfree(lc);
return -EINVAL;
}
strncpy(lc->uuid, argv[0], DM_UUID_LEN);
spin_lock_init(&lc->flush_lock);
INIT_LIST_HEAD(&lc->flush_list);
str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
if (str_size < 0) {
kfree(lc);
return str_size;
}
/* Send table string */
r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
ctr_str, str_size, NULL, NULL);
if (r == -ESRCH) {
DMERR("Userspace log server not found");
goto out;
}
/* Since the region size does not change, get it now */
rdata_size = sizeof(rdata);
r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE,
NULL, 0, (char *)&rdata, &rdata_size);
if (r) {
DMERR("Failed to get region size of dirty log");
goto out;
}
lc->region_size = (uint32_t)rdata;
lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
out:
if (r) {
kfree(lc);
kfree(ctr_str);
} else {
lc->usr_argv_str = ctr_str;
lc->usr_argc = argc;
log->context = lc;
}
return r;
}
static void userspace_dtr(struct dm_dirty_log *log)
{
int r;
struct log_c *lc = log->context;
r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
NULL, 0,
NULL, NULL);
kfree(lc->usr_argv_str);
kfree(lc);
return;
}
static int userspace_presuspend(struct dm_dirty_log *log)
{
int r;
struct log_c *lc = log->context;
r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,
NULL, 0,
NULL, NULL);
return r;
}
static int userspace_postsuspend(struct dm_dirty_log *log)
{
int r;
struct log_c *lc = log->context;
r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,
NULL, 0,
NULL, NULL);
return r;
}
static int userspace_resume(struct dm_dirty_log *log)
{
int r;
struct log_c *lc = log->context;
lc->in_sync_hint = 0;
r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,
NULL, 0,
NULL, NULL);
return r;
}
static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
{
struct log_c *lc = log->context;
return lc->region_size;
}
/*
* userspace_is_clean
*
* Check whether a region is clean. If there is any sort of
* failure when consulting the server, we return not clean.
*
* Returns: 1 if clean, 0 otherwise
*/
static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
{
int r;
uint64_t region64 = (uint64_t)region;
int64_t is_clean;
size_t rdata_size;
struct log_c *lc = log->context;
rdata_size = sizeof(is_clean);
r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
(char *)&region64, sizeof(region64),
(char *)&is_clean, &rdata_size);
return (r) ? 0 : (int)is_clean;
}
/*
* userspace_in_sync
*
* Check if the region is in-sync. If there is any sort
* of failure when consulting the server, we assume that
* the region is not in sync.
*
* If 'can_block' is set, return immediately
*
* Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
*/
static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
int can_block)
{
int r;
uint64_t region64 = region;
int64_t in_sync;
size_t rdata_size;
struct log_c *lc = log->context;
/*
* We can never respond directly - even if in_sync_hint is
* set. This is because another machine could see a device
* failure and mark the region out-of-sync. If we don't go
* to userspace to ask, we might think the region is in-sync
* and allow a read to pick up data that is stale. (This is
* very unlikely if a device actually fails; but it is very
* likely if a connection to one device from one machine fails.)
*
* There still might be a problem if the mirror caches the region
* state as in-sync... but then this call would not be made. So,
* that is a mirror problem.
*/
if (!can_block)
return -EWOULDBLOCK;
rdata_size = sizeof(in_sync);
r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
(char *)&region64, sizeof(region64),
(char *)&in_sync, &rdata_size);
return (r) ? 0 : (int)in_sync;
}
/*
* userspace_flush
*
* This function is ok to block.
* The flush happens in two stages. First, it sends all
* clear/mark requests that are on the list. Then it
* tells the server to commit them. This gives the
* server a chance to optimise the commit, instead of
* doing it for every request.
*
* Additionally, we could implement another thread that
* sends the requests up to the server - reducing the
* load on flush. Then the flush would have less in
* the list and be responsible for the finishing commit.
*
* Returns: 0 on success, < 0 on failure
*/
static int userspace_flush(struct dm_dirty_log *log)
{
int r = 0;
unsigned long flags;
struct log_c *lc = log->context;
LIST_HEAD(flush_list);
struct flush_entry *fe, *tmp_fe;
spin_lock_irqsave(&lc->flush_lock, flags);
list_splice_init(&lc->flush_list, &flush_list);
spin_unlock_irqrestore(&lc->flush_lock, flags);
if (list_empty(&flush_list))
return 0;
/*
* FIXME: Count up requests, group request types,
* allocate memory to stick all requests in and
* send to server in one go. Failing the allocation,
* do it one by one.
*/
list_for_each_entry(fe, &flush_list, list) {
r = userspace_do_request(lc, lc->uuid, fe->type,
(char *)&fe->region,
sizeof(fe->region),
NULL, NULL);
if (r)
goto fail;
}
r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
NULL, 0, NULL, NULL);
fail:
/*
* We can safely remove these entries, even if failure.
* Calling code will receive an error and will know that
* the log facility has failed.
*/
list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
list_del(&fe->list);
mempool_free(fe, flush_entry_pool);
}
if (r)
dm_table_event(lc->ti->table);
return r;
}
/*
* userspace_mark_region
*
* This function should avoid blocking unless absolutely required.
* (Memory allocation is valid for blocking.)
*/
static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
{
unsigned long flags;
struct log_c *lc = log->context;
struct flush_entry *fe;
/* Wait for an allocation, but _never_ fail */
fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
BUG_ON(!fe);
spin_lock_irqsave(&lc->flush_lock, flags);
fe->type = DM_ULOG_MARK_REGION;
fe->region = region;
list_add(&fe->list, &lc->flush_list);
spin_unlock_irqrestore(&lc->flush_lock, flags);
return;
}
/*
* userspace_clear_region
*
* This function must not block.
* So, the alloc can't block. In the worst case, it is ok to
* fail. It would simply mean we can't clear the region.
* Does nothing to current sync context, but does mean
* the region will be re-sync'ed on a reload of the mirror
* even though it is in-sync.
*/
static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
{
unsigned long flags;
struct log_c *lc = log->context;
struct flush_entry *fe;
/*
* If we fail to allocate, we skip the clearing of
* the region. This doesn't hurt us in any way, except
* to cause the region to be resync'ed when the
* device is activated next time.
*/
fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
if (!fe) {
DMERR("Failed to allocate memory to clear region.");
return;
}
spin_lock_irqsave(&lc->flush_lock, flags);
fe->type = DM_ULOG_CLEAR_REGION;
fe->region = region;
list_add(&fe->list, &lc->flush_list);
spin_unlock_irqrestore(&lc->flush_lock, flags);
return;
}
/*
* userspace_get_resync_work
*
* Get a region that needs recovery. It is valid to return
* an error for this function.
*
* Returns: 1 if region filled, 0 if no work, <0 on error
*/
static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
{
int r;
size_t rdata_size;
struct log_c *lc = log->context;
struct {
int64_t i; /* 64-bit for mix arch compatibility */
region_t r;
} pkg;
if (lc->in_sync_hint >= lc->region_count)
return 0;
rdata_size = sizeof(pkg);
r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
NULL, 0,
(char *)&pkg, &rdata_size);
*region = pkg.r;
return (r) ? r : (int)pkg.i;
}
/*
* userspace_set_region_sync
*
* Set the sync status of a given region. This function
* must not fail.
*/
static void userspace_set_region_sync(struct dm_dirty_log *log,
region_t region, int in_sync)
{
int r;
struct log_c *lc = log->context;
struct {
region_t r;
int64_t i;
} pkg;
pkg.r = region;
pkg.i = (int64_t)in_sync;
r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
(char *)&pkg, sizeof(pkg),
NULL, NULL);
/*
* It would be nice to be able to report failures.
* However, it is easy emough to detect and resolve.
*/
return;
}
/*
* userspace_get_sync_count
*
* If there is any sort of failure when consulting the server,
* we assume that the sync count is zero.
*
* Returns: sync count on success, 0 on failure
*/
static region_t userspace_get_sync_count(struct dm_dirty_log *log)
{
int r;
size_t rdata_size;
uint64_t sync_count;
struct log_c *lc = log->context;
rdata_size = sizeof(sync_count);
r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
NULL, 0,
(char *)&sync_count, &rdata_size);
if (r)
return 0;
if (sync_count >= lc->region_count)
lc->in_sync_hint = lc->region_count;
return (region_t)sync_count;
}
/*
* userspace_status
*
* Returns: amount of space consumed
*/
static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
char *result, unsigned maxlen)
{
int r = 0;
char *table_args;
size_t sz = (size_t)maxlen;
struct log_c *lc = log->context;
switch (status_type) {
case STATUSTYPE_INFO:
r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
NULL, 0,
result, &sz);
if (r) {
sz = 0;
DMEMIT("%s 1 COM_FAILURE", log->type->name);
}
break;
case STATUSTYPE_TABLE:
sz = 0;
table_args = strchr(lc->usr_argv_str, ' ');
BUG_ON(!table_args); /* There will always be a ' ' */
table_args++;
DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc,
lc->uuid, table_args);
break;
}
return (r) ? 0 : (int)sz;
}
/*
* userspace_is_remote_recovering
*
* Returns: 1 if region recovering, 0 otherwise
*/
static int userspace_is_remote_recovering(struct dm_dirty_log *log,
region_t region)
{
int r;
uint64_t region64 = region;
struct log_c *lc = log->context;
static unsigned long long limit;
struct {
int64_t is_recovering;
uint64_t in_sync_hint;
} pkg;
size_t rdata_size = sizeof(pkg);
/*
* Once the mirror has been reported to be in-sync,
* it will never again ask for recovery work. So,
* we can safely say there is not a remote machine
* recovering if the device is in-sync. (in_sync_hint
* must be reset at resume time.)
*/
if (region < lc->in_sync_hint)
return 0;
else if (jiffies < limit)
return 1;
limit = jiffies + (HZ / 4);
r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
(char *)&region64, sizeof(region64),
(char *)&pkg, &rdata_size);
if (r)
return 1;
lc->in_sync_hint = pkg.in_sync_hint;
return (int)pkg.is_recovering;
}
static struct dm_dirty_log_type _userspace_type = {
.name = "userspace",
.module = THIS_MODULE,
.ctr = userspace_ctr,
.dtr = userspace_dtr,
.presuspend = userspace_presuspend,
.postsuspend = userspace_postsuspend,
.resume = userspace_resume,
.get_region_size = userspace_get_region_size,
.is_clean = userspace_is_clean,
.in_sync = userspace_in_sync,
.flush = userspace_flush,
.mark_region = userspace_mark_region,
.clear_region = userspace_clear_region,
.get_resync_work = userspace_get_resync_work,
.set_region_sync = userspace_set_region_sync,
.get_sync_count = userspace_get_sync_count,
.status = userspace_status,
.is_remote_recovering = userspace_is_remote_recovering,
};
static int __init userspace_dirty_log_init(void)
{
int r = 0;
flush_entry_pool = mempool_create(100, flush_entry_alloc,
flush_entry_free, NULL);
if (!flush_entry_pool) {
DMWARN("Unable to create flush_entry_pool: No memory.");
return -ENOMEM;
}
r = dm_ulog_tfr_init();
if (r) {
DMWARN("Unable to initialize userspace log communications");
mempool_destroy(flush_entry_pool);
return r;
}
r = dm_dirty_log_type_register(&_userspace_type);
if (r) {
DMWARN("Couldn't register userspace dirty log type");
dm_ulog_tfr_exit();
mempool_destroy(flush_entry_pool);
return r;
}
DMINFO("version 1.0.0 loaded");
return 0;
}
static void __exit userspace_dirty_log_exit(void)
{
dm_dirty_log_type_unregister(&_userspace_type);
dm_ulog_tfr_exit();
mempool_destroy(flush_entry_pool);
DMINFO("version 1.0.0 unloaded");
return;
}
module_init(userspace_dirty_log_init);
module_exit(userspace_dirty_log_exit);
MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,284 @@
/*
* Copyright (C) 2006-2009 Red Hat, Inc.
*
* This file is released under the LGPL.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <net/sock.h>
#include <linux/workqueue.h>
#include <linux/connector.h>
#include <linux/device-mapper.h>
#include <linux/dm-log-userspace.h>
#include "dm-log-userspace-transfer.h"
static uint32_t dm_ulog_seq;
/*
* Netlink/Connector is an unreliable protocol. How long should
* we wait for a response before assuming it was lost and retrying?
* (If we do receive a response after this time, it will be discarded
* and the response to the resent request will be waited for.
*/
#define DM_ULOG_RETRY_TIMEOUT (15 * HZ)
/*
* Pre-allocated space for speed
*/
#define DM_ULOG_PREALLOCED_SIZE 512
static struct cn_msg *prealloced_cn_msg;
static struct dm_ulog_request *prealloced_ulog_tfr;
static struct cb_id ulog_cn_id = {
.idx = CN_IDX_DM,
.val = CN_VAL_DM_USERSPACE_LOG
};
static DEFINE_MUTEX(dm_ulog_lock);
struct receiving_pkg {
struct list_head list;
struct completion complete;
uint32_t seq;
int error;
size_t *data_size;
char *data;
};
static DEFINE_SPINLOCK(receiving_list_lock);
static struct list_head receiving_list;
static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
{
int r;
struct cn_msg *msg = prealloced_cn_msg;
memset(msg, 0, sizeof(struct cn_msg));
msg->id.idx = ulog_cn_id.idx;
msg->id.val = ulog_cn_id.val;
msg->ack = 0;
msg->seq = tfr->seq;
msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
r = cn_netlink_send(msg, 0, gfp_any());
return r;
}
/*
* Parameters for this function can be either msg or tfr, but not
* both. This function fills in the reply for a waiting request.
* If just msg is given, then the reply is simply an ACK from userspace
* that the request was received.
*
* Returns: 0 on success, -ENOENT on failure
*/
static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
{
uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0;
struct receiving_pkg *pkg;
/*
* The 'receiving_pkg' entries in this list are statically
* allocated on the stack in 'dm_consult_userspace'.
* Each process that is waiting for a reply from the user
* space server will have an entry in this list.
*
* We are safe to do it this way because the stack space
* is unique to each process, but still addressable by
* other processes.
*/
list_for_each_entry(pkg, &receiving_list, list) {
if (rtn_seq != pkg->seq)
continue;
if (msg) {
pkg->error = -msg->ack;
/*
* If we are trying again, we will need to know our
* storage capacity. Otherwise, along with the
* error code, we make explicit that we have no data.
*/
if (pkg->error != -EAGAIN)
*(pkg->data_size) = 0;
} else if (tfr->data_size > *(pkg->data_size)) {
DMERR("Insufficient space to receive package [%u] "
"(%u vs %zu)", tfr->request_type,
tfr->data_size, *(pkg->data_size));
*(pkg->data_size) = 0;
pkg->error = -ENOSPC;
} else {
pkg->error = tfr->error;
memcpy(pkg->data, tfr->data, tfr->data_size);
*(pkg->data_size) = tfr->data_size;
}
complete(&pkg->complete);
return 0;
}
return -ENOENT;
}
/*
* This is the connector callback that delivers data
* that was sent from userspace.
*/
static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
{
struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN))
return;
spin_lock(&receiving_list_lock);
if (msg->len == 0)
fill_pkg(msg, NULL);
else if (msg->len < sizeof(*tfr))
DMERR("Incomplete message received (expected %u, got %u): [%u]",
(unsigned)sizeof(*tfr), msg->len, msg->seq);
else
fill_pkg(NULL, tfr);
spin_unlock(&receiving_list_lock);
}
/**
* dm_consult_userspace
* @uuid: log's universal unique identifier (must be DM_UUID_LEN in size)
* @luid: log's local unique identifier
* @request_type: found in include/linux/dm-log-userspace.h
* @data: data to tx to the server
* @data_size: size of data in bytes
* @rdata: place to put return data from server
* @rdata_size: value-result (amount of space given/amount of space used)
*
* rdata_size is undefined on failure.
*
* Memory used to communicate with userspace is zero'ed
* before populating to ensure that no unwanted bits leak
* from kernel space to user-space. All userspace log communications
* between kernel and user space go through this function.
*
* Returns: 0 on success, -EXXX on failure
**/
int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
char *data, size_t data_size,
char *rdata, size_t *rdata_size)
{
int r = 0;
size_t dummy = 0;
int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg);
struct dm_ulog_request *tfr = prealloced_ulog_tfr;
struct receiving_pkg pkg;
/*
* Given the space needed to hold the 'struct cn_msg' and
* 'struct dm_ulog_request' - do we have enough payload
* space remaining?
*/
if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) {
DMINFO("Size of tfr exceeds preallocated size");
return -EINVAL;
}
if (!rdata_size)
rdata_size = &dummy;
resend:
/*
* We serialize the sending of requests so we can
* use the preallocated space.
*/
mutex_lock(&dm_ulog_lock);
memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
memcpy(tfr->uuid, uuid, DM_UUID_LEN);
tfr->luid = luid;
tfr->seq = dm_ulog_seq++;
/*
* Must be valid request type (all other bits set to
* zero). This reserves other bits for possible future
* use.
*/
tfr->request_type = request_type & DM_ULOG_REQUEST_MASK;
tfr->data_size = data_size;
if (data && data_size)
memcpy(tfr->data, data, data_size);
memset(&pkg, 0, sizeof(pkg));
init_completion(&pkg.complete);
pkg.seq = tfr->seq;
pkg.data_size = rdata_size;
pkg.data = rdata;
spin_lock(&receiving_list_lock);
list_add(&(pkg.list), &receiving_list);
spin_unlock(&receiving_list_lock);
r = dm_ulog_sendto_server(tfr);
mutex_unlock(&dm_ulog_lock);
if (r) {
DMERR("Unable to send log request [%u] to userspace: %d",
request_type, r);
spin_lock(&receiving_list_lock);
list_del_init(&(pkg.list));
spin_unlock(&receiving_list_lock);
goto out;
}
r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
spin_lock(&receiving_list_lock);
list_del_init(&(pkg.list));
spin_unlock(&receiving_list_lock);
if (!r) {
DMWARN("[%s] Request timed out: [%u/%u] - retrying",
(strlen(uuid) > 8) ?
(uuid + (strlen(uuid) - 8)) : (uuid),
request_type, pkg.seq);
goto resend;
}
r = pkg.error;
if (r == -EAGAIN)
goto resend;
out:
return r;
}
int dm_ulog_tfr_init(void)
{
int r;
void *prealloced;
INIT_LIST_HEAD(&receiving_list);
prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL);
if (!prealloced)
return -ENOMEM;
prealloced_cn_msg = prealloced;
prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg);
r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
if (r) {
cn_del_callback(&ulog_cn_id);
return r;
}
return 0;
}
void dm_ulog_tfr_exit(void)
{
cn_del_callback(&ulog_cn_id);
kfree(prealloced_cn_msg);
}

View File

@@ -0,0 +1,18 @@
/*
* Copyright (C) 2006-2009 Red Hat, Inc.
*
* This file is released under the LGPL.
*/
#ifndef __DM_LOG_USERSPACE_TRANSFER_H__
#define __DM_LOG_USERSPACE_TRANSFER_H__
#define DM_MSG_PREFIX "dm-log-userspace"
int dm_ulog_tfr_init(void);
void dm_ulog_tfr_exit(void);
int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
char *data, size_t data_size,
char *rdata, size_t *rdata_size);
#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */

843
kernel/drivers/md/dm-log.c Normal file
View File

@@ -0,0 +1,843 @@
/*
* Copyright (C) 2003 Sistina Software
* Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This file is released under the LGPL.
*/
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/dm-io.h>
#include <linux/dm-dirty-log.h>
#include <linux/device-mapper.h>
#define DM_MSG_PREFIX "dirty region log"
static LIST_HEAD(_log_types);
static DEFINE_SPINLOCK(_lock);
static struct dm_dirty_log_type *__find_dirty_log_type(const char *name)
{
struct dm_dirty_log_type *log_type;
list_for_each_entry(log_type, &_log_types, list)
if (!strcmp(name, log_type->name))
return log_type;
return NULL;
}
static struct dm_dirty_log_type *_get_dirty_log_type(const char *name)
{
struct dm_dirty_log_type *log_type;
spin_lock(&_lock);
log_type = __find_dirty_log_type(name);
if (log_type && !try_module_get(log_type->module))
log_type = NULL;
spin_unlock(&_lock);
return log_type;
}
/*
* get_type
* @type_name
*
* Attempt to retrieve the dm_dirty_log_type by name. If not already
* available, attempt to load the appropriate module.
*
* Log modules are named "dm-log-" followed by the 'type_name'.
* Modules may contain multiple types.
* This function will first try the module "dm-log-<type_name>",
* then truncate 'type_name' on the last '-' and try again.
*
* For example, if type_name was "clustered-disk", it would search
* 'dm-log-clustered-disk' then 'dm-log-clustered'.
*
* Returns: dirty_log_type* on success, NULL on failure
*/
static struct dm_dirty_log_type *get_type(const char *type_name)
{
char *p, *type_name_dup;
struct dm_dirty_log_type *log_type;
if (!type_name)
return NULL;
log_type = _get_dirty_log_type(type_name);
if (log_type)
return log_type;
type_name_dup = kstrdup(type_name, GFP_KERNEL);
if (!type_name_dup) {
DMWARN("No memory left to attempt log module load for \"%s\"",
type_name);
return NULL;
}
while (request_module("dm-log-%s", type_name_dup) ||
!(log_type = _get_dirty_log_type(type_name))) {
p = strrchr(type_name_dup, '-');
if (!p)
break;
p[0] = '\0';
}
if (!log_type)
DMWARN("Module for logging type \"%s\" not found.", type_name);
kfree(type_name_dup);
return log_type;
}
static void put_type(struct dm_dirty_log_type *type)
{
if (!type)
return;
spin_lock(&_lock);
if (!__find_dirty_log_type(type->name))
goto out;
module_put(type->module);
out:
spin_unlock(&_lock);
}
int dm_dirty_log_type_register(struct dm_dirty_log_type *type)
{
int r = 0;
spin_lock(&_lock);
if (!__find_dirty_log_type(type->name))
list_add(&type->list, &_log_types);
else
r = -EEXIST;
spin_unlock(&_lock);
return r;
}
EXPORT_SYMBOL(dm_dirty_log_type_register);
int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
{
spin_lock(&_lock);
if (!__find_dirty_log_type(type->name)) {
spin_unlock(&_lock);
return -EINVAL;
}
list_del(&type->list);
spin_unlock(&_lock);
return 0;
}
EXPORT_SYMBOL(dm_dirty_log_type_unregister);
struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
struct dm_target *ti,
unsigned int argc, char **argv)
{
struct dm_dirty_log_type *type;
struct dm_dirty_log *log;
log = kmalloc(sizeof(*log), GFP_KERNEL);
if (!log)
return NULL;
type = get_type(type_name);
if (!type) {
kfree(log);
return NULL;
}
log->type = type;
if (type->ctr(log, ti, argc, argv)) {
kfree(log);
put_type(type);
return NULL;
}
return log;
}
EXPORT_SYMBOL(dm_dirty_log_create);
void dm_dirty_log_destroy(struct dm_dirty_log *log)
{
log->type->dtr(log);
put_type(log->type);
kfree(log);
}
EXPORT_SYMBOL(dm_dirty_log_destroy);
/*-----------------------------------------------------------------
* Persistent and core logs share a lot of their implementation.
* FIXME: need a reload method to be called from a resume
*---------------------------------------------------------------*/
/*
* Magic for persistent mirrors: "MiRr"
*/
#define MIRROR_MAGIC 0x4D695272
/*
* The on-disk version of the metadata.
*/
#define MIRROR_DISK_VERSION 2
#define LOG_OFFSET 2
struct log_header {
uint32_t magic;
/*
* Simple, incrementing version. no backward
* compatibility.
*/
uint32_t version;
sector_t nr_regions;
};
struct log_c {
struct dm_target *ti;
int touched;
uint32_t region_size;
unsigned int region_count;
region_t sync_count;
unsigned bitset_uint32_count;
uint32_t *clean_bits;
uint32_t *sync_bits;
uint32_t *recovering_bits; /* FIXME: this seems excessive */
int sync_search;
/* Resync flag */
enum sync {
DEFAULTSYNC, /* Synchronize if necessary */
NOSYNC, /* Devices known to be already in sync */
FORCESYNC, /* Force a sync to happen */
} sync;
struct dm_io_request io_req;
/*
* Disk log fields
*/
int log_dev_failed;
struct dm_dev *log_dev;
struct log_header header;
struct dm_io_region header_location;
struct log_header *disk_header;
};
/*
* The touched member needs to be updated every time we access
* one of the bitsets.
*/
static inline int log_test_bit(uint32_t *bs, unsigned bit)
{
return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0;
}
static inline void log_set_bit(struct log_c *l,
uint32_t *bs, unsigned bit)
{
ext2_set_bit(bit, (unsigned long *) bs);
l->touched = 1;
}
static inline void log_clear_bit(struct log_c *l,
uint32_t *bs, unsigned bit)
{
ext2_clear_bit(bit, (unsigned long *) bs);
l->touched = 1;
}
/*----------------------------------------------------------------
* Header IO
*--------------------------------------------------------------*/
static void header_to_disk(struct log_header *core, struct log_header *disk)
{
disk->magic = cpu_to_le32(core->magic);
disk->version = cpu_to_le32(core->version);
disk->nr_regions = cpu_to_le64(core->nr_regions);
}
static void header_from_disk(struct log_header *core, struct log_header *disk)
{
core->magic = le32_to_cpu(disk->magic);
core->version = le32_to_cpu(disk->version);
core->nr_regions = le64_to_cpu(disk->nr_regions);
}
static int rw_header(struct log_c *lc, int rw)
{
lc->io_req.bi_rw = rw;
return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
}
static int read_header(struct log_c *log)
{
int r;
r = rw_header(log, READ);
if (r)
return r;
header_from_disk(&log->header, log->disk_header);
/* New log required? */
if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) {
log->header.magic = MIRROR_MAGIC;
log->header.version = MIRROR_DISK_VERSION;
log->header.nr_regions = 0;
}
#ifdef __LITTLE_ENDIAN
if (log->header.version == 1)
log->header.version = 2;
#endif
if (log->header.version != MIRROR_DISK_VERSION) {
DMWARN("incompatible disk log version");
return -EINVAL;
}
return 0;
}
static int _check_region_size(struct dm_target *ti, uint32_t region_size)
{
if (region_size < 2 || region_size > ti->len)
return 0;
if (!is_power_of_2(region_size))
return 0;
return 1;
}
/*----------------------------------------------------------------
* core log constructor/destructor
*
* argv contains region_size followed optionally by [no]sync
*--------------------------------------------------------------*/
#define BYTE_SHIFT 3
static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
unsigned int argc, char **argv,
struct dm_dev *dev)
{
enum sync sync = DEFAULTSYNC;
struct log_c *lc;
uint32_t region_size;
unsigned int region_count;
size_t bitset_size, buf_size;
int r;
if (argc < 1 || argc > 2) {
DMWARN("wrong number of arguments to dirty region log");
return -EINVAL;
}
if (argc > 1) {
if (!strcmp(argv[1], "sync"))
sync = FORCESYNC;
else if (!strcmp(argv[1], "nosync"))
sync = NOSYNC;
else {
DMWARN("unrecognised sync argument to "
"dirty region log: %s", argv[1]);
return -EINVAL;
}
}
if (sscanf(argv[0], "%u", &region_size) != 1 ||
!_check_region_size(ti, region_size)) {
DMWARN("invalid region size %s", argv[0]);
return -EINVAL;
}
region_count = dm_sector_div_up(ti->len, region_size);
lc = kmalloc(sizeof(*lc), GFP_KERNEL);
if (!lc) {
DMWARN("couldn't allocate core log");
return -ENOMEM;
}
lc->ti = ti;
lc->touched = 0;
lc->region_size = region_size;
lc->region_count = region_count;
lc->sync = sync;
/*
* Work out how many "unsigned long"s we need to hold the bitset.
*/
bitset_size = dm_round_up(region_count,
sizeof(*lc->clean_bits) << BYTE_SHIFT);
bitset_size >>= BYTE_SHIFT;
lc->bitset_uint32_count = bitset_size / sizeof(*lc->clean_bits);
/*
* Disk log?
*/
if (!dev) {
lc->clean_bits = vmalloc(bitset_size);
if (!lc->clean_bits) {
DMWARN("couldn't allocate clean bitset");
kfree(lc);
return -ENOMEM;
}
lc->disk_header = NULL;
} else {
lc->log_dev = dev;
lc->log_dev_failed = 0;
lc->header_location.bdev = lc->log_dev->bdev;
lc->header_location.sector = 0;
/*
* Buffer holds both header and bitset.
*/
buf_size =
dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size,
bdev_logical_block_size(lc->header_location.
bdev));
if (buf_size > i_size_read(dev->bdev->bd_inode)) {
DMWARN("log device %s too small: need %llu bytes",
dev->name, (unsigned long long)buf_size);
kfree(lc);
return -EINVAL;
}
lc->header_location.count = buf_size >> SECTOR_SHIFT;
lc->io_req.mem.type = DM_IO_VMA;
lc->io_req.notify.fn = NULL;
lc->io_req.client = dm_io_client_create(dm_div_up(buf_size,
PAGE_SIZE));
if (IS_ERR(lc->io_req.client)) {
r = PTR_ERR(lc->io_req.client);
DMWARN("couldn't allocate disk io client");
kfree(lc);
return -ENOMEM;
}
lc->disk_header = vmalloc(buf_size);
if (!lc->disk_header) {
DMWARN("couldn't allocate disk log buffer");
dm_io_client_destroy(lc->io_req.client);
kfree(lc);
return -ENOMEM;
}
lc->io_req.mem.ptr.vma = lc->disk_header;
lc->clean_bits = (void *)lc->disk_header +
(LOG_OFFSET << SECTOR_SHIFT);
}
memset(lc->clean_bits, -1, bitset_size);
lc->sync_bits = vmalloc(bitset_size);
if (!lc->sync_bits) {
DMWARN("couldn't allocate sync bitset");
if (!dev)
vfree(lc->clean_bits);
else
dm_io_client_destroy(lc->io_req.client);
vfree(lc->disk_header);
kfree(lc);
return -ENOMEM;
}
memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
lc->sync_count = (sync == NOSYNC) ? region_count : 0;
lc->recovering_bits = vmalloc(bitset_size);
if (!lc->recovering_bits) {
DMWARN("couldn't allocate sync bitset");
vfree(lc->sync_bits);
if (!dev)
vfree(lc->clean_bits);
else
dm_io_client_destroy(lc->io_req.client);
vfree(lc->disk_header);
kfree(lc);
return -ENOMEM;
}
memset(lc->recovering_bits, 0, bitset_size);
lc->sync_search = 0;
log->context = lc;
return 0;
}
static int core_ctr(struct dm_dirty_log *log, struct dm_target *ti,
unsigned int argc, char **argv)
{
return create_log_context(log, ti, argc, argv, NULL);
}
static void destroy_log_context(struct log_c *lc)
{
vfree(lc->sync_bits);
vfree(lc->recovering_bits);
kfree(lc);
}
static void core_dtr(struct dm_dirty_log *log)
{
struct log_c *lc = (struct log_c *) log->context;
vfree(lc->clean_bits);
destroy_log_context(lc);
}
/*----------------------------------------------------------------
* disk log constructor/destructor
*
* argv contains log_device region_size followed optionally by [no]sync
*--------------------------------------------------------------*/
static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti,
unsigned int argc, char **argv)
{
int r;
struct dm_dev *dev;
if (argc < 2 || argc > 3) {
DMWARN("wrong number of arguments to disk dirty region log");
return -EINVAL;
}
r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */,
FMODE_READ | FMODE_WRITE, &dev);
if (r)
return r;
r = create_log_context(log, ti, argc - 1, argv + 1, dev);
if (r) {
dm_put_device(ti, dev);
return r;
}
return 0;
}
static void disk_dtr(struct dm_dirty_log *log)
{
struct log_c *lc = (struct log_c *) log->context;
dm_put_device(lc->ti, lc->log_dev);
vfree(lc->disk_header);
dm_io_client_destroy(lc->io_req.client);
destroy_log_context(lc);
}
static int count_bits32(uint32_t *addr, unsigned size)
{
int count = 0, i;
for (i = 0; i < size; i++) {
count += hweight32(*(addr+i));
}
return count;
}
static void fail_log_device(struct log_c *lc)
{
if (lc->log_dev_failed)
return;
lc->log_dev_failed = 1;
dm_table_event(lc->ti->table);
}
static int disk_resume(struct dm_dirty_log *log)
{
int r;
unsigned i;
struct log_c *lc = (struct log_c *) log->context;
size_t size = lc->bitset_uint32_count * sizeof(uint32_t);
/* read the disk header */
r = read_header(lc);
if (r) {
DMWARN("%s: Failed to read header on dirty region log device",
lc->log_dev->name);
fail_log_device(lc);
/*
* If the log device cannot be read, we must assume
* all regions are out-of-sync. If we simply return
* here, the state will be uninitialized and could
* lead us to return 'in-sync' status for regions
* that are actually 'out-of-sync'.
*/
lc->header.nr_regions = 0;
}
/* set or clear any new bits -- device has grown */
if (lc->sync == NOSYNC)
for (i = lc->header.nr_regions; i < lc->region_count; i++)
/* FIXME: amazingly inefficient */
log_set_bit(lc, lc->clean_bits, i);
else
for (i = lc->header.nr_regions; i < lc->region_count; i++)
/* FIXME: amazingly inefficient */
log_clear_bit(lc, lc->clean_bits, i);
/* clear any old bits -- device has shrunk */
for (i = lc->region_count; i % (sizeof(*lc->clean_bits) << BYTE_SHIFT); i++)
log_clear_bit(lc, lc->clean_bits, i);
/* copy clean across to sync */
memcpy(lc->sync_bits, lc->clean_bits, size);
lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
lc->sync_search = 0;
/* set the correct number of regions in the header */
lc->header.nr_regions = lc->region_count;
header_to_disk(&lc->header, lc->disk_header);
/* write the new header */
r = rw_header(lc, WRITE);
if (r) {
DMWARN("%s: Failed to write header on dirty region log device",
lc->log_dev->name);
fail_log_device(lc);
}
return r;
}
static uint32_t core_get_region_size(struct dm_dirty_log *log)
{
struct log_c *lc = (struct log_c *) log->context;
return lc->region_size;
}
static int core_resume(struct dm_dirty_log *log)
{
struct log_c *lc = (struct log_c *) log->context;
lc->sync_search = 0;
return 0;
}
static int core_is_clean(struct dm_dirty_log *log, region_t region)
{
struct log_c *lc = (struct log_c *) log->context;
return log_test_bit(lc->clean_bits, region);
}
static int core_in_sync(struct dm_dirty_log *log, region_t region, int block)
{
struct log_c *lc = (struct log_c *) log->context;
return log_test_bit(lc->sync_bits, region);
}
static int core_flush(struct dm_dirty_log *log)
{
/* no op */
return 0;
}
static int disk_flush(struct dm_dirty_log *log)
{
int r;
struct log_c *lc = (struct log_c *) log->context;
/* only write if the log has changed */
if (!lc->touched)
return 0;
r = rw_header(lc, WRITE);
if (r)
fail_log_device(lc);
else
lc->touched = 0;
return r;
}
static void core_mark_region(struct dm_dirty_log *log, region_t region)
{
struct log_c *lc = (struct log_c *) log->context;
log_clear_bit(lc, lc->clean_bits, region);
}
static void core_clear_region(struct dm_dirty_log *log, region_t region)
{
struct log_c *lc = (struct log_c *) log->context;
log_set_bit(lc, lc->clean_bits, region);
}
static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
{
struct log_c *lc = (struct log_c *) log->context;
if (lc->sync_search >= lc->region_count)
return 0;
do {
*region = ext2_find_next_zero_bit(
(unsigned long *) lc->sync_bits,
lc->region_count,
lc->sync_search);
lc->sync_search = *region + 1;
if (*region >= lc->region_count)
return 0;
} while (log_test_bit(lc->recovering_bits, *region));
log_set_bit(lc, lc->recovering_bits, *region);
return 1;
}
static void core_set_region_sync(struct dm_dirty_log *log, region_t region,
int in_sync)
{
struct log_c *lc = (struct log_c *) log->context;
log_clear_bit(lc, lc->recovering_bits, region);
if (in_sync) {
log_set_bit(lc, lc->sync_bits, region);
lc->sync_count++;
} else if (log_test_bit(lc->sync_bits, region)) {
lc->sync_count--;
log_clear_bit(lc, lc->sync_bits, region);
}
}
static region_t core_get_sync_count(struct dm_dirty_log *log)
{
struct log_c *lc = (struct log_c *) log->context;
return lc->sync_count;
}
#define DMEMIT_SYNC \
if (lc->sync != DEFAULTSYNC) \
DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "")
static int core_status(struct dm_dirty_log *log, status_type_t status,
char *result, unsigned int maxlen)
{
int sz = 0;
struct log_c *lc = log->context;
switch(status) {
case STATUSTYPE_INFO:
DMEMIT("1 %s", log->type->name);
break;
case STATUSTYPE_TABLE:
DMEMIT("%s %u %u ", log->type->name,
lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size);
DMEMIT_SYNC;
}
return sz;
}
static int disk_status(struct dm_dirty_log *log, status_type_t status,
char *result, unsigned int maxlen)
{
int sz = 0;
struct log_c *lc = log->context;
switch(status) {
case STATUSTYPE_INFO:
DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
lc->log_dev_failed ? 'D' : 'A');
break;
case STATUSTYPE_TABLE:
DMEMIT("%s %u %s %u ", log->type->name,
lc->sync == DEFAULTSYNC ? 2 : 3, lc->log_dev->name,
lc->region_size);
DMEMIT_SYNC;
}
return sz;
}
static struct dm_dirty_log_type _core_type = {
.name = "core",
.module = THIS_MODULE,
.ctr = core_ctr,
.dtr = core_dtr,
.resume = core_resume,
.get_region_size = core_get_region_size,
.is_clean = core_is_clean,
.in_sync = core_in_sync,
.flush = core_flush,
.mark_region = core_mark_region,
.clear_region = core_clear_region,
.get_resync_work = core_get_resync_work,
.set_region_sync = core_set_region_sync,
.get_sync_count = core_get_sync_count,
.status = core_status,
};
static struct dm_dirty_log_type _disk_type = {
.name = "disk",
.module = THIS_MODULE,
.ctr = disk_ctr,
.dtr = disk_dtr,
.postsuspend = disk_flush,
.resume = disk_resume,
.get_region_size = core_get_region_size,
.is_clean = core_is_clean,
.in_sync = core_in_sync,
.flush = disk_flush,
.mark_region = core_mark_region,
.clear_region = core_clear_region,
.get_resync_work = core_get_resync_work,
.set_region_sync = core_set_region_sync,
.get_sync_count = core_get_sync_count,
.status = disk_status,
};
static int __init dm_dirty_log_init(void)
{
int r;
r = dm_dirty_log_type_register(&_core_type);
if (r)
DMWARN("couldn't register core log");
r = dm_dirty_log_type_register(&_disk_type);
if (r) {
DMWARN("couldn't register disk type");
dm_dirty_log_type_unregister(&_core_type);
}
return r;
}
static void __exit dm_dirty_log_exit(void)
{
dm_dirty_log_type_unregister(&_disk_type);
dm_dirty_log_type_unregister(&_core_type);
}
module_init(dm_dirty_log_init);
module_exit(dm_dirty_log_exit);
MODULE_DESCRIPTION(DM_NAME " dirty region log");
MODULE_AUTHOR("Joe Thornber, Heinz Mauelshagen <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");

1634
kernel/drivers/md/dm-mpath.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,22 @@
/*
* Copyright (C) 2004 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*
* Multipath.
*/
#ifndef DM_MPATH_H
#define DM_MPATH_H
struct dm_dev;
struct dm_path {
struct dm_dev *dev; /* Read-only */
void *pscontext; /* For path-selector use */
};
/* Callback for hwh_pg_init_fn to use when complete */
void dm_pg_init_complete(struct dm_path *path, unsigned err_flags);
#endif

View File

@@ -0,0 +1,139 @@
/*
* Copyright (C) 2003 Sistina Software.
* Copyright (C) 2004 Red Hat, Inc. All rights reserved.
*
* Module Author: Heinz Mauelshagen
*
* This file is released under the GPL.
*
* Path selector registration.
*/
#include <linux/device-mapper.h>
#include "dm-path-selector.h"
#include <linux/slab.h>
struct ps_internal {
struct path_selector_type pst;
struct list_head list;
};
#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst)
static LIST_HEAD(_path_selectors);
static DECLARE_RWSEM(_ps_lock);
static struct ps_internal *__find_path_selector_type(const char *name)
{
struct ps_internal *psi;
list_for_each_entry(psi, &_path_selectors, list) {
if (!strcmp(name, psi->pst.name))
return psi;
}
return NULL;
}
static struct ps_internal *get_path_selector(const char *name)
{
struct ps_internal *psi;
down_read(&_ps_lock);
psi = __find_path_selector_type(name);
if (psi && !try_module_get(psi->pst.module))
psi = NULL;
up_read(&_ps_lock);
return psi;
}
struct path_selector_type *dm_get_path_selector(const char *name)
{
struct ps_internal *psi;
if (!name)
return NULL;
psi = get_path_selector(name);
if (!psi) {
request_module("dm-%s", name);
psi = get_path_selector(name);
}
return psi ? &psi->pst : NULL;
}
void dm_put_path_selector(struct path_selector_type *pst)
{
struct ps_internal *psi;
if (!pst)
return;
down_read(&_ps_lock);
psi = __find_path_selector_type(pst->name);
if (!psi)
goto out;
module_put(psi->pst.module);
out:
up_read(&_ps_lock);
}
static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst)
{
struct ps_internal *psi = kzalloc(sizeof(*psi), GFP_KERNEL);
if (psi)
psi->pst = *pst;
return psi;
}
int dm_register_path_selector(struct path_selector_type *pst)
{
int r = 0;
struct ps_internal *psi = _alloc_path_selector(pst);
if (!psi)
return -ENOMEM;
down_write(&_ps_lock);
if (__find_path_selector_type(pst->name)) {
kfree(psi);
r = -EEXIST;
} else
list_add(&psi->list, &_path_selectors);
up_write(&_ps_lock);
return r;
}
int dm_unregister_path_selector(struct path_selector_type *pst)
{
struct ps_internal *psi;
down_write(&_ps_lock);
psi = __find_path_selector_type(pst->name);
if (!psi) {
up_write(&_ps_lock);
return -EINVAL;
}
list_del(&psi->list);
up_write(&_ps_lock);
kfree(psi);
return 0;
}
EXPORT_SYMBOL_GPL(dm_register_path_selector);
EXPORT_SYMBOL_GPL(dm_unregister_path_selector);

View File

@@ -0,0 +1,97 @@
/*
* Copyright (C) 2003 Sistina Software.
* Copyright (C) 2004 Red Hat, Inc. All rights reserved.
*
* Module Author: Heinz Mauelshagen
*
* This file is released under the GPL.
*
* Path-Selector registration.
*/
#ifndef DM_PATH_SELECTOR_H
#define DM_PATH_SELECTOR_H
#include <linux/device-mapper.h>
#include "dm-mpath.h"
/*
* We provide an abstraction for the code that chooses which path
* to send some io down.
*/
struct path_selector_type;
struct path_selector {
struct path_selector_type *type;
void *context;
};
/* Information about a path selector type */
struct path_selector_type {
char *name;
struct module *module;
unsigned int table_args;
unsigned int info_args;
/*
* Constructs a path selector object, takes custom arguments
*/
int (*create) (struct path_selector *ps, unsigned argc, char **argv);
void (*destroy) (struct path_selector *ps);
/*
* Add an opaque path object, along with some selector specific
* path args (eg, path priority).
*/
int (*add_path) (struct path_selector *ps, struct dm_path *path,
int argc, char **argv, char **error);
/*
* Chooses a path for this io, if no paths are available then
* NULL will be returned.
*
* repeat_count is the number of times to use the path before
* calling the function again. 0 means don't call it again unless
* the path fails.
*/
struct dm_path *(*select_path) (struct path_selector *ps,
unsigned *repeat_count,
size_t nr_bytes);
/*
* Notify the selector that a path has failed.
*/
void (*fail_path) (struct path_selector *ps, struct dm_path *p);
/*
* Ask selector to reinstate a path.
*/
int (*reinstate_path) (struct path_selector *ps, struct dm_path *p);
/*
* Table content based on parameters added in ps_add_path_fn
* or path selector status
*/
int (*status) (struct path_selector *ps, struct dm_path *path,
status_type_t type, char *result, unsigned int maxlen);
int (*start_io) (struct path_selector *ps, struct dm_path *path,
size_t nr_bytes);
int (*end_io) (struct path_selector *ps, struct dm_path *path,
size_t nr_bytes);
};
/* Register a path selector */
int dm_register_path_selector(struct path_selector_type *type);
/* Unregister a path selector */
int dm_unregister_path_selector(struct path_selector_type *type);
/* Returns a registered path selector type */
struct path_selector_type *dm_get_path_selector(const char *name);
/* Releases a path selector */
void dm_put_path_selector(struct path_selector_type *pst);
#endif

View File

@@ -0,0 +1,263 @@
/*
* Copyright (C) 2004-2005 IBM Corp. All Rights Reserved.
* Copyright (C) 2006-2009 NEC Corporation.
*
* dm-queue-length.c
*
* Module Author: Stefan Bader, IBM
* Modified by: Kiyoshi Ueda, NEC
*
* This file is released under the GPL.
*
* queue-length path selector - choose a path with the least number of
* in-flight I/Os.
*/
#include "dm.h"
#include "dm-path-selector.h"
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <asm/atomic.h>
#define DM_MSG_PREFIX "multipath queue-length"
#define QL_MIN_IO 128
#define QL_VERSION "0.1.0"
struct selector {
struct list_head valid_paths;
struct list_head failed_paths;
};
struct path_info {
struct list_head list;
struct dm_path *path;
unsigned repeat_count;
atomic_t qlen; /* the number of in-flight I/Os */
};
static struct selector *alloc_selector(void)
{
struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s) {
INIT_LIST_HEAD(&s->valid_paths);
INIT_LIST_HEAD(&s->failed_paths);
}
return s;
}
static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
{
struct selector *s = alloc_selector();
if (!s)
return -ENOMEM;
ps->context = s;
return 0;
}
static void ql_free_paths(struct list_head *paths)
{
struct path_info *pi, *next;
list_for_each_entry_safe(pi, next, paths, list) {
list_del(&pi->list);
kfree(pi);
}
}
static void ql_destroy(struct path_selector *ps)
{
struct selector *s = ps->context;
ql_free_paths(&s->valid_paths);
ql_free_paths(&s->failed_paths);
kfree(s);
ps->context = NULL;
}
static int ql_status(struct path_selector *ps, struct dm_path *path,
status_type_t type, char *result, unsigned maxlen)
{
unsigned sz = 0;
struct path_info *pi;
/* When called with NULL path, return selector status/args. */
if (!path)
DMEMIT("0 ");
else {
pi = path->pscontext;
switch (type) {
case STATUSTYPE_INFO:
DMEMIT("%d ", atomic_read(&pi->qlen));
break;
case STATUSTYPE_TABLE:
DMEMIT("%u ", pi->repeat_count);
break;
}
}
return sz;
}
static int ql_add_path(struct path_selector *ps, struct dm_path *path,
int argc, char **argv, char **error)
{
struct selector *s = ps->context;
struct path_info *pi;
unsigned repeat_count = QL_MIN_IO;
/*
* Arguments: [<repeat_count>]
* <repeat_count>: The number of I/Os before switching path.
* If not given, default (QL_MIN_IO) is used.
*/
if (argc > 1) {
*error = "queue-length ps: incorrect number of arguments";
return -EINVAL;
}
if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
*error = "queue-length ps: invalid repeat count";
return -EINVAL;
}
/* Allocate the path information structure */
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
if (!pi) {
*error = "queue-length ps: Error allocating path information";
return -ENOMEM;
}
pi->path = path;
pi->repeat_count = repeat_count;
atomic_set(&pi->qlen, 0);
path->pscontext = pi;
list_add_tail(&pi->list, &s->valid_paths);
return 0;
}
static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
{
struct selector *s = ps->context;
struct path_info *pi = path->pscontext;
list_move(&pi->list, &s->failed_paths);
}
static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
{
struct selector *s = ps->context;
struct path_info *pi = path->pscontext;
list_move_tail(&pi->list, &s->valid_paths);
return 0;
}
/*
* Select a path having the minimum number of in-flight I/Os
*/
static struct dm_path *ql_select_path(struct path_selector *ps,
unsigned *repeat_count, size_t nr_bytes)
{
struct selector *s = ps->context;
struct path_info *pi = NULL, *best = NULL;
if (list_empty(&s->valid_paths))
return NULL;
/* Change preferred (first in list) path to evenly balance. */
list_move_tail(s->valid_paths.next, &s->valid_paths);
list_for_each_entry(pi, &s->valid_paths, list) {
if (!best ||
(atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
best = pi;
if (!atomic_read(&best->qlen))
break;
}
if (!best)
return NULL;
*repeat_count = best->repeat_count;
return best->path;
}
static int ql_start_io(struct path_selector *ps, struct dm_path *path,
size_t nr_bytes)
{
struct path_info *pi = path->pscontext;
atomic_inc(&pi->qlen);
return 0;
}
static int ql_end_io(struct path_selector *ps, struct dm_path *path,
size_t nr_bytes)
{
struct path_info *pi = path->pscontext;
atomic_dec(&pi->qlen);
return 0;
}
static struct path_selector_type ql_ps = {
.name = "queue-length",
.module = THIS_MODULE,
.table_args = 1,
.info_args = 1,
.create = ql_create,
.destroy = ql_destroy,
.status = ql_status,
.add_path = ql_add_path,
.fail_path = ql_fail_path,
.reinstate_path = ql_reinstate_path,
.select_path = ql_select_path,
.start_io = ql_start_io,
.end_io = ql_end_io,
};
static int __init dm_ql_init(void)
{
int r = dm_register_path_selector(&ql_ps);
if (r < 0)
DMERR("register failed %d", r);
DMINFO("version " QL_VERSION " loaded");
return r;
}
static void __exit dm_ql_exit(void)
{
int r = dm_unregister_path_selector(&ql_ps);
if (r < 0)
DMERR("unregister failed %d", r);
}
module_init(dm_ql_init);
module_exit(dm_ql_exit);
MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
MODULE_DESCRIPTION(
"(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n"
DM_NAME " path selector to balance the number of in-flight I/Os"
);
MODULE_LICENSE("GPL");

1359
kernel/drivers/md/dm-raid1.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,702 @@
/*
* Copyright (C) 2003 Sistina Software Limited.
* Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
#include <linux/dm-dirty-log.h>
#include <linux/dm-region-hash.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include "dm.h"
#define DM_MSG_PREFIX "region hash"
/*-----------------------------------------------------------------
* Region hash
*
* The mirror splits itself up into discrete regions. Each
* region can be in one of three states: clean, dirty,
* nosync. There is no need to put clean regions in the hash.
*
* In addition to being present in the hash table a region _may_
* be present on one of three lists.
*
* clean_regions: Regions on this list have no io pending to
* them, they are in sync, we are no longer interested in them,
* they are dull. dm_rh_update_states() will remove them from the
* hash table.
*
* quiesced_regions: These regions have been spun down, ready
* for recovery. rh_recovery_start() will remove regions from
* this list and hand them to kmirrord, which will schedule the
* recovery io with kcopyd.
*
* recovered_regions: Regions that kcopyd has successfully
* recovered. dm_rh_update_states() will now schedule any delayed
* io, up the recovery_count, and remove the region from the
* hash.
*
* There are 2 locks:
* A rw spin lock 'hash_lock' protects just the hash table,
* this is never held in write mode from interrupt context,
* which I believe means that we only have to disable irqs when
* doing a write lock.
*
* An ordinary spin lock 'region_lock' that protects the three
* lists in the region_hash, with the 'state', 'list' and
* 'delayed_bios' fields of the regions. This is used from irq
* context, so all other uses will have to suspend local irqs.
*---------------------------------------------------------------*/
struct dm_region_hash {
uint32_t region_size;
unsigned region_shift;
/* holds persistent region state */
struct dm_dirty_log *log;
/* hash table */
rwlock_t hash_lock;
mempool_t *region_pool;
unsigned mask;
unsigned nr_buckets;
unsigned prime;
unsigned shift;
struct list_head *buckets;
unsigned max_recovery; /* Max # of regions to recover in parallel */
spinlock_t region_lock;
atomic_t recovery_in_flight;
struct semaphore recovery_count;
struct list_head clean_regions;
struct list_head quiesced_regions;
struct list_head recovered_regions;
struct list_head failed_recovered_regions;
void *context;
sector_t target_begin;
/* Callback function to schedule bios writes */
void (*dispatch_bios)(void *context, struct bio_list *bios);
/* Callback function to wakeup callers worker thread. */
void (*wakeup_workers)(void *context);
/* Callback function to wakeup callers recovery waiters. */
void (*wakeup_all_recovery_waiters)(void *context);
};
struct dm_region {
struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */
region_t key;
int state;
struct list_head hash_list;
struct list_head list;
atomic_t pending;
struct bio_list delayed_bios;
};
/*
* Conversion fns
*/
static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector)
{
return sector >> rh->region_shift;
}
sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region)
{
return region << rh->region_shift;
}
EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio)
{
return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin);
}
EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
void *dm_rh_region_context(struct dm_region *reg)
{
return reg->rh->context;
}
EXPORT_SYMBOL_GPL(dm_rh_region_context);
region_t dm_rh_get_region_key(struct dm_region *reg)
{
return reg->key;
}
EXPORT_SYMBOL_GPL(dm_rh_get_region_key);
sector_t dm_rh_get_region_size(struct dm_region_hash *rh)
{
return rh->region_size;
}
EXPORT_SYMBOL_GPL(dm_rh_get_region_size);
/*
* FIXME: shall we pass in a structure instead of all these args to
* dm_region_hash_create()????
*/
#define RH_HASH_MULT 2654435387U
#define RH_HASH_SHIFT 12
#define MIN_REGIONS 64
struct dm_region_hash *dm_region_hash_create(
void *context, void (*dispatch_bios)(void *context,
struct bio_list *bios),
void (*wakeup_workers)(void *context),
void (*wakeup_all_recovery_waiters)(void *context),
sector_t target_begin, unsigned max_recovery,
struct dm_dirty_log *log, uint32_t region_size,
region_t nr_regions)
{
struct dm_region_hash *rh;
unsigned nr_buckets, max_buckets;
size_t i;
/*
* Calculate a suitable number of buckets for our hash
* table.
*/
max_buckets = nr_regions >> 6;
for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
;
nr_buckets >>= 1;
rh = kmalloc(sizeof(*rh), GFP_KERNEL);
if (!rh) {
DMERR("unable to allocate region hash memory");
return ERR_PTR(-ENOMEM);
}
rh->context = context;
rh->dispatch_bios = dispatch_bios;
rh->wakeup_workers = wakeup_workers;
rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters;
rh->target_begin = target_begin;
rh->max_recovery = max_recovery;
rh->log = log;
rh->region_size = region_size;
rh->region_shift = ffs(region_size) - 1;
rwlock_init(&rh->hash_lock);
rh->mask = nr_buckets - 1;
rh->nr_buckets = nr_buckets;
rh->shift = RH_HASH_SHIFT;
rh->prime = RH_HASH_MULT;
rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
if (!rh->buckets) {
DMERR("unable to allocate region hash bucket memory");
kfree(rh);
return ERR_PTR(-ENOMEM);
}
for (i = 0; i < nr_buckets; i++)
INIT_LIST_HEAD(rh->buckets + i);
spin_lock_init(&rh->region_lock);
sema_init(&rh->recovery_count, 0);
atomic_set(&rh->recovery_in_flight, 0);
INIT_LIST_HEAD(&rh->clean_regions);
INIT_LIST_HEAD(&rh->quiesced_regions);
INIT_LIST_HEAD(&rh->recovered_regions);
INIT_LIST_HEAD(&rh->failed_recovered_regions);
rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
sizeof(struct dm_region));
if (!rh->region_pool) {
vfree(rh->buckets);
kfree(rh);
rh = ERR_PTR(-ENOMEM);
}
return rh;
}
EXPORT_SYMBOL_GPL(dm_region_hash_create);
void dm_region_hash_destroy(struct dm_region_hash *rh)
{
unsigned h;
struct dm_region *reg, *nreg;
BUG_ON(!list_empty(&rh->quiesced_regions));
for (h = 0; h < rh->nr_buckets; h++) {
list_for_each_entry_safe(reg, nreg, rh->buckets + h,
hash_list) {
BUG_ON(atomic_read(&reg->pending));
mempool_free(reg, rh->region_pool);
}
}
if (rh->log)
dm_dirty_log_destroy(rh->log);
if (rh->region_pool)
mempool_destroy(rh->region_pool);
vfree(rh->buckets);
kfree(rh);
}
EXPORT_SYMBOL_GPL(dm_region_hash_destroy);
struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh)
{
return rh->log;
}
EXPORT_SYMBOL_GPL(dm_rh_dirty_log);
static unsigned rh_hash(struct dm_region_hash *rh, region_t region)
{
return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask;
}
static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region)
{
struct dm_region *reg;
struct list_head *bucket = rh->buckets + rh_hash(rh, region);
list_for_each_entry(reg, bucket, hash_list)
if (reg->key == region)
return reg;
return NULL;
}
static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg)
{
list_add(&reg->hash_list, rh->buckets + rh_hash(rh, reg->key));
}
static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
{
struct dm_region *reg, *nreg;
nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
if (unlikely(!nreg))
nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
DM_RH_CLEAN : DM_RH_NOSYNC;
nreg->rh = rh;
nreg->key = region;
INIT_LIST_HEAD(&nreg->list);
atomic_set(&nreg->pending, 0);
bio_list_init(&nreg->delayed_bios);
write_lock_irq(&rh->hash_lock);
reg = __rh_lookup(rh, region);
if (reg)
/* We lost the race. */
mempool_free(nreg, rh->region_pool);
else {
__rh_insert(rh, nreg);
if (nreg->state == DM_RH_CLEAN) {
spin_lock(&rh->region_lock);
list_add(&nreg->list, &rh->clean_regions);
spin_unlock(&rh->region_lock);
}
reg = nreg;
}
write_unlock_irq(&rh->hash_lock);
return reg;
}
static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region)
{
struct dm_region *reg;
reg = __rh_lookup(rh, region);
if (!reg) {
read_unlock(&rh->hash_lock);
reg = __rh_alloc(rh, region);
read_lock(&rh->hash_lock);
}
return reg;
}
int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block)
{
int r;
struct dm_region *reg;
read_lock(&rh->hash_lock);
reg = __rh_lookup(rh, region);
read_unlock(&rh->hash_lock);
if (reg)
return reg->state;
/*
* The region wasn't in the hash, so we fall back to the
* dirty log.
*/
r = rh->log->type->in_sync(rh->log, region, may_block);
/*
* Any error from the dirty log (eg. -EWOULDBLOCK) gets
* taken as a DM_RH_NOSYNC
*/
return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC;
}
EXPORT_SYMBOL_GPL(dm_rh_get_state);
static void complete_resync_work(struct dm_region *reg, int success)
{
struct dm_region_hash *rh = reg->rh;
rh->log->type->set_region_sync(rh->log, reg->key, success);
/*
* Dispatch the bios before we call 'wake_up_all'.
* This is important because if we are suspending,
* we want to know that recovery is complete and
* the work queue is flushed. If we wake_up_all
* before we dispatch_bios (queue bios and call wake()),
* then we risk suspending before the work queue
* has been properly flushed.
*/
rh->dispatch_bios(rh->context, &reg->delayed_bios);
if (atomic_dec_and_test(&rh->recovery_in_flight))
rh->wakeup_all_recovery_waiters(rh->context);
up(&rh->recovery_count);
}
/* dm_rh_mark_nosync
* @ms
* @bio
* @done
* @error
*
* The bio was written on some mirror(s) but failed on other mirror(s).
* We can successfully endio the bio but should avoid the region being
* marked clean by setting the state DM_RH_NOSYNC.
*
* This function is _not_ safe in interrupt context!
*/
void dm_rh_mark_nosync(struct dm_region_hash *rh,
struct bio *bio, unsigned done, int error)
{
unsigned long flags;
struct dm_dirty_log *log = rh->log;
struct dm_region *reg;
region_t region = dm_rh_bio_to_region(rh, bio);
int recovering = 0;
/* We must inform the log that the sync count has changed. */
log->type->set_region_sync(log, region, 0);
read_lock(&rh->hash_lock);
reg = __rh_find(rh, region);
read_unlock(&rh->hash_lock);
/* region hash entry should exist because write was in-flight */
BUG_ON(!reg);
BUG_ON(!list_empty(&reg->list));
spin_lock_irqsave(&rh->region_lock, flags);
/*
* Possible cases:
* 1) DM_RH_DIRTY
* 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed
* 3) DM_RH_RECOVERING: flushing pending writes
* Either case, the region should have not been connected to list.
*/
recovering = (reg->state == DM_RH_RECOVERING);
reg->state = DM_RH_NOSYNC;
BUG_ON(!list_empty(&reg->list));
spin_unlock_irqrestore(&rh->region_lock, flags);
bio_endio(bio, error);
if (recovering)
complete_resync_work(reg, 0);
}
EXPORT_SYMBOL_GPL(dm_rh_mark_nosync);
void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled)
{
struct dm_region *reg, *next;
LIST_HEAD(clean);
LIST_HEAD(recovered);
LIST_HEAD(failed_recovered);
/*
* Quickly grab the lists.
*/
write_lock_irq(&rh->hash_lock);
spin_lock(&rh->region_lock);
if (!list_empty(&rh->clean_regions)) {
list_splice_init(&rh->clean_regions, &clean);
list_for_each_entry(reg, &clean, list)
list_del(&reg->hash_list);
}
if (!list_empty(&rh->recovered_regions)) {
list_splice_init(&rh->recovered_regions, &recovered);
list_for_each_entry(reg, &recovered, list)
list_del(&reg->hash_list);
}
if (!list_empty(&rh->failed_recovered_regions)) {
list_splice_init(&rh->failed_recovered_regions,
&failed_recovered);
list_for_each_entry(reg, &failed_recovered, list)
list_del(&reg->hash_list);
}
spin_unlock(&rh->region_lock);
write_unlock_irq(&rh->hash_lock);
/*
* All the regions on the recovered and clean lists have
* now been pulled out of the system, so no need to do
* any more locking.
*/
list_for_each_entry_safe(reg, next, &recovered, list) {
rh->log->type->clear_region(rh->log, reg->key);
complete_resync_work(reg, 1);
mempool_free(reg, rh->region_pool);
}
list_for_each_entry_safe(reg, next, &failed_recovered, list) {
complete_resync_work(reg, errors_handled ? 0 : 1);
mempool_free(reg, rh->region_pool);
}
list_for_each_entry_safe(reg, next, &clean, list) {
rh->log->type->clear_region(rh->log, reg->key);
mempool_free(reg, rh->region_pool);
}
rh->log->type->flush(rh->log);
}
EXPORT_SYMBOL_GPL(dm_rh_update_states);
static void rh_inc(struct dm_region_hash *rh, region_t region)
{
struct dm_region *reg;
read_lock(&rh->hash_lock);
reg = __rh_find(rh, region);
spin_lock_irq(&rh->region_lock);
atomic_inc(&reg->pending);
if (reg->state == DM_RH_CLEAN) {
reg->state = DM_RH_DIRTY;
list_del_init(&reg->list); /* take off the clean list */
spin_unlock_irq(&rh->region_lock);
rh->log->type->mark_region(rh->log, reg->key);
} else
spin_unlock_irq(&rh->region_lock);
read_unlock(&rh->hash_lock);
}
void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
{
struct bio *bio;
for (bio = bios->head; bio; bio = bio->bi_next)
rh_inc(rh, dm_rh_bio_to_region(rh, bio));
}
EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
void dm_rh_dec(struct dm_region_hash *rh, region_t region)
{
unsigned long flags;
struct dm_region *reg;
int should_wake = 0;
read_lock(&rh->hash_lock);
reg = __rh_lookup(rh, region);
read_unlock(&rh->hash_lock);
spin_lock_irqsave(&rh->region_lock, flags);
if (atomic_dec_and_test(&reg->pending)) {
/*
* There is no pending I/O for this region.
* We can move the region to corresponding list for next action.
* At this point, the region is not yet connected to any list.
*
* If the state is DM_RH_NOSYNC, the region should be kept off
* from clean list.
* The hash entry for DM_RH_NOSYNC will remain in memory
* until the region is recovered or the map is reloaded.
*/
/* do nothing for DM_RH_NOSYNC */
if (reg->state == DM_RH_RECOVERING) {
list_add_tail(&reg->list, &rh->quiesced_regions);
} else if (reg->state == DM_RH_DIRTY) {
reg->state = DM_RH_CLEAN;
list_add(&reg->list, &rh->clean_regions);
}
should_wake = 1;
}
spin_unlock_irqrestore(&rh->region_lock, flags);
if (should_wake)
rh->wakeup_workers(rh->context);
}
EXPORT_SYMBOL_GPL(dm_rh_dec);
/*
* Starts quiescing a region in preparation for recovery.
*/
static int __rh_recovery_prepare(struct dm_region_hash *rh)
{
int r;
region_t region;
struct dm_region *reg;
/*
* Ask the dirty log what's next.
*/
r = rh->log->type->get_resync_work(rh->log, &region);
if (r <= 0)
return r;
/*
* Get this region, and start it quiescing by setting the
* recovering flag.
*/
read_lock(&rh->hash_lock);
reg = __rh_find(rh, region);
read_unlock(&rh->hash_lock);
spin_lock_irq(&rh->region_lock);
reg->state = DM_RH_RECOVERING;
/* Already quiesced ? */
if (atomic_read(&reg->pending))
list_del_init(&reg->list);
else
list_move(&reg->list, &rh->quiesced_regions);
spin_unlock_irq(&rh->region_lock);
return 1;
}
void dm_rh_recovery_prepare(struct dm_region_hash *rh)
{
/* Extra reference to avoid race with dm_rh_stop_recovery */
atomic_inc(&rh->recovery_in_flight);
while (!down_trylock(&rh->recovery_count)) {
atomic_inc(&rh->recovery_in_flight);
if (__rh_recovery_prepare(rh) <= 0) {
atomic_dec(&rh->recovery_in_flight);
up(&rh->recovery_count);
break;
}
}
/* Drop the extra reference */
if (atomic_dec_and_test(&rh->recovery_in_flight))
rh->wakeup_all_recovery_waiters(rh->context);
}
EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare);
/*
* Returns any quiesced regions.
*/
struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh)
{
struct dm_region *reg = NULL;
spin_lock_irq(&rh->region_lock);
if (!list_empty(&rh->quiesced_regions)) {
reg = list_entry(rh->quiesced_regions.next,
struct dm_region, list);
list_del_init(&reg->list); /* remove from the quiesced list */
}
spin_unlock_irq(&rh->region_lock);
return reg;
}
EXPORT_SYMBOL_GPL(dm_rh_recovery_start);
void dm_rh_recovery_end(struct dm_region *reg, int success)
{
struct dm_region_hash *rh = reg->rh;
spin_lock_irq(&rh->region_lock);
if (success)
list_add(&reg->list, &reg->rh->recovered_regions);
else
list_add(&reg->list, &reg->rh->failed_recovered_regions);
spin_unlock_irq(&rh->region_lock);
rh->wakeup_workers(rh->context);
}
EXPORT_SYMBOL_GPL(dm_rh_recovery_end);
/* Return recovery in flight count. */
int dm_rh_recovery_in_flight(struct dm_region_hash *rh)
{
return atomic_read(&rh->recovery_in_flight);
}
EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight);
int dm_rh_flush(struct dm_region_hash *rh)
{
return rh->log->type->flush(rh->log);
}
EXPORT_SYMBOL_GPL(dm_rh_flush);
void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio)
{
struct dm_region *reg;
read_lock(&rh->hash_lock);
reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio));
bio_list_add(&reg->delayed_bios, bio);
read_unlock(&rh->hash_lock);
}
EXPORT_SYMBOL_GPL(dm_rh_delay);
void dm_rh_stop_recovery(struct dm_region_hash *rh)
{
int i;
/* wait for any recovering regions */
for (i = 0; i < rh->max_recovery; i++)
down(&rh->recovery_count);
}
EXPORT_SYMBOL_GPL(dm_rh_stop_recovery);
void dm_rh_start_recovery(struct dm_region_hash *rh)
{
int i;
for (i = 0; i < rh->max_recovery; i++)
up(&rh->recovery_count);
rh->wakeup_workers(rh->context);
}
EXPORT_SYMBOL_GPL(dm_rh_start_recovery);
MODULE_DESCRIPTION(DM_NAME " region hash");
MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,217 @@
/*
* Copyright (C) 2003 Sistina Software.
* Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
*
* Module Author: Heinz Mauelshagen
*
* This file is released under the GPL.
*
* Round-robin path selector.
*/
#include <linux/device-mapper.h>
#include "dm-path-selector.h"
#include <linux/slab.h>
#define DM_MSG_PREFIX "multipath round-robin"
/*-----------------------------------------------------------------
* Path-handling code, paths are held in lists
*---------------------------------------------------------------*/
struct path_info {
struct list_head list;
struct dm_path *path;
unsigned repeat_count;
};
static void free_paths(struct list_head *paths)
{
struct path_info *pi, *next;
list_for_each_entry_safe(pi, next, paths, list) {
list_del(&pi->list);
kfree(pi);
}
}
/*-----------------------------------------------------------------
* Round-robin selector
*---------------------------------------------------------------*/
#define RR_MIN_IO 1000
struct selector {
struct list_head valid_paths;
struct list_head invalid_paths;
};
static struct selector *alloc_selector(void)
{
struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s) {
INIT_LIST_HEAD(&s->valid_paths);
INIT_LIST_HEAD(&s->invalid_paths);
}
return s;
}
static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
{
struct selector *s;
s = alloc_selector();
if (!s)
return -ENOMEM;
ps->context = s;
return 0;
}
static void rr_destroy(struct path_selector *ps)
{
struct selector *s = (struct selector *) ps->context;
free_paths(&s->valid_paths);
free_paths(&s->invalid_paths);
kfree(s);
ps->context = NULL;
}
static int rr_status(struct path_selector *ps, struct dm_path *path,
status_type_t type, char *result, unsigned int maxlen)
{
struct path_info *pi;
int sz = 0;
if (!path)
DMEMIT("0 ");
else {
switch(type) {
case STATUSTYPE_INFO:
break;
case STATUSTYPE_TABLE:
pi = path->pscontext;
DMEMIT("%u ", pi->repeat_count);
break;
}
}
return sz;
}
/*
* Called during initialisation to register each path with an
* optional repeat_count.
*/
static int rr_add_path(struct path_selector *ps, struct dm_path *path,
int argc, char **argv, char **error)
{
struct selector *s = (struct selector *) ps->context;
struct path_info *pi;
unsigned repeat_count = RR_MIN_IO;
if (argc > 1) {
*error = "round-robin ps: incorrect number of arguments";
return -EINVAL;
}
/* First path argument is number of I/Os before switching path */
if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
*error = "round-robin ps: invalid repeat count";
return -EINVAL;
}
/* allocate the path */
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
if (!pi) {
*error = "round-robin ps: Error allocating path context";
return -ENOMEM;
}
pi->path = path;
pi->repeat_count = repeat_count;
path->pscontext = pi;
list_add_tail(&pi->list, &s->valid_paths);
return 0;
}
static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
{
struct selector *s = (struct selector *) ps->context;
struct path_info *pi = p->pscontext;
list_move(&pi->list, &s->invalid_paths);
}
static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
{
struct selector *s = (struct selector *) ps->context;
struct path_info *pi = p->pscontext;
list_move(&pi->list, &s->valid_paths);
return 0;
}
static struct dm_path *rr_select_path(struct path_selector *ps,
unsigned *repeat_count, size_t nr_bytes)
{
struct selector *s = (struct selector *) ps->context;
struct path_info *pi = NULL;
if (!list_empty(&s->valid_paths)) {
pi = list_entry(s->valid_paths.next, struct path_info, list);
list_move_tail(&pi->list, &s->valid_paths);
*repeat_count = pi->repeat_count;
}
return pi ? pi->path : NULL;
}
static struct path_selector_type rr_ps = {
.name = "round-robin",
.module = THIS_MODULE,
.table_args = 1,
.info_args = 0,
.create = rr_create,
.destroy = rr_destroy,
.status = rr_status,
.add_path = rr_add_path,
.fail_path = rr_fail_path,
.reinstate_path = rr_reinstate_path,
.select_path = rr_select_path,
};
static int __init dm_rr_init(void)
{
int r = dm_register_path_selector(&rr_ps);
if (r < 0)
DMERR("register failed %d", r);
DMINFO("version 1.0.0 loaded");
return r;
}
static void __exit dm_rr_exit(void)
{
int r = dm_unregister_path_selector(&rr_ps);
if (r < 0)
DMERR("unregister failed %d", r);
}
module_init(dm_rr_init);
module_exit(dm_rr_exit);
MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector");
MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,339 @@
/*
* Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved.
*
* Module Author: Kiyoshi Ueda
*
* This file is released under the GPL.
*
* Throughput oriented path selector.
*/
#include "dm.h"
#include "dm-path-selector.h"
#define DM_MSG_PREFIX "multipath service-time"
#define ST_MIN_IO 1
#define ST_MAX_RELATIVE_THROUGHPUT 100
#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7
#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
#define ST_VERSION "0.2.0"
struct selector {
struct list_head valid_paths;
struct list_head failed_paths;
};
struct path_info {
struct list_head list;
struct dm_path *path;
unsigned repeat_count;
unsigned relative_throughput;
atomic_t in_flight_size; /* Total size of in-flight I/Os */
};
static struct selector *alloc_selector(void)
{
struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s) {
INIT_LIST_HEAD(&s->valid_paths);
INIT_LIST_HEAD(&s->failed_paths);
}
return s;
}
static int st_create(struct path_selector *ps, unsigned argc, char **argv)
{
struct selector *s = alloc_selector();
if (!s)
return -ENOMEM;
ps->context = s;
return 0;
}
static void free_paths(struct list_head *paths)
{
struct path_info *pi, *next;
list_for_each_entry_safe(pi, next, paths, list) {
list_del(&pi->list);
kfree(pi);
}
}
static void st_destroy(struct path_selector *ps)
{
struct selector *s = ps->context;
free_paths(&s->valid_paths);
free_paths(&s->failed_paths);
kfree(s);
ps->context = NULL;
}
static int st_status(struct path_selector *ps, struct dm_path *path,
status_type_t type, char *result, unsigned maxlen)
{
unsigned sz = 0;
struct path_info *pi;
if (!path)
DMEMIT("0 ");
else {
pi = path->pscontext;
switch (type) {
case STATUSTYPE_INFO:
DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
pi->relative_throughput);
break;
case STATUSTYPE_TABLE:
DMEMIT("%u %u ", pi->repeat_count,
pi->relative_throughput);
break;
}
}
return sz;
}
static int st_add_path(struct path_selector *ps, struct dm_path *path,
int argc, char **argv, char **error)
{
struct selector *s = ps->context;
struct path_info *pi;
unsigned repeat_count = ST_MIN_IO;
unsigned relative_throughput = 1;
/*
* Arguments: [<repeat_count> [<relative_throughput>]]
* <repeat_count>: The number of I/Os before switching path.
* If not given, default (ST_MIN_IO) is used.
* <relative_throughput>: The relative throughput value of
* the path among all paths in the path-group.
* The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
* If not given, minimum value '1' is used.
* If '0' is given, the path isn't selected while
* other paths having a positive value are
* available.
*/
if (argc > 2) {
*error = "service-time ps: incorrect number of arguments";
return -EINVAL;
}
if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
*error = "service-time ps: invalid repeat count";
return -EINVAL;
}
if ((argc == 2) &&
(sscanf(argv[1], "%u", &relative_throughput) != 1 ||
relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
*error = "service-time ps: invalid relative_throughput value";
return -EINVAL;
}
/* allocate the path */
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
if (!pi) {
*error = "service-time ps: Error allocating path context";
return -ENOMEM;
}
pi->path = path;
pi->repeat_count = repeat_count;
pi->relative_throughput = relative_throughput;
atomic_set(&pi->in_flight_size, 0);
path->pscontext = pi;
list_add_tail(&pi->list, &s->valid_paths);
return 0;
}
static void st_fail_path(struct path_selector *ps, struct dm_path *path)
{
struct selector *s = ps->context;
struct path_info *pi = path->pscontext;
list_move(&pi->list, &s->failed_paths);
}
static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
{
struct selector *s = ps->context;
struct path_info *pi = path->pscontext;
list_move_tail(&pi->list, &s->valid_paths);
return 0;
}
/*
* Compare the estimated service time of 2 paths, pi1 and pi2,
* for the incoming I/O.
*
* Returns:
* < 0 : pi1 is better
* 0 : no difference between pi1 and pi2
* > 0 : pi2 is better
*
* Description:
* Basically, the service time is estimated by:
* ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
* To reduce the calculation, some optimizations are made.
* (See comments inline)
*/
static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
size_t incoming)
{
size_t sz1, sz2, st1, st2;
sz1 = atomic_read(&pi1->in_flight_size);
sz2 = atomic_read(&pi2->in_flight_size);
/*
* Case 1: Both have same throughput value. Choose less loaded path.
*/
if (pi1->relative_throughput == pi2->relative_throughput)
return sz1 - sz2;
/*
* Case 2a: Both have same load. Choose higher throughput path.
* Case 2b: One path has no throughput value. Choose the other one.
*/
if (sz1 == sz2 ||
!pi1->relative_throughput || !pi2->relative_throughput)
return pi2->relative_throughput - pi1->relative_throughput;
/*
* Case 3: Calculate service time. Choose faster path.
* Service time using pi1:
* st1 = (sz1 + incoming) / pi1->relative_throughput
* Service time using pi2:
* st2 = (sz2 + incoming) / pi2->relative_throughput
*
* To avoid the division, transform the expression to use
* multiplication.
* Because ->relative_throughput > 0 here, if st1 < st2,
* the expressions below are the same meaning:
* (sz1 + incoming) / pi1->relative_throughput <
* (sz2 + incoming) / pi2->relative_throughput
* (sz1 + incoming) * pi2->relative_throughput <
* (sz2 + incoming) * pi1->relative_throughput
* So use the later one.
*/
sz1 += incoming;
sz2 += incoming;
if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
sz2 >= ST_MAX_INFLIGHT_SIZE)) {
/*
* Size may be too big for multiplying pi->relative_throughput
* and overflow.
* To avoid the overflow and mis-selection, shift down both.
*/
sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
}
st1 = sz1 * pi2->relative_throughput;
st2 = sz2 * pi1->relative_throughput;
if (st1 != st2)
return st1 - st2;
/*
* Case 4: Service time is equal. Choose higher throughput path.
*/
return pi2->relative_throughput - pi1->relative_throughput;
}
static struct dm_path *st_select_path(struct path_selector *ps,
unsigned *repeat_count, size_t nr_bytes)
{
struct selector *s = ps->context;
struct path_info *pi = NULL, *best = NULL;
if (list_empty(&s->valid_paths))
return NULL;
/* Change preferred (first in list) path to evenly balance. */
list_move_tail(s->valid_paths.next, &s->valid_paths);
list_for_each_entry(pi, &s->valid_paths, list)
if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
best = pi;
if (!best)
return NULL;
*repeat_count = best->repeat_count;
return best->path;
}
static int st_start_io(struct path_selector *ps, struct dm_path *path,
size_t nr_bytes)
{
struct path_info *pi = path->pscontext;
atomic_add(nr_bytes, &pi->in_flight_size);
return 0;
}
static int st_end_io(struct path_selector *ps, struct dm_path *path,
size_t nr_bytes)
{
struct path_info *pi = path->pscontext;
atomic_sub(nr_bytes, &pi->in_flight_size);
return 0;
}
static struct path_selector_type st_ps = {
.name = "service-time",
.module = THIS_MODULE,
.table_args = 2,
.info_args = 2,
.create = st_create,
.destroy = st_destroy,
.status = st_status,
.add_path = st_add_path,
.fail_path = st_fail_path,
.reinstate_path = st_reinstate_path,
.select_path = st_select_path,
.start_io = st_start_io,
.end_io = st_end_io,
};
static int __init dm_st_init(void)
{
int r = dm_register_path_selector(&st_ps);
if (r < 0)
DMERR("register failed %d", r);
DMINFO("version " ST_VERSION " loaded");
return r;
}
static void __exit dm_st_exit(void)
{
int r = dm_unregister_path_selector(&st_ps);
if (r < 0)
DMERR("unregister failed %d", r);
}
module_init(dm_st_init);
module_exit(dm_st_exit);
MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,787 @@
/*
* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
* Copyright (C) 2006-2008 Red Hat GmbH
*
* This file is released under the GPL.
*/
#include "dm-exception-store.h"
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/dm-io.h>
#define DM_MSG_PREFIX "persistent snapshot"
#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
/*-----------------------------------------------------------------
* Persistent snapshots, by persistent we mean that the snapshot
* will survive a reboot.
*---------------------------------------------------------------*/
/*
* We need to store a record of which parts of the origin have
* been copied to the snapshot device. The snapshot code
* requires that we copy exception chunks to chunk aligned areas
* of the COW store. It makes sense therefore, to store the
* metadata in chunk size blocks.
*
* There is no backward or forward compatibility implemented,
* snapshots with different disk versions than the kernel will
* not be usable. It is expected that "lvcreate" will blank out
* the start of a fresh COW device before calling the snapshot
* constructor.
*
* The first chunk of the COW device just contains the header.
* After this there is a chunk filled with exception metadata,
* followed by as many exception chunks as can fit in the
* metadata areas.
*
* All on disk structures are in little-endian format. The end
* of the exceptions info is indicated by an exception with a
* new_chunk of 0, which is invalid since it would point to the
* header chunk.
*/
/*
* Magic for persistent snapshots: "SnAp" - Feeble isn't it.
*/
#define SNAP_MAGIC 0x70416e53
/*
* The on-disk version of the metadata.
*/
#define SNAPSHOT_DISK_VERSION 1
struct disk_header {
uint32_t magic;
/*
* Is this snapshot valid. There is no way of recovering
* an invalid snapshot.
*/
uint32_t valid;
/*
* Simple, incrementing version. no backward
* compatibility.
*/
uint32_t version;
/* In sectors */
uint32_t chunk_size;
};
struct disk_exception {
uint64_t old_chunk;
uint64_t new_chunk;
};
struct commit_callback {
void (*callback)(void *, int success);
void *context;
};
/*
* The top level structure for a persistent exception store.
*/
struct pstore {
struct dm_exception_store *store;
int version;
int valid;
uint32_t exceptions_per_area;
/*
* Now that we have an asynchronous kcopyd there is no
* need for large chunk sizes, so it wont hurt to have a
* whole chunks worth of metadata in memory at once.
*/
void *area;
/*
* An area of zeros used to clear the next area.
*/
void *zero_area;
/*
* An area used for header. The header can be written
* concurrently with metadata (when invalidating the snapshot),
* so it needs a separate buffer.
*/
void *header_area;
/*
* Used to keep track of which metadata area the data in
* 'chunk' refers to.
*/
chunk_t current_area;
/*
* The next free chunk for an exception.
*/
chunk_t next_free;
/*
* The index of next free exception in the current
* metadata area.
*/
uint32_t current_committed;
atomic_t pending_count;
uint32_t callback_count;
struct commit_callback *callbacks;
struct dm_io_client *io_client;
struct workqueue_struct *metadata_wq;
};
static unsigned sectors_to_pages(unsigned sectors)
{
return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
}
static int alloc_area(struct pstore *ps)
{
int r = -ENOMEM;
size_t len;
len = ps->store->chunk_size << SECTOR_SHIFT;
/*
* Allocate the chunk_size block of memory that will hold
* a single metadata area.
*/
ps->area = vmalloc(len);
if (!ps->area)
goto err_area;
ps->zero_area = vmalloc(len);
if (!ps->zero_area)
goto err_zero_area;
memset(ps->zero_area, 0, len);
ps->header_area = vmalloc(len);
if (!ps->header_area)
goto err_header_area;
return 0;
err_header_area:
vfree(ps->zero_area);
err_zero_area:
vfree(ps->area);
err_area:
return r;
}
static void free_area(struct pstore *ps)
{
if (ps->area)
vfree(ps->area);
ps->area = NULL;
if (ps->zero_area)
vfree(ps->zero_area);
ps->zero_area = NULL;
if (ps->header_area)
vfree(ps->header_area);
ps->header_area = NULL;
}
struct mdata_req {
struct dm_io_region *where;
struct dm_io_request *io_req;
struct work_struct work;
int result;
};
static void do_metadata(struct work_struct *work)
{
struct mdata_req *req = container_of(work, struct mdata_req, work);
req->result = dm_io(req->io_req, 1, req->where, NULL);
}
/*
* Read or write a chunk aligned and sized block of data from a device.
*/
static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
int metadata)
{
struct dm_io_region where = {
.bdev = ps->store->cow->bdev,
.sector = ps->store->chunk_size * chunk,
.count = ps->store->chunk_size,
};
struct dm_io_request io_req = {
.bi_rw = rw,
.mem.type = DM_IO_VMA,
.mem.ptr.vma = area,
.client = ps->io_client,
.notify.fn = NULL,
};
struct mdata_req req;
if (!metadata)
return dm_io(&io_req, 1, &where, NULL);
req.where = &where;
req.io_req = &io_req;
/*
* Issue the synchronous I/O from a different thread
* to avoid generic_make_request recursion.
*/
INIT_WORK(&req.work, do_metadata);
queue_work(ps->metadata_wq, &req.work);
flush_workqueue(ps->metadata_wq);
return req.result;
}
/*
* Convert a metadata area index to a chunk index.
*/
static chunk_t area_location(struct pstore *ps, chunk_t area)
{
return 1 + ((ps->exceptions_per_area + 1) * area);
}
/*
* Read or write a metadata area. Remembering to skip the first
* chunk which holds the header.
*/
static int area_io(struct pstore *ps, int rw)
{
int r;
chunk_t chunk;
chunk = area_location(ps, ps->current_area);
r = chunk_io(ps, ps->area, chunk, rw, 0);
if (r)
return r;
return 0;
}
static void zero_memory_area(struct pstore *ps)
{
memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
}
static int zero_disk_area(struct pstore *ps, chunk_t area)
{
return chunk_io(ps, ps->zero_area, area_location(ps, area), WRITE, 0);
}
static int read_header(struct pstore *ps, int *new_snapshot)
{
int r;
struct disk_header *dh;
unsigned chunk_size;
int chunk_size_supplied = 1;
char *chunk_err;
/*
* Use default chunk size (or logical_block_size, if larger)
* if none supplied
*/
if (!ps->store->chunk_size) {
ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
bdev_logical_block_size(ps->store->cow->bdev) >> 9);
ps->store->chunk_mask = ps->store->chunk_size - 1;
ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
chunk_size_supplied = 0;
}
ps->io_client = dm_io_client_create(sectors_to_pages(ps->store->
chunk_size));
if (IS_ERR(ps->io_client))
return PTR_ERR(ps->io_client);
r = alloc_area(ps);
if (r)
return r;
r = chunk_io(ps, ps->header_area, 0, READ, 1);
if (r)
goto bad;
dh = ps->header_area;
if (le32_to_cpu(dh->magic) == 0) {
*new_snapshot = 1;
return 0;
}
if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
DMWARN("Invalid or corrupt snapshot");
r = -ENXIO;
goto bad;
}
*new_snapshot = 0;
ps->valid = le32_to_cpu(dh->valid);
ps->version = le32_to_cpu(dh->version);
chunk_size = le32_to_cpu(dh->chunk_size);
if (ps->store->chunk_size == chunk_size)
return 0;
if (chunk_size_supplied)
DMWARN("chunk size %u in device metadata overrides "
"table chunk size of %u.",
chunk_size, ps->store->chunk_size);
/* We had a bogus chunk_size. Fix stuff up. */
free_area(ps);
r = dm_exception_store_set_chunk_size(ps->store, chunk_size,
&chunk_err);
if (r) {
DMERR("invalid on-disk chunk size %u: %s.",
chunk_size, chunk_err);
return r;
}
r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size),
ps->io_client);
if (r)
return r;
r = alloc_area(ps);
return r;
bad:
free_area(ps);
return r;
}
static int write_header(struct pstore *ps)
{
struct disk_header *dh;
memset(ps->header_area, 0, ps->store->chunk_size << SECTOR_SHIFT);
dh = ps->header_area;
dh->magic = cpu_to_le32(SNAP_MAGIC);
dh->valid = cpu_to_le32(ps->valid);
dh->version = cpu_to_le32(ps->version);
dh->chunk_size = cpu_to_le32(ps->store->chunk_size);
return chunk_io(ps, ps->header_area, 0, WRITE, 1);
}
/*
* Access functions for the disk exceptions, these do the endian conversions.
*/
static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
{
BUG_ON(index >= ps->exceptions_per_area);
return ((struct disk_exception *) ps->area) + index;
}
static void read_exception(struct pstore *ps,
uint32_t index, struct disk_exception *result)
{
struct disk_exception *e = get_exception(ps, index);
/* copy it */
result->old_chunk = le64_to_cpu(e->old_chunk);
result->new_chunk = le64_to_cpu(e->new_chunk);
}
static void write_exception(struct pstore *ps,
uint32_t index, struct disk_exception *de)
{
struct disk_exception *e = get_exception(ps, index);
/* copy it */
e->old_chunk = cpu_to_le64(de->old_chunk);
e->new_chunk = cpu_to_le64(de->new_chunk);
}
/*
* Registers the exceptions that are present in the current area.
* 'full' is filled in to indicate if the area has been
* filled.
*/
static int insert_exceptions(struct pstore *ps,
int (*callback)(void *callback_context,
chunk_t old, chunk_t new),
void *callback_context,
int *full)
{
int r;
unsigned int i;
struct disk_exception de;
/* presume the area is full */
*full = 1;
for (i = 0; i < ps->exceptions_per_area; i++) {
read_exception(ps, i, &de);
/*
* If the new_chunk is pointing at the start of
* the COW device, where the first metadata area
* is we know that we've hit the end of the
* exceptions. Therefore the area is not full.
*/
if (de.new_chunk == 0LL) {
ps->current_committed = i;
*full = 0;
break;
}
/*
* Keep track of the start of the free chunks.
*/
if (ps->next_free <= de.new_chunk)
ps->next_free = de.new_chunk + 1;
/*
* Otherwise we add the exception to the snapshot.
*/
r = callback(callback_context, de.old_chunk, de.new_chunk);
if (r)
return r;
}
return 0;
}
static int read_exceptions(struct pstore *ps,
int (*callback)(void *callback_context, chunk_t old,
chunk_t new),
void *callback_context)
{
int r, full = 1;
/*
* Keeping reading chunks and inserting exceptions until
* we find a partially full area.
*/
for (ps->current_area = 0; full; ps->current_area++) {
r = area_io(ps, READ);
if (r)
return r;
r = insert_exceptions(ps, callback, callback_context, &full);
if (r)
return r;
}
ps->current_area--;
return 0;
}
static struct pstore *get_info(struct dm_exception_store *store)
{
return (struct pstore *) store->context;
}
static void persistent_fraction_full(struct dm_exception_store *store,
sector_t *numerator, sector_t *denominator)
{
*numerator = get_info(store)->next_free * store->chunk_size;
*denominator = get_dev_size(store->cow->bdev);
}
static void persistent_dtr(struct dm_exception_store *store)
{
struct pstore *ps = get_info(store);
destroy_workqueue(ps->metadata_wq);
/* Created in read_header */
if (ps->io_client)
dm_io_client_destroy(ps->io_client);
free_area(ps);
/* Allocated in persistent_read_metadata */
if (ps->callbacks)
vfree(ps->callbacks);
kfree(ps);
}
static int persistent_read_metadata(struct dm_exception_store *store,
int (*callback)(void *callback_context,
chunk_t old, chunk_t new),
void *callback_context)
{
int r, uninitialized_var(new_snapshot);
struct pstore *ps = get_info(store);
/*
* Read the snapshot header.
*/
r = read_header(ps, &new_snapshot);
if (r)
return r;
/*
* Now we know correct chunk_size, complete the initialisation.
*/
ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
sizeof(struct disk_exception);
ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
sizeof(*ps->callbacks));
if (!ps->callbacks)
return -ENOMEM;
/*
* Do we need to setup a new snapshot ?
*/
if (new_snapshot) {
r = write_header(ps);
if (r) {
DMWARN("write_header failed");
return r;
}
ps->current_area = 0;
zero_memory_area(ps);
r = zero_disk_area(ps, 0);
if (r) {
DMWARN("zero_disk_area(0) failed");
return r;
}
} else {
/*
* Sanity checks.
*/
if (ps->version != SNAPSHOT_DISK_VERSION) {
DMWARN("unable to handle snapshot disk version %d",
ps->version);
return -EINVAL;
}
/*
* Metadata are valid, but snapshot is invalidated
*/
if (!ps->valid)
return 1;
/*
* Read the metadata.
*/
r = read_exceptions(ps, callback, callback_context);
if (r)
return r;
}
return 0;
}
static int persistent_prepare_exception(struct dm_exception_store *store,
struct dm_snap_exception *e)
{
struct pstore *ps = get_info(store);
uint32_t stride;
chunk_t next_free;
sector_t size = get_dev_size(store->cow->bdev);
/* Is there enough room ? */
if (size < ((ps->next_free + 1) * store->chunk_size))
return -ENOSPC;
e->new_chunk = ps->next_free;
/*
* Move onto the next free pending, making sure to take
* into account the location of the metadata chunks.
*/
stride = (ps->exceptions_per_area + 1);
next_free = ++ps->next_free;
if (sector_div(next_free, stride) == 1)
ps->next_free++;
atomic_inc(&ps->pending_count);
return 0;
}
static void persistent_commit_exception(struct dm_exception_store *store,
struct dm_snap_exception *e,
void (*callback) (void *, int success),
void *callback_context)
{
unsigned int i;
struct pstore *ps = get_info(store);
struct disk_exception de;
struct commit_callback *cb;
de.old_chunk = e->old_chunk;
de.new_chunk = e->new_chunk;
write_exception(ps, ps->current_committed++, &de);
/*
* Add the callback to the back of the array. This code
* is the only place where the callback array is
* manipulated, and we know that it will never be called
* multiple times concurrently.
*/
cb = ps->callbacks + ps->callback_count++;
cb->callback = callback;
cb->context = callback_context;
/*
* If there are exceptions in flight and we have not yet
* filled this metadata area there's nothing more to do.
*/
if (!atomic_dec_and_test(&ps->pending_count) &&
(ps->current_committed != ps->exceptions_per_area))
return;
/*
* If we completely filled the current area, then wipe the next one.
*/
if ((ps->current_committed == ps->exceptions_per_area) &&
zero_disk_area(ps, ps->current_area + 1))
ps->valid = 0;
/*
* Commit exceptions to disk.
*/
if (ps->valid && area_io(ps, WRITE_BARRIER))
ps->valid = 0;
/*
* Advance to the next area if this one is full.
*/
if (ps->current_committed == ps->exceptions_per_area) {
ps->current_committed = 0;
ps->current_area++;
zero_memory_area(ps);
}
for (i = 0; i < ps->callback_count; i++) {
cb = ps->callbacks + i;
cb->callback(cb->context, ps->valid);
}
ps->callback_count = 0;
}
static void persistent_drop_snapshot(struct dm_exception_store *store)
{
struct pstore *ps = get_info(store);
ps->valid = 0;
if (write_header(ps))
DMWARN("write header failed");
}
static int persistent_ctr(struct dm_exception_store *store,
unsigned argc, char **argv)
{
struct pstore *ps;
/* allocate the pstore */
ps = kzalloc(sizeof(*ps), GFP_KERNEL);
if (!ps)
return -ENOMEM;
ps->store = store;
ps->valid = 1;
ps->version = SNAPSHOT_DISK_VERSION;
ps->area = NULL;
ps->zero_area = NULL;
ps->header_area = NULL;
ps->next_free = 2; /* skipping the header and first area */
ps->current_committed = 0;
ps->callback_count = 0;
atomic_set(&ps->pending_count, 0);
ps->callbacks = NULL;
ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
if (!ps->metadata_wq) {
kfree(ps);
DMERR("couldn't start header metadata update thread");
return -ENOMEM;
}
store->context = ps;
return 0;
}
static unsigned persistent_status(struct dm_exception_store *store,
status_type_t status, char *result,
unsigned maxlen)
{
unsigned sz = 0;
switch (status) {
case STATUSTYPE_INFO:
break;
case STATUSTYPE_TABLE:
DMEMIT(" %s P %llu", store->cow->name,
(unsigned long long)store->chunk_size);
}
return sz;
}
static struct dm_exception_store_type _persistent_type = {
.name = "persistent",
.module = THIS_MODULE,
.ctr = persistent_ctr,
.dtr = persistent_dtr,
.read_metadata = persistent_read_metadata,
.prepare_exception = persistent_prepare_exception,
.commit_exception = persistent_commit_exception,
.drop_snapshot = persistent_drop_snapshot,
.fraction_full = persistent_fraction_full,
.status = persistent_status,
};
static struct dm_exception_store_type _persistent_compat_type = {
.name = "P",
.module = THIS_MODULE,
.ctr = persistent_ctr,
.dtr = persistent_dtr,
.read_metadata = persistent_read_metadata,
.prepare_exception = persistent_prepare_exception,
.commit_exception = persistent_commit_exception,
.drop_snapshot = persistent_drop_snapshot,
.fraction_full = persistent_fraction_full,
.status = persistent_status,
};
int dm_persistent_snapshot_init(void)
{
int r;
r = dm_exception_store_type_register(&_persistent_type);
if (r) {
DMERR("Unable to register persistent exception store type");
return r;
}
r = dm_exception_store_type_register(&_persistent_compat_type);
if (r) {
DMERR("Unable to register old-style persistent exception "
"store type");
dm_exception_store_type_unregister(&_persistent_type);
return r;
}
return r;
}
void dm_persistent_snapshot_exit(void)
{
dm_exception_store_type_unregister(&_persistent_type);
dm_exception_store_type_unregister(&_persistent_compat_type);
}

View File

@@ -0,0 +1,150 @@
/*
* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
* Copyright (C) 2006-2008 Red Hat GmbH
*
* This file is released under the GPL.
*/
#include "dm-exception-store.h"
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/dm-io.h>
#define DM_MSG_PREFIX "transient snapshot"
/*-----------------------------------------------------------------
* Implementation of the store for non-persistent snapshots.
*---------------------------------------------------------------*/
struct transient_c {
sector_t next_free;
};
static void transient_dtr(struct dm_exception_store *store)
{
kfree(store->context);
}
static int transient_read_metadata(struct dm_exception_store *store,
int (*callback)(void *callback_context,
chunk_t old, chunk_t new),
void *callback_context)
{
return 0;
}
static int transient_prepare_exception(struct dm_exception_store *store,
struct dm_snap_exception *e)
{
struct transient_c *tc = store->context;
sector_t size = get_dev_size(store->cow->bdev);
if (size < (tc->next_free + store->chunk_size))
return -1;
e->new_chunk = sector_to_chunk(store, tc->next_free);
tc->next_free += store->chunk_size;
return 0;
}
static void transient_commit_exception(struct dm_exception_store *store,
struct dm_snap_exception *e,
void (*callback) (void *, int success),
void *callback_context)
{
/* Just succeed */
callback(callback_context, 1);
}
static void transient_fraction_full(struct dm_exception_store *store,
sector_t *numerator, sector_t *denominator)
{
*numerator = ((struct transient_c *) store->context)->next_free;
*denominator = get_dev_size(store->cow->bdev);
}
static int transient_ctr(struct dm_exception_store *store,
unsigned argc, char **argv)
{
struct transient_c *tc;
tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
if (!tc)
return -ENOMEM;
tc->next_free = 0;
store->context = tc;
return 0;
}
static unsigned transient_status(struct dm_exception_store *store,
status_type_t status, char *result,
unsigned maxlen)
{
unsigned sz = 0;
switch (status) {
case STATUSTYPE_INFO:
break;
case STATUSTYPE_TABLE:
DMEMIT(" %s N %llu", store->cow->name,
(unsigned long long)store->chunk_size);
}
return sz;
}
static struct dm_exception_store_type _transient_type = {
.name = "transient",
.module = THIS_MODULE,
.ctr = transient_ctr,
.dtr = transient_dtr,
.read_metadata = transient_read_metadata,
.prepare_exception = transient_prepare_exception,
.commit_exception = transient_commit_exception,
.fraction_full = transient_fraction_full,
.status = transient_status,
};
static struct dm_exception_store_type _transient_compat_type = {
.name = "N",
.module = THIS_MODULE,
.ctr = transient_ctr,
.dtr = transient_dtr,
.read_metadata = transient_read_metadata,
.prepare_exception = transient_prepare_exception,
.commit_exception = transient_commit_exception,
.fraction_full = transient_fraction_full,
.status = transient_status,
};
int dm_transient_snapshot_init(void)
{
int r;
r = dm_exception_store_type_register(&_transient_type);
if (r) {
DMWARN("Unable to register transient exception store type");
return r;
}
r = dm_exception_store_type_register(&_transient_compat_type);
if (r) {
DMWARN("Unable to register old-style transient "
"exception store type");
dm_exception_store_type_unregister(&_transient_type);
return r;
}
return r;
}
void dm_transient_snapshot_exit(void)
{
dm_exception_store_type_unregister(&_transient_type);
dm_exception_store_type_unregister(&_transient_compat_type);
}

1570
kernel/drivers/md/dm-snap.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,381 @@
/*
* Copyright (C) 2001-2003 Sistina Software (UK) Limited.
*
* This file is released under the GPL.
*/
#include <linux/device-mapper.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/log2.h>
#define DM_MSG_PREFIX "striped"
#define DM_IO_ERROR_THRESHOLD 15
struct stripe {
struct dm_dev *dev;
sector_t physical_start;
atomic_t error_count;
};
struct stripe_c {
uint32_t stripes;
/* The size of this target / num. stripes */
sector_t stripe_width;
/* stripe chunk size */
uint32_t chunk_shift;
sector_t chunk_mask;
/* Needed for handling events */
struct dm_target *ti;
/* Work struct used for triggering events*/
struct work_struct kstriped_ws;
struct stripe stripe[0];
};
static struct workqueue_struct *kstriped;
/*
* An event is triggered whenever a drive
* drops out of a stripe volume.
*/
static void trigger_event(struct work_struct *work)
{
struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws);
dm_table_event(sc->ti->table);
}
static inline struct stripe_c *alloc_context(unsigned int stripes)
{
size_t len;
if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
stripes))
return NULL;
len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
return kmalloc(len, GFP_KERNEL);
}
/*
* Parse a single <dev> <sector> pair
*/
static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
unsigned int stripe, char **argv)
{
unsigned long long start;
if (sscanf(argv[1], "%llu", &start) != 1)
return -EINVAL;
if (dm_get_device(ti, argv[0], start, sc->stripe_width,
dm_table_get_mode(ti->table),
&sc->stripe[stripe].dev))
return -ENXIO;
sc->stripe[stripe].physical_start = start;
return 0;
}
/*
* Construct a striped mapping.
* <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
*/
static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct stripe_c *sc;
sector_t width;
uint32_t stripes;
uint32_t chunk_size;
char *end;
int r;
unsigned int i;
if (argc < 2) {
ti->error = "Not enough arguments";
return -EINVAL;
}
stripes = simple_strtoul(argv[0], &end, 10);
if (!stripes || *end) {
ti->error = "Invalid stripe count";
return -EINVAL;
}
chunk_size = simple_strtoul(argv[1], &end, 10);
if (*end) {
ti->error = "Invalid chunk_size";
return -EINVAL;
}
/*
* chunk_size is a power of two
*/
if (!is_power_of_2(chunk_size) ||
(chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
ti->error = "Invalid chunk size";
return -EINVAL;
}
if (ti->len & (chunk_size - 1)) {
ti->error = "Target length not divisible by "
"chunk size";
return -EINVAL;
}
width = ti->len;
if (sector_div(width, stripes)) {
ti->error = "Target length not divisible by "
"number of stripes";
return -EINVAL;
}
/*
* Do we have enough arguments for that many stripes ?
*/
if (argc != (2 + 2 * stripes)) {
ti->error = "Not enough destinations "
"specified";
return -EINVAL;
}
sc = alloc_context(stripes);
if (!sc) {
ti->error = "Memory allocation for striped context "
"failed";
return -ENOMEM;
}
INIT_WORK(&sc->kstriped_ws, trigger_event);
/* Set pointer to dm target; used in trigger_event */
sc->ti = ti;
sc->stripes = stripes;
sc->stripe_width = width;
ti->split_io = chunk_size;
ti->num_flush_requests = stripes;
sc->chunk_mask = ((sector_t) chunk_size) - 1;
for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
chunk_size >>= 1;
sc->chunk_shift--;
/*
* Get the stripe destinations.
*/
for (i = 0; i < stripes; i++) {
argv += 2;
r = get_stripe(ti, sc, i, argv);
if (r < 0) {
ti->error = "Couldn't parse stripe destination";
while (i--)
dm_put_device(ti, sc->stripe[i].dev);
kfree(sc);
return r;
}
atomic_set(&(sc->stripe[i].error_count), 0);
}
ti->private = sc;
return 0;
}
static void stripe_dtr(struct dm_target *ti)
{
unsigned int i;
struct stripe_c *sc = (struct stripe_c *) ti->private;
for (i = 0; i < sc->stripes; i++)
dm_put_device(ti, sc->stripe[i].dev);
flush_workqueue(kstriped);
kfree(sc);
}
static int stripe_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
struct stripe_c *sc = (struct stripe_c *) ti->private;
sector_t offset, chunk;
uint32_t stripe;
if (unlikely(bio_empty_barrier(bio))) {
BUG_ON(map_context->flush_request >= sc->stripes);
bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev;
return DM_MAPIO_REMAPPED;
}
offset = bio->bi_sector - ti->begin;
chunk = offset >> sc->chunk_shift;
stripe = sector_div(chunk, sc->stripes);
bio->bi_bdev = sc->stripe[stripe].dev->bdev;
bio->bi_sector = sc->stripe[stripe].physical_start +
(chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
return DM_MAPIO_REMAPPED;
}
/*
* Stripe status:
*
* INFO
* #stripes [stripe_name <stripe_name>] [group word count]
* [error count 'A|D' <error count 'A|D'>]
*
* TABLE
* #stripes [stripe chunk size]
* [stripe_name physical_start <stripe_name physical_start>]
*
*/
static int stripe_status(struct dm_target *ti,
status_type_t type, char *result, unsigned int maxlen)
{
struct stripe_c *sc = (struct stripe_c *) ti->private;
char buffer[sc->stripes + 1];
unsigned int sz = 0;
unsigned int i;
switch (type) {
case STATUSTYPE_INFO:
DMEMIT("%d ", sc->stripes);
for (i = 0; i < sc->stripes; i++) {
DMEMIT("%s ", sc->stripe[i].dev->name);
buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ?
'D' : 'A';
}
buffer[i] = '\0';
DMEMIT("1 %s", buffer);
break;
case STATUSTYPE_TABLE:
DMEMIT("%d %llu", sc->stripes,
(unsigned long long)sc->chunk_mask + 1);
for (i = 0; i < sc->stripes; i++)
DMEMIT(" %s %llu", sc->stripe[i].dev->name,
(unsigned long long)sc->stripe[i].physical_start);
break;
}
return 0;
}
static int stripe_end_io(struct dm_target *ti, struct bio *bio,
int error, union map_info *map_context)
{
unsigned i;
char major_minor[16];
struct stripe_c *sc = ti->private;
if (!error)
return 0; /* I/O complete */
if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
return error;
if (error == -EOPNOTSUPP)
return error;
memset(major_minor, 0, sizeof(major_minor));
sprintf(major_minor, "%d:%d",
MAJOR(disk_devt(bio->bi_bdev->bd_disk)),
MINOR(disk_devt(bio->bi_bdev->bd_disk)));
/*
* Test to see which stripe drive triggered the event
* and increment error count for all stripes on that device.
* If the error count for a given device exceeds the threshold
* value we will no longer trigger any further events.
*/
for (i = 0; i < sc->stripes; i++)
if (!strcmp(sc->stripe[i].dev->name, major_minor)) {
atomic_inc(&(sc->stripe[i].error_count));
if (atomic_read(&(sc->stripe[i].error_count)) <
DM_IO_ERROR_THRESHOLD)
queue_work(kstriped, &sc->kstriped_ws);
}
return error;
}
static int stripe_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct stripe_c *sc = ti->private;
int ret = 0;
unsigned i = 0;
do {
ret = fn(ti, sc->stripe[i].dev,
sc->stripe[i].physical_start,
sc->stripe_width, data);
} while (!ret && ++i < sc->stripes);
return ret;
}
static void stripe_io_hints(struct dm_target *ti,
struct queue_limits *limits)
{
struct stripe_c *sc = ti->private;
unsigned chunk_size = (sc->chunk_mask + 1) << 9;
blk_limits_io_min(limits, chunk_size);
blk_limits_io_opt(limits, chunk_size * sc->stripes);
}
static struct target_type stripe_target = {
.name = "striped",
.version = {1, 3, 0},
.module = THIS_MODULE,
.ctr = stripe_ctr,
.dtr = stripe_dtr,
.map = stripe_map,
.end_io = stripe_end_io,
.status = stripe_status,
.iterate_devices = stripe_iterate_devices,
.io_hints = stripe_io_hints,
};
int __init dm_stripe_init(void)
{
int r;
r = dm_register_target(&stripe_target);
if (r < 0) {
DMWARN("target registration failed");
return r;
}
kstriped = create_singlethread_workqueue("kstriped");
if (!kstriped) {
DMERR("failed to create workqueue kstriped");
dm_unregister_target(&stripe_target);
return -ENOMEM;
}
return r;
}
void dm_stripe_exit(void)
{
dm_unregister_target(&stripe_target);
destroy_workqueue(kstriped);
return;
}

View File

@@ -0,0 +1,108 @@
/*
* Copyright (C) 2008 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
#include <linux/sysfs.h>
#include <linux/dm-ioctl.h>
#include "dm.h"
struct dm_sysfs_attr {
struct attribute attr;
ssize_t (*show)(struct mapped_device *, char *);
ssize_t (*store)(struct mapped_device *, char *);
};
#define DM_ATTR_RO(_name) \
struct dm_sysfs_attr dm_attr_##_name = \
__ATTR(_name, S_IRUGO, dm_attr_##_name##_show, NULL)
static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
char *page)
{
struct dm_sysfs_attr *dm_attr;
struct mapped_device *md;
ssize_t ret;
dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
if (!dm_attr->show)
return -EIO;
md = dm_get_from_kobject(kobj);
if (!md)
return -EINVAL;
ret = dm_attr->show(md, page);
dm_put(md);
return ret;
}
static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
{
if (dm_copy_name_and_uuid(md, buf, NULL))
return -EIO;
strcat(buf, "\n");
return strlen(buf);
}
static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
{
if (dm_copy_name_and_uuid(md, NULL, buf))
return -EIO;
strcat(buf, "\n");
return strlen(buf);
}
static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
{
sprintf(buf, "%d\n", dm_suspended(md));
return strlen(buf);
}
static DM_ATTR_RO(name);
static DM_ATTR_RO(uuid);
static DM_ATTR_RO(suspended);
static struct attribute *dm_attrs[] = {
&dm_attr_name.attr,
&dm_attr_uuid.attr,
&dm_attr_suspended.attr,
NULL,
};
static struct sysfs_ops dm_sysfs_ops = {
.show = dm_attr_show,
};
/*
* dm kobject is embedded in mapped_device structure
* no need to define release function here
*/
static struct kobj_type dm_ktype = {
.sysfs_ops = &dm_sysfs_ops,
.default_attrs = dm_attrs,
};
/*
* Initialize kobj
* because nobody using md yet, no need to call explicit dm_get/put
*/
int dm_sysfs_init(struct mapped_device *md)
{
return kobject_init_and_add(dm_kobject(md), &dm_ktype,
&disk_to_dev(dm_disk(md))->kobj,
"%s", "dm");
}
/*
* Remove kobj, called after all references removed
*/
void dm_sysfs_exit(struct mapped_device *md)
{
kobject_put(dm_kobject(md));
}

1259
kernel/drivers/md/dm-table.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,150 @@
/*
* Copyright (C) 2001 Sistina Software (UK) Limited
*
* This file is released under the GPL.
*/
#include "dm.h"
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/bio.h>
#include <linux/slab.h>
#define DM_MSG_PREFIX "target"
static LIST_HEAD(_targets);
static DECLARE_RWSEM(_lock);
#define DM_MOD_NAME_SIZE 32
static inline struct target_type *__find_target_type(const char *name)
{
struct target_type *tt;
list_for_each_entry(tt, &_targets, list)
if (!strcmp(name, tt->name))
return tt;
return NULL;
}
static struct target_type *get_target_type(const char *name)
{
struct target_type *tt;
down_read(&_lock);
tt = __find_target_type(name);
if (tt && !try_module_get(tt->module))
tt = NULL;
up_read(&_lock);
return tt;
}
static void load_module(const char *name)
{
request_module("dm-%s", name);
}
struct target_type *dm_get_target_type(const char *name)
{
struct target_type *tt = get_target_type(name);
if (!tt) {
load_module(name);
tt = get_target_type(name);
}
return tt;
}
void dm_put_target_type(struct target_type *tt)
{
down_read(&_lock);
module_put(tt->module);
up_read(&_lock);
}
int dm_target_iterate(void (*iter_func)(struct target_type *tt,
void *param), void *param)
{
struct target_type *tt;
down_read(&_lock);
list_for_each_entry(tt, &_targets, list)
iter_func(tt, param);
up_read(&_lock);
return 0;
}
int dm_register_target(struct target_type *tt)
{
int rv = 0;
down_write(&_lock);
if (__find_target_type(tt->name))
rv = -EEXIST;
else
list_add(&tt->list, &_targets);
up_write(&_lock);
return rv;
}
void dm_unregister_target(struct target_type *tt)
{
down_write(&_lock);
if (!__find_target_type(tt->name)) {
DMCRIT("Unregistering unrecognised target: %s", tt->name);
BUG();
}
list_del(&tt->list);
up_write(&_lock);
}
/*
* io-err: always fails an io, useful for bringing
* up LVs that have holes in them.
*/
static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
{
return 0;
}
static void io_err_dtr(struct dm_target *tt)
{
/* empty */
}
static int io_err_map(struct dm_target *tt, struct bio *bio,
union map_info *map_context)
{
return -EIO;
}
static struct target_type error_target = {
.name = "error",
.version = {1, 0, 1},
.ctr = io_err_ctr,
.dtr = io_err_dtr,
.map = io_err_map,
};
int __init dm_target_init(void)
{
return dm_register_target(&error_target);
}
void dm_target_exit(void)
{
dm_unregister_target(&error_target);
}
EXPORT_SYMBOL(dm_register_target);
EXPORT_SYMBOL(dm_unregister_target);

View File

@@ -0,0 +1,221 @@
/*
* Device Mapper Uevent Support (dm-uevent)
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright IBM Corporation, 2007
* Author: Mike Anderson <andmike@linux.vnet.ibm.com>
*/
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/kobject.h>
#include <linux/dm-ioctl.h>
#include "dm.h"
#include "dm-uevent.h"
#define DM_MSG_PREFIX "uevent"
static const struct {
enum dm_uevent_type type;
enum kobject_action action;
char *name;
} _dm_uevent_type_names[] = {
{DM_UEVENT_PATH_FAILED, KOBJ_CHANGE, "PATH_FAILED"},
{DM_UEVENT_PATH_REINSTATED, KOBJ_CHANGE, "PATH_REINSTATED"},
};
static struct kmem_cache *_dm_event_cache;
struct dm_uevent {
struct mapped_device *md;
enum kobject_action action;
struct kobj_uevent_env ku_env;
struct list_head elist;
char name[DM_NAME_LEN];
char uuid[DM_UUID_LEN];
};
static void dm_uevent_free(struct dm_uevent *event)
{
kmem_cache_free(_dm_event_cache, event);
}
static struct dm_uevent *dm_uevent_alloc(struct mapped_device *md)
{
struct dm_uevent *event;
event = kmem_cache_zalloc(_dm_event_cache, GFP_ATOMIC);
if (!event)
return NULL;
INIT_LIST_HEAD(&event->elist);
event->md = md;
return event;
}
static struct dm_uevent *dm_build_path_uevent(struct mapped_device *md,
struct dm_target *ti,
enum kobject_action action,
const char *dm_action,
const char *path,
unsigned nr_valid_paths)
{
struct dm_uevent *event;
event = dm_uevent_alloc(md);
if (!event) {
DMERR("%s: dm_uevent_alloc() failed", __func__);
goto err_nomem;
}
event->action = action;
if (add_uevent_var(&event->ku_env, "DM_TARGET=%s", ti->type->name)) {
DMERR("%s: add_uevent_var() for DM_TARGET failed",
__func__);
goto err_add;
}
if (add_uevent_var(&event->ku_env, "DM_ACTION=%s", dm_action)) {
DMERR("%s: add_uevent_var() for DM_ACTION failed",
__func__);
goto err_add;
}
if (add_uevent_var(&event->ku_env, "DM_SEQNUM=%u",
dm_next_uevent_seq(md))) {
DMERR("%s: add_uevent_var() for DM_SEQNUM failed",
__func__);
goto err_add;
}
if (add_uevent_var(&event->ku_env, "DM_PATH=%s", path)) {
DMERR("%s: add_uevent_var() for DM_PATH failed", __func__);
goto err_add;
}
if (add_uevent_var(&event->ku_env, "DM_NR_VALID_PATHS=%d",
nr_valid_paths)) {
DMERR("%s: add_uevent_var() for DM_NR_VALID_PATHS failed",
__func__);
goto err_add;
}
return event;
err_add:
dm_uevent_free(event);
err_nomem:
return ERR_PTR(-ENOMEM);
}
/**
* dm_send_uevents - send uevents for given list
*
* @events: list of events to send
* @kobj: kobject generating event
*
*/
void dm_send_uevents(struct list_head *events, struct kobject *kobj)
{
int r;
struct dm_uevent *event, *next;
list_for_each_entry_safe(event, next, events, elist) {
list_del_init(&event->elist);
/*
* When a device is being removed this copy fails and we
* discard these unsent events.
*/
if (dm_copy_name_and_uuid(event->md, event->name,
event->uuid)) {
DMINFO("%s: skipping sending uevent for lost device",
__func__);
goto uevent_free;
}
if (add_uevent_var(&event->ku_env, "DM_NAME=%s", event->name)) {
DMERR("%s: add_uevent_var() for DM_NAME failed",
__func__);
goto uevent_free;
}
if (add_uevent_var(&event->ku_env, "DM_UUID=%s", event->uuid)) {
DMERR("%s: add_uevent_var() for DM_UUID failed",
__func__);
goto uevent_free;
}
r = kobject_uevent_env(kobj, event->action, event->ku_env.envp);
if (r)
DMERR("%s: kobject_uevent_env failed", __func__);
uevent_free:
dm_uevent_free(event);
}
}
EXPORT_SYMBOL_GPL(dm_send_uevents);
/**
* dm_path_uevent - called to create a new path event and queue it
*
* @event_type: path event type enum
* @ti: pointer to a dm_target
* @path: string containing pathname
* @nr_valid_paths: number of valid paths remaining
*
*/
void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
const char *path, unsigned nr_valid_paths)
{
struct mapped_device *md = dm_table_get_md(ti->table);
struct dm_uevent *event;
if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) {
DMERR("%s: Invalid event_type %d", __func__, event_type);
goto out;
}
event = dm_build_path_uevent(md, ti,
_dm_uevent_type_names[event_type].action,
_dm_uevent_type_names[event_type].name,
path, nr_valid_paths);
if (IS_ERR(event))
goto out;
dm_uevent_add(md, &event->elist);
out:
dm_put(md);
}
EXPORT_SYMBOL_GPL(dm_path_uevent);
int dm_uevent_init(void)
{
_dm_event_cache = KMEM_CACHE(dm_uevent, 0);
if (!_dm_event_cache)
return -ENOMEM;
DMINFO("version 1.0.3");
return 0;
}
void dm_uevent_exit(void)
{
kmem_cache_destroy(_dm_event_cache);
}

View File

@@ -0,0 +1,59 @@
/*
* Device Mapper Uevent Support
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright IBM Corporation, 2007
* Author: Mike Anderson <andmike@linux.vnet.ibm.com>
*/
#ifndef DM_UEVENT_H
#define DM_UEVENT_H
enum dm_uevent_type {
DM_UEVENT_PATH_FAILED,
DM_UEVENT_PATH_REINSTATED,
};
#ifdef CONFIG_DM_UEVENT
extern int dm_uevent_init(void);
extern void dm_uevent_exit(void);
extern void dm_send_uevents(struct list_head *events, struct kobject *kobj);
extern void dm_path_uevent(enum dm_uevent_type event_type,
struct dm_target *ti, const char *path,
unsigned nr_valid_paths);
#else
static inline int dm_uevent_init(void)
{
return 0;
}
static inline void dm_uevent_exit(void)
{
}
static inline void dm_send_uevents(struct list_head *events,
struct kobject *kobj)
{
}
static inline void dm_path_uevent(enum dm_uevent_type event_type,
struct dm_target *ti, const char *path,
unsigned nr_valid_paths)
{
}
#endif /* CONFIG_DM_UEVENT */
#endif /* DM_UEVENT_H */

View File

@@ -0,0 +1,80 @@
/*
* Copyright (C) 2003 Christophe Saout <christophe@saout.de>
*
* This file is released under the GPL.
*/
#include <linux/device-mapper.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
#define DM_MSG_PREFIX "zero"
/*
* Construct a dummy mapping that only returns zeros
*/
static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
if (argc != 0) {
ti->error = "No arguments required";
return -EINVAL;
}
return 0;
}
/*
* Return zeros only on reads
*/
static int zero_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
switch(bio_rw(bio)) {
case READ:
zero_fill_bio(bio);
break;
case READA:
/* readahead of null bytes only wastes buffer cache */
return -EIO;
case WRITE:
/* writes get silently dropped */
break;
}
bio_endio(bio, 0);
/* accepted bio, don't make new request */
return DM_MAPIO_SUBMITTED;
}
static struct target_type zero_target = {
.name = "zero",
.version = {1, 0, 0},
.module = THIS_MODULE,
.ctr = zero_ctr,
.map = zero_map,
};
static int __init dm_zero_init(void)
{
int r = dm_register_target(&zero_target);
if (r < 0)
DMERR("register failed %d", r);
return r;
}
static void __exit dm_zero_exit(void)
{
dm_unregister_target(&zero_target);
}
module_init(dm_zero_init)
module_exit(dm_zero_exit)
MODULE_AUTHOR("Christophe Saout <christophe@saout.de>");
MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros");
MODULE_LICENSE("GPL");

2707
kernel/drivers/md/dm.c Normal file

File diff suppressed because it is too large Load Diff

130
kernel/drivers/md/dm.h Normal file
View File

@@ -0,0 +1,130 @@
/*
* Internal header file for device mapper
*
* Copyright (C) 2001, 2002 Sistina Software
* Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
*
* This file is released under the LGPL.
*/
#ifndef DM_INTERNAL_H
#define DM_INTERNAL_H
#include <linux/fs.h>
#include <linux/device-mapper.h>
#include <linux/list.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
/*
* Suspend feature flags
*/
#define DM_SUSPEND_LOCKFS_FLAG (1 << 0)
#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1)
/*
* Type of table and mapped_device's mempool
*/
#define DM_TYPE_NONE 0
#define DM_TYPE_BIO_BASED 1
#define DM_TYPE_REQUEST_BASED 2
/*
* List of devices that a metadevice uses and should open/close.
*/
struct dm_dev_internal {
struct list_head list;
atomic_t count;
struct dm_dev dm_dev;
};
struct dm_table;
struct dm_md_mempools;
/*-----------------------------------------------------------------
* Internal table functions.
*---------------------------------------------------------------*/
void dm_table_destroy(struct dm_table *t);
void dm_table_event_callback(struct dm_table *t,
void (*fn)(void *), void *context);
struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
int dm_calculate_queue_limits(struct dm_table *table,
struct queue_limits *limits);
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits);
struct list_head *dm_table_get_devices(struct dm_table *t);
void dm_table_presuspend_targets(struct dm_table *t);
void dm_table_postsuspend_targets(struct dm_table *t);
int dm_table_resume_targets(struct dm_table *t);
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
int dm_table_any_busy_target(struct dm_table *t);
int dm_table_set_type(struct dm_table *t);
unsigned dm_table_get_type(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
int dm_table_alloc_md_mempools(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
/*
* To check the return value from dm_table_find_target().
*/
#define dm_target_is_valid(t) ((t)->table)
/*
* To check whether the target type is request-based or not (bio-based).
*/
#define dm_target_request_based(t) ((t)->type->map_rq != NULL)
/*-----------------------------------------------------------------
* A registry of target types.
*---------------------------------------------------------------*/
int dm_target_init(void);
void dm_target_exit(void);
struct target_type *dm_get_target_type(const char *name);
void dm_put_target_type(struct target_type *tt);
int dm_target_iterate(void (*iter_func)(struct target_type *tt,
void *param), void *param);
int dm_split_args(int *argc, char ***argvp, char *input);
/*
* The device-mapper can be driven through one of two interfaces;
* ioctl or filesystem, depending which patch you have applied.
*/
int dm_interface_init(void);
void dm_interface_exit(void);
/*
* sysfs interface
*/
int dm_sysfs_init(struct mapped_device *md);
void dm_sysfs_exit(struct mapped_device *md);
struct kobject *dm_kobject(struct mapped_device *md);
struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
/*
* Targets for linear and striped mappings
*/
int dm_linear_init(void);
void dm_linear_exit(void);
int dm_stripe_init(void);
void dm_stripe_exit(void);
int dm_open_count(struct mapped_device *md);
int dm_lock_for_deletion(struct mapped_device *md);
void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
unsigned cookie);
int dm_kcopyd_init(void);
void dm_kcopyd_exit(void);
/*
* Mempool operations
*/
struct dm_md_mempools *dm_alloc_md_mempools(unsigned type);
void dm_free_md_mempools(struct dm_md_mempools *pools);
#endif

365
kernel/drivers/md/faulty.c Normal file
View File

@@ -0,0 +1,365 @@
/*
* faulty.c : Multiple Devices driver for Linux
*
* Copyright (C) 2004 Neil Brown
*
* fautly-device-simulator personality for md
*
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* You should have received a copy of the GNU General Public License
* (for example /usr/src/linux/COPYING); if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* The "faulty" personality causes some requests to fail.
*
* Possible failure modes are:
* reads fail "randomly" but succeed on retry
* writes fail "randomly" but succeed on retry
* reads for some address fail and then persist until a write
* reads for some address fail and then persist irrespective of write
* writes for some address fail and persist
* all writes fail
*
* Different modes can be active at a time, but only
* one can be set at array creation. Others can be added later.
* A mode can be one-shot or recurrent with the recurrance being
* once in every N requests.
* The bottom 5 bits of the "layout" indicate the mode. The
* remainder indicate a period, or 0 for one-shot.
*
* There is an implementation limit on the number of concurrently
* persisting-faulty blocks. When a new fault is requested that would
* exceed the limit, it is ignored.
* All current faults can be clear using a layout of "0".
*
* Requests are always sent to the device. If they are to fail,
* we clone the bio and insert a new b_end_io into the chain.
*/
#define WriteTransient 0
#define ReadTransient 1
#define WritePersistent 2
#define ReadPersistent 3
#define WriteAll 4 /* doesn't go to device */
#define ReadFixable 5
#define Modes 6
#define ClearErrors 31
#define ClearFaults 30
#define AllPersist 100 /* internal use only */
#define NoPersist 101
#define ModeMask 0x1f
#define ModeShift 5
#define MaxFault 50
#include <linux/blkdev.h>
#include <linux/raid/md_u.h>
#include "md.h"
#include <linux/seq_file.h>
static void faulty_fail(struct bio *bio, int error)
{
struct bio *b = bio->bi_private;
b->bi_size = bio->bi_size;
b->bi_sector = bio->bi_sector;
bio_put(bio);
bio_io_error(b);
}
typedef struct faulty_conf {
int period[Modes];
atomic_t counters[Modes];
sector_t faults[MaxFault];
int modes[MaxFault];
int nfaults;
mdk_rdev_t *rdev;
} conf_t;
static int check_mode(conf_t *conf, int mode)
{
if (conf->period[mode] == 0 &&
atomic_read(&conf->counters[mode]) <= 0)
return 0; /* no failure, no decrement */
if (atomic_dec_and_test(&conf->counters[mode])) {
if (conf->period[mode])
atomic_set(&conf->counters[mode], conf->period[mode]);
return 1;
}
return 0;
}
static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir)
{
/* If we find a ReadFixable sector, we fix it ... */
int i;
for (i=0; i<conf->nfaults; i++)
if (conf->faults[i] >= start &&
conf->faults[i] < end) {
/* found it ... */
switch (conf->modes[i] * 2 + dir) {
case WritePersistent*2+WRITE: return 1;
case ReadPersistent*2+READ: return 1;
case ReadFixable*2+READ: return 1;
case ReadFixable*2+WRITE:
conf->modes[i] = NoPersist;
return 0;
case AllPersist*2+READ:
case AllPersist*2+WRITE: return 1;
default:
return 0;
}
}
return 0;
}
static void add_sector(conf_t *conf, sector_t start, int mode)
{
int i;
int n = conf->nfaults;
for (i=0; i<conf->nfaults; i++)
if (conf->faults[i] == start) {
switch(mode) {
case NoPersist: conf->modes[i] = mode; return;
case WritePersistent:
if (conf->modes[i] == ReadPersistent ||
conf->modes[i] == ReadFixable)
conf->modes[i] = AllPersist;
else
conf->modes[i] = WritePersistent;
return;
case ReadPersistent:
if (conf->modes[i] == WritePersistent)
conf->modes[i] = AllPersist;
else
conf->modes[i] = ReadPersistent;
return;
case ReadFixable:
if (conf->modes[i] == WritePersistent ||
conf->modes[i] == ReadPersistent)
conf->modes[i] = AllPersist;
else
conf->modes[i] = ReadFixable;
return;
}
} else if (conf->modes[i] == NoPersist)
n = i;
if (n >= MaxFault)
return;
conf->faults[n] = start;
conf->modes[n] = mode;
if (conf->nfaults == n)
conf->nfaults = n+1;
}
static int make_request(struct request_queue *q, struct bio *bio)
{
mddev_t *mddev = q->queuedata;
conf_t *conf = (conf_t*)mddev->private;
int failit = 0;
if (bio_data_dir(bio) == WRITE) {
/* write request */
if (atomic_read(&conf->counters[WriteAll])) {
/* special case - don't decrement, don't generic_make_request,
* just fail immediately
*/
bio_endio(bio, -EIO);
return 0;
}
if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
WRITE))
failit = 1;
if (check_mode(conf, WritePersistent)) {
add_sector(conf, bio->bi_sector, WritePersistent);
failit = 1;
}
if (check_mode(conf, WriteTransient))
failit = 1;
} else {
/* read request */
if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9),
READ))
failit = 1;
if (check_mode(conf, ReadTransient))
failit = 1;
if (check_mode(conf, ReadPersistent)) {
add_sector(conf, bio->bi_sector, ReadPersistent);
failit = 1;
}
if (check_mode(conf, ReadFixable)) {
add_sector(conf, bio->bi_sector, ReadFixable);
failit = 1;
}
}
if (failit) {
struct bio *b = bio_clone(bio, GFP_NOIO);
b->bi_bdev = conf->rdev->bdev;
b->bi_private = bio;
b->bi_end_io = faulty_fail;
generic_make_request(b);
return 0;
} else {
bio->bi_bdev = conf->rdev->bdev;
return 1;
}
}
static void status(struct seq_file *seq, mddev_t *mddev)
{
conf_t *conf = (conf_t*)mddev->private;
int n;
if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
seq_printf(seq, " WriteTransient=%d(%d)",
n, conf->period[WriteTransient]);
if ((n=atomic_read(&conf->counters[ReadTransient])) != 0)
seq_printf(seq, " ReadTransient=%d(%d)",
n, conf->period[ReadTransient]);
if ((n=atomic_read(&conf->counters[WritePersistent])) != 0)
seq_printf(seq, " WritePersistent=%d(%d)",
n, conf->period[WritePersistent]);
if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0)
seq_printf(seq, " ReadPersistent=%d(%d)",
n, conf->period[ReadPersistent]);
if ((n=atomic_read(&conf->counters[ReadFixable])) != 0)
seq_printf(seq, " ReadFixable=%d(%d)",
n, conf->period[ReadFixable]);
if ((n=atomic_read(&conf->counters[WriteAll])) != 0)
seq_printf(seq, " WriteAll");
seq_printf(seq, " nfaults=%d", conf->nfaults);
}
static int reshape(mddev_t *mddev)
{
int mode = mddev->new_layout & ModeMask;
int count = mddev->new_layout >> ModeShift;
conf_t *conf = mddev->private;
if (mddev->new_layout < 0)
return 0;
/* new layout */
if (mode == ClearFaults)
conf->nfaults = 0;
else if (mode == ClearErrors) {
int i;
for (i=0 ; i < Modes ; i++) {
conf->period[i] = 0;
atomic_set(&conf->counters[i], 0);
}
} else if (mode < Modes) {
conf->period[mode] = count;
if (!count) count++;
atomic_set(&conf->counters[mode], count);
} else
return -EINVAL;
mddev->new_layout = -1;
mddev->layout = -1; /* makes sure further changes come through */
return 0;
}
static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
WARN_ONCE(raid_disks,
"%s does not support generic reshape\n", __func__);
if (sectors == 0)
return mddev->dev_sectors;
return sectors;
}
static int run(mddev_t *mddev)
{
mdk_rdev_t *rdev;
int i;
conf_t *conf;
if (md_check_no_bitmap(mddev))
return -EINVAL;
conf = kmalloc(sizeof(*conf), GFP_KERNEL);
if (!conf)
return -ENOMEM;
for (i=0; i<Modes; i++) {
atomic_set(&conf->counters[i], 0);
conf->period[i] = 0;
}
conf->nfaults = 0;
list_for_each_entry(rdev, &mddev->disks, same_set)
conf->rdev = rdev;
md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
mddev->private = conf;
reshape(mddev);
return 0;
}
static int stop(mddev_t *mddev)
{
conf_t *conf = (conf_t *)mddev->private;
kfree(conf);
mddev->private = NULL;
return 0;
}
static struct mdk_personality faulty_personality =
{
.name = "faulty",
.level = LEVEL_FAULTY,
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
.stop = stop,
.status = status,
.check_reshape = reshape,
.size = faulty_size,
};
static int __init raid_init(void)
{
return register_md_personality(&faulty_personality);
}
static void raid_exit(void)
{
unregister_md_personality(&faulty_personality);
}
module_init(raid_init);
module_exit(raid_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS("md-personality-10"); /* faulty */
MODULE_ALIAS("md-faulty");
MODULE_ALIAS("md-level--5");

390
kernel/drivers/md/linear.c Normal file
View File

@@ -0,0 +1,390 @@
/*
linear.c : Multiple Devices driver for Linux
Copyright (C) 1994-96 Marc ZYNGIER
<zyngier@ufr-info-p7.ibp.fr> or
<maz@gloups.fdn.fr>
Linear mode management functions.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
You should have received a copy of the GNU General Public License
(for example /usr/src/linux/COPYING); if not, write to the Free
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/blkdev.h>
#include <linux/raid/md_u.h>
#include <linux/seq_file.h>
#include "md.h"
#include "linear.h"
/*
* find which device holds a particular offset
*/
static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
{
int lo, mid, hi;
linear_conf_t *conf;
lo = 0;
hi = mddev->raid_disks - 1;
conf = rcu_dereference(mddev->private);
/*
* Binary Search
*/
while (hi > lo) {
mid = (hi + lo) / 2;
if (sector < conf->disks[mid].end_sector)
hi = mid;
else
lo = mid + 1;
}
return conf->disks + lo;
}
/**
* linear_mergeable_bvec -- tell bio layer if two requests can be merged
* @q: request queue
* @bvm: properties of new bio
* @biovec: the request that could be merged to it.
*
* Return amount of bytes we can take at this offset
*/
static int linear_mergeable_bvec(struct request_queue *q,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
mddev_t *mddev = q->queuedata;
dev_info_t *dev0;
unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
rcu_read_lock();
dev0 = which_dev(mddev, sector);
maxsectors = dev0->end_sector - sector;
rcu_read_unlock();
if (maxsectors < bio_sectors)
maxsectors = 0;
else
maxsectors -= bio_sectors;
if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
return biovec->bv_len;
/* The bytes available at this offset could be really big,
* so we cap at 2^31 to avoid overflow */
if (maxsectors > (1 << (31-9)))
return 1<<31;
return maxsectors << 9;
}
static void linear_unplug(struct request_queue *q)
{
mddev_t *mddev = q->queuedata;
linear_conf_t *conf;
int i;
rcu_read_lock();
conf = rcu_dereference(mddev->private);
for (i=0; i < mddev->raid_disks; i++) {
struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
blk_unplug(r_queue);
}
rcu_read_unlock();
}
static int linear_congested(void *data, int bits)
{
mddev_t *mddev = data;
linear_conf_t *conf;
int i, ret = 0;
if (mddev_congested(mddev, bits))
return 1;
rcu_read_lock();
conf = rcu_dereference(mddev->private);
for (i = 0; i < mddev->raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
ret |= bdi_congested(&q->backing_dev_info, bits);
}
rcu_read_unlock();
return ret;
}
static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
linear_conf_t *conf;
sector_t array_sectors;
rcu_read_lock();
conf = rcu_dereference(mddev->private);
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
array_sectors = conf->array_sectors;
rcu_read_unlock();
return array_sectors;
}
static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
{
linear_conf_t *conf;
mdk_rdev_t *rdev;
int i, cnt;
conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
GFP_KERNEL);
if (!conf)
return NULL;
cnt = 0;
conf->array_sectors = 0;
list_for_each_entry(rdev, &mddev->disks, same_set) {
int j = rdev->raid_disk;
dev_info_t *disk = conf->disks + j;
sector_t sectors;
if (j < 0 || j >= raid_disks || disk->rdev) {
printk("linear: disk numbering problem. Aborting!\n");
goto out;
}
disk->rdev = rdev;
if (mddev->chunk_sectors) {
sectors = rdev->sectors;
sector_div(sectors, mddev->chunk_sectors);
rdev->sectors = sectors * mddev->chunk_sectors;
}
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit max_phys_segments to 1 lying within
* a single page.
*/
if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
blk_queue_max_phys_segments(mddev->queue, 1);
blk_queue_segment_boundary(mddev->queue,
PAGE_CACHE_SIZE - 1);
}
conf->array_sectors += rdev->sectors;
cnt++;
}
if (cnt != raid_disks) {
printk("linear: not enough drives present. Aborting!\n");
goto out;
}
/*
* Here we calculate the device offsets.
*/
conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
for (i = 1; i < raid_disks; i++)
conf->disks[i].end_sector =
conf->disks[i-1].end_sector +
conf->disks[i].rdev->sectors;
return conf;
out:
kfree(conf);
return NULL;
}
static int linear_run (mddev_t *mddev)
{
linear_conf_t *conf;
if (md_check_no_bitmap(mddev))
return -EINVAL;
mddev->queue->queue_lock = &mddev->queue->__queue_lock;
conf = linear_conf(mddev, mddev->raid_disks);
if (!conf)
return 1;
mddev->private = conf;
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
mddev->queue->unplug_fn = linear_unplug;
mddev->queue->backing_dev_info.congested_fn = linear_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
md_integrity_register(mddev);
return 0;
}
static void free_conf(struct rcu_head *head)
{
linear_conf_t *conf = container_of(head, linear_conf_t, rcu);
kfree(conf);
}
static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
{
/* Adding a drive to a linear array allows the array to grow.
* It is permitted if the new drive has a matching superblock
* already on it, with raid_disk equal to raid_disks.
* It is achieved by creating a new linear_private_data structure
* and swapping it in in-place of the current one.
* The current one is never freed until the array is stopped.
* This avoids races.
*/
linear_conf_t *newconf, *oldconf;
if (rdev->saved_raid_disk != mddev->raid_disks)
return -EINVAL;
rdev->raid_disk = rdev->saved_raid_disk;
newconf = linear_conf(mddev,mddev->raid_disks+1);
if (!newconf)
return -ENOMEM;
oldconf = rcu_dereference(mddev->private);
mddev->raid_disks++;
rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
call_rcu(&oldconf->rcu, free_conf);
return 0;
}
static int linear_stop (mddev_t *mddev)
{
linear_conf_t *conf = mddev->private;
/*
* We do not require rcu protection here since
* we hold reconfig_mutex for both linear_add and
* linear_stop, so they cannot race.
* We should make sure any old 'conf's are properly
* freed though.
*/
rcu_barrier();
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
kfree(conf);
return 0;
}
static int linear_make_request (struct request_queue *q, struct bio *bio)
{
const int rw = bio_data_dir(bio);
mddev_t *mddev = q->queuedata;
dev_info_t *tmp_dev;
sector_t start_sector;
int cpu;
if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
cpu = part_stat_lock();
part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
bio_sectors(bio));
part_stat_unlock();
rcu_read_lock();
tmp_dev = which_dev(mddev, bio->bi_sector);
start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
if (unlikely(bio->bi_sector >= (tmp_dev->end_sector)
|| (bio->bi_sector < start_sector))) {
char b[BDEVNAME_SIZE];
printk("linear_make_request: Sector %llu out of bounds on "
"dev %s: %llu sectors, offset %llu\n",
(unsigned long long)bio->bi_sector,
bdevname(tmp_dev->rdev->bdev, b),
(unsigned long long)tmp_dev->rdev->sectors,
(unsigned long long)start_sector);
rcu_read_unlock();
bio_io_error(bio);
return 0;
}
if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
tmp_dev->end_sector)) {
/* This bio crosses a device boundary, so we have to
* split it.
*/
struct bio_pair *bp;
sector_t end_sector = tmp_dev->end_sector;
rcu_read_unlock();
bp = bio_split(bio, end_sector - bio->bi_sector);
if (linear_make_request(q, &bp->bio1))
generic_make_request(&bp->bio1);
if (linear_make_request(q, &bp->bio2))
generic_make_request(&bp->bio2);
bio_pair_release(bp);
return 0;
}
bio->bi_bdev = tmp_dev->rdev->bdev;
bio->bi_sector = bio->bi_sector - start_sector
+ tmp_dev->rdev->data_offset;
rcu_read_unlock();
return 1;
}
static void linear_status (struct seq_file *seq, mddev_t *mddev)
{
seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
}
static struct mdk_personality linear_personality =
{
.name = "linear",
.level = LEVEL_LINEAR,
.owner = THIS_MODULE,
.make_request = linear_make_request,
.run = linear_run,
.stop = linear_stop,
.status = linear_status,
.hot_add_disk = linear_add,
.size = linear_size,
};
static int __init linear_init (void)
{
return register_md_personality (&linear_personality);
}
static void linear_exit (void)
{
unregister_md_personality (&linear_personality);
}
module_init(linear_init);
module_exit(linear_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
MODULE_ALIAS("md-linear");
MODULE_ALIAS("md-level--1");

View File

@@ -0,0 +1,21 @@
#ifndef _LINEAR_H
#define _LINEAR_H
struct dev_info {
mdk_rdev_t *rdev;
sector_t end_sector;
};
typedef struct dev_info dev_info_t;
struct linear_private_data
{
sector_t array_sectors;
dev_info_t disks[0];
struct rcu_head rcu;
};
typedef struct linear_private_data linear_conf_t;
#endif

7049
kernel/drivers/md/md.c Normal file

File diff suppressed because it is too large Load Diff

449
kernel/drivers/md/md.h Normal file
View File

@@ -0,0 +1,449 @@
/*
md_k.h : kernel internal structure of the Linux MD driver
Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
You should have received a copy of the GNU General Public License
(for example /usr/src/linux/COPYING); if not, write to the Free
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _MD_MD_H
#define _MD_MD_H
#include <linux/blkdev.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#define MaxSector (~(sector_t)0)
typedef struct mddev_s mddev_t;
typedef struct mdk_rdev_s mdk_rdev_t;
/*
* MD's 'extended' device
*/
struct mdk_rdev_s
{
struct list_head same_set; /* RAID devices within the same set */
sector_t sectors; /* Device size (in 512bytes sectors) */
mddev_t *mddev; /* RAID array if running */
int last_events; /* IO event timestamp */
struct block_device *bdev; /* block device handle */
struct page *sb_page;
int sb_loaded;
__u64 sb_events;
sector_t data_offset; /* start of data in array */
sector_t sb_start; /* offset of the super block (in 512byte sectors) */
int sb_size; /* bytes in the superblock */
int preferred_minor; /* autorun support */
struct kobject kobj;
/* A device can be in one of three states based on two flags:
* Not working: faulty==1 in_sync==0
* Fully working: faulty==0 in_sync==1
* Working, but not
* in sync with array
* faulty==0 in_sync==0
*
* It can never have faulty==1, in_sync==1
* This reduces the burden of testing multiple flags in many cases
*/
unsigned long flags;
#define Faulty 1 /* device is known to have a fault */
#define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */
#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */
#define AllReserved 6 /* If whole device is reserved for
* one array */
#define AutoDetected 7 /* added by auto-detect */
#define Blocked 8 /* An error occured on an externally
* managed array, don't allow writes
* until it is cleared */
#define StateChanged 9 /* Faulty or Blocked has changed during
* interrupt, so it needs to be
* notified by the thread */
wait_queue_head_t blocked_wait;
int desc_nr; /* descriptor index in the superblock */
int raid_disk; /* role of device in array */
int saved_raid_disk; /* role that device used to have in the
* array and could again if we did a partial
* resync from the bitmap
*/
sector_t recovery_offset;/* If this device has been partially
* recovered, this is where we were
* up to.
*/
atomic_t nr_pending; /* number of pending requests.
* only maintained for arrays that
* support hot removal
*/
atomic_t read_errors; /* number of consecutive read errors that
* we have tried to ignore.
*/
atomic_t corrected_errors; /* number of corrected read errors,
* for reporting to userspace and storing
* in superblock.
*/
struct work_struct del_work; /* used for delayed sysfs removal */
struct sysfs_dirent *sysfs_state; /* handle for 'state'
* sysfs entry */
};
struct mddev_s
{
void *private;
struct mdk_personality *pers;
dev_t unit;
int md_minor;
struct list_head disks;
unsigned long flags;
#define MD_CHANGE_DEVS 0 /* Some device status has changed */
#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
#define MD_CHANGE_PENDING 2 /* superblock update in progress */
int suspended;
atomic_t active_io;
int ro;
struct gendisk *gendisk;
struct kobject kobj;
int hold_active;
#define UNTIL_IOCTL 1
#define UNTIL_STOP 2
/* Superblock information */
int major_version,
minor_version,
patch_version;
int persistent;
int external; /* metadata is
* managed externally */
char metadata_type[17]; /* externally set*/
int chunk_sectors;
time_t ctime, utime;
int level, layout;
char clevel[16];
int raid_disks;
int max_disks;
sector_t dev_sectors; /* used size of
* component devices */
sector_t array_sectors; /* exported array size */
int external_size; /* size managed
* externally */
__u64 events;
char uuid[16];
/* If the array is being reshaped, we need to record the
* new shape and an indication of where we are up to.
* This is written to the superblock.
* If reshape_position is MaxSector, then no reshape is happening (yet).
*/
sector_t reshape_position;
int delta_disks, new_level, new_layout;
int new_chunk_sectors;
struct mdk_thread_s *thread; /* management thread */
struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
sector_t curr_resync; /* last block scheduled */
/* As resync requests can complete out of order, we cannot easily track
* how much resync has been completed. So we occasionally pause until
* everything completes, then set curr_resync_completed to curr_resync.
* As such it may be well behind the real resync mark, but it is a value
* we are certain of.
*/
sector_t curr_resync_completed;
unsigned long resync_mark; /* a recent timestamp */
sector_t resync_mark_cnt;/* blocks written at resync_mark */
sector_t curr_mark_cnt; /* blocks scheduled now */
sector_t resync_max_sectors; /* may be set by personality */
sector_t resync_mismatches; /* count of sectors where
* parity/replica mismatch found
*/
/* allow user-space to request suspension of IO to regions of the array */
sector_t suspend_lo;
sector_t suspend_hi;
/* if zero, use the system-wide default */
int sync_speed_min;
int sync_speed_max;
/* resync even though the same disks are shared among md-devices */
int parallel_resync;
int ok_start_degraded;
/* recovery/resync flags
* NEEDED: we might need to start a resync/recover
* RUNNING: a thread is running, or about to be started
* SYNC: actually doing a resync, not a recovery
* RECOVER: doing recovery, or need to try it.
* INTR: resync needs to be aborted for some reason
* DONE: thread is done and is waiting to be reaped
* REQUEST: user-space has requested a sync (used with SYNC)
* CHECK: user-space request for check-only, no repair
* RESHAPE: A reshape is happening
*
* If neither SYNC or RESHAPE are set, then it is a recovery.
*/
#define MD_RECOVERY_RUNNING 0
#define MD_RECOVERY_SYNC 1
#define MD_RECOVERY_RECOVER 2
#define MD_RECOVERY_INTR 3
#define MD_RECOVERY_DONE 4
#define MD_RECOVERY_NEEDED 5
#define MD_RECOVERY_REQUESTED 6
#define MD_RECOVERY_CHECK 7
#define MD_RECOVERY_RESHAPE 8
#define MD_RECOVERY_FROZEN 9
unsigned long recovery;
int recovery_disabled; /* if we detect that recovery
* will always fail, set this
* so we don't loop trying */
int in_sync; /* know to not need resync */
/* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
* that we are never stopping an array while it is open.
* 'reconfig_mutex' protects all other reconfiguration.
* These locks are separate due to conflicting interactions
* with bdev->bd_mutex.
* Lock ordering is:
* reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
* bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open
*/
struct mutex open_mutex;
struct mutex reconfig_mutex;
atomic_t active; /* general refcount */
atomic_t openers; /* number of active opens */
int changed; /* true if we might need to reread partition info */
int degraded; /* whether md should consider
* adding a spare
*/
int barriers_work; /* initialised to true, cleared as soon
* as a barrier request to slave
* fails. Only supported
*/
struct bio *biolist; /* bios that need to be retried
* because BIO_RW_BARRIER is not supported
*/
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
sector_t recovery_cp;
sector_t resync_min; /* user requested sync
* starts here */
sector_t resync_max; /* resync should pause
* when it gets here */
struct sysfs_dirent *sysfs_state; /* handle for 'array_state'
* file in sysfs.
*/
struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */
struct work_struct del_work; /* used for delayed sysfs removal */
spinlock_t write_lock;
wait_queue_head_t sb_wait; /* for waiting on superblock updates */
atomic_t pending_writes; /* number of active superblock writes */
unsigned int safemode; /* if set, update "clean" superblock
* when no writes pending.
*/
unsigned int safemode_delay;
struct timer_list safemode_timer;
atomic_t writes_pending;
struct request_queue *queue; /* for plugging ... */
atomic_t write_behind; /* outstanding async IO */
unsigned int max_write_behind; /* 0 = sync */
struct bitmap *bitmap; /* the bitmap for the device */
struct file *bitmap_file; /* the bitmap file */
long bitmap_offset; /* offset from superblock of
* start of bitmap. May be
* negative, but not '0'
*/
long default_bitmap_offset; /* this is the offset to use when
* hot-adding a bitmap. It should
* eventually be settable by sysfs.
*/
struct mutex bitmap_mutex;
struct list_head all_mddevs;
};
static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
{
int faulty = test_bit(Faulty, &rdev->flags);
if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
{
atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
}
struct mdk_personality
{
char *name;
int level;
struct list_head list;
struct module *owner;
int (*make_request)(struct request_queue *q, struct bio *bio);
int (*run)(mddev_t *mddev);
int (*stop)(mddev_t *mddev);
void (*status)(struct seq_file *seq, mddev_t *mddev);
/* error_handler must set ->faulty and clear ->in_sync
* if appropriate, and should abort recovery if needed
*/
void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
int (*hot_remove_disk) (mddev_t *mddev, int number);
int (*spare_active) (mddev_t *mddev);
sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
int (*resize) (mddev_t *mddev, sector_t sectors);
sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks);
int (*check_reshape) (mddev_t *mddev);
int (*start_reshape) (mddev_t *mddev);
void (*finish_reshape) (mddev_t *mddev);
/* quiesce moves between quiescence states
* 0 - fully active
* 1 - no new requests allowed
* others - reserved
*/
void (*quiesce) (mddev_t *mddev, int state);
/* takeover is used to transition an array from one
* personality to another. The new personality must be able
* to handle the data in the current layout.
* e.g. 2drive raid1 -> 2drive raid5
* ndrive raid5 -> degraded n+1drive raid6 with special layout
* If the takeover succeeds, a new 'private' structure is returned.
* This needs to be installed and then ->run used to activate the
* array.
*/
void *(*takeover) (mddev_t *mddev);
};
struct md_sysfs_entry {
struct attribute attr;
ssize_t (*show)(mddev_t *, char *);
ssize_t (*store)(mddev_t *, const char *, size_t);
};
static inline char * mdname (mddev_t * mddev)
{
return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
}
/*
* iterates through some rdev ringlist. It's safe to remove the
* current 'rdev'. Dont touch 'tmp' though.
*/
#define rdev_for_each_list(rdev, tmp, head) \
list_for_each_entry_safe(rdev, tmp, head, same_set)
/*
* iterates through the 'same array disks' ringlist
*/
#define rdev_for_each(rdev, tmp, mddev) \
list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
#define rdev_for_each_rcu(rdev, mddev) \
list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
typedef struct mdk_thread_s {
void (*run) (mddev_t *mddev);
mddev_t *mddev;
wait_queue_head_t wqueue;
unsigned long flags;
struct task_struct *tsk;
unsigned long timeout;
} mdk_thread_t;
#define THREAD_WAKEUP 0
#define __wait_event_lock_irq(wq, condition, lock, cmd) \
do { \
wait_queue_t __wait; \
init_waitqueue_entry(&__wait, current); \
\
add_wait_queue(&wq, &__wait); \
for (;;) { \
set_current_state(TASK_UNINTERRUPTIBLE); \
if (condition) \
break; \
spin_unlock_irq(&lock); \
cmd; \
schedule(); \
spin_lock_irq(&lock); \
} \
current->state = TASK_RUNNING; \
remove_wait_queue(&wq, &__wait); \
} while (0)
#define wait_event_lock_irq(wq, condition, lock, cmd) \
do { \
if (condition) \
break; \
__wait_event_lock_irq(wq, condition, lock, cmd); \
} while (0)
static inline void safe_put_page(struct page *p)
{
if (p) put_page(p);
}
extern int register_md_personality(struct mdk_personality *p);
extern int unregister_md_personality(struct mdk_personality *p);
extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
mddev_t *mddev, const char *name);
extern void md_unregister_thread(mdk_thread_t *thread);
extern void md_wakeup_thread(mdk_thread_t *thread);
extern void md_check_recovery(mddev_t *mddev);
extern void md_write_start(mddev_t *mddev, struct bio *bi);
extern void md_write_end(mddev_t *mddev);
extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
extern int mddev_congested(mddev_t *mddev, int bits);
extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page);
extern void md_super_wait(mddev_t *mddev);
extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
struct page *page, int rw);
extern void md_do_sync(mddev_t *mddev);
extern void md_new_event(mddev_t *mddev);
extern int md_allow_write(mddev_t *mddev);
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
extern int md_check_no_bitmap(mddev_t *mddev);
extern int md_integrity_register(mddev_t *mddev);
void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
#endif /* _MD_MD_H */

View File

@@ -0,0 +1,132 @@
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright 2002-2007 H. Peter Anvin - All Rights Reserved
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2 or (at your
* option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* mktables.c
*
* Make RAID-6 tables. This is a host user space program to be run at
* compile time.
*/
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include <stdlib.h>
#include <time.h>
static uint8_t gfmul(uint8_t a, uint8_t b)
{
uint8_t v = 0;
while (b) {
if (b & 1)
v ^= a;
a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
b >>= 1;
}
return v;
}
static uint8_t gfpow(uint8_t a, int b)
{
uint8_t v = 1;
b %= 255;
if (b < 0)
b += 255;
while (b) {
if (b & 1)
v = gfmul(v, a);
a = gfmul(a, a);
b >>= 1;
}
return v;
}
int main(int argc, char *argv[])
{
int i, j, k;
uint8_t v;
uint8_t exptbl[256], invtbl[256];
printf("#include <linux/raid/pq.h>\n");
/* Compute multiplication table */
printf("\nconst u8 __attribute__((aligned(256)))\n"
"raid6_gfmul[256][256] =\n"
"{\n");
for (i = 0; i < 256; i++) {
printf("\t{\n");
for (j = 0; j < 256; j += 8) {
printf("\t\t");
for (k = 0; k < 8; k++)
printf("0x%02x,%c", gfmul(i, j + k),
(k == 7) ? '\n' : ' ');
}
printf("\t},\n");
}
printf("};\n");
printf("#ifdef __KERNEL__\n");
printf("EXPORT_SYMBOL(raid6_gfmul);\n");
printf("#endif\n");
/* Compute power-of-2 table (exponent) */
v = 1;
printf("\nconst u8 __attribute__((aligned(256)))\n"
"raid6_gfexp[256] =\n" "{\n");
for (i = 0; i < 256; i += 8) {
printf("\t");
for (j = 0; j < 8; j++) {
exptbl[i + j] = v;
printf("0x%02x,%c", v, (j == 7) ? '\n' : ' ');
v = gfmul(v, 2);
if (v == 1)
v = 0; /* For entry 255, not a real entry */
}
}
printf("};\n");
printf("#ifdef __KERNEL__\n");
printf("EXPORT_SYMBOL(raid6_gfexp);\n");
printf("#endif\n");
/* Compute inverse table x^-1 == x^254 */
printf("\nconst u8 __attribute__((aligned(256)))\n"
"raid6_gfinv[256] =\n" "{\n");
for (i = 0; i < 256; i += 8) {
printf("\t");
for (j = 0; j < 8; j++) {
invtbl[i + j] = v = gfpow(i + j, 254);
printf("0x%02x,%c", v, (j == 7) ? '\n' : ' ');
}
}
printf("};\n");
printf("#ifdef __KERNEL__\n");
printf("EXPORT_SYMBOL(raid6_gfinv);\n");
printf("#endif\n");
/* Compute inv(2^x + 1) (exponent-xor-inverse) table */
printf("\nconst u8 __attribute__((aligned(256)))\n"
"raid6_gfexi[256] =\n" "{\n");
for (i = 0; i < 256; i += 8) {
printf("\t");
for (j = 0; j < 8; j++)
printf("0x%02x,%c", invtbl[exptbl[i + j] ^ 1],
(j == 7) ? '\n' : ' ');
}
printf("};\n");
printf("#ifdef __KERNEL__\n");
printf("EXPORT_SYMBOL(raid6_gfexi);\n");
printf("#endif\n");
return 0;
}

View File

@@ -0,0 +1,590 @@
/*
* multipath.c : Multiple Devices driver for Linux
*
* Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
*
* Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
*
* MULTIPATH management functions.
*
* derived from raid1.c.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* You should have received a copy of the GNU General Public License
* (for example /usr/src/linux/COPYING); if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/blkdev.h>
#include <linux/raid/md_u.h>
#include <linux/seq_file.h>
#include "md.h"
#include "multipath.h"
#define MAX_WORK_PER_DISK 128
#define NR_RESERVED_BUFS 32
static int multipath_map (multipath_conf_t *conf)
{
int i, disks = conf->raid_disks;
/*
* Later we do read balancing on the read side
* now we use the first available disk.
*/
rcu_read_lock();
for (i = 0; i < disks; i++) {
mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
if (rdev && test_bit(In_sync, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
return i;
}
}
rcu_read_unlock();
printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
return (-1);
}
static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
{
unsigned long flags;
mddev_t *mddev = mp_bh->mddev;
multipath_conf_t *conf = mddev->private;
spin_lock_irqsave(&conf->device_lock, flags);
list_add(&mp_bh->retry_list, &conf->retry_list);
spin_unlock_irqrestore(&conf->device_lock, flags);
md_wakeup_thread(mddev->thread);
}
/*
* multipath_end_bh_io() is called when we have finished servicing a multipathed
* operation and are ready to return a success/failure code to the buffer
* cache layer.
*/
static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
{
struct bio *bio = mp_bh->master_bio;
multipath_conf_t *conf = mp_bh->mddev->private;
bio_endio(bio, err);
mempool_free(mp_bh, conf->pool);
}
static void multipath_end_request(struct bio *bio, int error)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
multipath_conf_t *conf = mp_bh->mddev->private;
mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
if (uptodate)
multipath_end_bh_io(mp_bh, 0);
else if (!bio_rw_flagged(bio, BIO_RW_AHEAD)) {
/*
* oops, IO error:
*/
char b[BDEVNAME_SIZE];
md_error (mp_bh->mddev, rdev);
printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n",
bdevname(rdev->bdev,b),
(unsigned long long)bio->bi_sector);
multipath_reschedule_retry(mp_bh);
} else
multipath_end_bh_io(mp_bh, error);
rdev_dec_pending(rdev, conf->mddev);
}
static void unplug_slaves(mddev_t *mddev)
{
multipath_conf_t *conf = mddev->private;
int i;
rcu_read_lock();
for (i=0; i<mddev->raid_disks; i++) {
mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)
&& atomic_read(&rdev->nr_pending)) {
struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
blk_unplug(r_queue);
rdev_dec_pending(rdev, mddev);
rcu_read_lock();
}
}
rcu_read_unlock();
}
static void multipath_unplug(struct request_queue *q)
{
unplug_slaves(q->queuedata);
}
static int multipath_make_request (struct request_queue *q, struct bio * bio)
{
mddev_t *mddev = q->queuedata;
multipath_conf_t *conf = mddev->private;
struct multipath_bh * mp_bh;
struct multipath_info *multipath;
const int rw = bio_data_dir(bio);
int cpu;
if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
mp_bh->master_bio = bio;
mp_bh->mddev = mddev;
cpu = part_stat_lock();
part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
bio_sectors(bio));
part_stat_unlock();
mp_bh->path = multipath_map(conf);
if (mp_bh->path < 0) {
bio_endio(bio, -EIO);
mempool_free(mp_bh, conf->pool);
return 0;
}
multipath = conf->multipaths + mp_bh->path;
mp_bh->bio = *bio;
mp_bh->bio.bi_sector += multipath->rdev->data_offset;
mp_bh->bio.bi_bdev = multipath->rdev->bdev;
mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT);
mp_bh->bio.bi_end_io = multipath_end_request;
mp_bh->bio.bi_private = mp_bh;
generic_make_request(&mp_bh->bio);
return 0;
}
static void multipath_status (struct seq_file *seq, mddev_t *mddev)
{
multipath_conf_t *conf = mddev->private;
int i;
seq_printf (seq, " [%d/%d] [", conf->raid_disks,
conf->working_disks);
for (i = 0; i < conf->raid_disks; i++)
seq_printf (seq, "%s",
conf->multipaths[i].rdev &&
test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_");
seq_printf (seq, "]");
}
static int multipath_congested(void *data, int bits)
{
mddev_t *mddev = data;
multipath_conf_t *conf = mddev->private;
int i, ret = 0;
if (mddev_congested(mddev, bits))
return 1;
rcu_read_lock();
for (i = 0; i < mddev->raid_disks ; i++) {
mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q = bdev_get_queue(rdev->bdev);
ret |= bdi_congested(&q->backing_dev_info, bits);
/* Just like multipath_map, we just check the
* first available device
*/
break;
}
}
rcu_read_unlock();
return ret;
}
/*
* Careful, this can execute in IRQ contexts as well!
*/
static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
{
multipath_conf_t *conf = mddev->private;
if (conf->working_disks <= 1) {
/*
* Uh oh, we can do nothing if this is our last path, but
* first check if this is a queued request for a device
* which has just failed.
*/
printk(KERN_ALERT
"multipath: only one IO path left and IO error.\n");
/* leave it active... it's all we have */
} else {
/*
* Mark disk as unusable
*/
if (!test_bit(Faulty, &rdev->flags)) {
char b[BDEVNAME_SIZE];
clear_bit(In_sync, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
conf->working_disks--;
mddev->degraded++;
printk(KERN_ALERT "multipath: IO failure on %s,"
" disabling IO path.\n"
"multipath: Operation continuing"
" on %d IO paths.\n",
bdevname (rdev->bdev,b),
conf->working_disks);
}
}
}
static void print_multipath_conf (multipath_conf_t *conf)
{
int i;
struct multipath_info *tmp;
printk("MULTIPATH conf printout:\n");
if (!conf) {
printk("(conf==NULL)\n");
return;
}
printk(" --- wd:%d rd:%d\n", conf->working_disks,
conf->raid_disks);
for (i = 0; i < conf->raid_disks; i++) {
char b[BDEVNAME_SIZE];
tmp = conf->multipaths + i;
if (tmp->rdev)
printk(" disk%d, o:%d, dev:%s\n",
i,!test_bit(Faulty, &tmp->rdev->flags),
bdevname(tmp->rdev->bdev,b));
}
}
static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{
multipath_conf_t *conf = mddev->private;
struct request_queue *q;
int err = -EEXIST;
int path;
struct multipath_info *p;
int first = 0;
int last = mddev->raid_disks - 1;
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
print_multipath_conf(conf);
for (path = first; path <= last; path++)
if ((p=conf->multipaths+path)->rdev == NULL) {
q = rdev->bdev->bd_disk->queue;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit ->max_phys_segments to one, lying
* within a single page.
* (Note: it is very unlikely that a device with
* merge_bvec_fn will be involved in multipath.)
*/
if (q->merge_bvec_fn) {
blk_queue_max_phys_segments(mddev->queue, 1);
blk_queue_segment_boundary(mddev->queue,
PAGE_CACHE_SIZE - 1);
}
conf->working_disks++;
mddev->degraded--;
rdev->raid_disk = path;
set_bit(In_sync, &rdev->flags);
rcu_assign_pointer(p->rdev, rdev);
err = 0;
md_integrity_add_rdev(rdev, mddev);
break;
}
print_multipath_conf(conf);
return err;
}
static int multipath_remove_disk(mddev_t *mddev, int number)
{
multipath_conf_t *conf = mddev->private;
int err = 0;
mdk_rdev_t *rdev;
struct multipath_info *p = conf->multipaths + number;
print_multipath_conf(conf);
rdev = p->rdev;
if (rdev) {
if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) {
printk(KERN_ERR "hot-remove-disk, slot %d is identified"
" but is still operational!\n", number);
err = -EBUSY;
goto abort;
}
p->rdev = NULL;
synchronize_rcu();
if (atomic_read(&rdev->nr_pending)) {
/* lost the race, try later */
err = -EBUSY;
p->rdev = rdev;
goto abort;
}
md_integrity_register(mddev);
}
abort:
print_multipath_conf(conf);
return err;
}
/*
* This is a kernel thread which:
*
* 1. Retries failed read operations on working multipaths.
* 2. Updates the raid superblock when problems encounter.
* 3. Performs writes following reads for array syncronising.
*/
static void multipathd (mddev_t *mddev)
{
struct multipath_bh *mp_bh;
struct bio *bio;
unsigned long flags;
multipath_conf_t *conf = mddev->private;
struct list_head *head = &conf->retry_list;
md_check_recovery(mddev);
for (;;) {
char b[BDEVNAME_SIZE];
spin_lock_irqsave(&conf->device_lock, flags);
if (list_empty(head))
break;
mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
list_del(head->prev);
spin_unlock_irqrestore(&conf->device_lock, flags);
bio = &mp_bh->bio;
bio->bi_sector = mp_bh->master_bio->bi_sector;
if ((mp_bh->path = multipath_map (conf))<0) {
printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
" error for block %llu\n",
bdevname(bio->bi_bdev,b),
(unsigned long long)bio->bi_sector);
multipath_end_bh_io(mp_bh, -EIO);
} else {
printk(KERN_ERR "multipath: %s: redirecting sector %llu"
" to another IO path\n",
bdevname(bio->bi_bdev,b),
(unsigned long long)bio->bi_sector);
*bio = *(mp_bh->master_bio);
bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset;
bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT);
bio->bi_end_io = multipath_end_request;
bio->bi_private = mp_bh;
generic_make_request(bio);
}
}
spin_unlock_irqrestore(&conf->device_lock, flags);
}
static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
return mddev->dev_sectors;
}
static int multipath_run (mddev_t *mddev)
{
multipath_conf_t *conf;
int disk_idx;
struct multipath_info *disk;
mdk_rdev_t *rdev;
if (md_check_no_bitmap(mddev))
return -EINVAL;
if (mddev->level != LEVEL_MULTIPATH) {
printk("multipath: %s: raid level not set to multipath IO (%d)\n",
mdname(mddev), mddev->level);
goto out;
}
/*
* copy the already verified devices into our private MULTIPATH
* bookkeeping area. [whatever we allocate in multipath_run(),
* should be freed in multipath_stop()]
*/
mddev->queue->queue_lock = &mddev->queue->__queue_lock;
conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
mddev->private = conf;
if (!conf) {
printk(KERN_ERR
"multipath: couldn't allocate memory for %s\n",
mdname(mddev));
goto out;
}
conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
GFP_KERNEL);
if (!conf->multipaths) {
printk(KERN_ERR
"multipath: couldn't allocate memory for %s\n",
mdname(mddev));
goto out_free_conf;
}
conf->working_disks = 0;
list_for_each_entry(rdev, &mddev->disks, same_set) {
disk_idx = rdev->raid_disk;
if (disk_idx < 0 ||
disk_idx >= mddev->raid_disks)
continue;
disk = conf->multipaths + disk_idx;
disk->rdev = rdev;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, not that we ever expect a device with
* a merge_bvec_fn to be involved in multipath */
if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
blk_queue_max_phys_segments(mddev->queue, 1);
blk_queue_segment_boundary(mddev->queue,
PAGE_CACHE_SIZE - 1);
}
if (!test_bit(Faulty, &rdev->flags))
conf->working_disks++;
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
if (!conf->working_disks) {
printk(KERN_ERR "multipath: no operational IO paths for %s\n",
mdname(mddev));
goto out_free_conf;
}
mddev->degraded = conf->raid_disks - conf->working_disks;
conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
sizeof(struct multipath_bh));
if (conf->pool == NULL) {
printk(KERN_ERR
"multipath: couldn't allocate memory for %s\n",
mdname(mddev));
goto out_free_conf;
}
{
mddev->thread = md_register_thread(multipathd, mddev, NULL);
if (!mddev->thread) {
printk(KERN_ERR "multipath: couldn't allocate thread"
" for %s\n", mdname(mddev));
goto out_free_conf;
}
}
printk(KERN_INFO
"multipath: array %s active with %d out of %d IO paths\n",
mdname(mddev), conf->working_disks, mddev->raid_disks);
/*
* Ok, everything is just fine now
*/
md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
mddev->queue->unplug_fn = multipath_unplug;
mddev->queue->backing_dev_info.congested_fn = multipath_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
md_integrity_register(mddev);
return 0;
out_free_conf:
if (conf->pool)
mempool_destroy(conf->pool);
kfree(conf->multipaths);
kfree(conf);
mddev->private = NULL;
out:
return -EIO;
}
static int multipath_stop (mddev_t *mddev)
{
multipath_conf_t *conf = mddev->private;
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
mempool_destroy(conf->pool);
kfree(conf->multipaths);
kfree(conf);
mddev->private = NULL;
return 0;
}
static struct mdk_personality multipath_personality =
{
.name = "multipath",
.level = LEVEL_MULTIPATH,
.owner = THIS_MODULE,
.make_request = multipath_make_request,
.run = multipath_run,
.stop = multipath_stop,
.status = multipath_status,
.error_handler = multipath_error,
.hot_add_disk = multipath_add_disk,
.hot_remove_disk= multipath_remove_disk,
.size = multipath_size,
};
static int __init multipath_init (void)
{
return register_md_personality (&multipath_personality);
}
static void __exit multipath_exit (void)
{
unregister_md_personality (&multipath_personality);
}
module_init(multipath_init);
module_exit(multipath_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
MODULE_ALIAS("md-multipath");
MODULE_ALIAS("md-level--4");

View File

@@ -0,0 +1,34 @@
#ifndef _MULTIPATH_H
#define _MULTIPATH_H
struct multipath_info {
mdk_rdev_t *rdev;
};
struct multipath_private_data {
mddev_t *mddev;
struct multipath_info *multipaths;
int raid_disks;
int working_disks;
spinlock_t device_lock;
struct list_head retry_list;
mempool_t *pool;
};
typedef struct multipath_private_data multipath_conf_t;
/*
* this is our 'private' 'collective' MULTIPATH buffer head.
* it contains information about what kind of IO operations were started
* for this MULTIPATH operation, and about their status:
*/
struct multipath_bh {
mddev_t *mddev;
struct bio *master_bio;
struct bio bio;
int path;
struct list_head retry_list;
};
#endif

573
kernel/drivers/md/raid0.c Normal file
View File

@@ -0,0 +1,573 @@
/*
raid0.c : Multiple Devices driver for Linux
Copyright (C) 1994-96 Marc ZYNGIER
<zyngier@ufr-info-p7.ibp.fr> or
<maz@gloups.fdn.fr>
Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
RAID-0 management functions.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
You should have received a copy of the GNU General Public License
(for example /usr/src/linux/COPYING); if not, write to the Free
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/blkdev.h>
#include <linux/seq_file.h>
#include "md.h"
#include "raid0.h"
static void raid0_unplug(struct request_queue *q)
{
mddev_t *mddev = q->queuedata;
raid0_conf_t *conf = mddev->private;
mdk_rdev_t **devlist = conf->devlist;
int i;
for (i=0; i<mddev->raid_disks; i++) {
struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev);
blk_unplug(r_queue);
}
}
static int raid0_congested(void *data, int bits)
{
mddev_t *mddev = data;
raid0_conf_t *conf = mddev->private;
mdk_rdev_t **devlist = conf->devlist;
int i, ret = 0;
if (mddev_congested(mddev, bits))
return 1;
for (i = 0; i < mddev->raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
ret |= bdi_congested(&q->backing_dev_info, bits);
}
return ret;
}
/*
* inform the user of the raid configuration
*/
static void dump_zones(mddev_t *mddev)
{
int j, k, h;
sector_t zone_size = 0;
sector_t zone_start = 0;
char b[BDEVNAME_SIZE];
raid0_conf_t *conf = mddev->private;
printk(KERN_INFO "******* %s configuration *********\n",
mdname(mddev));
h = 0;
for (j = 0; j < conf->nr_strip_zones; j++) {
printk(KERN_INFO "zone%d=[", j);
for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
printk("%s/",
bdevname(conf->devlist[j*mddev->raid_disks
+ k]->bdev, b));
printk("]\n");
zone_size = conf->strip_zone[j].zone_end - zone_start;
printk(KERN_INFO " zone offset=%llukb "
"device offset=%llukb size=%llukb\n",
(unsigned long long)zone_start>>1,
(unsigned long long)conf->strip_zone[j].dev_start>>1,
(unsigned long long)zone_size>>1);
zone_start = conf->strip_zone[j].zone_end;
}
printk(KERN_INFO "**********************************\n\n");
}
static int create_strip_zones(mddev_t *mddev)
{
int i, c, err;
sector_t curr_zone_end, sectors;
mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev;
struct strip_zone *zone;
int cnt;
char b[BDEVNAME_SIZE];
raid0_conf_t *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
if (!conf)
return -ENOMEM;
list_for_each_entry(rdev1, &mddev->disks, same_set) {
printk(KERN_INFO "raid0: looking at %s\n",
bdevname(rdev1->bdev,b));
c = 0;
/* round size to chunk_size */
sectors = rdev1->sectors;
sector_div(sectors, mddev->chunk_sectors);
rdev1->sectors = sectors * mddev->chunk_sectors;
list_for_each_entry(rdev2, &mddev->disks, same_set) {
printk(KERN_INFO "raid0: comparing %s(%llu)",
bdevname(rdev1->bdev,b),
(unsigned long long)rdev1->sectors);
printk(KERN_INFO " with %s(%llu)\n",
bdevname(rdev2->bdev,b),
(unsigned long long)rdev2->sectors);
if (rdev2 == rdev1) {
printk(KERN_INFO "raid0: END\n");
break;
}
if (rdev2->sectors == rdev1->sectors) {
/*
* Not unique, don't count it as a new
* group
*/
printk(KERN_INFO "raid0: EQUAL\n");
c = 1;
break;
}
printk(KERN_INFO "raid0: NOT EQUAL\n");
}
if (!c) {
printk(KERN_INFO "raid0: ==> UNIQUE\n");
conf->nr_strip_zones++;
printk(KERN_INFO "raid0: %d zones\n",
conf->nr_strip_zones);
}
}
printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones);
err = -ENOMEM;
conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
conf->nr_strip_zones, GFP_KERNEL);
if (!conf->strip_zone)
goto abort;
conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
conf->nr_strip_zones*mddev->raid_disks,
GFP_KERNEL);
if (!conf->devlist)
goto abort;
/* The first zone must contain all devices, so here we check that
* there is a proper alignment of slots to devices and find them all
*/
zone = &conf->strip_zone[0];
cnt = 0;
smallest = NULL;
dev = conf->devlist;
err = -EINVAL;
list_for_each_entry(rdev1, &mddev->disks, same_set) {
int j = rdev1->raid_disk;
if (j < 0 || j >= mddev->raid_disks) {
printk(KERN_ERR "raid0: bad disk number %d - "
"aborting!\n", j);
goto abort;
}
if (dev[j]) {
printk(KERN_ERR "raid0: multiple devices for %d - "
"aborting!\n", j);
goto abort;
}
dev[j] = rdev1;
disk_stack_limits(mddev->gendisk, rdev1->bdev,
rdev1->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit ->max_phys_segments to 1, lying within
* a single page.
*/
if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) {
blk_queue_max_phys_segments(mddev->queue, 1);
blk_queue_segment_boundary(mddev->queue,
PAGE_CACHE_SIZE - 1);
}
if (!smallest || (rdev1->sectors < smallest->sectors))
smallest = rdev1;
cnt++;
}
if (cnt != mddev->raid_disks) {
printk(KERN_ERR "raid0: too few disks (%d of %d) - "
"aborting!\n", cnt, mddev->raid_disks);
goto abort;
}
zone->nb_dev = cnt;
zone->zone_end = smallest->sectors * cnt;
curr_zone_end = zone->zone_end;
/* now do the other zones */
for (i = 1; i < conf->nr_strip_zones; i++)
{
int j;
zone = conf->strip_zone + i;
dev = conf->devlist + i * mddev->raid_disks;
printk(KERN_INFO "raid0: zone %d\n", i);
zone->dev_start = smallest->sectors;
smallest = NULL;
c = 0;
for (j=0; j<cnt; j++) {
rdev = conf->devlist[j];
printk(KERN_INFO "raid0: checking %s ...",
bdevname(rdev->bdev, b));
if (rdev->sectors <= zone->dev_start) {
printk(KERN_INFO " nope.\n");
continue;
}
printk(KERN_INFO " contained as device %d\n", c);
dev[c] = rdev;
c++;
if (!smallest || rdev->sectors < smallest->sectors) {
smallest = rdev;
printk(KERN_INFO " (%llu) is smallest!.\n",
(unsigned long long)rdev->sectors);
}
}
zone->nb_dev = c;
sectors = (smallest->sectors - zone->dev_start) * c;
printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
zone->nb_dev, (unsigned long long)sectors);
curr_zone_end += sectors;
zone->zone_end = curr_zone_end;
printk(KERN_INFO "raid0: current zone start: %llu\n",
(unsigned long long)smallest->sectors);
}
mddev->queue->unplug_fn = raid0_unplug;
mddev->queue->backing_dev_info.congested_fn = raid0_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
/*
* now since we have the hard sector sizes, we can make sure
* chunk size is a multiple of that sector size
*/
if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
printk(KERN_ERR "%s chunk_size of %d not valid\n",
mdname(mddev),
mddev->chunk_sectors << 9);
goto abort;
}
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,
(mddev->chunk_sectors << 9) * mddev->raid_disks);
printk(KERN_INFO "raid0: done.\n");
mddev->private = conf;
return 0;
abort:
kfree(conf->strip_zone);
kfree(conf->devlist);
kfree(conf);
mddev->private = NULL;
return err;
}
/**
* raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
* @q: request queue
* @bvm: properties of new bio
* @biovec: the request that could be merged to it.
*
* Return amount of bytes we can accept at this offset
*/
static int raid0_mergeable_bvec(struct request_queue *q,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
mddev_t *mddev = q->queuedata;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
unsigned int chunk_sectors = mddev->chunk_sectors;
unsigned int bio_sectors = bvm->bi_size >> 9;
if (is_power_of_2(chunk_sectors))
max = (chunk_sectors - ((sector & (chunk_sectors-1))
+ bio_sectors)) << 9;
else
max = (chunk_sectors - (sector_div(sector, chunk_sectors)
+ bio_sectors)) << 9;
if (max < 0) max = 0; /* bio_add cannot handle a negative return */
if (max <= biovec->bv_len && bio_sectors == 0)
return biovec->bv_len;
else
return max;
}
static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
sector_t array_sectors = 0;
mdk_rdev_t *rdev;
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
list_for_each_entry(rdev, &mddev->disks, same_set)
array_sectors += rdev->sectors;
return array_sectors;
}
static int raid0_run(mddev_t *mddev)
{
int ret;
if (mddev->chunk_sectors == 0) {
printk(KERN_ERR "md/raid0: chunk size must be set.\n");
return -EINVAL;
}
if (md_check_no_bitmap(mddev))
return -EINVAL;
blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors);
mddev->queue->queue_lock = &mddev->queue->__queue_lock;
ret = create_strip_zones(mddev);
if (ret < 0)
return ret;
/* calculate array device size */
md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
(unsigned long long)mddev->array_sectors);
/* calculate the max read-ahead size.
* For read-ahead of large files to be effective, we need to
* readahead at least twice a whole stripe. i.e. number of devices
* multiplied by chunk size times 2.
* If an individual device has an ra_pages greater than the
* chunk size, then we will not drive that device as hard as it
* wants. We consider this a configuration error: a larger
* chunksize should be used in that case.
*/
{
int stripe = mddev->raid_disks *
(mddev->chunk_sectors << 9) / PAGE_SIZE;
if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
mddev->queue->backing_dev_info.ra_pages = 2* stripe;
}
blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
dump_zones(mddev);
md_integrity_register(mddev);
return 0;
}
static int raid0_stop(mddev_t *mddev)
{
raid0_conf_t *conf = mddev->private;
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
kfree(conf->strip_zone);
kfree(conf->devlist);
kfree(conf);
mddev->private = NULL;
return 0;
}
/* Find the zone which holds a particular offset
* Update *sectorp to be an offset in that zone
*/
static struct strip_zone *find_zone(struct raid0_private_data *conf,
sector_t *sectorp)
{
int i;
struct strip_zone *z = conf->strip_zone;
sector_t sector = *sectorp;
for (i = 0; i < conf->nr_strip_zones; i++)
if (sector < z[i].zone_end) {
if (i)
*sectorp = sector - z[i-1].zone_end;
return z + i;
}
BUG();
}
/*
* remaps the bio to the target device. we separate two flows.
* power 2 flow and a general flow for the sake of perfromance
*/
static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
sector_t sector, sector_t *sector_offset)
{
unsigned int sect_in_chunk;
sector_t chunk;
raid0_conf_t *conf = mddev->private;
unsigned int chunk_sects = mddev->chunk_sectors;
if (is_power_of_2(chunk_sects)) {
int chunksect_bits = ffz(~chunk_sects);
/* find the sector offset inside the chunk */
sect_in_chunk = sector & (chunk_sects - 1);
sector >>= chunksect_bits;
/* chunk in zone */
chunk = *sector_offset;
/* quotient is the chunk in real device*/
sector_div(chunk, zone->nb_dev << chunksect_bits);
} else{
sect_in_chunk = sector_div(sector, chunk_sects);
chunk = *sector_offset;
sector_div(chunk, chunk_sects * zone->nb_dev);
}
/*
* position the bio over the real device
* real sector = chunk in device + starting of zone
* + the position in the chunk
*/
*sector_offset = (chunk * chunk_sects) + sect_in_chunk;
return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks
+ sector_div(sector, zone->nb_dev)];
}
/*
* Is io distribute over 1 or more chunks ?
*/
static inline int is_io_in_chunk_boundary(mddev_t *mddev,
unsigned int chunk_sects, struct bio *bio)
{
if (likely(is_power_of_2(chunk_sects))) {
return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
+ (bio->bi_size >> 9));
} else{
sector_t sector = bio->bi_sector;
return chunk_sects >= (sector_div(sector, chunk_sects)
+ (bio->bi_size >> 9));
}
}
static int raid0_make_request(struct request_queue *q, struct bio *bio)
{
mddev_t *mddev = q->queuedata;
unsigned int chunk_sects;
sector_t sector_offset;
struct strip_zone *zone;
mdk_rdev_t *tmp_dev;
const int rw = bio_data_dir(bio);
int cpu;
if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
cpu = part_stat_lock();
part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
bio_sectors(bio));
part_stat_unlock();
chunk_sects = mddev->chunk_sectors;
if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
sector_t sector = bio->bi_sector;
struct bio_pair *bp;
/* Sanity check -- queue functions should prevent this happening */
if (bio->bi_vcnt != 1 ||
bio->bi_idx != 0)
goto bad_map;
/* This is a one page bio that upper layers
* refuse to split for us, so we need to split it.
*/
if (likely(is_power_of_2(chunk_sects)))
bp = bio_split(bio, chunk_sects - (sector &
(chunk_sects-1)));
else
bp = bio_split(bio, chunk_sects -
sector_div(sector, chunk_sects));
if (raid0_make_request(q, &bp->bio1))
generic_make_request(&bp->bio1);
if (raid0_make_request(q, &bp->bio2))
generic_make_request(&bp->bio2);
bio_pair_release(bp);
return 0;
}
sector_offset = bio->bi_sector;
zone = find_zone(mddev->private, &sector_offset);
tmp_dev = map_sector(mddev, zone, bio->bi_sector,
&sector_offset);
bio->bi_bdev = tmp_dev->bdev;
bio->bi_sector = sector_offset + zone->dev_start +
tmp_dev->data_offset;
/*
* Let the main block layer submit the IO and resolve recursion:
*/
return 1;
bad_map:
printk("raid0_make_request bug: can't convert block across chunks"
" or bigger than %dk %llu %d\n", chunk_sects / 2,
(unsigned long long)bio->bi_sector, bio->bi_size >> 10);
bio_io_error(bio);
return 0;
}
static void raid0_status(struct seq_file *seq, mddev_t *mddev)
{
#undef MD_DEBUG
#ifdef MD_DEBUG
int j, k, h;
char b[BDEVNAME_SIZE];
raid0_conf_t *conf = mddev->private;
sector_t zone_size;
sector_t zone_start = 0;
h = 0;
for (j = 0; j < conf->nr_strip_zones; j++) {
seq_printf(seq, " z%d", j);
seq_printf(seq, "=[");
for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
seq_printf(seq, "%s/", bdevname(
conf->devlist[j*mddev->raid_disks + k]
->bdev, b));
zone_size = conf->strip_zone[j].zone_end - zone_start;
seq_printf(seq, "] ze=%lld ds=%lld s=%lld\n",
(unsigned long long)zone_start>>1,
(unsigned long long)conf->strip_zone[j].dev_start>>1,
(unsigned long long)zone_size>>1);
zone_start = conf->strip_zone[j].zone_end;
}
#endif
seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2);
return;
}
static struct mdk_personality raid0_personality=
{
.name = "raid0",
.level = 0,
.owner = THIS_MODULE,
.make_request = raid0_make_request,
.run = raid0_run,
.stop = raid0_stop,
.status = raid0_status,
.size = raid0_size,
};
static int __init raid0_init (void)
{
return register_md_personality (&raid0_personality);
}
static void raid0_exit (void)
{
unregister_md_personality (&raid0_personality);
}
module_init(raid0_init);
module_exit(raid0_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS("md-personality-2"); /* RAID0 */
MODULE_ALIAS("md-raid0");
MODULE_ALIAS("md-level-0");

20
kernel/drivers/md/raid0.h Normal file
View File

@@ -0,0 +1,20 @@
#ifndef _RAID0_H
#define _RAID0_H
struct strip_zone
{
sector_t zone_end; /* Start of the next zone (in sectors) */
sector_t dev_start; /* Zone offset in real dev (in sectors) */
int nb_dev; /* # of devices attached to the zone */
};
struct raid0_private_data
{
struct strip_zone *strip_zone;
mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
int nr_strip_zones;
};
typedef struct raid0_private_data raid0_conf_t;
#endif

2332
kernel/drivers/md/raid1.c Normal file

File diff suppressed because it is too large Load Diff

126
kernel/drivers/md/raid1.h Normal file
View File

@@ -0,0 +1,126 @@
#ifndef _RAID1_H
#define _RAID1_H
typedef struct mirror_info mirror_info_t;
struct mirror_info {
mdk_rdev_t *rdev;
sector_t head_position;
};
/*
* memory pools need a pointer to the mddev, so they can force an unplug
* when memory is tight, and a count of the number of drives that the
* pool was allocated for, so they know how much to allocate and free.
* mddev->raid_disks cannot be used, as it can change while a pool is active
* These two datums are stored in a kmalloced struct.
*/
struct pool_info {
mddev_t *mddev;
int raid_disks;
};
typedef struct r1bio_s r1bio_t;
struct r1_private_data_s {
mddev_t *mddev;
mirror_info_t *mirrors;
int raid_disks;
int last_used;
sector_t next_seq_sect;
spinlock_t device_lock;
struct list_head retry_list;
/* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list;
/* queue of writes that have been unplugged */
struct bio_list flushing_bio_list;
/* for use when syncing mirrors: */
spinlock_t resync_lock;
int nr_pending;
int nr_waiting;
int nr_queued;
int barrier;
sector_t next_resync;
int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added).
* Cleared when a sync completes.
*/
wait_queue_head_t wait_barrier;
struct pool_info *poolinfo;
struct page *tmppage;
mempool_t *r1bio_pool;
mempool_t *r1buf_pool;
};
typedef struct r1_private_data_s conf_t;
/*
* this is our 'private' RAID1 bio.
*
* it contains information about what kind of IO operations were started
* for this RAID1 operation, and about their status:
*/
struct r1bio_s {
atomic_t remaining; /* 'have we finished' count,
* used from IRQ handlers
*/
atomic_t behind_remaining; /* number of write-behind ios remaining
* in this BehindIO request
*/
sector_t sector;
int sectors;
unsigned long state;
mddev_t *mddev;
/*
* original bio going to /dev/mdx
*/
struct bio *master_bio;
/*
* if the IO is in READ direction, then this is where we read
*/
int read_disk;
struct list_head retry_list;
struct bitmap_update *bitmap_update;
/*
* if the IO is in WRITE direction, then multiple bios are used.
* We choose the number when they are allocated.
*/
struct bio *bios[0];
/* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
};
/* when we get a read error on a read-only array, we redirect to another
* device without failing the first device, or trying to over-write to
* correct the read error. To keep track of bad blocks on a per-bio
* level, we store IO_BLOCKED in the appropriate 'bios' pointer
*/
#define IO_BLOCKED ((struct bio*)1)
/* bits for r1bio.state */
#define R1BIO_Uptodate 0
#define R1BIO_IsSync 1
#define R1BIO_Degraded 2
#define R1BIO_BehindIO 3
#define R1BIO_Barrier 4
#define R1BIO_BarrierRetry 5
/* For write-behind requests, we call bi_end_io when
* the last non-write-behind device completes, providing
* any write was successful. Otherwise we call when
* any write-behind write succeeds, otherwise we call
* with failure when last write completes (and all failed).
* Record that bi_end_io was called with this flag...
*/
#define R1BIO_Returned 6
#endif

2344
kernel/drivers/md/raid10.c Normal file

File diff suppressed because it is too large Load Diff

115
kernel/drivers/md/raid10.h Normal file
View File

@@ -0,0 +1,115 @@
#ifndef _RAID10_H
#define _RAID10_H
typedef struct mirror_info mirror_info_t;
struct mirror_info {
mdk_rdev_t *rdev;
sector_t head_position;
};
typedef struct r10bio_s r10bio_t;
struct r10_private_data_s {
mddev_t *mddev;
mirror_info_t *mirrors;
int raid_disks;
spinlock_t device_lock;
/* geometry */
int near_copies; /* number of copies layed out raid0 style */
int far_copies; /* number of copies layed out
* at large strides across drives
*/
int far_offset; /* far_copies are offset by 1 stripe
* instead of many
*/
int copies; /* near_copies * far_copies.
* must be <= raid_disks
*/
sector_t stride; /* distance between far copies.
* This is size / far_copies unless
* far_offset, in which case it is
* 1 stripe.
*/
int chunk_shift; /* shift from chunks to sectors */
sector_t chunk_mask;
struct list_head retry_list;
/* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list;
spinlock_t resync_lock;
int nr_pending;
int nr_waiting;
int nr_queued;
int barrier;
sector_t next_resync;
int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added).
* Cleared when a sync completes.
*/
wait_queue_head_t wait_barrier;
mempool_t *r10bio_pool;
mempool_t *r10buf_pool;
struct page *tmppage;
};
typedef struct r10_private_data_s conf_t;
/*
* this is our 'private' RAID10 bio.
*
* it contains information about what kind of IO operations were started
* for this RAID10 operation, and about their status:
*/
struct r10bio_s {
atomic_t remaining; /* 'have we finished' count,
* used from IRQ handlers
*/
sector_t sector; /* virtual sector number */
int sectors;
unsigned long state;
mddev_t *mddev;
/*
* original bio going to /dev/mdx
*/
struct bio *master_bio;
/*
* if the IO is in READ direction, then this is where we read
*/
int read_slot;
struct list_head retry_list;
/*
* if the IO is in WRITE direction, then multiple bios are used,
* one for each copy.
* When resyncing we also use one for each copy.
* When reconstructing, we use 2 bios, one for read, one for write.
* We choose the number when they are allocated.
*/
struct {
struct bio *bio;
sector_t addr;
int devnum;
} devs[0];
};
/* when we get a read error on a read-only array, we redirect to another
* device without failing the first device, or trying to over-write to
* correct the read error. To keep track of bad blocks on a per-bio
* level, we store IO_BLOCKED in the appropriate 'bios' pointer
*/
#define IO_BLOCKED ((struct bio*)1)
/* bits for r10bio.state */
#define R10BIO_Uptodate 0
#define R10BIO_IsSync 1
#define R10BIO_IsRecover 2
#define R10BIO_Degraded 3
#endif

5873
kernel/drivers/md/raid5.c Normal file

File diff suppressed because it is too large Load Diff

500
kernel/drivers/md/raid5.h Normal file
View File

@@ -0,0 +1,500 @@
#ifndef _RAID5_H
#define _RAID5_H
#include <linux/raid/xor.h>
#include <linux/dmaengine.h>
/*
*
* Each stripe contains one buffer per disc. Each buffer can be in
* one of a number of states stored in "flags". Changes between
* these states happen *almost* exclusively under a per-stripe
* spinlock. Some very specific changes can happen in bi_end_io, and
* these are not protected by the spin lock.
*
* The flag bits that are used to represent these states are:
* R5_UPTODATE and R5_LOCKED
*
* State Empty == !UPTODATE, !LOCK
* We have no data, and there is no active request
* State Want == !UPTODATE, LOCK
* A read request is being submitted for this block
* State Dirty == UPTODATE, LOCK
* Some new data is in this buffer, and it is being written out
* State Clean == UPTODATE, !LOCK
* We have valid data which is the same as on disc
*
* The possible state transitions are:
*
* Empty -> Want - on read or write to get old data for parity calc
* Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
* Empty -> Clean - on compute_block when computing a block for failed drive
* Want -> Empty - on failed read
* Want -> Clean - on successful completion of read request
* Dirty -> Clean - on successful completion of write request
* Dirty -> Clean - on failed write
* Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
*
* The Want->Empty, Want->Clean, Dirty->Clean, transitions
* all happen in b_end_io at interrupt time.
* Each sets the Uptodate bit before releasing the Lock bit.
* This leaves one multi-stage transition:
* Want->Dirty->Clean
* This is safe because thinking that a Clean buffer is actually dirty
* will at worst delay some action, and the stripe will be scheduled
* for attention after the transition is complete.
*
* There is one possibility that is not covered by these states. That
* is if one drive has failed and there is a spare being rebuilt. We
* can't distinguish between a clean block that has been generated
* from parity calculations, and a clean block that has been
* successfully written to the spare ( or to parity when resyncing).
* To distingush these states we have a stripe bit STRIPE_INSYNC that
* is set whenever a write is scheduled to the spare, or to the parity
* disc if there is no spare. A sync request clears this bit, and
* when we find it set with no buffers locked, we know the sync is
* complete.
*
* Buffers for the md device that arrive via make_request are attached
* to the appropriate stripe in one of two lists linked on b_reqnext.
* One list (bh_read) for read requests, one (bh_write) for write.
* There should never be more than one buffer on the two lists
* together, but we are not guaranteed of that so we allow for more.
*
* If a buffer is on the read list when the associated cache buffer is
* Uptodate, the data is copied into the read buffer and it's b_end_io
* routine is called. This may happen in the end_request routine only
* if the buffer has just successfully been read. end_request should
* remove the buffers from the list and then set the Uptodate bit on
* the buffer. Other threads may do this only if they first check
* that the Uptodate bit is set. Once they have checked that they may
* take buffers off the read queue.
*
* When a buffer on the write list is committed for write it is copied
* into the cache buffer, which is then marked dirty, and moved onto a
* third list, the written list (bh_written). Once both the parity
* block and the cached buffer are successfully written, any buffer on
* a written list can be returned with b_end_io.
*
* The write list and read list both act as fifos. The read list is
* protected by the device_lock. The write and written lists are
* protected by the stripe lock. The device_lock, which can be
* claimed while the stipe lock is held, is only for list
* manipulations and will only be held for a very short time. It can
* be claimed from interrupts.
*
*
* Stripes in the stripe cache can be on one of two lists (or on
* neither). The "inactive_list" contains stripes which are not
* currently being used for any request. They can freely be reused
* for another stripe. The "handle_list" contains stripes that need
* to be handled in some way. Both of these are fifo queues. Each
* stripe is also (potentially) linked to a hash bucket in the hash
* table so that it can be found by sector number. Stripes that are
* not hashed must be on the inactive_list, and will normally be at
* the front. All stripes start life this way.
*
* The inactive_list, handle_list and hash bucket lists are all protected by the
* device_lock.
* - stripes on the inactive_list never have their stripe_lock held.
* - stripes have a reference counter. If count==0, they are on a list.
* - If a stripe might need handling, STRIPE_HANDLE is set.
* - When refcount reaches zero, then if STRIPE_HANDLE it is put on
* handle_list else inactive_list
*
* This, combined with the fact that STRIPE_HANDLE is only ever
* cleared while a stripe has a non-zero count means that if the
* refcount is 0 and STRIPE_HANDLE is set, then it is on the
* handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
* the stripe is on inactive_list.
*
* The possible transitions are:
* activate an unhashed/inactive stripe (get_active_stripe())
* lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
* activate a hashed, possibly active stripe (get_active_stripe())
* lockdev check-hash if(!cnt++)unlink-stripe unlockdev
* attach a request to an active stripe (add_stripe_bh())
* lockdev attach-buffer unlockdev
* handle a stripe (handle_stripe())
* lockstripe clrSTRIPE_HANDLE ...
* (lockdev check-buffers unlockdev) ..
* change-state ..
* record io/ops needed unlockstripe schedule io/ops
* release an active stripe (release_stripe())
* lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
*
* The refcount counts each thread that have activated the stripe,
* plus raid5d if it is handling it, plus one for each active request
* on a cached buffer, and plus one if the stripe is undergoing stripe
* operations.
*
* Stripe operations are performed outside the stripe lock,
* the stripe operations are:
* -copying data between the stripe cache and user application buffers
* -computing blocks to save a disk access, or to recover a missing block
* -updating the parity on a write operation (reconstruct write and
* read-modify-write)
* -checking parity correctness
* -running i/o to disk
* These operations are carried out by raid5_run_ops which uses the async_tx
* api to (optionally) offload operations to dedicated hardware engines.
* When requesting an operation handle_stripe sets the pending bit for the
* operation and increments the count. raid5_run_ops is then run whenever
* the count is non-zero.
* There are some critical dependencies between the operations that prevent some
* from being requested while another is in flight.
* 1/ Parity check operations destroy the in cache version of the parity block,
* so we prevent parity dependent operations like writes and compute_blocks
* from starting while a check is in progress. Some dma engines can perform
* the check without damaging the parity block, in these cases the parity
* block is re-marked up to date (assuming the check was successful) and is
* not re-read from disk.
* 2/ When a write operation is requested we immediately lock the affected
* blocks, and mark them as not up to date. This causes new read requests
* to be held off, as well as parity checks and compute block operations.
* 3/ Once a compute block operation has been requested handle_stripe treats
* that block as if it is up to date. raid5_run_ops guaruntees that any
* operation that is dependent on the compute block result is initiated after
* the compute block completes.
*/
/*
* Operations state - intermediate states that are visible outside of sh->lock
* In general _idle indicates nothing is running, _run indicates a data
* processing operation is active, and _result means the data processing result
* is stable and can be acted upon. For simple operations like biofill and
* compute that only have an _idle and _run state they are indicated with
* sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
*/
/**
* enum check_states - handles syncing / repairing a stripe
* @check_state_idle - check operations are quiesced
* @check_state_run - check operation is running
* @check_state_result - set outside lock when check result is valid
* @check_state_compute_run - check failed and we are repairing
* @check_state_compute_result - set outside lock when compute result is valid
*/
enum check_states {
check_state_idle = 0,
check_state_run, /* xor parity check */
check_state_run_q, /* q-parity check */
check_state_run_pq, /* pq dual parity check */
check_state_check_result,
check_state_compute_run, /* parity repair */
check_state_compute_result,
};
/**
* enum reconstruct_states - handles writing or expanding a stripe
*/
enum reconstruct_states {
reconstruct_state_idle = 0,
reconstruct_state_prexor_drain_run, /* prexor-write */
reconstruct_state_drain_run, /* write */
reconstruct_state_run, /* expand */
reconstruct_state_prexor_drain_result,
reconstruct_state_drain_result,
reconstruct_state_result,
};
struct stripe_head {
struct hlist_node hash;
struct list_head lru; /* inactive_list or handle_list */
struct raid5_private_data *raid_conf;
short generation; /* increments with every
* reshape */
sector_t sector; /* sector of this row */
short pd_idx; /* parity disk index */
short qd_idx; /* 'Q' disk index for raid6 */
short ddf_layout;/* use DDF ordering to calculate Q */
unsigned long state; /* state flags */
atomic_t count; /* nr of active thread/requests */
spinlock_t lock;
int bm_seq; /* sequence number for bitmap flushes */
int disks; /* disks in stripe */
enum check_states check_state;
enum reconstruct_states reconstruct_state;
/**
* struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
* @target2 - 2nd compute target in the raid6 case
* @zero_sum_result - P and Q verification flags
* @request - async service request flags for raid_run_ops
*/
struct stripe_operations {
int target, target2;
enum sum_check_flags zero_sum_result;
#ifdef CONFIG_MULTICORE_RAID456
unsigned long request;
wait_queue_head_t wait_for_ops;
#endif
} ops;
struct r5dev {
struct bio req;
struct bio_vec vec;
struct page *page;
struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */
unsigned long flags;
} dev[1]; /* allocated with extra space depending of RAID geometry */
};
/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
* for handle_stripe. It is only valid under spin_lock(sh->lock);
*/
struct stripe_head_state {
int syncing, expanding, expanded;
int locked, uptodate, to_read, to_write, failed, written;
int to_fill, compute, req_compute, non_overwrite;
int failed_num;
unsigned long ops_request;
};
/* r6_state - extra state data only relevant to r6 */
struct r6_state {
int p_failed, q_failed, failed_num[2];
};
/* Flags */
#define R5_UPTODATE 0 /* page contains current data */
#define R5_LOCKED 1 /* IO has been submitted on "req" */
#define R5_OVERWRITE 2 /* towrite covers whole page */
/* and some that are internal to handle_stripe */
#define R5_Insync 3 /* rdev && rdev->in_sync at start */
#define R5_Wantread 4 /* want to schedule a read */
#define R5_Wantwrite 5
#define R5_Overlap 7 /* There is a pending overlapping request on this block */
#define R5_ReadError 8 /* seen a read error here recently */
#define R5_ReWrite 9 /* have tried to over-write the readerror */
#define R5_Expanded 10 /* This block now has post-expand data */
#define R5_Wantcompute 11 /* compute_block in progress treat as
* uptodate
*/
#define R5_Wantfill 12 /* dev->toread contains a bio that needs
* filling
*/
#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
/*
* Write method
*/
#define RECONSTRUCT_WRITE 1
#define READ_MODIFY_WRITE 2
/* not a write method, but a compute_parity mode */
#define CHECK_PARITY 3
/* Additional compute_parity mode -- updates the parity w/o LOCKING */
#define UPDATE_PARITY 4
/*
* Stripe state
*/
#define STRIPE_HANDLE 2
#define STRIPE_SYNCING 3
#define STRIPE_INSYNC 4
#define STRIPE_PREREAD_ACTIVE 5
#define STRIPE_DELAYED 6
#define STRIPE_DEGRADED 7
#define STRIPE_BIT_DELAY 8
#define STRIPE_EXPANDING 9
#define STRIPE_EXPAND_SOURCE 10
#define STRIPE_EXPAND_READY 11
#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */
#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */
#define STRIPE_BIOFILL_RUN 14
#define STRIPE_COMPUTE_RUN 15
#define STRIPE_OPS_REQ_PENDING 16
/*
* Operation request flags
*/
#define STRIPE_OP_BIOFILL 0
#define STRIPE_OP_COMPUTE_BLK 1
#define STRIPE_OP_PREXOR 2
#define STRIPE_OP_BIODRAIN 3
#define STRIPE_OP_RECONSTRUCT 4
#define STRIPE_OP_CHECK 5
/*
* Plugging:
*
* To improve write throughput, we need to delay the handling of some
* stripes until there has been a chance that several write requests
* for the one stripe have all been collected.
* In particular, any write request that would require pre-reading
* is put on a "delayed" queue until there are no stripes currently
* in a pre-read phase. Further, if the "delayed" queue is empty when
* a stripe is put on it then we "plug" the queue and do not process it
* until an unplug call is made. (the unplug_io_fn() is called).
*
* When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
* it to the count of prereading stripes.
* When write is initiated, or the stripe refcnt == 0 (just in case) we
* clear the PREREAD_ACTIVE flag and decrement the count
* Whenever the 'handle' queue is empty and the device is not plugged, we
* move any strips from delayed to handle and clear the DELAYED flag and set
* PREREAD_ACTIVE.
* In stripe_handle, if we find pre-reading is necessary, we do it if
* PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
* HANDLE gets cleared if stripe_handle leave nothing locked.
*/
struct disk_info {
mdk_rdev_t *rdev;
};
struct raid5_private_data {
struct hlist_head *stripe_hashtbl;
mddev_t *mddev;
struct disk_info *spare;
int chunk_sectors;
int level, algorithm;
int max_degraded;
int raid_disks;
int max_nr_stripes;
/* reshape_progress is the leading edge of a 'reshape'
* It has value MaxSector when no reshape is happening
* If delta_disks < 0, it is the last sector we started work on,
* else is it the next sector to work on.
*/
sector_t reshape_progress;
/* reshape_safe is the trailing edge of a reshape. We know that
* before (or after) this address, all reshape has completed.
*/
sector_t reshape_safe;
int previous_raid_disks;
int prev_chunk_sectors;
int prev_algo;
short generation; /* increments with every reshape */
unsigned long reshape_checkpoint; /* Time we last updated
* metadata */
struct list_head handle_list; /* stripes needing handling */
struct list_head hold_list; /* preread ready stripes */
struct list_head delayed_list; /* stripes that have plugged requests */
struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
struct bio *retry_read_aligned; /* currently retrying aligned bios */
struct bio *retry_read_aligned_list; /* aligned bios retry list */
atomic_t preread_active_stripes; /* stripes with scheduled io */
atomic_t active_aligned_reads;
atomic_t pending_full_writes; /* full write backlog */
int bypass_count; /* bypassed prereads */
int bypass_threshold; /* preread nice */
struct list_head *last_hold; /* detect hold_list promotions */
atomic_t reshape_stripes; /* stripes with pending writes for reshape */
/* unfortunately we need two cache names as we temporarily have
* two caches.
*/
int active_name;
char cache_name[2][20];
struct kmem_cache *slab_cache; /* for allocating stripes */
int seq_flush, seq_write;
int quiesce;
int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added).
* Cleared when a sync completes.
*/
/* per cpu variables */
struct raid5_percpu {
struct page *spare_page; /* Used when checking P/Q in raid6 */
void *scribble; /* space for constructing buffer
* lists and performing address
* conversions
*/
} *percpu;
size_t scribble_len; /* size of scribble region must be
* associated with conf to handle
* cpu hotplug while reshaping
*/
#ifdef CONFIG_HOTPLUG_CPU
struct notifier_block cpu_notify;
#endif
/*
* Free stripes pool
*/
atomic_t active_stripes;
struct list_head inactive_list;
wait_queue_head_t wait_for_stripe;
wait_queue_head_t wait_for_overlap;
int inactive_blocked; /* release of inactive stripes blocked,
* waiting for 25% to be free
*/
int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock;
struct disk_info *disks;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
*/
struct mdk_thread_s *thread;
};
typedef struct raid5_private_data raid5_conf_t;
/*
* Our supported algorithms
*/
#define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */
#define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */
#define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */
#define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */
/* Define non-rotating (raid4) algorithms. These allow
* conversion of raid4 to raid5.
*/
#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */
#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */
/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
* Firstly, the exact positioning of the parity block is slightly
* different between the 'LEFT_*' modes of md and the "_N_*" modes
* of DDF.
* Secondly, or order of datablocks over which the Q syndrome is computed
* is different.
* Consequently we have different layouts for DDF/raid6 than md/raid6.
* These layouts are from the DDFv1.2 spec.
* Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
* leaves RLQ=3 as 'Vendor Specific'
*/
#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */
#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */
#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */
/* For every RAID5 algorithm we define a RAID6 algorithm
* with exactly the same layout for data and parity, and
* with the Q block always on the last device (N-1).
* This allows trivial conversion from RAID5 to RAID6
*/
#define ALGORITHM_LEFT_ASYMMETRIC_6 16
#define ALGORITHM_RIGHT_ASYMMETRIC_6 17
#define ALGORITHM_LEFT_SYMMETRIC_6 18
#define ALGORITHM_RIGHT_SYMMETRIC_6 19
#define ALGORITHM_PARITY_0_6 20
#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N
static inline int algorithm_valid_raid5(int layout)
{
return (layout >= 0) &&
(layout <= 5);
}
static inline int algorithm_valid_raid6(int layout)
{
return (layout >= 0 && layout <= 5)
||
(layout >= 8 && layout <= 10)
||
(layout >= 16 && layout <= 20);
}
static inline int algorithm_is_DDF(int layout)
{
return layout >= 8 && layout <= 10;
}
#endif

View File

@@ -0,0 +1,171 @@
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright 2002 H. Peter Anvin - All Rights Reserved
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* raid6algos.c
*
* Algorithm list and algorithm selection for RAID-6
*/
#include <linux/raid/pq.h>
#ifndef __KERNEL__
#include <sys/mman.h>
#include <stdio.h>
#else
#if !RAID6_USE_EMPTY_ZERO_PAGE
/* In .bss so it's zeroed */
const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
EXPORT_SYMBOL(raid6_empty_zero_page);
#endif
#endif
struct raid6_calls raid6_call;
EXPORT_SYMBOL_GPL(raid6_call);
/* Various routine sets */
extern const struct raid6_calls raid6_intx1;
extern const struct raid6_calls raid6_intx2;
extern const struct raid6_calls raid6_intx4;
extern const struct raid6_calls raid6_intx8;
extern const struct raid6_calls raid6_intx16;
extern const struct raid6_calls raid6_intx32;
extern const struct raid6_calls raid6_mmxx1;
extern const struct raid6_calls raid6_mmxx2;
extern const struct raid6_calls raid6_sse1x1;
extern const struct raid6_calls raid6_sse1x2;
extern const struct raid6_calls raid6_sse2x1;
extern const struct raid6_calls raid6_sse2x2;
extern const struct raid6_calls raid6_sse2x4;
extern const struct raid6_calls raid6_altivec1;
extern const struct raid6_calls raid6_altivec2;
extern const struct raid6_calls raid6_altivec4;
extern const struct raid6_calls raid6_altivec8;
const struct raid6_calls * const raid6_algos[] = {
&raid6_intx1,
&raid6_intx2,
&raid6_intx4,
&raid6_intx8,
#if defined(__ia64__)
&raid6_intx16,
&raid6_intx32,
#endif
#if defined(__i386__) && !defined(__arch_um__)
&raid6_mmxx1,
&raid6_mmxx2,
&raid6_sse1x1,
&raid6_sse1x2,
&raid6_sse2x1,
&raid6_sse2x2,
#endif
#if defined(__x86_64__) && !defined(__arch_um__)
&raid6_sse2x1,
&raid6_sse2x2,
&raid6_sse2x4,
#endif
#ifdef CONFIG_ALTIVEC
&raid6_altivec1,
&raid6_altivec2,
&raid6_altivec4,
&raid6_altivec8,
#endif
NULL
};
#ifdef __KERNEL__
#define RAID6_TIME_JIFFIES_LG2 4
#else
/* Need more time to be stable in userspace */
#define RAID6_TIME_JIFFIES_LG2 9
#define time_before(x, y) ((x) < (y))
#endif
/* Try to pick the best algorithm */
/* This code uses the gfmul table as convenient data set to abuse */
int __init raid6_select_algo(void)
{
const struct raid6_calls * const * algo;
const struct raid6_calls * best;
char *syndromes;
void *dptrs[(65536/PAGE_SIZE)+2];
int i, disks;
unsigned long perf, bestperf;
int bestprefer;
unsigned long j0, j1;
disks = (65536/PAGE_SIZE)+2;
for ( i = 0 ; i < disks-2 ; i++ ) {
dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
}
/* Normal code - use a 2-page allocation to avoid D$ conflict */
syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
if ( !syndromes ) {
printk("raid6: Yikes! No memory available.\n");
return -ENOMEM;
}
dptrs[disks-2] = syndromes;
dptrs[disks-1] = syndromes + PAGE_SIZE;
bestperf = 0; bestprefer = 0; best = NULL;
for ( algo = raid6_algos ; *algo ; algo++ ) {
if ( !(*algo)->valid || (*algo)->valid() ) {
perf = 0;
preempt_disable();
j0 = jiffies;
while ( (j1 = jiffies) == j0 )
cpu_relax();
while (time_before(jiffies,
j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
(*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs);
perf++;
}
preempt_enable();
if ( (*algo)->prefer > bestprefer ||
((*algo)->prefer == bestprefer &&
perf > bestperf) ) {
best = *algo;
bestprefer = best->prefer;
bestperf = perf;
}
printk("raid6: %-8s %5ld MB/s\n", (*algo)->name,
(perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
}
}
if (best) {
printk("raid6: using algorithm %s (%ld MB/s)\n",
best->name,
(bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
raid6_call = *best;
} else
printk("raid6: Yikes! No algorithm found!\n");
free_pages((unsigned long)syndromes, 1);
return best ? 0 : -EINVAL;
}
static void raid6_exit(void)
{
do { } while (0);
}
subsys_initcall(raid6_select_algo);
module_exit(raid6_exit);
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,130 @@
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* raid6altivec$#.c
*
* $#-way unrolled portable integer math RAID-6 instruction set
*
* This file is postprocessed using unroll.awk
*
* <benh> hpa: in process,
* you can just "steal" the vec unit with enable_kernel_altivec() (but
* bracked this with preempt_disable/enable or in a lock)
*/
#include <linux/raid/pq.h>
#ifdef CONFIG_ALTIVEC
#include <altivec.h>
#ifdef __KERNEL__
# include <asm/system.h>
# include <asm/cputable.h>
#endif
/*
* This is the C data type to use. We use a vector of
* signed char so vec_cmpgt() will generate the right
* instruction.
*/
typedef vector signed char unative_t;
#define NBYTES(x) ((vector signed char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
#define NSIZE sizeof(unative_t)
/*
* The SHLBYTE() operation shifts each byte left by 1, *not*
* rolling over into the next byte
*/
static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
{
return vec_add(v,v);
}
/*
* The MASK() operation returns 0xFF in any byte for which the high
* bit is 1, 0x00 for any byte for which the high bit is 0.
*/
static inline __attribute_const__ unative_t MASK(unative_t v)
{
unative_t zv = NBYTES(0);
/* vec_cmpgt returns a vector bool char; thus the need for the cast */
return (unative_t)vec_cmpgt(zv, v);
}
/* This is noinline to make damned sure that gcc doesn't move any of the
Altivec code around the enable/disable code */
static void noinline
raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
int d, z, z0;
unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
unative_t x1d = NBYTES(0x1d);
z0 = disks - 3; /* Highest data disk */
p = dptr[z0+1]; /* XOR parity */
q = dptr[z0+2]; /* RS syndrome */
for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
for ( z = z0-1 ; z >= 0 ; z-- ) {
wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
wp$$ = vec_xor(wp$$, wd$$);
w2$$ = MASK(wq$$);
w1$$ = SHLBYTE(wq$$);
w2$$ = vec_and(w2$$, x1d);
w1$$ = vec_xor(w1$$, w2$$);
wq$$ = vec_xor(w1$$, wd$$);
}
*(unative_t *)&p[d+NSIZE*$$] = wp$$;
*(unative_t *)&q[d+NSIZE*$$] = wq$$;
}
}
static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
{
preempt_disable();
enable_kernel_altivec();
raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs);
preempt_enable();
}
int raid6_have_altivec(void);
#if $# == 1
int raid6_have_altivec(void)
{
/* This assumes either all CPUs have Altivec or none does */
# ifdef __KERNEL__
return cpu_has_feature(CPU_FTR_ALTIVEC);
# else
return 1;
# endif
}
#endif
const struct raid6_calls raid6_altivec$# = {
raid6_altivec$#_gen_syndrome,
raid6_have_altivec,
"altivecx$#",
0
};
#endif /* CONFIG_ALTIVEC */

View File

@@ -0,0 +1,117 @@
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* raid6int$#.c
*
* $#-way unrolled portable integer math RAID-6 instruction set
*
* This file is postprocessed using unroll.awk
*/
#include <linux/raid/pq.h>
/*
* This is the C data type to use
*/
/* Change this from BITS_PER_LONG if there is something better... */
#if BITS_PER_LONG == 64
# define NBYTES(x) ((x) * 0x0101010101010101UL)
# define NSIZE 8
# define NSHIFT 3
# define NSTRING "64"
typedef u64 unative_t;
#else
# define NBYTES(x) ((x) * 0x01010101U)
# define NSIZE 4
# define NSHIFT 2
# define NSTRING "32"
typedef u32 unative_t;
#endif
/*
* IA-64 wants insane amounts of unrolling. On other architectures that
* is just a waste of space.
*/
#if ($# <= 8) || defined(__ia64__)
/*
* These sub-operations are separate inlines since they can sometimes be
* specially optimized using architecture-specific hacks.
*/
/*
* The SHLBYTE() operation shifts each byte left by 1, *not*
* rolling over into the next byte
*/
static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
{
unative_t vv;
vv = (v << 1) & NBYTES(0xfe);
return vv;
}
/*
* The MASK() operation returns 0xFF in any byte for which the high
* bit is 1, 0x00 for any byte for which the high bit is 0.
*/
static inline __attribute_const__ unative_t MASK(unative_t v)
{
unative_t vv;
vv = v & NBYTES(0x80);
vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */
return vv;
}
static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
int d, z, z0;
unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0+1]; /* XOR parity */
q = dptr[z0+2]; /* RS syndrome */
for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
for ( z = z0-1 ; z >= 0 ; z-- ) {
wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
wp$$ ^= wd$$;
w2$$ = MASK(wq$$);
w1$$ = SHLBYTE(wq$$);
w2$$ &= NBYTES(0x1d);
w1$$ ^= w2$$;
wq$$ = w1$$ ^ wd$$;
}
*(unative_t *)&p[d+NSIZE*$$] = wp$$;
*(unative_t *)&q[d+NSIZE*$$] = wq$$;
}
}
const struct raid6_calls raid6_intx$# = {
raid6_int$#_gen_syndrome,
NULL, /* always valid */
"int" NSTRING "x$#",
0
};
#endif

View File

@@ -0,0 +1,142 @@
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright 2002 H. Peter Anvin - All Rights Reserved
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* raid6mmx.c
*
* MMX implementation of RAID-6 syndrome functions
*/
#if defined(__i386__) && !defined(__arch_um__)
#include <linux/raid/pq.h>
#include "raid6x86.h"
/* Shared with raid6sse1.c */
const struct raid6_mmx_constants {
u64 x1d;
} raid6_mmx_constants = {
0x1d1d1d1d1d1d1d1dULL,
};
static int raid6_have_mmx(void)
{
/* Not really "boot_cpu" but "all_cpus" */
return boot_cpu_has(X86_FEATURE_MMX);
}
/*
* Plain MMX implementation
*/
static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
int d, z, z0;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0+1]; /* XOR parity */
q = dptr[z0+2]; /* RS syndrome */
kernel_fpu_begin();
asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
asm volatile("pxor %mm5,%mm5"); /* Zero temp */
for ( d = 0 ; d < bytes ; d += 8 ) {
asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
asm volatile("movq %mm2,%mm4"); /* Q[0] */
for ( z = z0-1 ; z >= 0 ; z-- ) {
asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d]));
asm volatile("pcmpgtb %mm4,%mm5");
asm volatile("paddb %mm4,%mm4");
asm volatile("pand %mm0,%mm5");
asm volatile("pxor %mm5,%mm4");
asm volatile("pxor %mm5,%mm5");
asm volatile("pxor %mm6,%mm2");
asm volatile("pxor %mm6,%mm4");
}
asm volatile("movq %%mm2,%0" : "=m" (p[d]));
asm volatile("pxor %mm2,%mm2");
asm volatile("movq %%mm4,%0" : "=m" (q[d]));
asm volatile("pxor %mm4,%mm4");
}
kernel_fpu_end();
}
const struct raid6_calls raid6_mmxx1 = {
raid6_mmx1_gen_syndrome,
raid6_have_mmx,
"mmxx1",
0
};
/*
* Unrolled-by-2 MMX implementation
*/
static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
int d, z, z0;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0+1]; /* XOR parity */
q = dptr[z0+2]; /* RS syndrome */
kernel_fpu_begin();
asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
asm volatile("pxor %mm5,%mm5"); /* Zero temp */
asm volatile("pxor %mm7,%mm7"); /* Zero temp */
for ( d = 0 ; d < bytes ; d += 16 ) {
asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8]));
asm volatile("movq %mm2,%mm4"); /* Q[0] */
asm volatile("movq %mm3,%mm6"); /* Q[1] */
for ( z = z0-1 ; z >= 0 ; z-- ) {
asm volatile("pcmpgtb %mm4,%mm5");
asm volatile("pcmpgtb %mm6,%mm7");
asm volatile("paddb %mm4,%mm4");
asm volatile("paddb %mm6,%mm6");
asm volatile("pand %mm0,%mm5");
asm volatile("pand %mm0,%mm7");
asm volatile("pxor %mm5,%mm4");
asm volatile("pxor %mm7,%mm6");
asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d]));
asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8]));
asm volatile("pxor %mm5,%mm2");
asm volatile("pxor %mm7,%mm3");
asm volatile("pxor %mm5,%mm4");
asm volatile("pxor %mm7,%mm6");
asm volatile("pxor %mm5,%mm5");
asm volatile("pxor %mm7,%mm7");
}
asm volatile("movq %%mm2,%0" : "=m" (p[d]));
asm volatile("movq %%mm3,%0" : "=m" (p[d+8]));
asm volatile("movq %%mm4,%0" : "=m" (q[d]));
asm volatile("movq %%mm6,%0" : "=m" (q[d+8]));
}
kernel_fpu_end();
}
const struct raid6_calls raid6_mmxx2 = {
raid6_mmx2_gen_syndrome,
raid6_have_mmx,
"mmxx2",
0
};
#endif

View File

@@ -0,0 +1,132 @@
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright 2002 H. Peter Anvin - All Rights Reserved
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* raid6recov.c
*
* RAID-6 data recovery in dual failure mode. In single failure mode,
* use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct
* the syndrome.)
*/
#include <linux/raid/pq.h>
/* Recover two failed data blocks. */
void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
void **ptrs)
{
u8 *p, *q, *dp, *dq;
u8 px, qx, db;
const u8 *pbmul; /* P multiplier table for B data */
const u8 *qmul; /* Q multiplier table (for both) */
p = (u8 *)ptrs[disks-2];
q = (u8 *)ptrs[disks-1];
/* Compute syndrome with zero for the missing data pages
Use the dead data pages as temporary storage for
delta p and delta q */
dp = (u8 *)ptrs[faila];
ptrs[faila] = (void *)raid6_empty_zero_page;
ptrs[disks-2] = dp;
dq = (u8 *)ptrs[failb];
ptrs[failb] = (void *)raid6_empty_zero_page;
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
/* Restore pointer table */
ptrs[faila] = dp;
ptrs[failb] = dq;
ptrs[disks-2] = p;
ptrs[disks-1] = q;
/* Now, pick the proper data tables */
pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
/* Now do it... */
while ( bytes-- ) {
px = *p ^ *dp;
qx = qmul[*q ^ *dq];
*dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
*dp++ = db ^ px; /* Reconstructed A */
p++; q++;
}
}
EXPORT_SYMBOL_GPL(raid6_2data_recov);
/* Recover failure of one data block plus the P block */
void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
{
u8 *p, *q, *dq;
const u8 *qmul; /* Q multiplier table */
p = (u8 *)ptrs[disks-2];
q = (u8 *)ptrs[disks-1];
/* Compute syndrome with zero for the missing data page
Use the dead data page as temporary storage for delta q */
dq = (u8 *)ptrs[faila];
ptrs[faila] = (void *)raid6_empty_zero_page;
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
/* Restore pointer table */
ptrs[faila] = dq;
ptrs[disks-1] = q;
/* Now, pick the proper data tables */
qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
/* Now do it... */
while ( bytes-- ) {
*p++ ^= *dq = qmul[*q ^ *dq];
q++; dq++;
}
}
EXPORT_SYMBOL_GPL(raid6_datap_recov);
#ifndef __KERNEL__
/* Testing only */
/* Recover two failed blocks. */
void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)
{
if ( faila > failb ) {
int tmp = faila;
faila = failb;
failb = tmp;
}
if ( failb == disks-1 ) {
if ( faila == disks-2 ) {
/* P+Q failure. Just rebuild the syndrome. */
raid6_call.gen_syndrome(disks, bytes, ptrs);
} else {
/* data+Q failure. Reconstruct data from P,
then rebuild syndrome. */
/* NOT IMPLEMENTED - equivalent to RAID-5 */
}
} else {
if ( failb == disks-2 ) {
/* data+P failure. */
raid6_datap_recov(disks, bytes, faila, ptrs);
} else {
/* data+data failure. */
raid6_2data_recov(disks, bytes, faila, failb, ptrs);
}
}
}
#endif

View File

@@ -0,0 +1,162 @@
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright 2002 H. Peter Anvin - All Rights Reserved
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* raid6sse1.c
*
* SSE-1/MMXEXT implementation of RAID-6 syndrome functions
*
* This is really an MMX implementation, but it requires SSE-1 or
* AMD MMXEXT for prefetch support and a few other features. The
* support for nontemporal memory accesses is enough to make this
* worthwhile as a separate implementation.
*/
#if defined(__i386__) && !defined(__arch_um__)
#include <linux/raid/pq.h>
#include "raid6x86.h"
/* Defined in raid6mmx.c */
extern const struct raid6_mmx_constants {
u64 x1d;
} raid6_mmx_constants;
static int raid6_have_sse1_or_mmxext(void)
{
/* Not really boot_cpu but "all_cpus" */
return boot_cpu_has(X86_FEATURE_MMX) &&
(boot_cpu_has(X86_FEATURE_XMM) ||
boot_cpu_has(X86_FEATURE_MMXEXT));
}
/*
* Plain SSE1 implementation
*/
static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
int d, z, z0;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0+1]; /* XOR parity */
q = dptr[z0+2]; /* RS syndrome */
kernel_fpu_begin();
asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
asm volatile("pxor %mm5,%mm5"); /* Zero temp */
for ( d = 0 ; d < bytes ; d += 8 ) {
asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
asm volatile("movq %mm2,%mm4"); /* Q[0] */
asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d]));
for ( z = z0-2 ; z >= 0 ; z-- ) {
asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
asm volatile("pcmpgtb %mm4,%mm5");
asm volatile("paddb %mm4,%mm4");
asm volatile("pand %mm0,%mm5");
asm volatile("pxor %mm5,%mm4");
asm volatile("pxor %mm5,%mm5");
asm volatile("pxor %mm6,%mm2");
asm volatile("pxor %mm6,%mm4");
asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d]));
}
asm volatile("pcmpgtb %mm4,%mm5");
asm volatile("paddb %mm4,%mm4");
asm volatile("pand %mm0,%mm5");
asm volatile("pxor %mm5,%mm4");
asm volatile("pxor %mm5,%mm5");
asm volatile("pxor %mm6,%mm2");
asm volatile("pxor %mm6,%mm4");
asm volatile("movntq %%mm2,%0" : "=m" (p[d]));
asm volatile("movntq %%mm4,%0" : "=m" (q[d]));
}
asm volatile("sfence" : : : "memory");
kernel_fpu_end();
}
const struct raid6_calls raid6_sse1x1 = {
raid6_sse11_gen_syndrome,
raid6_have_sse1_or_mmxext,
"sse1x1",
1 /* Has cache hints */
};
/*
* Unrolled-by-2 SSE1 implementation
*/
static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
int d, z, z0;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0+1]; /* XOR parity */
q = dptr[z0+2]; /* RS syndrome */
kernel_fpu_begin();
asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
asm volatile("pxor %mm5,%mm5"); /* Zero temp */
asm volatile("pxor %mm7,%mm7"); /* Zero temp */
/* We uniformly assume a single prefetch covers at least 16 bytes */
for ( d = 0 ; d < bytes ; d += 16 ) {
asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */
asm volatile("movq %mm2,%mm4"); /* Q[0] */
asm volatile("movq %mm3,%mm6"); /* Q[1] */
for ( z = z0-1 ; z >= 0 ; z-- ) {
asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
asm volatile("pcmpgtb %mm4,%mm5");
asm volatile("pcmpgtb %mm6,%mm7");
asm volatile("paddb %mm4,%mm4");
asm volatile("paddb %mm6,%mm6");
asm volatile("pand %mm0,%mm5");
asm volatile("pand %mm0,%mm7");
asm volatile("pxor %mm5,%mm4");
asm volatile("pxor %mm7,%mm6");
asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d]));
asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8]));
asm volatile("pxor %mm5,%mm2");
asm volatile("pxor %mm7,%mm3");
asm volatile("pxor %mm5,%mm4");
asm volatile("pxor %mm7,%mm6");
asm volatile("pxor %mm5,%mm5");
asm volatile("pxor %mm7,%mm7");
}
asm volatile("movntq %%mm2,%0" : "=m" (p[d]));
asm volatile("movntq %%mm3,%0" : "=m" (p[d+8]));
asm volatile("movntq %%mm4,%0" : "=m" (q[d]));
asm volatile("movntq %%mm6,%0" : "=m" (q[d+8]));
}
asm volatile("sfence" : :: "memory");
kernel_fpu_end();
}
const struct raid6_calls raid6_sse1x2 = {
raid6_sse12_gen_syndrome,
raid6_have_sse1_or_mmxext,
"sse1x2",
1 /* Has cache hints */
};
#endif

View File

@@ -0,0 +1,262 @@
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright 2002 H. Peter Anvin - All Rights Reserved
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* raid6sse2.c
*
* SSE-2 implementation of RAID-6 syndrome functions
*
*/
#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
#include <linux/raid/pq.h>
#include "raid6x86.h"
static const struct raid6_sse_constants {
u64 x1d[2];
} raid6_sse_constants __attribute__((aligned(16))) = {
{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
};
static int raid6_have_sse2(void)
{
/* Not really boot_cpu but "all_cpus" */
return boot_cpu_has(X86_FEATURE_MMX) &&
boot_cpu_has(X86_FEATURE_FXSR) &&
boot_cpu_has(X86_FEATURE_XMM) &&
boot_cpu_has(X86_FEATURE_XMM2);
}
/*
* Plain SSE2 implementation
*/
static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
int d, z, z0;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0+1]; /* XOR parity */
q = dptr[z0+2]; /* RS syndrome */
kernel_fpu_begin();
asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
for ( d = 0 ; d < bytes ; d += 16 ) {
asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
for ( z = z0-2 ; z >= 0 ; z-- ) {
asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
asm volatile("pcmpgtb %xmm4,%xmm5");
asm volatile("paddb %xmm4,%xmm4");
asm volatile("pand %xmm0,%xmm5");
asm volatile("pxor %xmm5,%xmm4");
asm volatile("pxor %xmm5,%xmm5");
asm volatile("pxor %xmm6,%xmm2");
asm volatile("pxor %xmm6,%xmm4");
asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
}
asm volatile("pcmpgtb %xmm4,%xmm5");
asm volatile("paddb %xmm4,%xmm4");
asm volatile("pand %xmm0,%xmm5");
asm volatile("pxor %xmm5,%xmm4");
asm volatile("pxor %xmm5,%xmm5");
asm volatile("pxor %xmm6,%xmm2");
asm volatile("pxor %xmm6,%xmm4");
asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
asm volatile("pxor %xmm2,%xmm2");
asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
asm volatile("pxor %xmm4,%xmm4");
}
asm volatile("sfence" : : : "memory");
kernel_fpu_end();
}
const struct raid6_calls raid6_sse2x1 = {
raid6_sse21_gen_syndrome,
raid6_have_sse2,
"sse2x1",
1 /* Has cache hints */
};
/*
* Unrolled-by-2 SSE2 implementation
*/
static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
int d, z, z0;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0+1]; /* XOR parity */
q = dptr[z0+2]; /* RS syndrome */
kernel_fpu_begin();
asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
/* We uniformly assume a single prefetch covers at least 32 bytes */
for ( d = 0 ; d < bytes ; d += 32 ) {
asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
for ( z = z0-1 ; z >= 0 ; z-- ) {
asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
asm volatile("pcmpgtb %xmm4,%xmm5");
asm volatile("pcmpgtb %xmm6,%xmm7");
asm volatile("paddb %xmm4,%xmm4");
asm volatile("paddb %xmm6,%xmm6");
asm volatile("pand %xmm0,%xmm5");
asm volatile("pand %xmm0,%xmm7");
asm volatile("pxor %xmm5,%xmm4");
asm volatile("pxor %xmm7,%xmm6");
asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
asm volatile("pxor %xmm5,%xmm2");
asm volatile("pxor %xmm7,%xmm3");
asm volatile("pxor %xmm5,%xmm4");
asm volatile("pxor %xmm7,%xmm6");
asm volatile("pxor %xmm5,%xmm5");
asm volatile("pxor %xmm7,%xmm7");
}
asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
}
asm volatile("sfence" : : : "memory");
kernel_fpu_end();
}
const struct raid6_calls raid6_sse2x2 = {
raid6_sse22_gen_syndrome,
raid6_have_sse2,
"sse2x2",
1 /* Has cache hints */
};
#endif
#if defined(__x86_64__) && !defined(__arch_um__)
/*
* Unrolled-by-4 SSE2 implementation
*/
static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
int d, z, z0;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0+1]; /* XOR parity */
q = dptr[z0+2]; /* RS syndrome */
kernel_fpu_begin();
asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
asm volatile("pxor %xmm2,%xmm2"); /* P[0] */
asm volatile("pxor %xmm3,%xmm3"); /* P[1] */
asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */
asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */
asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
asm volatile("pxor %xmm10,%xmm10"); /* P[2] */
asm volatile("pxor %xmm11,%xmm11"); /* P[3] */
asm volatile("pxor %xmm12,%xmm12"); /* Q[2] */
asm volatile("pxor %xmm13,%xmm13"); /* Zero temp */
asm volatile("pxor %xmm14,%xmm14"); /* Q[3] */
asm volatile("pxor %xmm15,%xmm15"); /* Zero temp */
for ( d = 0 ; d < bytes ; d += 64 ) {
for ( z = z0 ; z >= 0 ; z-- ) {
/* The second prefetch seems to improve performance... */
asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
asm volatile("pcmpgtb %xmm4,%xmm5");
asm volatile("pcmpgtb %xmm6,%xmm7");
asm volatile("pcmpgtb %xmm12,%xmm13");
asm volatile("pcmpgtb %xmm14,%xmm15");
asm volatile("paddb %xmm4,%xmm4");
asm volatile("paddb %xmm6,%xmm6");
asm volatile("paddb %xmm12,%xmm12");
asm volatile("paddb %xmm14,%xmm14");
asm volatile("pand %xmm0,%xmm5");
asm volatile("pand %xmm0,%xmm7");
asm volatile("pand %xmm0,%xmm13");
asm volatile("pand %xmm0,%xmm15");
asm volatile("pxor %xmm5,%xmm4");
asm volatile("pxor %xmm7,%xmm6");
asm volatile("pxor %xmm13,%xmm12");
asm volatile("pxor %xmm15,%xmm14");
asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
asm volatile("pxor %xmm5,%xmm2");
asm volatile("pxor %xmm7,%xmm3");
asm volatile("pxor %xmm13,%xmm10");
asm volatile("pxor %xmm15,%xmm11");
asm volatile("pxor %xmm5,%xmm4");
asm volatile("pxor %xmm7,%xmm6");
asm volatile("pxor %xmm13,%xmm12");
asm volatile("pxor %xmm15,%xmm14");
asm volatile("pxor %xmm5,%xmm5");
asm volatile("pxor %xmm7,%xmm7");
asm volatile("pxor %xmm13,%xmm13");
asm volatile("pxor %xmm15,%xmm15");
}
asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
asm volatile("pxor %xmm2,%xmm2");
asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
asm volatile("pxor %xmm3,%xmm3");
asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
asm volatile("pxor %xmm10,%xmm10");
asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
asm volatile("pxor %xmm11,%xmm11");
asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
asm volatile("pxor %xmm4,%xmm4");
asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
asm volatile("pxor %xmm6,%xmm6");
asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
asm volatile("pxor %xmm12,%xmm12");
asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
asm volatile("pxor %xmm14,%xmm14");
}
asm volatile("sfence" : : : "memory");
kernel_fpu_end();
}
const struct raid6_calls raid6_sse2x4 = {
raid6_sse24_gen_syndrome,
raid6_have_sse2,
"sse2x4",
1 /* Has cache hints */
};
#endif

View File

@@ -0,0 +1,75 @@
#
# This is a simple Makefile to test some of the RAID-6 code
# from userspace.
#
CC = gcc
OPTFLAGS = -O2 # Adjust as desired
CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS)
LD = ld
AWK = awk
AR = ar
RANLIB = ranlib
.c.o:
$(CC) $(CFLAGS) -c -o $@ $<
%.c: ../%.c
cp -f $< $@
%.uc: ../%.uc
cp -f $< $@
all: raid6.a raid6test
raid6.a: raid6int1.o raid6int2.o raid6int4.o raid6int8.o raid6int16.o \
raid6int32.o \
raid6mmx.o raid6sse1.o raid6sse2.o \
raid6altivec1.o raid6altivec2.o raid6altivec4.o raid6altivec8.o \
raid6recov.o raid6algos.o \
raid6tables.o
rm -f $@
$(AR) cq $@ $^
$(RANLIB) $@
raid6test: test.c raid6.a
$(CC) $(CFLAGS) -o raid6test $^
raid6altivec1.c: raid6altivec.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=1 < raid6altivec.uc > $@
raid6altivec2.c: raid6altivec.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=2 < raid6altivec.uc > $@
raid6altivec4.c: raid6altivec.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=4 < raid6altivec.uc > $@
raid6altivec8.c: raid6altivec.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=8 < raid6altivec.uc > $@
raid6int1.c: raid6int.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=1 < raid6int.uc > $@
raid6int2.c: raid6int.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=2 < raid6int.uc > $@
raid6int4.c: raid6int.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=4 < raid6int.uc > $@
raid6int8.c: raid6int.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=8 < raid6int.uc > $@
raid6int16.c: raid6int.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=16 < raid6int.uc > $@
raid6int32.c: raid6int.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=32 < raid6int.uc > $@
raid6tables.c: mktables
./mktables > raid6tables.c
clean:
rm -f *.o *.a mktables mktables.c raid6int.uc raid6*.c raid6test
spotless: clean
rm -f *~

View File

@@ -0,0 +1,124 @@
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright 2002-2007 H. Peter Anvin - All Rights Reserved
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2 or (at your
* option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* raid6test.c
*
* Test RAID-6 recovery with various algorithms
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <linux/raid/pq.h>
#define NDISKS 16 /* Including P and Q */
const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
struct raid6_calls raid6_call;
char *dataptrs[NDISKS];
char data[NDISKS][PAGE_SIZE];
char recovi[PAGE_SIZE], recovj[PAGE_SIZE];
static void makedata(void)
{
int i, j;
for (i = 0; i < NDISKS; i++) {
for (j = 0; j < PAGE_SIZE; j++)
data[i][j] = rand();
dataptrs[i] = data[i];
}
}
static char disk_type(int d)
{
switch (d) {
case NDISKS-2:
return 'P';
case NDISKS-1:
return 'Q';
default:
return 'D';
}
}
static int test_disks(int i, int j)
{
int erra, errb;
memset(recovi, 0xf0, PAGE_SIZE);
memset(recovj, 0xba, PAGE_SIZE);
dataptrs[i] = recovi;
dataptrs[j] = recovj;
raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs);
erra = memcmp(data[i], recovi, PAGE_SIZE);
errb = memcmp(data[j], recovj, PAGE_SIZE);
if (i < NDISKS-2 && j == NDISKS-1) {
/* We don't implement the DQ failure scenario, since it's
equivalent to a RAID-5 failure (XOR, then recompute Q) */
erra = errb = 0;
} else {
printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n",
raid6_call.name,
i, disk_type(i),
j, disk_type(j),
(!erra && !errb) ? "OK" :
!erra ? "ERRB" :
!errb ? "ERRA" : "ERRAB");
}
dataptrs[i] = data[i];
dataptrs[j] = data[j];
return erra || errb;
}
int main(int argc, char *argv[])
{
const struct raid6_calls *const *algo;
int i, j;
int err = 0;
makedata();
for (algo = raid6_algos; *algo; algo++) {
if (!(*algo)->valid || (*algo)->valid()) {
raid6_call = **algo;
/* Nuke syndromes */
memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
/* Generate assumed good syndrome */
raid6_call.gen_syndrome(NDISKS, PAGE_SIZE,
(void **)&dataptrs);
for (i = 0; i < NDISKS-1; i++)
for (j = i+1; j < NDISKS; j++)
err += test_disks(i, j);
}
printf("\n");
}
printf("\n");
/* Pick the best algorithm test */
raid6_select_algo();
if (err)
printf("\n*** ERRORS FOUND ***\n");
return err;
}

View File

@@ -0,0 +1,61 @@
/* ----------------------------------------------------------------------- *
*
* Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* raid6x86.h
*
* Definitions common to x86 and x86-64 RAID-6 code only
*/
#ifndef LINUX_RAID_RAID6X86_H
#define LINUX_RAID_RAID6X86_H
#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
#ifdef __KERNEL__ /* Real code */
#include <asm/i387.h>
#else /* Dummy code for user space testing */
static inline void kernel_fpu_begin(void)
{
}
static inline void kernel_fpu_end(void)
{
}
#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions
* (fast save and restore) */
#define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */
#define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */
#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
/* Should work well enough on modern CPUs for testing */
static inline int boot_cpu_has(int flag)
{
u32 eax = (flag >> 5) ? 0x80000001 : 1;
u32 edx;
asm volatile("cpuid"
: "+a" (eax), "=d" (edx)
: : "ecx", "ebx");
return (edx >> (flag & 31)) & 1;
}
#endif /* ndef __KERNEL__ */
#endif
#endif

View File

@@ -0,0 +1,20 @@
# This filter requires one command line option of form -vN=n
# where n must be a decimal number.
#
# Repeat each input line containing $$ n times, replacing $$ with 0...n-1.
# Replace each $# with n, and each $* with a single $.
BEGIN {
n = N + 0
}
{
if (/\$\$/) { rep = n } else { rep = 1 }
for (i = 0; i < rep; ++i) {
tmp = $0
gsub(/\$\$/, i, tmp)
gsub(/\$\#/, n, tmp)
gsub(/\$\*/, "$", tmp)
print tmp
}
}