add idl4k kernel firmware version 1.13.0.105

2015-03-26 17:22:37 +01:00
parent 5194d2792e
commit e9070cdc77
31064 changed files with 12769984 additions and 0 deletions
--- a/kernel/drivers/md/.gitignore
+++ b/kernel/drivers/md/.gitignore
@@ -0,0 +1,4 @@
+mktables
+raid6altivec*.c
+raid6int*.c
+raid6tables.c
--- a/kernel/drivers/md/Kconfig
+++ b/kernel/drivers/md/Kconfig
@@ -0,0 +1,323 @@
+#
+# Block device driver configuration
+#
+
+menuconfig MD
+	bool "Multiple devices driver support (RAID and LVM)"
+	depends on BLOCK
+	help
+	  Support multiple physical spindles through a single logical device.
+	  Required for RAID and logical volume management.
+
+if MD
+
+config BLK_DEV_MD
+	tristate "RAID support"
+	---help---
+	  This driver lets you combine several hard disk partitions into one
+	  logical block device. This can be used to simply append one
+	  partition to another one or to combine several redundant hard disks
+	  into a RAID1/4/5 device so as to provide protection against hard
+	  disk failures. This is called "Software RAID" since the combining of
+	  the partitions is done by the kernel. "Hardware RAID" means that the
+	  combining is done by a dedicated controller; if you have such a
+	  controller, you do not need to say Y here.
+
+	  More information about Software RAID on Linux is contained in the
+	  Software RAID mini-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>. There you will also learn
+	  where to get the supporting user space utilities raidtools.
+
+	  If unsure, say N.
+
+config MD_AUTODETECT
+	bool "Autodetect RAID arrays during kernel boot"
+	depends on BLK_DEV_MD=y
+	default y
+	---help---
+	  If you say Y here, then the kernel will try to autodetect raid
+	  arrays as part of its boot process. 
+
+	  If you don't use raid and say Y, this autodetection can cause 
+	  a several-second delay in the boot time due to various
+	  synchronisation steps that are part of this step.
+
+	  If unsure, say Y.
+
+config MD_LINEAR
+	tristate "Linear (append) mode"
+	depends on BLK_DEV_MD
+	---help---
+	  If you say Y here, then your multiple devices driver will be able to
+	  use the so-called linear mode, i.e. it will combine the hard disk
+	  partitions by simply appending one to the other.
+
+	  To compile this as a module, choose M here: the module
+	  will be called linear.
+
+	  If unsure, say Y.
+
+config MD_RAID0
+	tristate "RAID-0 (striping) mode"
+	depends on BLK_DEV_MD
+	---help---
+	  If you say Y here, then your multiple devices driver will be able to
+	  use the so-called raid0 mode, i.e. it will combine the hard disk
+	  partitions into one logical device in such a fashion as to fill them
+	  up evenly, one chunk here and one chunk there. This will increase
+	  the throughput rate if the partitions reside on distinct disks.
+
+	  Information about Software RAID on Linux is contained in the
+	  Software-RAID mini-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>. There you will also
+	  learn where to get the supporting user space utilities raidtools.
+
+	  To compile this as a module, choose M here: the module
+	  will be called raid0.
+
+	  If unsure, say Y.
+
+config MD_RAID1
+	tristate "RAID-1 (mirroring) mode"
+	depends on BLK_DEV_MD
+	---help---
+	  A RAID-1 set consists of several disk drives which are exact copies
+	  of each other.  In the event of a mirror failure, the RAID driver
+	  will continue to use the operational mirrors in the set, providing
+	  an error free MD (multiple device) to the higher levels of the
+	  kernel.  In a set with N drives, the available space is the capacity
+	  of a single drive, and the set protects against a failure of (N - 1)
+	  drives.
+
+	  Information about Software RAID on Linux is contained in the
+	  Software-RAID mini-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.  There you will also
+	  learn where to get the supporting user space utilities raidtools.
+
+	  If you want to use such a RAID-1 set, say Y.  To compile this code
+	  as a module, choose M here: the module will be called raid1.
+
+	  If unsure, say Y.
+
+config MD_RAID10
+	tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)"
+	depends on BLK_DEV_MD && EXPERIMENTAL
+	---help---
+	  RAID-10 provides a combination of striping (RAID-0) and
+	  mirroring (RAID-1) with easier configuration and more flexible
+	  layout.
+	  Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to
+	  be the same size (or at least, only as much as the smallest device
+	  will be used).
+	  RAID-10 provides a variety of layouts that provide different levels
+	  of redundancy and performance.
+
+	  RAID-10 requires mdadm-1.7.0 or later, available at:
+
+	  ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
+
+	  If unsure, say Y.
+
+config MD_RAID456
+	tristate "RAID-4/RAID-5/RAID-6 mode"
+	depends on BLK_DEV_MD
+	select MD_RAID6_PQ
+	select ASYNC_MEMCPY
+	select ASYNC_XOR
+	select ASYNC_PQ
+	select ASYNC_RAID6_RECOV
+	---help---
+	  A RAID-5 set of N drives with a capacity of C MB per drive provides
+	  the capacity of C * (N - 1) MB, and protects against a failure
+	  of a single drive. For a given sector (row) number, (N - 1) drives
+	  contain data sectors, and one drive contains the parity protection.
+	  For a RAID-4 set, the parity blocks are present on a single drive,
+	  while a RAID-5 set distributes the parity across the drives in one
+	  of the available parity distribution methods.
+
+	  A RAID-6 set of N drives with a capacity of C MB per drive
+	  provides the capacity of C * (N - 2) MB, and protects
+	  against a failure of any two drives. For a given sector
+	  (row) number, (N - 2) drives contain data sectors, and two
+	  drives contains two independent redundancy syndromes.  Like
+	  RAID-5, RAID-6 distributes the syndromes across the drives
+	  in one of the available parity distribution methods.
+
+	  Information about Software RAID on Linux is contained in the
+	  Software-RAID mini-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>. There you will also
+	  learn where to get the supporting user space utilities raidtools.
+
+	  If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y.  To
+	  compile this code as a module, choose M here: the module
+	  will be called raid456.
+
+	  If unsure, say Y.
+
+config MULTICORE_RAID456
+	bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
+	depends on MD_RAID456
+	depends on SMP
+	depends on EXPERIMENTAL
+	---help---
+	  Enable the raid456 module to dispatch per-stripe raid operations to a
+	  thread pool.
+
+	  If unsure, say N.
+
+config MD_RAID6_PQ
+	tristate
+
+config ASYNC_RAID6_TEST
+	tristate "Self test for hardware accelerated raid6 recovery"
+	depends on MD_RAID6_PQ
+	select ASYNC_RAID6_RECOV
+	---help---
+	  This is a one-shot self test that permutes through the
+	  recovery of all the possible two disk failure scenarios for a
+	  N-disk array.  Recovery is performed with the asynchronous
+	  raid6 recovery routines, and will optionally use an offload
+	  engine if one is available.
+
+	  If unsure, say N.
+
+config MD_MULTIPATH
+	tristate "Multipath I/O support"
+	depends on BLK_DEV_MD
+	help
+	  Multipath-IO is the ability of certain devices to address the same
+	  physical disk over multiple 'IO paths'. The code ensures that such
+	  paths can be defined and handled at runtime, and ensures that a
+	  transparent failover to the backup path(s) happens if a IO errors
+	  arrives on the primary path.
+
+	  If unsure, say N.
+
+config MD_FAULTY
+	tristate "Faulty test module for MD"
+	depends on BLK_DEV_MD
+	help
+	  The "faulty" module allows for a block device that occasionally returns
+	  read or write errors.  It is useful for testing.
+
+	  In unsure, say N.
+
+config BLK_DEV_DM
+	tristate "Device mapper support"
+	---help---
+	  Device-mapper is a low level volume manager.  It works by allowing
+	  people to specify mappings for ranges of logical sectors.  Various
+	  mapping types are available, in addition people may write their own
+	  modules containing custom mappings if they wish.
+
+	  Higher level volume managers such as LVM2 use this driver.
+
+	  To compile this as a module, choose M here: the module will be
+	  called dm-mod.
+
+	  If unsure, say N.
+
+config DM_DEBUG
+	boolean "Device mapper debugging support"
+	depends on BLK_DEV_DM
+	---help---
+	  Enable this for messages that may help debug device-mapper problems.
+
+	  If unsure, say N.
+
+config DM_CRYPT
+	tristate "Crypt target support"
+	depends on BLK_DEV_DM
+	select CRYPTO
+	select CRYPTO_CBC
+	---help---
+	  This device-mapper target allows you to create a device that
+	  transparently encrypts the data on it. You'll need to activate
+	  the ciphers you're going to use in the cryptoapi configuration.
+
+	  Information on how to use dm-crypt can be found on
+
+	  <http://www.saout.de/misc/dm-crypt/>
+
+	  To compile this code as a module, choose M here: the module will
+	  be called dm-crypt.
+
+	  If unsure, say N.
+
+config DM_SNAPSHOT
+       tristate "Snapshot target"
+       depends on BLK_DEV_DM
+       ---help---
+         Allow volume managers to take writable snapshots of a device.
+
+config DM_MIRROR
+       tristate "Mirror target"
+       depends on BLK_DEV_DM
+       ---help---
+         Allow volume managers to mirror logical volumes, also
+         needed for live data migration tools such as 'pvmove'.
+
+config DM_LOG_USERSPACE
+	tristate "Mirror userspace logging (EXPERIMENTAL)"
+	depends on DM_MIRROR && EXPERIMENTAL && NET
+	select CONNECTOR
+	---help---
+	  The userspace logging module provides a mechanism for
+	  relaying the dm-dirty-log API to userspace.  Log designs
+	  which are more suited to userspace implementation (e.g.
+	  shared storage logs) or experimental logs can be implemented
+	  by leveraging this framework.
+
+config DM_ZERO
+	tristate "Zero target"
+	depends on BLK_DEV_DM
+	---help---
+	  A target that discards writes, and returns all zeroes for
+	  reads.  Useful in some recovery situations.
+
+config DM_MULTIPATH
+	tristate "Multipath target"
+	depends on BLK_DEV_DM
+	# nasty syntax but means make DM_MULTIPATH independent
+	# of SCSI_DH if the latter isn't defined but if
+	# it is, DM_MULTIPATH must depend on it.  We get a build
+	# error if SCSI_DH=m and DM_MULTIPATH=y
+	depends on SCSI_DH || !SCSI_DH
+	---help---
+	  Allow volume managers to support multipath hardware.
+
+config DM_MULTIPATH_QL
+	tristate "I/O Path Selector based on the number of in-flight I/Os"
+	depends on DM_MULTIPATH
+	---help---
+	  This path selector is a dynamic load balancer which selects
+	  the path with the least number of in-flight I/Os.
+
+	  If unsure, say N.
+
+config DM_MULTIPATH_ST
+	tristate "I/O Path Selector based on the service time"
+	depends on DM_MULTIPATH
+	---help---
+	  This path selector is a dynamic load balancer which selects
+	  the path expected to complete the incoming I/O in the shortest
+	  time.
+
+	  If unsure, say N.
+
+config DM_DELAY
+	tristate "I/O delaying target (EXPERIMENTAL)"
+	depends on BLK_DEV_DM && EXPERIMENTAL
+	---help---
+	A target that delays reads and/or writes and can send
+	them to different devices.  Useful for testing.
+
+	If unsure, say N.
+
+config DM_UEVENT
+	bool "DM uevents (EXPERIMENTAL)"
+	depends on BLK_DEV_DM && EXPERIMENTAL
+	---help---
+	Generate udev events for DM events.
+
+endif # MD
--- a/kernel/drivers/md/Makefile
+++ b/kernel/drivers/md/Makefile
@@ -0,0 +1,119 @@
+#
+# Makefile for the kernel software RAID and LVM drivers.
+#
+
+dm-mod-y	+= dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
+		   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
+dm-multipath-y	+= dm-path-selector.o dm-mpath.o
+dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
+		    dm-snap-persistent.o
+dm-mirror-y	+= dm-raid1.o
+dm-log-userspace-y \
+		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
+md-mod-y	+= md.o bitmap.o
+raid456-y	+= raid5.o
+raid6_pq-y	+= raid6algos.o raid6recov.o raid6tables.o \
+		   raid6int1.o raid6int2.o raid6int4.o \
+		   raid6int8.o raid6int16.o raid6int32.o \
+		   raid6altivec1.o raid6altivec2.o raid6altivec4.o \
+		   raid6altivec8.o \
+		   raid6mmx.o raid6sse1.o raid6sse2.o
+hostprogs-y	+= mktables
+
+# Note: link order is important.  All raid personalities
+# and must come before md.o, as they each initialise 
+# themselves, and md.o may use the personalities when it 
+# auto-initialised.
+
+obj-$(CONFIG_MD_LINEAR)		+= linear.o
+obj-$(CONFIG_MD_RAID0)		+= raid0.o
+obj-$(CONFIG_MD_RAID1)		+= raid1.o
+obj-$(CONFIG_MD_RAID10)		+= raid10.o
+obj-$(CONFIG_MD_RAID6_PQ)	+= raid6_pq.o
+obj-$(CONFIG_MD_RAID456)	+= raid456.o
+obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
+obj-$(CONFIG_MD_FAULTY)		+= faulty.o
+obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
+obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
+obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
+obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
+obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
+obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
+obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
+obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
+obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
+obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
+obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
+
+quiet_cmd_unroll = UNROLL  $@
+      cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) \
+                   < $< > $@ || ( rm -f $@ && exit 1 )
+
+ifeq ($(CONFIG_ALTIVEC),y)
+altivec_flags := -maltivec -mabi=altivec
+endif
+
+ifeq ($(CONFIG_DM_UEVENT),y)
+dm-mod-objs			+= dm-uevent.o
+endif
+
+targets += raid6int1.c
+$(obj)/raid6int1.c:   UNROLL := 1
+$(obj)/raid6int1.c:   $(src)/raid6int.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+targets += raid6int2.c
+$(obj)/raid6int2.c:   UNROLL := 2
+$(obj)/raid6int2.c:   $(src)/raid6int.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+targets += raid6int4.c
+$(obj)/raid6int4.c:   UNROLL := 4
+$(obj)/raid6int4.c:   $(src)/raid6int.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+targets += raid6int8.c
+$(obj)/raid6int8.c:   UNROLL := 8
+$(obj)/raid6int8.c:   $(src)/raid6int.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+targets += raid6int16.c
+$(obj)/raid6int16.c:  UNROLL := 16
+$(obj)/raid6int16.c:  $(src)/raid6int.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+targets += raid6int32.c
+$(obj)/raid6int32.c:  UNROLL := 32
+$(obj)/raid6int32.c:  $(src)/raid6int.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+CFLAGS_raid6altivec1.o += $(altivec_flags)
+targets += raid6altivec1.c
+$(obj)/raid6altivec1.c:   UNROLL := 1
+$(obj)/raid6altivec1.c:   $(src)/raid6altivec.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+CFLAGS_raid6altivec2.o += $(altivec_flags)
+targets += raid6altivec2.c
+$(obj)/raid6altivec2.c:   UNROLL := 2
+$(obj)/raid6altivec2.c:   $(src)/raid6altivec.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+CFLAGS_raid6altivec4.o += $(altivec_flags)
+targets += raid6altivec4.c
+$(obj)/raid6altivec4.c:   UNROLL := 4
+$(obj)/raid6altivec4.c:   $(src)/raid6altivec.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+CFLAGS_raid6altivec8.o += $(altivec_flags)
+targets += raid6altivec8.c
+$(obj)/raid6altivec8.c:   UNROLL := 8
+$(obj)/raid6altivec8.c:   $(src)/raid6altivec.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+quiet_cmd_mktable = TABLE   $@
+      cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
+
+targets += raid6tables.c
+$(obj)/raid6tables.c: $(obj)/mktables FORCE
+	$(call if_changed,mktable)
--- a/kernel/drivers/md/bitmap.c
+++ b/kernel/drivers/md/bitmap.c
--- a/kernel/drivers/md/bitmap.h
+++ b/kernel/drivers/md/bitmap.h
@@ -0,0 +1,291 @@
+/*
+ * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
+ *
+ * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+ */
+#ifndef BITMAP_H
+#define BITMAP_H 1
+
+#define BITMAP_MAJOR_LO 3
+/* version 4 insists the bitmap is in little-endian order
+ * with version 3, it is host-endian which is non-portable
+ */
+#define BITMAP_MAJOR_HI 4
+#define	BITMAP_MAJOR_HOSTENDIAN 3
+
+#define BITMAP_MINOR 39
+
+/*
+ * in-memory bitmap:
+ *
+ * Use 16 bit block counters to track pending writes to each "chunk".
+ * The 2 high order bits are special-purpose, the first is a flag indicating
+ * whether a resync is needed.  The second is a flag indicating whether a
+ * resync is active.
+ * This means that the counter is actually 14 bits:
+ *
+ * +--------+--------+------------------------------------------------+
+ * | resync | resync |               counter                          |
+ * | needed | active |                                                |
+ * |  (0-1) |  (0-1) |              (0-16383)                         |
+ * +--------+--------+------------------------------------------------+
+ *
+ * The "resync needed" bit is set when:
+ *    a '1' bit is read from storage at startup.
+ *    a write request fails on some drives
+ *    a resync is aborted on a chunk with 'resync active' set
+ * It is cleared (and resync-active set) when a resync starts across all drives
+ * of the chunk.
+ *
+ *
+ * The "resync active" bit is set when:
+ *    a resync is started on all drives, and resync_needed is set.
+ *       resync_needed will be cleared (as long as resync_active wasn't already set).
+ * It is cleared when a resync completes.
+ *
+ * The counter counts pending write requests, plus the on-disk bit.
+ * When the counter is '1' and the resync bits are clear, the on-disk
+ * bit can be cleared aswell, thus setting the counter to 0.
+ * When we set a bit, or in the counter (to start a write), if the fields is
+ * 0, we first set the disk bit and set the counter to 1.
+ *
+ * If the counter is 0, the on-disk bit is clear and the stipe is clean
+ * Anything that dirties the stipe pushes the counter to 2 (at least)
+ * and sets the on-disk bit (lazily).
+ * If a periodic sweep find the counter at 2, it is decremented to 1.
+ * If the sweep find the counter at 1, the on-disk bit is cleared and the
+ * counter goes to zero.
+ *
+ * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
+ * counters as a fallback when "page" memory cannot be allocated:
+ *
+ * Normal case (page memory allocated):
+ *
+ *     page pointer (32-bit)
+ *
+ *     [ ] ------+
+ *               |
+ *               +-------> [   ][   ]..[   ] (4096 byte page == 2048 counters)
+ *                          c1   c2    c2048
+ *
+ * Hijacked case (page memory allocation failed):
+ *
+ *     hijacked page pointer (32-bit)
+ *
+ *     [		  ][		  ] (no page memory allocated)
+ *      counter #1 (16-bit) counter #2 (16-bit)
+ *
+ */
+
+#ifdef __KERNEL__
+
+#define PAGE_BITS (PAGE_SIZE << 3)
+#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
+
+typedef __u16 bitmap_counter_t;
+#define COUNTER_BITS 16
+#define COUNTER_BIT_SHIFT 4
+#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
+#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
+
+#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
+#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
+#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
+#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
+#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
+#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
+
+/* how many counters per page? */
+#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
+/* same, except a shift value for more efficient bitops */
+#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
+/* same, except a mask value for more efficient bitops */
+#define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)
+
+#define BITMAP_BLOCK_SIZE 512
+#define BITMAP_BLOCK_SHIFT 9
+
+/* how many blocks per chunk? (this is variable) */
+#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
+
+/* when hijacked, the counters and bits represent even larger "chunks" */
+/* there will be 1024 chunks represented by each counter in the page pointers */
+#define PAGEPTR_BLOCK_RATIO(bitmap) \
+			(CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
+#define PAGEPTR_BLOCK_SHIFT(bitmap) \
+			(CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
+#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
+
+/*
+ * on-disk bitmap:
+ *
+ * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
+ * file a page at a time. There's a superblock at the start of the file.
+ */
+
+/* map chunks (bits) to file pages - offset by the size of the superblock */
+#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
+
+#endif
+
+/*
+ * bitmap structures:
+ */
+
+#define BITMAP_MAGIC 0x6d746962
+
+/* use these for bitmap->flags and bitmap->sb->state bit-fields */
+enum bitmap_state {
+	BITMAP_STALE  = 0x002,  /* the bitmap file is out of date or had -EIO */
+	BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
+	BITMAP_HOSTENDIAN = 0x8000,
+};
+
+/* the superblock at the front of the bitmap file -- little endian */
+typedef struct bitmap_super_s {
+	__le32 magic;        /*  0  BITMAP_MAGIC */
+	__le32 version;      /*  4  the bitmap major for now, could change... */
+	__u8  uuid[16];      /*  8  128 bit uuid - must match md device uuid */
+	__le64 events;       /* 24  event counter for the bitmap (1)*/
+	__le64 events_cleared;/*32  event counter when last bit cleared (2) */
+	__le64 sync_size;    /* 40  the size of the md device's sync range(3) */
+	__le32 state;        /* 48  bitmap state information */
+	__le32 chunksize;    /* 52  the bitmap chunk size in bytes */
+	__le32 daemon_sleep; /* 56  seconds between disk flushes */
+	__le32 write_behind; /* 60  number of outstanding write-behind writes */
+
+	__u8  pad[256 - 64]; /* set to zero */
+} bitmap_super_t;
+
+/* notes:
+ * (1) This event counter is updated before the eventcounter in the md superblock
+ *    When a bitmap is loaded, it is only accepted if this event counter is equal
+ *    to, or one greater than, the event counter in the superblock.
+ * (2) This event counter is updated when the other one is *if*and*only*if* the
+ *    array is not degraded.  As bits are not cleared when the array is degraded,
+ *    this represents the last time that any bits were cleared.
+ *    If a device is being added that has an event count with this value or
+ *    higher, it is accepted as conforming to the bitmap.
+ * (3)This is the number of sectors represented by the bitmap, and is the range that
+ *    resync happens across.  For raid1 and raid5/6 it is the size of individual
+ *    devices.  For raid10 it is the size of the array.
+ */
+
+#ifdef __KERNEL__
+
+/* the in-memory bitmap is represented by bitmap_pages */
+struct bitmap_page {
+	/*
+	 * map points to the actual memory page
+	 */
+	char *map;
+	/*
+	 * in emergencies (when map cannot be alloced), hijack the map
+	 * pointer and use it as two counters itself
+	 */
+	unsigned int hijacked:1;
+	/*
+	 * count of dirty bits on the page
+	 */
+	unsigned int  count:31;
+};
+
+/* keep track of bitmap file pages that have pending writes on them */
+struct page_list {
+	struct list_head list;
+	struct page *page;
+};
+
+/* the main bitmap structure - one per mddev */
+struct bitmap {
+	struct bitmap_page *bp;
+	unsigned long pages; /* total number of pages in the bitmap */
+	unsigned long missing_pages; /* number of pages not yet allocated */
+
+	mddev_t *mddev; /* the md device that the bitmap is for */
+
+	int counter_bits; /* how many bits per block counter */
+
+	/* bitmap chunksize -- how much data does each bit represent? */
+	unsigned long chunksize;
+	unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
+	unsigned long chunks; /* total number of data chunks for the array */
+
+	/* We hold a count on the chunk currently being synced, and drop
+	 * it when the last block is started.  If the resync is aborted
+	 * midway, we need to be able to drop that count, so we remember
+	 * the counted chunk..
+	 */
+	unsigned long syncchunk;
+
+	__u64	events_cleared;
+	int need_sync;
+
+	/* bitmap spinlock */
+	spinlock_t lock;
+
+	long offset; /* offset from superblock if file is NULL */
+	struct file *file; /* backing disk file */
+	struct page *sb_page; /* cached copy of the bitmap file superblock */
+	struct page **filemap; /* list of cache pages for the file */
+	unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
+	unsigned long file_pages; /* number of pages in the file */
+	int last_page_size; /* bytes in the last page */
+
+	unsigned long flags;
+
+	int allclean;
+
+	unsigned long max_write_behind; /* write-behind mode */
+	atomic_t behind_writes;
+
+	/*
+	 * the bitmap daemon - periodically wakes up and sweeps the bitmap
+	 * file, cleaning up bits and flushing out pages to disk as necessary
+	 */
+	unsigned long daemon_lastrun; /* jiffies of last run */
+	unsigned long daemon_sleep; /* how many seconds between updates? */
+	unsigned long last_end_sync; /* when we lasted called end_sync to
+				      * update bitmap with resync progress */
+
+	atomic_t pending_writes; /* pending writes to the bitmap file */
+	wait_queue_head_t write_wait;
+	wait_queue_head_t overflow_wait;
+
+#ifndef __GENKSYMS__
+	wait_queue_head_t behind_wait;
+#endif
+};
+
+/* the bitmap API */
+
+/* these are used only by md/bitmap */
+int  bitmap_create(mddev_t *mddev);
+void bitmap_flush(mddev_t *mddev);
+void bitmap_destroy(mddev_t *mddev);
+
+void bitmap_print_sb(struct bitmap *bitmap);
+void bitmap_update_sb(struct bitmap *bitmap);
+
+int  bitmap_setallbits(struct bitmap *bitmap);
+void bitmap_write_all(struct bitmap *bitmap);
+
+void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
+
+/* these are exported */
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
+			unsigned long sectors, int behind);
+void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
+			unsigned long sectors, int success, int behind);
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
+void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
+void bitmap_close_sync(struct bitmap *bitmap);
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
+
+void bitmap_unplug(struct bitmap *bitmap);
+void bitmap_daemon_work(mddev_t *mddev);
+#endif
+
+#endif
--- a/kernel/drivers/md/dm-bio-record.h
+++ b/kernel/drivers/md/dm-bio-record.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_BIO_RECORD_H
+#define DM_BIO_RECORD_H
+
+#include <linux/bio.h>
+
+/*
+ * There are lots of mutable fields in the bio struct that get
+ * changed by the lower levels of the block layer.  Some targets,
+ * such as multipath, may wish to resubmit a bio on error.  The
+ * functions in this file help the target record and restore the
+ * original bio state.
+ */
+
+struct dm_bio_vec_details {
+#if PAGE_SIZE < 65536
+	__u16 bv_len;
+	__u16 bv_offset;
+#else
+	unsigned bv_len;
+	unsigned bv_offset;
+#endif
+};
+
+struct dm_bio_details {
+	sector_t bi_sector;
+	struct block_device *bi_bdev;
+	unsigned int bi_size;
+	unsigned short bi_idx;
+	unsigned long bi_flags;
+	struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES];
+};
+
+static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
+{
+	unsigned i;
+
+	bd->bi_sector = bio->bi_sector;
+	bd->bi_bdev = bio->bi_bdev;
+	bd->bi_size = bio->bi_size;
+	bd->bi_idx = bio->bi_idx;
+	bd->bi_flags = bio->bi_flags;
+
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len;
+		bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset;
+	}
+}
+
+static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
+{
+	unsigned i;
+
+	bio->bi_sector = bd->bi_sector;
+	bio->bi_bdev = bd->bi_bdev;
+	bio->bi_size = bd->bi_size;
+	bio->bi_idx = bd->bi_idx;
+	bio->bi_flags = bd->bi_flags;
+
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len;
+		bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset;
+	}
+}
+
+#endif
--- a/kernel/drivers/md/dm-crypt.c
+++ b/kernel/drivers/md/dm-crypt.c
--- a/kernel/drivers/md/dm-delay.c
+++ b/kernel/drivers/md/dm-delay.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright (C) 2005-2007 Red Hat GmbH
+ *
+ * A target that delays reads and/or writes and can send
+ * them to different devices.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "delay"
+
+struct delay_c {
+	struct timer_list delay_timer;
+	struct mutex timer_lock;
+	struct work_struct flush_expired_bios;
+	struct list_head delayed_bios;
+	atomic_t may_delay;
+	mempool_t *delayed_pool;
+
+	struct dm_dev *dev_read;
+	sector_t start_read;
+	unsigned read_delay;
+	unsigned reads;
+
+	struct dm_dev *dev_write;
+	sector_t start_write;
+	unsigned write_delay;
+	unsigned writes;
+};
+
+struct dm_delay_info {
+	struct delay_c *context;
+	struct list_head list;
+	struct bio *bio;
+	unsigned long expires;
+};
+
+static DEFINE_MUTEX(delayed_bios_lock);
+
+static struct workqueue_struct *kdelayd_wq;
+static struct kmem_cache *delayed_cache;
+
+static void handle_delayed_timer(unsigned long data)
+{
+	struct delay_c *dc = (struct delay_c *)data;
+
+	queue_work(kdelayd_wq, &dc->flush_expired_bios);
+}
+
+static void queue_timeout(struct delay_c *dc, unsigned long expires)
+{
+	mutex_lock(&dc->timer_lock);
+
+	if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires)
+		mod_timer(&dc->delay_timer, expires);
+
+	mutex_unlock(&dc->timer_lock);
+}
+
+static void flush_bios(struct bio *bio)
+{
+	struct bio *n;
+
+	while (bio) {
+		n = bio->bi_next;
+		bio->bi_next = NULL;
+		generic_make_request(bio);
+		bio = n;
+	}
+}
+
+static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
+{
+	struct dm_delay_info *delayed, *next;
+	unsigned long next_expires = 0;
+	int start_timer = 0;
+	struct bio_list flush_bios = { };
+
+	mutex_lock(&delayed_bios_lock);
+	list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
+		if (flush_all || time_after_eq(jiffies, delayed->expires)) {
+			list_del(&delayed->list);
+			bio_list_add(&flush_bios, delayed->bio);
+			if ((bio_data_dir(delayed->bio) == WRITE))
+				delayed->context->writes--;
+			else
+				delayed->context->reads--;
+			mempool_free(delayed, dc->delayed_pool);
+			continue;
+		}
+
+		if (!start_timer) {
+			start_timer = 1;
+			next_expires = delayed->expires;
+		} else
+			next_expires = min(next_expires, delayed->expires);
+	}
+
+	mutex_unlock(&delayed_bios_lock);
+
+	if (start_timer)
+		queue_timeout(dc, next_expires);
+
+	return bio_list_get(&flush_bios);
+}
+
+static void flush_expired_bios(struct work_struct *work)
+{
+	struct delay_c *dc;
+
+	dc = container_of(work, struct delay_c, flush_expired_bios);
+	flush_bios(flush_delayed_bios(dc, 0));
+}
+
+/*
+ * Mapping parameters:
+ *    <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
+ *
+ * With separate write parameters, the first set is only used for reads.
+ * Delays are specified in milliseconds.
+ */
+static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct delay_c *dc;
+	unsigned long long tmpll;
+
+	if (argc != 3 && argc != 6) {
+		ti->error = "requires exactly 3 or 6 arguments";
+		return -EINVAL;
+	}
+
+	dc = kmalloc(sizeof(*dc), GFP_KERNEL);
+	if (!dc) {
+		ti->error = "Cannot allocate context";
+		return -ENOMEM;
+	}
+
+	dc->reads = dc->writes = 0;
+
+	if (sscanf(argv[1], "%llu", &tmpll) != 1) {
+		ti->error = "Invalid device sector";
+		goto bad;
+	}
+	dc->start_read = tmpll;
+
+	if (sscanf(argv[2], "%u", &dc->read_delay) != 1) {
+		ti->error = "Invalid delay";
+		goto bad;
+	}
+
+	if (dm_get_device(ti, argv[0], dc->start_read, ti->len,
+			  dm_table_get_mode(ti->table), &dc->dev_read)) {
+		ti->error = "Device lookup failed";
+		goto bad;
+	}
+
+	dc->dev_write = NULL;
+	if (argc == 3)
+		goto out;
+
+	if (sscanf(argv[4], "%llu", &tmpll) != 1) {
+		ti->error = "Invalid write device sector";
+		goto bad_dev_read;
+	}
+	dc->start_write = tmpll;
+
+	if (sscanf(argv[5], "%u", &dc->write_delay) != 1) {
+		ti->error = "Invalid write delay";
+		goto bad_dev_read;
+	}
+
+	if (dm_get_device(ti, argv[3], dc->start_write, ti->len,
+			  dm_table_get_mode(ti->table), &dc->dev_write)) {
+		ti->error = "Write device lookup failed";
+		goto bad_dev_read;
+	}
+
+out:
+	dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache);
+	if (!dc->delayed_pool) {
+		DMERR("Couldn't create delayed bio pool.");
+		goto bad_dev_write;
+	}
+
+	setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc);
+
+	INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
+	INIT_LIST_HEAD(&dc->delayed_bios);
+	mutex_init(&dc->timer_lock);
+	atomic_set(&dc->may_delay, 1);
+
+	ti->num_flush_requests = 1;
+	ti->private = dc;
+	return 0;
+
+bad_dev_write:
+	if (dc->dev_write)
+		dm_put_device(ti, dc->dev_write);
+bad_dev_read:
+	dm_put_device(ti, dc->dev_read);
+bad:
+	kfree(dc);
+	return -EINVAL;
+}
+
+static void delay_dtr(struct dm_target *ti)
+{
+	struct delay_c *dc = ti->private;
+
+	flush_workqueue(kdelayd_wq);
+
+	dm_put_device(ti, dc->dev_read);
+
+	if (dc->dev_write)
+		dm_put_device(ti, dc->dev_write);
+
+	mempool_destroy(dc->delayed_pool);
+	kfree(dc);
+}
+
+static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
+{
+	struct dm_delay_info *delayed;
+	unsigned long expires = 0;
+
+	if (!delay || !atomic_read(&dc->may_delay))
+		return 1;
+
+	delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO);
+
+	delayed->context = dc;
+	delayed->bio = bio;
+	delayed->expires = expires = jiffies + (delay * HZ / 1000);
+
+	mutex_lock(&delayed_bios_lock);
+
+	if (bio_data_dir(bio) == WRITE)
+		dc->writes++;
+	else
+		dc->reads++;
+
+	list_add_tail(&delayed->list, &dc->delayed_bios);
+
+	mutex_unlock(&delayed_bios_lock);
+
+	queue_timeout(dc, expires);
+
+	return 0;
+}
+
+static void delay_presuspend(struct dm_target *ti)
+{
+	struct delay_c *dc = ti->private;
+
+	atomic_set(&dc->may_delay, 0);
+	del_timer_sync(&dc->delay_timer);
+	flush_bios(flush_delayed_bios(dc, 1));
+}
+
+static void delay_resume(struct dm_target *ti)
+{
+	struct delay_c *dc = ti->private;
+
+	atomic_set(&dc->may_delay, 1);
+}
+
+static int delay_map(struct dm_target *ti, struct bio *bio,
+		     union map_info *map_context)
+{
+	struct delay_c *dc = ti->private;
+
+	if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
+		bio->bi_bdev = dc->dev_write->bdev;
+		if (bio_sectors(bio))
+			bio->bi_sector = dc->start_write +
+					 (bio->bi_sector - ti->begin);
+
+		return delay_bio(dc, dc->write_delay, bio);
+	}
+
+	bio->bi_bdev = dc->dev_read->bdev;
+	bio->bi_sector = dc->start_read +
+			 (bio->bi_sector - ti->begin);
+
+	return delay_bio(dc, dc->read_delay, bio);
+}
+
+static int delay_status(struct dm_target *ti, status_type_t type,
+			char *result, unsigned maxlen)
+{
+	struct delay_c *dc = ti->private;
+	int sz = 0;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		DMEMIT("%u %u", dc->reads, dc->writes);
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%s %llu %u", dc->dev_read->name,
+		       (unsigned long long) dc->start_read,
+		       dc->read_delay);
+		if (dc->dev_write)
+			DMEMIT(" %s %llu %u", dc->dev_write->name,
+			       (unsigned long long) dc->start_write,
+			       dc->write_delay);
+		break;
+	}
+
+	return 0;
+}
+
+static int delay_iterate_devices(struct dm_target *ti,
+				 iterate_devices_callout_fn fn, void *data)
+{
+	struct delay_c *dc = ti->private;
+	int ret = 0;
+
+	ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data);
+	if (ret)
+		goto out;
+
+	if (dc->dev_write)
+		ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data);
+
+out:
+	return ret;
+}
+
+static struct target_type delay_target = {
+	.name	     = "delay",
+	.version     = {1, 1, 0},
+	.module      = THIS_MODULE,
+	.ctr	     = delay_ctr,
+	.dtr	     = delay_dtr,
+	.map	     = delay_map,
+	.presuspend  = delay_presuspend,
+	.resume	     = delay_resume,
+	.status	     = delay_status,
+	.iterate_devices = delay_iterate_devices,
+};
+
+static int __init dm_delay_init(void)
+{
+	int r = -ENOMEM;
+
+	kdelayd_wq = create_workqueue("kdelayd");
+	if (!kdelayd_wq) {
+		DMERR("Couldn't start kdelayd");
+		goto bad_queue;
+	}
+
+	delayed_cache = KMEM_CACHE(dm_delay_info, 0);
+	if (!delayed_cache) {
+		DMERR("Couldn't create delayed bio cache.");
+		goto bad_memcache;
+	}
+
+	r = dm_register_target(&delay_target);
+	if (r < 0) {
+		DMERR("register failed %d", r);
+		goto bad_register;
+	}
+
+	return 0;
+
+bad_register:
+	kmem_cache_destroy(delayed_cache);
+bad_memcache:
+	destroy_workqueue(kdelayd_wq);
+bad_queue:
+	return r;
+}
+
+static void __exit dm_delay_exit(void)
+{
+	dm_unregister_target(&delay_target);
+	kmem_cache_destroy(delayed_cache);
+	destroy_workqueue(kdelayd_wq);
+}
+
+/* Module hooks */
+module_init(dm_delay_init);
+module_exit(dm_delay_exit);
+
+MODULE_DESCRIPTION(DM_NAME " delay target");
+MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>");
+MODULE_LICENSE("GPL");
--- a/kernel/drivers/md/dm-exception-store.c
+++ b/kernel/drivers/md/dm-exception-store.c
@@ -0,0 +1,300 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006-2008 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-exception-store.h"
+
+#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#define DM_MSG_PREFIX "snapshot exception stores"
+
+static LIST_HEAD(_exception_store_types);
+static DEFINE_SPINLOCK(_lock);
+
+static struct dm_exception_store_type *__find_exception_store_type(const char *name)
+{
+	struct dm_exception_store_type *type;
+
+	list_for_each_entry(type, &_exception_store_types, list)
+		if (!strcmp(name, type->name))
+			return type;
+
+	return NULL;
+}
+
+static struct dm_exception_store_type *_get_exception_store_type(const char *name)
+{
+	struct dm_exception_store_type *type;
+
+	spin_lock(&_lock);
+
+	type = __find_exception_store_type(name);
+
+	if (type && !try_module_get(type->module))
+		type = NULL;
+
+	spin_unlock(&_lock);
+
+	return type;
+}
+
+/*
+ * get_type
+ * @type_name
+ *
+ * Attempt to retrieve the dm_exception_store_type by name.  If not already
+ * available, attempt to load the appropriate module.
+ *
+ * Exstore modules are named "dm-exstore-" followed by the 'type_name'.
+ * Modules may contain multiple types.
+ * This function will first try the module "dm-exstore-<type_name>",
+ * then truncate 'type_name' on the last '-' and try again.
+ *
+ * For example, if type_name was "clustered-shared", it would search
+ * 'dm-exstore-clustered-shared' then 'dm-exstore-clustered'.
+ *
+ * 'dm-exception-store-<type_name>' is too long of a name in my
+ * opinion, which is why I've chosen to have the files
+ * containing exception store implementations be 'dm-exstore-<type_name>'.
+ * If you want your module to be autoloaded, you will follow this
+ * naming convention.
+ *
+ * Returns: dm_exception_store_type* on success, NULL on failure
+ */
+static struct dm_exception_store_type *get_type(const char *type_name)
+{
+	char *p, *type_name_dup;
+	struct dm_exception_store_type *type;
+
+	type = _get_exception_store_type(type_name);
+	if (type)
+		return type;
+
+	type_name_dup = kstrdup(type_name, GFP_KERNEL);
+	if (!type_name_dup) {
+		DMERR("No memory left to attempt load for \"%s\"", type_name);
+		return NULL;
+	}
+
+	while (request_module("dm-exstore-%s", type_name_dup) ||
+	       !(type = _get_exception_store_type(type_name))) {
+		p = strrchr(type_name_dup, '-');
+		if (!p)
+			break;
+		p[0] = '\0';
+	}
+
+	if (!type)
+		DMWARN("Module for exstore type \"%s\" not found.", type_name);
+
+	kfree(type_name_dup);
+
+	return type;
+}
+
+static void put_type(struct dm_exception_store_type *type)
+{
+	spin_lock(&_lock);
+	module_put(type->module);
+	spin_unlock(&_lock);
+}
+
+int dm_exception_store_type_register(struct dm_exception_store_type *type)
+{
+	int r = 0;
+
+	spin_lock(&_lock);
+	if (!__find_exception_store_type(type->name))
+		list_add(&type->list, &_exception_store_types);
+	else
+		r = -EEXIST;
+	spin_unlock(&_lock);
+
+	return r;
+}
+EXPORT_SYMBOL(dm_exception_store_type_register);
+
+int dm_exception_store_type_unregister(struct dm_exception_store_type *type)
+{
+	spin_lock(&_lock);
+
+	if (!__find_exception_store_type(type->name)) {
+		spin_unlock(&_lock);
+		return -EINVAL;
+	}
+
+	list_del(&type->list);
+
+	spin_unlock(&_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(dm_exception_store_type_unregister);
+
+static int set_chunk_size(struct dm_exception_store *store,
+			  const char *chunk_size_arg, char **error)
+{
+	unsigned long chunk_size_ulong;
+	char *value;
+
+	chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10);
+	if (*chunk_size_arg == '\0' || *value != '\0' ||
+	    chunk_size_ulong > UINT_MAX) {
+		*error = "Invalid chunk size";
+		return -EINVAL;
+	}
+
+	if (!chunk_size_ulong) {
+		store->chunk_size = store->chunk_mask = store->chunk_shift = 0;
+		return 0;
+	}
+
+	return dm_exception_store_set_chunk_size(store,
+						 (unsigned) chunk_size_ulong,
+						 error);
+}
+
+int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
+				      unsigned chunk_size,
+				      char **error)
+{
+	/* Check chunk_size is a power of 2 */
+	if (!is_power_of_2(chunk_size)) {
+		*error = "Chunk size is not a power of 2";
+		return -EINVAL;
+	}
+
+	/* Validate the chunk size against the device block size */
+	if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
+		*error = "Chunk size is not a multiple of device blocksize";
+		return -EINVAL;
+	}
+
+	if (chunk_size > INT_MAX >> SECTOR_SHIFT) {
+		*error = "Chunk size is too high";
+		return -EINVAL;
+	}
+
+	store->chunk_size = chunk_size;
+	store->chunk_mask = chunk_size - 1;
+	store->chunk_shift = ffs(chunk_size) - 1;
+
+	return 0;
+}
+
+int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+			      unsigned *args_used,
+			      struct dm_exception_store **store)
+{
+	int r = 0;
+	struct dm_exception_store_type *type = NULL;
+	struct dm_exception_store *tmp_store;
+	char persistent;
+
+	if (argc < 3) {
+		ti->error = "Insufficient exception store arguments";
+		return -EINVAL;
+	}
+
+	tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL);
+	if (!tmp_store) {
+		ti->error = "Exception store allocation failed";
+		return -ENOMEM;
+	}
+
+	persistent = toupper(*argv[1]);
+	if (persistent == 'P')
+		type = get_type("P");
+	else if (persistent == 'N')
+		type = get_type("N");
+	else {
+		ti->error = "Persistent flag is not P or N";
+		r = -EINVAL;
+		goto bad_type;
+	}
+
+	if (!type) {
+		ti->error = "Exception store type not recognised";
+		r = -EINVAL;
+		goto bad_type;
+	}
+
+	tmp_store->type = type;
+	tmp_store->ti = ti;
+
+	r = dm_get_device(ti, argv[0], 0, 0,
+			  FMODE_READ | FMODE_WRITE, &tmp_store->cow);
+	if (r) {
+		ti->error = "Cannot get COW device";
+		goto bad_cow;
+	}
+
+	r = set_chunk_size(tmp_store, argv[2], &ti->error);
+	if (r)
+		goto bad_ctr;
+
+	r = type->ctr(tmp_store, 0, NULL);
+	if (r) {
+		ti->error = "Exception store type constructor failed";
+		goto bad_ctr;
+	}
+
+	*args_used = 3;
+	*store = tmp_store;
+	return 0;
+
+bad_ctr:
+	dm_put_device(ti, tmp_store->cow);
+bad_cow:
+	put_type(type);
+bad_type:
+	kfree(tmp_store);
+	return r;
+}
+EXPORT_SYMBOL(dm_exception_store_create);
+
+void dm_exception_store_destroy(struct dm_exception_store *store)
+{
+	store->type->dtr(store);
+	dm_put_device(store->ti, store->cow);
+	put_type(store->type);
+	kfree(store);
+}
+EXPORT_SYMBOL(dm_exception_store_destroy);
+
+int dm_exception_store_init(void)
+{
+	int r;
+
+	r = dm_transient_snapshot_init();
+	if (r) {
+		DMERR("Unable to register transient exception store type.");
+		goto transient_fail;
+	}
+
+	r = dm_persistent_snapshot_init();
+	if (r) {
+		DMERR("Unable to register persistent exception store type");
+		goto persistent_fail;
+	}
+
+	return 0;
+
+persistent_fail:
+	dm_persistent_snapshot_exit();
+transient_fail:
+	return r;
+}
+
+void dm_exception_store_exit(void)
+{
+	dm_persistent_snapshot_exit();
+	dm_transient_snapshot_exit();
+}
--- a/kernel/drivers/md/dm-exception-store.h
+++ b/kernel/drivers/md/dm-exception-store.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
+ *
+ * Device-mapper snapshot exception store.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _LINUX_DM_EXCEPTION_STORE
+#define _LINUX_DM_EXCEPTION_STORE
+
+#include <linux/blkdev.h>
+#include <linux/device-mapper.h>
+
+/*
+ * The snapshot code deals with largish chunks of the disk at a
+ * time. Typically 32k - 512k.
+ */
+typedef sector_t chunk_t;
+
+/*
+ * An exception is used where an old chunk of data has been
+ * replaced by a new one.
+ * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
+ * of chunks that follow contiguously.  Remaining bits hold the number of the
+ * chunk within the device.
+ */
+struct dm_snap_exception {
+	struct list_head hash_list;
+
+	chunk_t old_chunk;
+	chunk_t new_chunk;
+};
+
+/*
+ * Abstraction to handle the meta/layout of exception stores (the
+ * COW device).
+ */
+struct dm_exception_store;
+struct dm_exception_store_type {
+	const char *name;
+	struct module *module;
+
+	int (*ctr) (struct dm_exception_store *store,
+		    unsigned argc, char **argv);
+
+	/*
+	 * Destroys this object when you've finished with it.
+	 */
+	void (*dtr) (struct dm_exception_store *store);
+
+	/*
+	 * The target shouldn't read the COW device until this is
+	 * called.  As exceptions are read from the COW, they are
+	 * reported back via the callback.
+	 */
+	int (*read_metadata) (struct dm_exception_store *store,
+			      int (*callback)(void *callback_context,
+					      chunk_t old, chunk_t new),
+			      void *callback_context);
+
+	/*
+	 * Find somewhere to store the next exception.
+	 */
+	int (*prepare_exception) (struct dm_exception_store *store,
+				  struct dm_snap_exception *e);
+
+	/*
+	 * Update the metadata with this exception.
+	 */
+	void (*commit_exception) (struct dm_exception_store *store,
+				  struct dm_snap_exception *e,
+				  void (*callback) (void *, int success),
+				  void *callback_context);
+
+	/*
+	 * The snapshot is invalid, note this in the metadata.
+	 */
+	void (*drop_snapshot) (struct dm_exception_store *store);
+
+	unsigned (*status) (struct dm_exception_store *store,
+			    status_type_t status, char *result,
+			    unsigned maxlen);
+
+	/*
+	 * Return how full the snapshot is.
+	 */
+	void (*fraction_full) (struct dm_exception_store *store,
+			       sector_t *numerator,
+			       sector_t *denominator);
+
+	/* For internal device-mapper use only. */
+	struct list_head list;
+};
+
+struct dm_exception_store {
+	struct dm_exception_store_type *type;
+	struct dm_target *ti;
+
+	struct dm_dev *cow;
+
+	/* Size of data blocks saved - must be a power of 2 */
+	unsigned chunk_size;
+	unsigned chunk_mask;
+	unsigned chunk_shift;
+
+	void *context;
+};
+
+/*
+ * Funtions to manipulate consecutive chunks
+ */
+#  if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
+#    define DM_CHUNK_CONSECUTIVE_BITS 8
+#    define DM_CHUNK_NUMBER_BITS 56
+
+static inline chunk_t dm_chunk_number(chunk_t chunk)
+{
+	return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
+}
+
+static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
+{
+	return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
+}
+
+static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
+{
+	e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
+
+	BUG_ON(!dm_consecutive_chunk_count(e));
+}
+
+#  else
+#    define DM_CHUNK_CONSECUTIVE_BITS 0
+
+static inline chunk_t dm_chunk_number(chunk_t chunk)
+{
+	return chunk;
+}
+
+static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
+{
+	return 0;
+}
+
+static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
+{
+}
+
+#  endif
+
+/*
+ * Return the number of sectors in the device.
+ */
+static inline sector_t get_dev_size(struct block_device *bdev)
+{
+	return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
+				      sector_t sector)
+{
+	return sector >> store->chunk_shift;
+}
+
+int dm_exception_store_type_register(struct dm_exception_store_type *type);
+int dm_exception_store_type_unregister(struct dm_exception_store_type *type);
+
+int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
+				      unsigned chunk_size,
+				      char **error);
+
+int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+			      unsigned *args_used,
+			      struct dm_exception_store **store);
+void dm_exception_store_destroy(struct dm_exception_store *store);
+
+int dm_exception_store_init(void);
+void dm_exception_store_exit(void);
+
+/*
+ * Two exception store implementations.
+ */
+int dm_persistent_snapshot_init(void);
+void dm_persistent_snapshot_exit(void);
+
+int dm_transient_snapshot_init(void);
+void dm_transient_snapshot_exit(void);
+
+#endif /* _LINUX_DM_EXCEPTION_STORE */
--- a/kernel/drivers/md/dm-io.c
+++ b/kernel/drivers/md/dm-io.c
@@ -0,0 +1,474 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ * Copyright (C) 2006 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/bio.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/dm-io.h>
+
+struct dm_io_client {
+	mempool_t *pool;
+	struct bio_set *bios;
+};
+
+/* FIXME: can we shrink this ? */
+struct io {
+	unsigned long error_bits;
+	unsigned long eopnotsupp_bits;
+	atomic_t count;
+	struct task_struct *sleeper;
+	struct dm_io_client *client;
+	io_notify_fn callback;
+	void *context;
+};
+
+/*
+ * io contexts are only dynamically allocated for asynchronous
+ * io.  Since async io is likely to be the majority of io we'll
+ * have the same number of io contexts as bios! (FIXME: must reduce this).
+ */
+
+static unsigned int pages_to_ios(unsigned int pages)
+{
+	return 4 * pages;	/* too many ? */
+}
+
+/*
+ * Create a client with mempool and bioset.
+ */
+struct dm_io_client *dm_io_client_create(unsigned num_pages)
+{
+	unsigned ios = pages_to_ios(num_pages);
+	struct dm_io_client *client;
+
+	client = kmalloc(sizeof(*client), GFP_KERNEL);
+	if (!client)
+		return ERR_PTR(-ENOMEM);
+
+	client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
+	if (!client->pool)
+		goto bad;
+
+	client->bios = bioset_create(16, 0);
+	if (!client->bios)
+		goto bad;
+
+	return client;
+
+   bad:
+	if (client->pool)
+		mempool_destroy(client->pool);
+	kfree(client);
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(dm_io_client_create);
+
+int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client)
+{
+	return mempool_resize(client->pool, pages_to_ios(num_pages),
+			      GFP_KERNEL);
+}
+EXPORT_SYMBOL(dm_io_client_resize);
+
+void dm_io_client_destroy(struct dm_io_client *client)
+{
+	mempool_destroy(client->pool);
+	bioset_free(client->bios);
+	kfree(client);
+}
+EXPORT_SYMBOL(dm_io_client_destroy);
+
+/*-----------------------------------------------------------------
+ * We need to keep track of which region a bio is doing io for.
+ * In order to save a memory allocation we store this the last
+ * bvec which we know is unused (blech).
+ * XXX This is ugly and can OOPS with some configs... find another way.
+ *---------------------------------------------------------------*/
+static inline void bio_set_region(struct bio *bio, unsigned region)
+{
+	bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
+}
+
+static inline unsigned bio_get_region(struct bio *bio)
+{
+	return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
+}
+
+/*-----------------------------------------------------------------
+ * We need an io object to keep track of the number of bios that
+ * have been dispatched for a particular io.
+ *---------------------------------------------------------------*/
+static void dec_count(struct io *io, unsigned int region, int error)
+{
+	if (error) {
+		set_bit(region, &io->error_bits);
+		if (error == -EOPNOTSUPP)
+			set_bit(region, &io->eopnotsupp_bits);
+	}
+
+	if (atomic_dec_and_test(&io->count)) {
+		if (io->sleeper)
+			wake_up_process(io->sleeper);
+
+		else {
+			unsigned long r = io->error_bits;
+			io_notify_fn fn = io->callback;
+			void *context = io->context;
+
+			mempool_free(io, io->client->pool);
+			fn(r, context);
+		}
+	}
+}
+
+static void endio(struct bio *bio, int error)
+{
+	struct io *io;
+	unsigned region;
+
+	if (error && bio_data_dir(bio) == READ)
+		zero_fill_bio(bio);
+
+	/*
+	 * The bio destructor in bio_put() may use the io object.
+	 */
+	io = bio->bi_private;
+	region = bio_get_region(bio);
+
+	bio->bi_max_vecs++;
+	bio_put(bio);
+
+	dec_count(io, region, error);
+}
+
+/*-----------------------------------------------------------------
+ * These little objects provide an abstraction for getting a new
+ * destination page for io.
+ *---------------------------------------------------------------*/
+struct dpages {
+	void (*get_page)(struct dpages *dp,
+			 struct page **p, unsigned long *len, unsigned *offset);
+	void (*next_page)(struct dpages *dp);
+
+	unsigned context_u;
+	void *context_ptr;
+};
+
+/*
+ * Functions for getting the pages from a list.
+ */
+static void list_get_page(struct dpages *dp,
+		  struct page **p, unsigned long *len, unsigned *offset)
+{
+	unsigned o = dp->context_u;
+	struct page_list *pl = (struct page_list *) dp->context_ptr;
+
+	*p = pl->page;
+	*len = PAGE_SIZE - o;
+	*offset = o;
+}
+
+static void list_next_page(struct dpages *dp)
+{
+	struct page_list *pl = (struct page_list *) dp->context_ptr;
+	dp->context_ptr = pl->next;
+	dp->context_u = 0;
+}
+
+static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset)
+{
+	dp->get_page = list_get_page;
+	dp->next_page = list_next_page;
+	dp->context_u = offset;
+	dp->context_ptr = pl;
+}
+
+/*
+ * Functions for getting the pages from a bvec.
+ */
+static void bvec_get_page(struct dpages *dp,
+		  struct page **p, unsigned long *len, unsigned *offset)
+{
+	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
+	*p = bvec->bv_page;
+	*len = bvec->bv_len;
+	*offset = bvec->bv_offset;
+}
+
+static void bvec_next_page(struct dpages *dp)
+{
+	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
+	dp->context_ptr = bvec + 1;
+}
+
+static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
+{
+	dp->get_page = bvec_get_page;
+	dp->next_page = bvec_next_page;
+	dp->context_ptr = bvec;
+}
+
+/*
+ * Functions for getting the pages from a VMA.
+ */
+static void vm_get_page(struct dpages *dp,
+		 struct page **p, unsigned long *len, unsigned *offset)
+{
+	*p = vmalloc_to_page(dp->context_ptr);
+	*offset = dp->context_u;
+	*len = PAGE_SIZE - dp->context_u;
+}
+
+static void vm_next_page(struct dpages *dp)
+{
+	dp->context_ptr += PAGE_SIZE - dp->context_u;
+	dp->context_u = 0;
+}
+
+static void vm_dp_init(struct dpages *dp, void *data)
+{
+	dp->get_page = vm_get_page;
+	dp->next_page = vm_next_page;
+	dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
+	dp->context_ptr = data;
+}
+
+static void dm_bio_destructor(struct bio *bio)
+{
+	struct io *io = bio->bi_private;
+
+	bio_free(bio, io->client->bios);
+}
+
+/*
+ * Functions for getting the pages from kernel memory.
+ */
+static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len,
+			unsigned *offset)
+{
+	*p = virt_to_page(dp->context_ptr);
+	*offset = dp->context_u;
+	*len = PAGE_SIZE - dp->context_u;
+}
+
+static void km_next_page(struct dpages *dp)
+{
+	dp->context_ptr += PAGE_SIZE - dp->context_u;
+	dp->context_u = 0;
+}
+
+static void km_dp_init(struct dpages *dp, void *data)
+{
+	dp->get_page = km_get_page;
+	dp->next_page = km_next_page;
+	dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
+	dp->context_ptr = data;
+}
+
+/*-----------------------------------------------------------------
+ * IO routines that accept a list of pages.
+ *---------------------------------------------------------------*/
+static void do_region(int rw, unsigned region, struct dm_io_region *where,
+		      struct dpages *dp, struct io *io)
+{
+	struct bio *bio;
+	struct page *page;
+	unsigned long len;
+	unsigned offset;
+	unsigned num_bvecs;
+	sector_t remaining = where->count;
+
+	while (remaining) {
+		/*
+		 * Allocate a suitably sized-bio: we add an extra
+		 * bvec for bio_get/set_region() and decrement bi_max_vecs
+		 * to hide it from bio_add_page().
+		 */
+		num_bvecs = dm_sector_div_up(remaining,
+					     (PAGE_SIZE >> SECTOR_SHIFT));
+		num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev),
+				      num_bvecs);
+		if (unlikely(num_bvecs > BIO_MAX_PAGES))
+			num_bvecs = BIO_MAX_PAGES;
+		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
+		bio->bi_sector = where->sector + (where->count - remaining);
+		bio->bi_bdev = where->bdev;
+		bio->bi_end_io = endio;
+		bio->bi_private = io;
+		bio->bi_destructor = dm_bio_destructor;
+		bio->bi_max_vecs--;
+		bio_set_region(bio, region);
+
+		/*
+		 * Try and add as many pages as possible.
+		 */
+		while (remaining) {
+			dp->get_page(dp, &page, &len, &offset);
+			len = min(len, to_bytes(remaining));
+			if (!bio_add_page(bio, page, len, offset))
+				break;
+
+			offset = 0;
+			remaining -= to_sector(len);
+			dp->next_page(dp);
+		}
+
+		atomic_inc(&io->count);
+		submit_bio(rw, bio);
+	}
+}
+
+static void dispatch_io(int rw, unsigned int num_regions,
+			struct dm_io_region *where, struct dpages *dp,
+			struct io *io, int sync)
+{
+	int i;
+	struct dpages old_pages = *dp;
+
+	if (sync)
+		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+
+	/*
+	 * For multiple regions we need to be careful to rewind
+	 * the dp object for each call to do_region.
+	 */
+	for (i = 0; i < num_regions; i++) {
+		*dp = old_pages;
+		if (where[i].count)
+			do_region(rw, i, where + i, dp, io);
+	}
+
+	/*
+	 * Drop the extra reference that we were holding to avoid
+	 * the io being completed too early.
+	 */
+	dec_count(io, 0, 0);
+}
+
+static int sync_io(struct dm_io_client *client, unsigned int num_regions,
+		   struct dm_io_region *where, int rw, struct dpages *dp,
+		   unsigned long *error_bits)
+{
+	struct io io;
+
+	if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
+		WARN_ON(1);
+		return -EIO;
+	}
+
+retry:
+	io.error_bits = 0;
+	io.eopnotsupp_bits = 0;
+	atomic_set(&io.count, 1); /* see dispatch_io() */
+	io.sleeper = current;
+	io.client = client;
+
+	dispatch_io(rw, num_regions, where, dp, &io, 1);
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+
+		if (!atomic_read(&io.count))
+			break;
+
+		io_schedule();
+	}
+	set_current_state(TASK_RUNNING);
+
+	if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
+		rw &= ~(1 << BIO_RW_BARRIER);
+		goto retry;
+	}
+
+	if (error_bits)
+		*error_bits = io.error_bits;
+
+	return io.error_bits ? -EIO : 0;
+}
+
+static int async_io(struct dm_io_client *client, unsigned int num_regions,
+		    struct dm_io_region *where, int rw, struct dpages *dp,
+		    io_notify_fn fn, void *context)
+{
+	struct io *io;
+
+	if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
+		WARN_ON(1);
+		fn(1, context);
+		return -EIO;
+	}
+
+	io = mempool_alloc(client->pool, GFP_NOIO);
+	io->error_bits = 0;
+	io->eopnotsupp_bits = 0;
+	atomic_set(&io->count, 1); /* see dispatch_io() */
+	io->sleeper = NULL;
+	io->client = client;
+	io->callback = fn;
+	io->context = context;
+
+	dispatch_io(rw, num_regions, where, dp, io, 0);
+	return 0;
+}
+
+static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
+{
+	/* Set up dpages based on memory type */
+	switch (io_req->mem.type) {
+	case DM_IO_PAGE_LIST:
+		list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
+		break;
+
+	case DM_IO_BVEC:
+		bvec_dp_init(dp, io_req->mem.ptr.bvec);
+		break;
+
+	case DM_IO_VMA:
+		vm_dp_init(dp, io_req->mem.ptr.vma);
+		break;
+
+	case DM_IO_KMEM:
+		km_dp_init(dp, io_req->mem.ptr.addr);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * New collapsed (a)synchronous interface.
+ *
+ * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
+ * the queue with blk_unplug() some time later or set the BIO_RW_SYNC bit in
+ * io_req->bi_rw. If you fail to do one of these, the IO will be submitted to
+ * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c.
+ */
+int dm_io(struct dm_io_request *io_req, unsigned num_regions,
+	  struct dm_io_region *where, unsigned long *sync_error_bits)
+{
+	int r;
+	struct dpages dp;
+
+	r = dp_init(io_req, &dp);
+	if (r)
+		return r;
+
+	if (!io_req->notify.fn)
+		return sync_io(io_req->client, num_regions, where,
+			       io_req->bi_rw, &dp, sync_error_bits);
+
+	return async_io(io_req->client, num_regions, where, io_req->bi_rw,
+			&dp, io_req->notify.fn, io_req->notify.context);
+}
+EXPORT_SYMBOL(dm_io);
--- a/kernel/drivers/md/dm-ioctl.c
+++ b/kernel/drivers/md/dm-ioctl.c
--- a/kernel/drivers/md/dm-kcopyd.c
+++ b/kernel/drivers/md/dm-kcopyd.c
@@ -0,0 +1,673 @@
+/*
+ * Copyright (C) 2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ *
+ * Kcopyd provides a simple interface for copying an area of one
+ * block-device to one or more other block-devices, with an asynchronous
+ * completion notification.
+ */
+
+#include <linux/types.h>
+#include <asm/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-kcopyd.h>
+
+#include "dm.h"
+
+/*-----------------------------------------------------------------
+ * Each kcopyd client has its own little pool of preallocated
+ * pages for kcopyd io.
+ *---------------------------------------------------------------*/
+struct dm_kcopyd_client {
+	spinlock_t lock;
+	struct page_list *pages;
+	unsigned int nr_pages;
+	unsigned int nr_free_pages;
+
+	struct dm_io_client *io_client;
+
+	wait_queue_head_t destroyq;
+	atomic_t nr_jobs;
+
+	mempool_t *job_pool;
+
+	struct workqueue_struct *kcopyd_wq;
+	struct work_struct kcopyd_work;
+
+/*
+ * We maintain three lists of jobs:
+ *
+ * i)   jobs waiting for pages
+ * ii)  jobs that have pages, and are waiting for the io to be issued.
+ * iii) jobs that have completed.
+ *
+ * All three of these are protected by job_lock.
+ */
+	spinlock_t job_lock;
+	struct list_head complete_jobs;
+	struct list_head io_jobs;
+	struct list_head pages_jobs;
+};
+
+static void wake(struct dm_kcopyd_client *kc)
+{
+	queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
+}
+
+static struct page_list *alloc_pl(void)
+{
+	struct page_list *pl;
+
+	pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+	if (!pl)
+		return NULL;
+
+	pl->page = alloc_page(GFP_KERNEL);
+	if (!pl->page) {
+		kfree(pl);
+		return NULL;
+	}
+
+	return pl;
+}
+
+static void free_pl(struct page_list *pl)
+{
+	__free_page(pl->page);
+	kfree(pl);
+}
+
+static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
+			    unsigned int nr, struct page_list **pages)
+{
+	struct page_list *pl;
+
+	spin_lock(&kc->lock);
+	if (kc->nr_free_pages < nr) {
+		spin_unlock(&kc->lock);
+		return -ENOMEM;
+	}
+
+	kc->nr_free_pages -= nr;
+	for (*pages = pl = kc->pages; --nr; pl = pl->next)
+		;
+
+	kc->pages = pl->next;
+	pl->next = NULL;
+
+	spin_unlock(&kc->lock);
+
+	return 0;
+}
+
+static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl)
+{
+	struct page_list *cursor;
+
+	spin_lock(&kc->lock);
+	for (cursor = pl; cursor->next; cursor = cursor->next)
+		kc->nr_free_pages++;
+
+	kc->nr_free_pages++;
+	cursor->next = kc->pages;
+	kc->pages = pl;
+	spin_unlock(&kc->lock);
+}
+
+/*
+ * These three functions resize the page pool.
+ */
+static void drop_pages(struct page_list *pl)
+{
+	struct page_list *next;
+
+	while (pl) {
+		next = pl->next;
+		free_pl(pl);
+		pl = next;
+	}
+}
+
+static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr)
+{
+	unsigned int i;
+	struct page_list *pl = NULL, *next;
+
+	for (i = 0; i < nr; i++) {
+		next = alloc_pl();
+		if (!next) {
+			if (pl)
+				drop_pages(pl);
+			return -ENOMEM;
+		}
+		next->next = pl;
+		pl = next;
+	}
+
+	kcopyd_put_pages(kc, pl);
+	kc->nr_pages += nr;
+	return 0;
+}
+
+static void client_free_pages(struct dm_kcopyd_client *kc)
+{
+	BUG_ON(kc->nr_free_pages != kc->nr_pages);
+	drop_pages(kc->pages);
+	kc->pages = NULL;
+	kc->nr_free_pages = kc->nr_pages = 0;
+}
+
+/*-----------------------------------------------------------------
+ * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
+ * for this reason we use a mempool to prevent the client from
+ * ever having to do io (which could cause a deadlock).
+ *---------------------------------------------------------------*/
+struct kcopyd_job {
+	struct dm_kcopyd_client *kc;
+	struct list_head list;
+	unsigned long flags;
+
+	/*
+	 * Error state of the job.
+	 */
+	int read_err;
+	unsigned long write_err;
+
+	/*
+	 * Either READ or WRITE
+	 */
+	int rw;
+	struct dm_io_region source;
+
+	/*
+	 * The destinations for the transfer.
+	 */
+	unsigned int num_dests;
+	struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
+
+	sector_t offset;
+	unsigned int nr_pages;
+	struct page_list *pages;
+
+	/*
+	 * Set this to ensure you are notified when the job has
+	 * completed.  'context' is for callback to use.
+	 */
+	dm_kcopyd_notify_fn fn;
+	void *context;
+
+	/*
+	 * These fields are only used if the job has been split
+	 * into more manageable parts.
+	 */
+	struct mutex lock;
+	atomic_t sub_jobs;
+	sector_t progress;
+};
+
+/* FIXME: this should scale with the number of pages */
+#define MIN_JOBS 512
+
+static struct kmem_cache *_job_cache;
+
+int __init dm_kcopyd_init(void)
+{
+	_job_cache = KMEM_CACHE(kcopyd_job, 0);
+	if (!_job_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void dm_kcopyd_exit(void)
+{
+	kmem_cache_destroy(_job_cache);
+	_job_cache = NULL;
+}
+
+/*
+ * Functions to push and pop a job onto the head of a given job
+ * list.
+ */
+static struct kcopyd_job *pop(struct list_head *jobs,
+			      struct dm_kcopyd_client *kc)
+{
+	struct kcopyd_job *job = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kc->job_lock, flags);
+
+	if (!list_empty(jobs)) {
+		job = list_entry(jobs->next, struct kcopyd_job, list);
+		list_del(&job->list);
+	}
+	spin_unlock_irqrestore(&kc->job_lock, flags);
+
+	return job;
+}
+
+static void push(struct list_head *jobs, struct kcopyd_job *job)
+{
+	unsigned long flags;
+	struct dm_kcopyd_client *kc = job->kc;
+
+	spin_lock_irqsave(&kc->job_lock, flags);
+	list_add_tail(&job->list, jobs);
+	spin_unlock_irqrestore(&kc->job_lock, flags);
+}
+
+
+static void push_head(struct list_head *jobs, struct kcopyd_job *job)
+{
+	unsigned long flags;
+	struct dm_kcopyd_client *kc = job->kc;
+
+	spin_lock_irqsave(&kc->job_lock, flags);
+	list_add(&job->list, jobs);
+	spin_unlock_irqrestore(&kc->job_lock, flags);
+}
+
+/*
+ * These three functions process 1 item from the corresponding
+ * job list.
+ *
+ * They return:
+ * < 0: error
+ *   0: success
+ * > 0: can't process yet.
+ */
+static int run_complete_job(struct kcopyd_job *job)
+{
+	void *context = job->context;
+	int read_err = job->read_err;
+	unsigned long write_err = job->write_err;
+	dm_kcopyd_notify_fn fn = job->fn;
+	struct dm_kcopyd_client *kc = job->kc;
+
+	if (job->pages)
+		kcopyd_put_pages(kc, job->pages);
+	mempool_free(job, kc->job_pool);
+	fn(read_err, write_err, context);
+
+	if (atomic_dec_and_test(&kc->nr_jobs))
+		wake_up(&kc->destroyq);
+
+	return 0;
+}
+
+static void complete_io(unsigned long error, void *context)
+{
+	struct kcopyd_job *job = (struct kcopyd_job *) context;
+	struct dm_kcopyd_client *kc = job->kc;
+
+	if (error) {
+		if (job->rw == WRITE)
+			job->write_err |= error;
+		else
+			job->read_err = 1;
+
+		if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
+			push(&kc->complete_jobs, job);
+			wake(kc);
+			return;
+		}
+	}
+
+	if (job->rw == WRITE)
+		push(&kc->complete_jobs, job);
+
+	else {
+		job->rw = WRITE;
+		push(&kc->io_jobs, job);
+	}
+
+	wake(kc);
+}
+
+/*
+ * Request io on as many buffer heads as we can currently get for
+ * a particular job.
+ */
+static int run_io_job(struct kcopyd_job *job)
+{
+	int r;
+	struct dm_io_request io_req = {
+		.bi_rw = job->rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG),
+		.mem.type = DM_IO_PAGE_LIST,
+		.mem.ptr.pl = job->pages,
+		.mem.offset = job->offset,
+		.notify.fn = complete_io,
+		.notify.context = job,
+		.client = job->kc->io_client,
+	};
+
+	if (job->rw == READ)
+		r = dm_io(&io_req, 1, &job->source, NULL);
+	else
+		r = dm_io(&io_req, job->num_dests, job->dests, NULL);
+
+	return r;
+}
+
+static int run_pages_job(struct kcopyd_job *job)
+{
+	int r;
+
+	job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
+				  PAGE_SIZE >> 9);
+	r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
+	if (!r) {
+		/* this job is ready for io */
+		push(&job->kc->io_jobs, job);
+		return 0;
+	}
+
+	if (r == -ENOMEM)
+		/* can't complete now */
+		return 1;
+
+	return r;
+}
+
+/*
+ * Run through a list for as long as possible.  Returns the count
+ * of successful jobs.
+ */
+static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
+			int (*fn) (struct kcopyd_job *))
+{
+	struct kcopyd_job *job;
+	int r, count = 0;
+
+	while ((job = pop(jobs, kc))) {
+
+		r = fn(job);
+
+		if (r < 0) {
+			/* error this rogue job */
+			if (job->rw == WRITE)
+				job->write_err = (unsigned long) -1L;
+			else
+				job->read_err = 1;
+			push(&kc->complete_jobs, job);
+			break;
+		}
+
+		if (r > 0) {
+			/*
+			 * We couldn't service this job ATM, so
+			 * push this job back onto the list.
+			 */
+			push_head(jobs, job);
+			break;
+		}
+
+		count++;
+	}
+
+	return count;
+}
+
+/*
+ * kcopyd does this every time it's woken up.
+ */
+static void do_work(struct work_struct *work)
+{
+	struct dm_kcopyd_client *kc = container_of(work,
+					struct dm_kcopyd_client, kcopyd_work);
+
+	/*
+	 * The order that these are called is *very* important.
+	 * complete jobs can free some pages for pages jobs.
+	 * Pages jobs when successful will jump onto the io jobs
+	 * list.  io jobs call wake when they complete and it all
+	 * starts again.
+	 */
+	process_jobs(&kc->complete_jobs, kc, run_complete_job);
+	process_jobs(&kc->pages_jobs, kc, run_pages_job);
+	process_jobs(&kc->io_jobs, kc, run_io_job);
+}
+
+/*
+ * If we are copying a small region we just dispatch a single job
+ * to do the copy, otherwise the io has to be split up into many
+ * jobs.
+ */
+static void dispatch_job(struct kcopyd_job *job)
+{
+	struct dm_kcopyd_client *kc = job->kc;
+	atomic_inc(&kc->nr_jobs);
+	push(&kc->pages_jobs, job);
+	wake(kc);
+}
+
+#define SUB_JOB_SIZE 128
+static void segment_complete(int read_err, unsigned long write_err,
+			     void *context)
+{
+	/* FIXME: tidy this function */
+	sector_t progress = 0;
+	sector_t count = 0;
+	struct kcopyd_job *job = (struct kcopyd_job *) context;
+	struct dm_kcopyd_client *kc = job->kc;
+
+	mutex_lock(&job->lock);
+
+	/* update the error */
+	if (read_err)
+		job->read_err = 1;
+
+	if (write_err)
+		job->write_err |= write_err;
+
+	/*
+	 * Only dispatch more work if there hasn't been an error.
+	 */
+	if ((!job->read_err && !job->write_err) ||
+	    test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
+		/* get the next chunk of work */
+		progress = job->progress;
+		count = job->source.count - progress;
+		if (count) {
+			if (count > SUB_JOB_SIZE)
+				count = SUB_JOB_SIZE;
+
+			job->progress += count;
+		}
+	}
+	mutex_unlock(&job->lock);
+
+	if (count) {
+		int i;
+		struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool,
+							   GFP_NOIO);
+
+		*sub_job = *job;
+		sub_job->source.sector += progress;
+		sub_job->source.count = count;
+
+		for (i = 0; i < job->num_dests; i++) {
+			sub_job->dests[i].sector += progress;
+			sub_job->dests[i].count = count;
+		}
+
+		sub_job->fn = segment_complete;
+		sub_job->context = job;
+		dispatch_job(sub_job);
+
+	} else if (atomic_dec_and_test(&job->sub_jobs)) {
+
+		/*
+		 * Queue the completion callback to the kcopyd thread.
+		 *
+		 * Some callers assume that all the completions are called
+		 * from a single thread and don't race with each other.
+		 *
+		 * We must not call the callback directly here because this
+		 * code may not be executing in the thread.
+		 */
+		push(&kc->complete_jobs, job);
+		wake(kc);
+	}
+}
+
+/*
+ * Create some little jobs that will do the move between
+ * them.
+ */
+#define SPLIT_COUNT 8
+static void split_job(struct kcopyd_job *job)
+{
+	int i;
+
+	atomic_inc(&job->kc->nr_jobs);
+
+	atomic_set(&job->sub_jobs, SPLIT_COUNT);
+	for (i = 0; i < SPLIT_COUNT; i++)
+		segment_complete(0, 0u, job);
+}
+
+int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
+		   unsigned int num_dests, struct dm_io_region *dests,
+		   unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
+{
+	struct kcopyd_job *job;
+
+	/*
+	 * Allocate a new job.
+	 */
+	job = mempool_alloc(kc->job_pool, GFP_NOIO);
+
+	/*
+	 * set up for the read.
+	 */
+	job->kc = kc;
+	job->flags = flags;
+	job->read_err = 0;
+	job->write_err = 0;
+	job->rw = READ;
+
+	job->source = *from;
+
+	job->num_dests = num_dests;
+	memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
+
+	job->offset = 0;
+	job->nr_pages = 0;
+	job->pages = NULL;
+
+	job->fn = fn;
+	job->context = context;
+
+	if (job->source.count < SUB_JOB_SIZE)
+		dispatch_job(job);
+
+	else {
+		mutex_init(&job->lock);
+		job->progress = 0;
+		split_job(job);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(dm_kcopyd_copy);
+
+/*
+ * Cancels a kcopyd job, eg. someone might be deactivating a
+ * mirror.
+ */
+#if 0
+int kcopyd_cancel(struct kcopyd_job *job, int block)
+{
+	/* FIXME: finish */
+	return -1;
+}
+#endif  /*  0  */
+
+/*-----------------------------------------------------------------
+ * Client setup
+ *---------------------------------------------------------------*/
+int dm_kcopyd_client_create(unsigned int nr_pages,
+			    struct dm_kcopyd_client **result)
+{
+	int r = -ENOMEM;
+	struct dm_kcopyd_client *kc;
+
+	kc = kmalloc(sizeof(*kc), GFP_KERNEL);
+	if (!kc)
+		return -ENOMEM;
+
+	spin_lock_init(&kc->lock);
+	spin_lock_init(&kc->job_lock);
+	INIT_LIST_HEAD(&kc->complete_jobs);
+	INIT_LIST_HEAD(&kc->io_jobs);
+	INIT_LIST_HEAD(&kc->pages_jobs);
+
+	kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
+	if (!kc->job_pool)
+		goto bad_slab;
+
+	INIT_WORK(&kc->kcopyd_work, do_work);
+	kc->kcopyd_wq = create_singlethread_workqueue("kcopyd");
+	if (!kc->kcopyd_wq)
+		goto bad_workqueue;
+
+	kc->pages = NULL;
+	kc->nr_pages = kc->nr_free_pages = 0;
+	r = client_alloc_pages(kc, nr_pages);
+	if (r)
+		goto bad_client_pages;
+
+	kc->io_client = dm_io_client_create(nr_pages);
+	if (IS_ERR(kc->io_client)) {
+		r = PTR_ERR(kc->io_client);
+		goto bad_io_client;
+	}
+
+	init_waitqueue_head(&kc->destroyq);
+	atomic_set(&kc->nr_jobs, 0);
+
+	*result = kc;
+	return 0;
+
+bad_io_client:
+	client_free_pages(kc);
+bad_client_pages:
+	destroy_workqueue(kc->kcopyd_wq);
+bad_workqueue:
+	mempool_destroy(kc->job_pool);
+bad_slab:
+	kfree(kc);
+
+	return r;
+}
+EXPORT_SYMBOL(dm_kcopyd_client_create);
+
+void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
+{
+	/* Wait for completion of all jobs submitted by this client. */
+	wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
+
+	BUG_ON(!list_empty(&kc->complete_jobs));
+	BUG_ON(!list_empty(&kc->io_jobs));
+	BUG_ON(!list_empty(&kc->pages_jobs));
+	destroy_workqueue(kc->kcopyd_wq);
+	dm_io_client_destroy(kc->io_client);
+	client_free_pages(kc);
+	mempool_destroy(kc->job_pool);
+	kfree(kc);
+}
+EXPORT_SYMBOL(dm_kcopyd_client_destroy);
--- a/kernel/drivers/md/dm-linear.c
+++ b/kernel/drivers/md/dm-linear.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2001-2003 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "linear"
+
+/*
+ * Linear: maps a linear range of a device.
+ */
+struct linear_c {
+	struct dm_dev *dev;
+	sector_t start;
+};
+
+/*
+ * Construct a linear mapping: <dev_path> <offset>
+ */
+static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct linear_c *lc;
+	unsigned long long tmp;
+
+	if (argc != 2) {
+		ti->error = "Invalid argument count";
+		return -EINVAL;
+	}
+
+	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+	if (lc == NULL) {
+		ti->error = "dm-linear: Cannot allocate linear context";
+		return -ENOMEM;
+	}
+
+	if (sscanf(argv[1], "%llu", &tmp) != 1) {
+		ti->error = "dm-linear: Invalid device sector";
+		goto bad;
+	}
+	lc->start = tmp;
+
+	if (dm_get_device(ti, argv[0], lc->start, ti->len,
+			  dm_table_get_mode(ti->table), &lc->dev)) {
+		ti->error = "dm-linear: Device lookup failed";
+		goto bad;
+	}
+
+	ti->num_flush_requests = 1;
+	ti->private = lc;
+	return 0;
+
+      bad:
+	kfree(lc);
+	return -EINVAL;
+}
+
+static void linear_dtr(struct dm_target *ti)
+{
+	struct linear_c *lc = (struct linear_c *) ti->private;
+
+	dm_put_device(ti, lc->dev);
+	kfree(lc);
+}
+
+static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector)
+{
+	struct linear_c *lc = ti->private;
+
+	return lc->start + (bi_sector - ti->begin);
+}
+
+static void linear_map_bio(struct dm_target *ti, struct bio *bio)
+{
+	struct linear_c *lc = ti->private;
+
+	bio->bi_bdev = lc->dev->bdev;
+	if (bio_sectors(bio))
+		bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
+}
+
+static int linear_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
+{
+	linear_map_bio(ti, bio);
+
+	return DM_MAPIO_REMAPPED;
+}
+
+static int linear_status(struct dm_target *ti, status_type_t type,
+			 char *result, unsigned int maxlen)
+{
+	struct linear_c *lc = (struct linear_c *) ti->private;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = '\0';
+		break;
+
+	case STATUSTYPE_TABLE:
+		snprintf(result, maxlen, "%s %llu", lc->dev->name,
+				(unsigned long long)lc->start);
+		break;
+	}
+	return 0;
+}
+
+static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
+			unsigned long arg)
+{
+	struct linear_c *lc = (struct linear_c *) ti->private;
+	return __blkdev_driver_ioctl(lc->dev->bdev, lc->dev->mode, cmd, arg);
+}
+
+static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+			struct bio_vec *biovec, int max_size)
+{
+	struct linear_c *lc = ti->private;
+	struct request_queue *q = bdev_get_queue(lc->dev->bdev);
+
+	if (!q->merge_bvec_fn)
+		return max_size;
+
+	bvm->bi_bdev = lc->dev->bdev;
+	bvm->bi_sector = linear_map_sector(ti, bvm->bi_sector);
+
+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int linear_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct linear_c *lc = ti->private;
+
+	return fn(ti, lc->dev, lc->start, ti->len, data);
+}
+
+static struct target_type linear_target = {
+	.name   = "linear",
+	.version = {1, 1, 0},
+	.module = THIS_MODULE,
+	.ctr    = linear_ctr,
+	.dtr    = linear_dtr,
+	.map    = linear_map,
+	.status = linear_status,
+	.ioctl  = linear_ioctl,
+	.merge  = linear_merge,
+	.iterate_devices = linear_iterate_devices,
+};
+
+int __init dm_linear_init(void)
+{
+	int r = dm_register_target(&linear_target);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	return r;
+}
+
+void dm_linear_exit(void)
+{
+	dm_unregister_target(&linear_target);
+}
--- a/kernel/drivers/md/dm-log-userspace-base.c
+++ b/kernel/drivers/md/dm-log-userspace-base.c
@@ -0,0 +1,705 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/bio.h>
+#include <linux/dm-dirty-log.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+
+#include "dm-log-userspace-transfer.h"
+
+struct flush_entry {
+	int type;
+	region_t region;
+	struct list_head list;
+};
+
+struct log_c {
+	struct dm_target *ti;
+	uint32_t region_size;
+	region_t region_count;
+	uint64_t luid;
+	char uuid[DM_UUID_LEN];
+
+	char *usr_argv_str;
+	uint32_t usr_argc;
+
+	/*
+	 * in_sync_hint gets set when doing is_remote_recovering.  It
+	 * represents the first region that needs recovery.  IOW, the
+	 * first zero bit of sync_bits.  This can be useful for to limit
+	 * traffic for calls like is_remote_recovering and get_resync_work,
+	 * but be take care in its use for anything else.
+	 */
+	uint64_t in_sync_hint;
+
+	spinlock_t flush_lock;
+	struct list_head flush_list;  /* only for clear and mark requests */
+};
+
+static mempool_t *flush_entry_pool;
+
+static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
+{
+	return kmalloc(sizeof(struct flush_entry), gfp_mask);
+}
+
+static void flush_entry_free(void *element, void *pool_data)
+{
+	kfree(element);
+}
+
+static int userspace_do_request(struct log_c *lc, const char *uuid,
+				int request_type, char *data, size_t data_size,
+				char *rdata, size_t *rdata_size)
+{
+	int r;
+
+	/*
+	 * If the server isn't there, -ESRCH is returned,
+	 * and we must keep trying until the server is
+	 * restored.
+	 */
+retry:
+	r = dm_consult_userspace(uuid, lc->luid, request_type, data,
+				 data_size, rdata, rdata_size);
+
+	if (r != -ESRCH)
+		return r;
+
+	DMERR(" Userspace log server not found.");
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(2*HZ);
+		DMWARN("Attempting to contact userspace log server...");
+		r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR,
+					 lc->usr_argv_str,
+					 strlen(lc->usr_argv_str) + 1,
+					 NULL, NULL);
+		if (!r)
+			break;
+	}
+	DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
+	r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL,
+				 0, NULL, NULL);
+	if (!r)
+		goto retry;
+
+	DMERR("Error trying to resume userspace log: %d", r);
+
+	return -ESRCH;
+}
+
+static int build_constructor_string(struct dm_target *ti,
+				    unsigned argc, char **argv,
+				    char **ctr_str)
+{
+	int i, str_size;
+	char *str = NULL;
+
+	*ctr_str = NULL;
+
+	for (i = 0, str_size = 0; i < argc; i++)
+		str_size += strlen(argv[i]) + 1; /* +1 for space between args */
+
+	str_size += 20; /* Max number of chars in a printed u64 number */
+
+	str = kzalloc(str_size, GFP_KERNEL);
+	if (!str) {
+		DMWARN("Unable to allocate memory for constructor string");
+		return -ENOMEM;
+	}
+
+	str_size = sprintf(str, "%llu", (unsigned long long)ti->len);
+	for (i = 0; i < argc; i++)
+		str_size += sprintf(str + str_size, " %s", argv[i]);
+
+	*ctr_str = str;
+	return str_size;
+}
+
+/*
+ * userspace_ctr
+ *
+ * argv contains:
+ *	<UUID> <other args>
+ * Where 'other args' is the userspace implementation specific log
+ * arguments.  An example might be:
+ *	<UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
+ *
+ * So, this module will strip off the <UUID> for identification purposes
+ * when communicating with userspace about a log; but will pass on everything
+ * else.
+ */
+static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
+			 unsigned argc, char **argv)
+{
+	int r = 0;
+	int str_size;
+	char *ctr_str = NULL;
+	struct log_c *lc = NULL;
+	uint64_t rdata;
+	size_t rdata_size = sizeof(rdata);
+
+	if (argc < 3) {
+		DMWARN("Too few arguments to userspace dirty log");
+		return -EINVAL;
+	}
+
+	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+	if (!lc) {
+		DMWARN("Unable to allocate userspace log context.");
+		return -ENOMEM;
+	}
+
+	/* The ptr value is sufficient for local unique id */
+	lc->luid = (unsigned long)lc;
+
+	lc->ti = ti;
+
+	if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
+		DMWARN("UUID argument too long.");
+		kfree(lc);
+		return -EINVAL;
+	}
+
+	strncpy(lc->uuid, argv[0], DM_UUID_LEN);
+	spin_lock_init(&lc->flush_lock);
+	INIT_LIST_HEAD(&lc->flush_list);
+
+	str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
+	if (str_size < 0) {
+		kfree(lc);
+		return str_size;
+	}
+
+	/* Send table string */
+	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
+				 ctr_str, str_size, NULL, NULL);
+
+	if (r == -ESRCH) {
+		DMERR("Userspace log server not found");
+		goto out;
+	}
+
+	/* Since the region size does not change, get it now */
+	rdata_size = sizeof(rdata);
+	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE,
+				 NULL, 0, (char *)&rdata, &rdata_size);
+
+	if (r) {
+		DMERR("Failed to get region size of dirty log");
+		goto out;
+	}
+
+	lc->region_size = (uint32_t)rdata;
+	lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
+
+out:
+	if (r) {
+		kfree(lc);
+		kfree(ctr_str);
+	} else {
+		lc->usr_argv_str = ctr_str;
+		lc->usr_argc = argc;
+		log->context = lc;
+	}
+
+	return r;
+}
+
+static void userspace_dtr(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
+				 NULL, 0,
+				 NULL, NULL);
+
+	kfree(lc->usr_argv_str);
+	kfree(lc);
+
+	return;
+}
+
+static int userspace_presuspend(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,
+				 NULL, 0,
+				 NULL, NULL);
+
+	return r;
+}
+
+static int userspace_postsuspend(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,
+				 NULL, 0,
+				 NULL, NULL);
+
+	return r;
+}
+
+static int userspace_resume(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	lc->in_sync_hint = 0;
+	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,
+				 NULL, 0,
+				 NULL, NULL);
+
+	return r;
+}
+
+static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
+{
+	struct log_c *lc = log->context;
+
+	return lc->region_size;
+}
+
+/*
+ * userspace_is_clean
+ *
+ * Check whether a region is clean.  If there is any sort of
+ * failure when consulting the server, we return not clean.
+ *
+ * Returns: 1 if clean, 0 otherwise
+ */
+static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
+{
+	int r;
+	uint64_t region64 = (uint64_t)region;
+	int64_t is_clean;
+	size_t rdata_size;
+	struct log_c *lc = log->context;
+
+	rdata_size = sizeof(is_clean);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
+				 (char *)&region64, sizeof(region64),
+				 (char *)&is_clean, &rdata_size);
+
+	return (r) ? 0 : (int)is_clean;
+}
+
+/*
+ * userspace_in_sync
+ *
+ * Check if the region is in-sync.  If there is any sort
+ * of failure when consulting the server, we assume that
+ * the region is not in sync.
+ *
+ * If 'can_block' is set, return immediately
+ *
+ * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
+ */
+static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
+			     int can_block)
+{
+	int r;
+	uint64_t region64 = region;
+	int64_t in_sync;
+	size_t rdata_size;
+	struct log_c *lc = log->context;
+
+	/*
+	 * We can never respond directly - even if in_sync_hint is
+	 * set.  This is because another machine could see a device
+	 * failure and mark the region out-of-sync.  If we don't go
+	 * to userspace to ask, we might think the region is in-sync
+	 * and allow a read to pick up data that is stale.  (This is
+	 * very unlikely if a device actually fails; but it is very
+	 * likely if a connection to one device from one machine fails.)
+	 *
+	 * There still might be a problem if the mirror caches the region
+	 * state as in-sync... but then this call would not be made.  So,
+	 * that is a mirror problem.
+	 */
+	if (!can_block)
+		return -EWOULDBLOCK;
+
+	rdata_size = sizeof(in_sync);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
+				 (char *)&region64, sizeof(region64),
+				 (char *)&in_sync, &rdata_size);
+	return (r) ? 0 : (int)in_sync;
+}
+
+/*
+ * userspace_flush
+ *
+ * This function is ok to block.
+ * The flush happens in two stages.  First, it sends all
+ * clear/mark requests that are on the list.  Then it
+ * tells the server to commit them.  This gives the
+ * server a chance to optimise the commit, instead of
+ * doing it for every request.
+ *
+ * Additionally, we could implement another thread that
+ * sends the requests up to the server - reducing the
+ * load on flush.  Then the flush would have less in
+ * the list and be responsible for the finishing commit.
+ *
+ * Returns: 0 on success, < 0 on failure
+ */
+static int userspace_flush(struct dm_dirty_log *log)
+{
+	int r = 0;
+	unsigned long flags;
+	struct log_c *lc = log->context;
+	LIST_HEAD(flush_list);
+	struct flush_entry *fe, *tmp_fe;
+
+	spin_lock_irqsave(&lc->flush_lock, flags);
+	list_splice_init(&lc->flush_list, &flush_list);
+	spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+	if (list_empty(&flush_list))
+		return 0;
+
+	/*
+	 * FIXME: Count up requests, group request types,
+	 * allocate memory to stick all requests in and
+	 * send to server in one go.  Failing the allocation,
+	 * do it one by one.
+	 */
+
+	list_for_each_entry(fe, &flush_list, list) {
+		r = userspace_do_request(lc, lc->uuid, fe->type,
+					 (char *)&fe->region,
+					 sizeof(fe->region),
+					 NULL, NULL);
+		if (r)
+			goto fail;
+	}
+
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+				 NULL, 0, NULL, NULL);
+
+fail:
+	/*
+	 * We can safely remove these entries, even if failure.
+	 * Calling code will receive an error and will know that
+	 * the log facility has failed.
+	 */
+	list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+		list_del(&fe->list);
+		mempool_free(fe, flush_entry_pool);
+	}
+
+	if (r)
+		dm_table_event(lc->ti->table);
+
+	return r;
+}
+
+/*
+ * userspace_mark_region
+ *
+ * This function should avoid blocking unless absolutely required.
+ * (Memory allocation is valid for blocking.)
+ */
+static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
+{
+	unsigned long flags;
+	struct log_c *lc = log->context;
+	struct flush_entry *fe;
+
+	/* Wait for an allocation, but _never_ fail */
+	fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
+	BUG_ON(!fe);
+
+	spin_lock_irqsave(&lc->flush_lock, flags);
+	fe->type = DM_ULOG_MARK_REGION;
+	fe->region = region;
+	list_add(&fe->list, &lc->flush_list);
+	spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+	return;
+}
+
+/*
+ * userspace_clear_region
+ *
+ * This function must not block.
+ * So, the alloc can't block.  In the worst case, it is ok to
+ * fail.  It would simply mean we can't clear the region.
+ * Does nothing to current sync context, but does mean
+ * the region will be re-sync'ed on a reload of the mirror
+ * even though it is in-sync.
+ */
+static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
+{
+	unsigned long flags;
+	struct log_c *lc = log->context;
+	struct flush_entry *fe;
+
+	/*
+	 * If we fail to allocate, we skip the clearing of
+	 * the region.  This doesn't hurt us in any way, except
+	 * to cause the region to be resync'ed when the
+	 * device is activated next time.
+	 */
+	fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
+	if (!fe) {
+		DMERR("Failed to allocate memory to clear region.");
+		return;
+	}
+
+	spin_lock_irqsave(&lc->flush_lock, flags);
+	fe->type = DM_ULOG_CLEAR_REGION;
+	fe->region = region;
+	list_add(&fe->list, &lc->flush_list);
+	spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+	return;
+}
+
+/*
+ * userspace_get_resync_work
+ *
+ * Get a region that needs recovery.  It is valid to return
+ * an error for this function.
+ *
+ * Returns: 1 if region filled, 0 if no work, <0 on error
+ */
+static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
+{
+	int r;
+	size_t rdata_size;
+	struct log_c *lc = log->context;
+	struct {
+		int64_t i; /* 64-bit for mix arch compatibility */
+		region_t r;
+	} pkg;
+
+	if (lc->in_sync_hint >= lc->region_count)
+		return 0;
+
+	rdata_size = sizeof(pkg);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
+				 NULL, 0,
+				 (char *)&pkg, &rdata_size);
+
+	*region = pkg.r;
+	return (r) ? r : (int)pkg.i;
+}
+
+/*
+ * userspace_set_region_sync
+ *
+ * Set the sync status of a given region.  This function
+ * must not fail.
+ */
+static void userspace_set_region_sync(struct dm_dirty_log *log,
+				      region_t region, int in_sync)
+{
+	int r;
+	struct log_c *lc = log->context;
+	struct {
+		region_t r;
+		int64_t i;
+	} pkg;
+
+	pkg.r = region;
+	pkg.i = (int64_t)in_sync;
+
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
+				 (char *)&pkg, sizeof(pkg),
+				 NULL, NULL);
+
+	/*
+	 * It would be nice to be able to report failures.
+	 * However, it is easy emough to detect and resolve.
+	 */
+	return;
+}
+
+/*
+ * userspace_get_sync_count
+ *
+ * If there is any sort of failure when consulting the server,
+ * we assume that the sync count is zero.
+ *
+ * Returns: sync count on success, 0 on failure
+ */
+static region_t userspace_get_sync_count(struct dm_dirty_log *log)
+{
+	int r;
+	size_t rdata_size;
+	uint64_t sync_count;
+	struct log_c *lc = log->context;
+
+	rdata_size = sizeof(sync_count);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
+				 NULL, 0,
+				 (char *)&sync_count, &rdata_size);
+
+	if (r)
+		return 0;
+
+	if (sync_count >= lc->region_count)
+		lc->in_sync_hint = lc->region_count;
+
+	return (region_t)sync_count;
+}
+
+/*
+ * userspace_status
+ *
+ * Returns: amount of space consumed
+ */
+static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
+			    char *result, unsigned maxlen)
+{
+	int r = 0;
+	char *table_args;
+	size_t sz = (size_t)maxlen;
+	struct log_c *lc = log->context;
+
+	switch (status_type) {
+	case STATUSTYPE_INFO:
+		r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
+					 NULL, 0,
+					 result, &sz);
+
+		if (r) {
+			sz = 0;
+			DMEMIT("%s 1 COM_FAILURE", log->type->name);
+		}
+		break;
+	case STATUSTYPE_TABLE:
+		sz = 0;
+		table_args = strchr(lc->usr_argv_str, ' ');
+		BUG_ON(!table_args); /* There will always be a ' ' */
+		table_args++;
+
+		DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc,
+		       lc->uuid, table_args);
+		break;
+	}
+	return (r) ? 0 : (int)sz;
+}
+
+/*
+ * userspace_is_remote_recovering
+ *
+ * Returns: 1 if region recovering, 0 otherwise
+ */
+static int userspace_is_remote_recovering(struct dm_dirty_log *log,
+					  region_t region)
+{
+	int r;
+	uint64_t region64 = region;
+	struct log_c *lc = log->context;
+	static unsigned long long limit;
+	struct {
+		int64_t is_recovering;
+		uint64_t in_sync_hint;
+	} pkg;
+	size_t rdata_size = sizeof(pkg);
+
+	/*
+	 * Once the mirror has been reported to be in-sync,
+	 * it will never again ask for recovery work.  So,
+	 * we can safely say there is not a remote machine
+	 * recovering if the device is in-sync.  (in_sync_hint
+	 * must be reset at resume time.)
+	 */
+	if (region < lc->in_sync_hint)
+		return 0;
+	else if (jiffies < limit)
+		return 1;
+
+	limit = jiffies + (HZ / 4);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
+				 (char *)&region64, sizeof(region64),
+				 (char *)&pkg, &rdata_size);
+	if (r)
+		return 1;
+
+	lc->in_sync_hint = pkg.in_sync_hint;
+
+	return (int)pkg.is_recovering;
+}
+
+static struct dm_dirty_log_type _userspace_type = {
+	.name = "userspace",
+	.module = THIS_MODULE,
+	.ctr = userspace_ctr,
+	.dtr = userspace_dtr,
+	.presuspend = userspace_presuspend,
+	.postsuspend = userspace_postsuspend,
+	.resume = userspace_resume,
+	.get_region_size = userspace_get_region_size,
+	.is_clean = userspace_is_clean,
+	.in_sync = userspace_in_sync,
+	.flush = userspace_flush,
+	.mark_region = userspace_mark_region,
+	.clear_region = userspace_clear_region,
+	.get_resync_work = userspace_get_resync_work,
+	.set_region_sync = userspace_set_region_sync,
+	.get_sync_count = userspace_get_sync_count,
+	.status = userspace_status,
+	.is_remote_recovering = userspace_is_remote_recovering,
+};
+
+static int __init userspace_dirty_log_init(void)
+{
+	int r = 0;
+
+	flush_entry_pool = mempool_create(100, flush_entry_alloc,
+					  flush_entry_free, NULL);
+
+	if (!flush_entry_pool) {
+		DMWARN("Unable to create flush_entry_pool:  No memory.");
+		return -ENOMEM;
+	}
+
+	r = dm_ulog_tfr_init();
+	if (r) {
+		DMWARN("Unable to initialize userspace log communications");
+		mempool_destroy(flush_entry_pool);
+		return r;
+	}
+
+	r = dm_dirty_log_type_register(&_userspace_type);
+	if (r) {
+		DMWARN("Couldn't register userspace dirty log type");
+		dm_ulog_tfr_exit();
+		mempool_destroy(flush_entry_pool);
+		return r;
+	}
+
+	DMINFO("version 1.0.0 loaded");
+	return 0;
+}
+
+static void __exit userspace_dirty_log_exit(void)
+{
+	dm_dirty_log_type_unregister(&_userspace_type);
+	dm_ulog_tfr_exit();
+	mempool_destroy(flush_entry_pool);
+
+	DMINFO("version 1.0.0 unloaded");
+	return;
+}
+
+module_init(userspace_dirty_log_init);
+module_exit(userspace_dirty_log_exit);
+
+MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
+MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
--- a/kernel/drivers/md/dm-log-userspace-transfer.c
+++ b/kernel/drivers/md/dm-log-userspace-transfer.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <net/sock.h>
+#include <linux/workqueue.h>
+#include <linux/connector.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+
+#include "dm-log-userspace-transfer.h"
+
+static uint32_t dm_ulog_seq;
+
+/*
+ * Netlink/Connector is an unreliable protocol.  How long should
+ * we wait for a response before assuming it was lost and retrying?
+ * (If we do receive a response after this time, it will be discarded
+ * and the response to the resent request will be waited for.
+ */
+#define DM_ULOG_RETRY_TIMEOUT (15 * HZ)
+
+/*
+ * Pre-allocated space for speed
+ */
+#define DM_ULOG_PREALLOCED_SIZE 512
+static struct cn_msg *prealloced_cn_msg;
+static struct dm_ulog_request *prealloced_ulog_tfr;
+
+static struct cb_id ulog_cn_id = {
+	.idx = CN_IDX_DM,
+	.val = CN_VAL_DM_USERSPACE_LOG
+};
+
+static DEFINE_MUTEX(dm_ulog_lock);
+
+struct receiving_pkg {
+	struct list_head list;
+	struct completion complete;
+
+	uint32_t seq;
+
+	int error;
+	size_t *data_size;
+	char *data;
+};
+
+static DEFINE_SPINLOCK(receiving_list_lock);
+static struct list_head receiving_list;
+
+static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
+{
+	int r;
+	struct cn_msg *msg = prealloced_cn_msg;
+
+	memset(msg, 0, sizeof(struct cn_msg));
+
+	msg->id.idx = ulog_cn_id.idx;
+	msg->id.val = ulog_cn_id.val;
+	msg->ack = 0;
+	msg->seq = tfr->seq;
+	msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
+
+	r = cn_netlink_send(msg, 0, gfp_any());
+
+	return r;
+}
+
+/*
+ * Parameters for this function can be either msg or tfr, but not
+ * both.  This function fills in the reply for a waiting request.
+ * If just msg is given, then the reply is simply an ACK from userspace
+ * that the request was received.
+ *
+ * Returns: 0 on success, -ENOENT on failure
+ */
+static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
+{
+	uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0;
+	struct receiving_pkg *pkg;
+
+	/*
+	 * The 'receiving_pkg' entries in this list are statically
+	 * allocated on the stack in 'dm_consult_userspace'.
+	 * Each process that is waiting for a reply from the user
+	 * space server will have an entry in this list.
+	 *
+	 * We are safe to do it this way because the stack space
+	 * is unique to each process, but still addressable by
+	 * other processes.
+	 */
+	list_for_each_entry(pkg, &receiving_list, list) {
+		if (rtn_seq != pkg->seq)
+			continue;
+
+		if (msg) {
+			pkg->error = -msg->ack;
+			/*
+			 * If we are trying again, we will need to know our
+			 * storage capacity.  Otherwise, along with the
+			 * error code, we make explicit that we have no data.
+			 */
+			if (pkg->error != -EAGAIN)
+				*(pkg->data_size) = 0;
+		} else if (tfr->data_size > *(pkg->data_size)) {
+			DMERR("Insufficient space to receive package [%u] "
+			      "(%u vs %zu)", tfr->request_type,
+			      tfr->data_size, *(pkg->data_size));
+
+			*(pkg->data_size) = 0;
+			pkg->error = -ENOSPC;
+		} else {
+			pkg->error = tfr->error;
+			memcpy(pkg->data, tfr->data, tfr->data_size);
+			*(pkg->data_size) = tfr->data_size;
+		}
+		complete(&pkg->complete);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+/*
+ * This is the connector callback that delivers data
+ * that was sent from userspace.
+ */
+static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
+{
+	struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
+
+	if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN))
+		return;
+
+	spin_lock(&receiving_list_lock);
+	if (msg->len == 0)
+		fill_pkg(msg, NULL);
+	else if (msg->len < sizeof(*tfr))
+		DMERR("Incomplete message received (expected %u, got %u): [%u]",
+		      (unsigned)sizeof(*tfr), msg->len, msg->seq);
+	else
+		fill_pkg(NULL, tfr);
+	spin_unlock(&receiving_list_lock);
+}
+
+/**
+ * dm_consult_userspace
+ * @uuid: log's universal unique identifier (must be DM_UUID_LEN in size)
+ * @luid: log's local unique identifier
+ * @request_type:  found in include/linux/dm-log-userspace.h
+ * @data: data to tx to the server
+ * @data_size: size of data in bytes
+ * @rdata: place to put return data from server
+ * @rdata_size: value-result (amount of space given/amount of space used)
+ *
+ * rdata_size is undefined on failure.
+ *
+ * Memory used to communicate with userspace is zero'ed
+ * before populating to ensure that no unwanted bits leak
+ * from kernel space to user-space.  All userspace log communications
+ * between kernel and user space go through this function.
+ *
+ * Returns: 0 on success, -EXXX on failure
+ **/
+int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
+			 char *data, size_t data_size,
+			 char *rdata, size_t *rdata_size)
+{
+	int r = 0;
+	size_t dummy = 0;
+	int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg);
+	struct dm_ulog_request *tfr = prealloced_ulog_tfr;
+	struct receiving_pkg pkg;
+
+	/*
+	 * Given the space needed to hold the 'struct cn_msg' and
+	 * 'struct dm_ulog_request' - do we have enough payload
+	 * space remaining?
+	 */
+	if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) {
+		DMINFO("Size of tfr exceeds preallocated size");
+		return -EINVAL;
+	}
+
+	if (!rdata_size)
+		rdata_size = &dummy;
+resend:
+	/*
+	 * We serialize the sending of requests so we can
+	 * use the preallocated space.
+	 */
+	mutex_lock(&dm_ulog_lock);
+
+	memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
+	memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+	tfr->luid = luid;
+	tfr->seq = dm_ulog_seq++;
+
+	/*
+	 * Must be valid request type (all other bits set to
+	 * zero).  This reserves other bits for possible future
+	 * use.
+	 */
+	tfr->request_type = request_type & DM_ULOG_REQUEST_MASK;
+
+	tfr->data_size = data_size;
+	if (data && data_size)
+		memcpy(tfr->data, data, data_size);
+
+	memset(&pkg, 0, sizeof(pkg));
+	init_completion(&pkg.complete);
+	pkg.seq = tfr->seq;
+	pkg.data_size = rdata_size;
+	pkg.data = rdata;
+	spin_lock(&receiving_list_lock);
+	list_add(&(pkg.list), &receiving_list);
+	spin_unlock(&receiving_list_lock);
+
+	r = dm_ulog_sendto_server(tfr);
+
+	mutex_unlock(&dm_ulog_lock);
+
+	if (r) {
+		DMERR("Unable to send log request [%u] to userspace: %d",
+		      request_type, r);
+		spin_lock(&receiving_list_lock);
+		list_del_init(&(pkg.list));
+		spin_unlock(&receiving_list_lock);
+
+		goto out;
+	}
+
+	r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
+	spin_lock(&receiving_list_lock);
+	list_del_init(&(pkg.list));
+	spin_unlock(&receiving_list_lock);
+	if (!r) {
+		DMWARN("[%s] Request timed out: [%u/%u] - retrying",
+		       (strlen(uuid) > 8) ?
+		       (uuid + (strlen(uuid) - 8)) : (uuid),
+		       request_type, pkg.seq);
+		goto resend;
+	}
+
+	r = pkg.error;
+	if (r == -EAGAIN)
+		goto resend;
+
+out:
+	return r;
+}
+
+int dm_ulog_tfr_init(void)
+{
+	int r;
+	void *prealloced;
+
+	INIT_LIST_HEAD(&receiving_list);
+
+	prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL);
+	if (!prealloced)
+		return -ENOMEM;
+
+	prealloced_cn_msg = prealloced;
+	prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg);
+
+	r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
+	if (r) {
+		cn_del_callback(&ulog_cn_id);
+		return r;
+	}
+
+	return 0;
+}
+
+void dm_ulog_tfr_exit(void)
+{
+	cn_del_callback(&ulog_cn_id);
+	kfree(prealloced_cn_msg);
+}
--- a/kernel/drivers/md/dm-log-userspace-transfer.h
+++ b/kernel/drivers/md/dm-log-userspace-transfer.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef __DM_LOG_USERSPACE_TRANSFER_H__
+#define __DM_LOG_USERSPACE_TRANSFER_H__
+
+#define DM_MSG_PREFIX "dm-log-userspace"
+
+int dm_ulog_tfr_init(void);
+void dm_ulog_tfr_exit(void);
+int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
+			 char *data, size_t data_size,
+			 char *rdata, size_t *rdata_size);
+
+#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */
--- a/kernel/drivers/md/dm-log.c
+++ b/kernel/drivers/md/dm-log.c
@@ -0,0 +1,843 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/dm-io.h>
+#include <linux/dm-dirty-log.h>
+
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "dirty region log"
+
+static LIST_HEAD(_log_types);
+static DEFINE_SPINLOCK(_lock);
+
+static struct dm_dirty_log_type *__find_dirty_log_type(const char *name)
+{
+	struct dm_dirty_log_type *log_type;
+
+	list_for_each_entry(log_type, &_log_types, list)
+		if (!strcmp(name, log_type->name))
+			return log_type;
+
+	return NULL;
+}
+
+static struct dm_dirty_log_type *_get_dirty_log_type(const char *name)
+{
+	struct dm_dirty_log_type *log_type;
+
+	spin_lock(&_lock);
+
+	log_type = __find_dirty_log_type(name);
+	if (log_type && !try_module_get(log_type->module))
+		log_type = NULL;
+
+	spin_unlock(&_lock);
+
+	return log_type;
+}
+
+/*
+ * get_type
+ * @type_name
+ *
+ * Attempt to retrieve the dm_dirty_log_type by name.  If not already
+ * available, attempt to load the appropriate module.
+ *
+ * Log modules are named "dm-log-" followed by the 'type_name'.
+ * Modules may contain multiple types.
+ * This function will first try the module "dm-log-<type_name>",
+ * then truncate 'type_name' on the last '-' and try again.
+ *
+ * For example, if type_name was "clustered-disk", it would search
+ * 'dm-log-clustered-disk' then 'dm-log-clustered'.
+ *
+ * Returns: dirty_log_type* on success, NULL on failure
+ */
+static struct dm_dirty_log_type *get_type(const char *type_name)
+{
+	char *p, *type_name_dup;
+	struct dm_dirty_log_type *log_type;
+
+	if (!type_name)
+		return NULL;
+
+	log_type = _get_dirty_log_type(type_name);
+	if (log_type)
+		return log_type;
+
+	type_name_dup = kstrdup(type_name, GFP_KERNEL);
+	if (!type_name_dup) {
+		DMWARN("No memory left to attempt log module load for \"%s\"",
+		       type_name);
+		return NULL;
+	}
+
+	while (request_module("dm-log-%s", type_name_dup) ||
+	       !(log_type = _get_dirty_log_type(type_name))) {
+		p = strrchr(type_name_dup, '-');
+		if (!p)
+			break;
+		p[0] = '\0';
+	}
+
+	if (!log_type)
+		DMWARN("Module for logging type \"%s\" not found.", type_name);
+
+	kfree(type_name_dup);
+
+	return log_type;
+}
+
+static void put_type(struct dm_dirty_log_type *type)
+{
+	if (!type)
+		return;
+
+	spin_lock(&_lock);
+	if (!__find_dirty_log_type(type->name))
+		goto out;
+
+	module_put(type->module);
+
+out:
+	spin_unlock(&_lock);
+}
+
+int dm_dirty_log_type_register(struct dm_dirty_log_type *type)
+{
+	int r = 0;
+
+	spin_lock(&_lock);
+	if (!__find_dirty_log_type(type->name))
+		list_add(&type->list, &_log_types);
+	else
+		r = -EEXIST;
+	spin_unlock(&_lock);
+
+	return r;
+}
+EXPORT_SYMBOL(dm_dirty_log_type_register);
+
+int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
+{
+	spin_lock(&_lock);
+
+	if (!__find_dirty_log_type(type->name)) {
+		spin_unlock(&_lock);
+		return -EINVAL;
+	}
+
+	list_del(&type->list);
+
+	spin_unlock(&_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(dm_dirty_log_type_unregister);
+
+struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
+					 struct dm_target *ti,
+					 unsigned int argc, char **argv)
+{
+	struct dm_dirty_log_type *type;
+	struct dm_dirty_log *log;
+
+	log = kmalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		return NULL;
+
+	type = get_type(type_name);
+	if (!type) {
+		kfree(log);
+		return NULL;
+	}
+
+	log->type = type;
+	if (type->ctr(log, ti, argc, argv)) {
+		kfree(log);
+		put_type(type);
+		return NULL;
+	}
+
+	return log;
+}
+EXPORT_SYMBOL(dm_dirty_log_create);
+
+void dm_dirty_log_destroy(struct dm_dirty_log *log)
+{
+	log->type->dtr(log);
+	put_type(log->type);
+	kfree(log);
+}
+EXPORT_SYMBOL(dm_dirty_log_destroy);
+
+/*-----------------------------------------------------------------
+ * Persistent and core logs share a lot of their implementation.
+ * FIXME: need a reload method to be called from a resume
+ *---------------------------------------------------------------*/
+/*
+ * Magic for persistent mirrors: "MiRr"
+ */
+#define MIRROR_MAGIC 0x4D695272
+
+/*
+ * The on-disk version of the metadata.
+ */
+#define MIRROR_DISK_VERSION 2
+#define LOG_OFFSET 2
+
+struct log_header {
+	uint32_t magic;
+
+	/*
+	 * Simple, incrementing version. no backward
+	 * compatibility.
+	 */
+	uint32_t version;
+	sector_t nr_regions;
+};
+
+struct log_c {
+	struct dm_target *ti;
+	int touched;
+	uint32_t region_size;
+	unsigned int region_count;
+	region_t sync_count;
+
+	unsigned bitset_uint32_count;
+	uint32_t *clean_bits;
+	uint32_t *sync_bits;
+	uint32_t *recovering_bits;	/* FIXME: this seems excessive */
+
+	int sync_search;
+
+	/* Resync flag */
+	enum sync {
+		DEFAULTSYNC,	/* Synchronize if necessary */
+		NOSYNC,		/* Devices known to be already in sync */
+		FORCESYNC,	/* Force a sync to happen */
+	} sync;
+
+	struct dm_io_request io_req;
+
+	/*
+	 * Disk log fields
+	 */
+	int log_dev_failed;
+	struct dm_dev *log_dev;
+	struct log_header header;
+
+	struct dm_io_region header_location;
+	struct log_header *disk_header;
+};
+
+/*
+ * The touched member needs to be updated every time we access
+ * one of the bitsets.
+ */
+static inline int log_test_bit(uint32_t *bs, unsigned bit)
+{
+	return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0;
+}
+
+static inline void log_set_bit(struct log_c *l,
+			       uint32_t *bs, unsigned bit)
+{
+	ext2_set_bit(bit, (unsigned long *) bs);
+	l->touched = 1;
+}
+
+static inline void log_clear_bit(struct log_c *l,
+				 uint32_t *bs, unsigned bit)
+{
+	ext2_clear_bit(bit, (unsigned long *) bs);
+	l->touched = 1;
+}
+
+/*----------------------------------------------------------------
+ * Header IO
+ *--------------------------------------------------------------*/
+static void header_to_disk(struct log_header *core, struct log_header *disk)
+{
+	disk->magic = cpu_to_le32(core->magic);
+	disk->version = cpu_to_le32(core->version);
+	disk->nr_regions = cpu_to_le64(core->nr_regions);
+}
+
+static void header_from_disk(struct log_header *core, struct log_header *disk)
+{
+	core->magic = le32_to_cpu(disk->magic);
+	core->version = le32_to_cpu(disk->version);
+	core->nr_regions = le64_to_cpu(disk->nr_regions);
+}
+
+static int rw_header(struct log_c *lc, int rw)
+{
+	lc->io_req.bi_rw = rw;
+
+	return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
+}
+
+static int read_header(struct log_c *log)
+{
+	int r;
+
+	r = rw_header(log, READ);
+	if (r)
+		return r;
+
+	header_from_disk(&log->header, log->disk_header);
+
+	/* New log required? */
+	if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) {
+		log->header.magic = MIRROR_MAGIC;
+		log->header.version = MIRROR_DISK_VERSION;
+		log->header.nr_regions = 0;
+	}
+
+#ifdef __LITTLE_ENDIAN
+	if (log->header.version == 1)
+		log->header.version = 2;
+#endif
+
+	if (log->header.version != MIRROR_DISK_VERSION) {
+		DMWARN("incompatible disk log version");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int _check_region_size(struct dm_target *ti, uint32_t region_size)
+{
+	if (region_size < 2 || region_size > ti->len)
+		return 0;
+
+	if (!is_power_of_2(region_size))
+		return 0;
+
+	return 1;
+}
+
+/*----------------------------------------------------------------
+ * core log constructor/destructor
+ *
+ * argv contains region_size followed optionally by [no]sync
+ *--------------------------------------------------------------*/
+#define BYTE_SHIFT 3
+static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
+			      unsigned int argc, char **argv,
+			      struct dm_dev *dev)
+{
+	enum sync sync = DEFAULTSYNC;
+
+	struct log_c *lc;
+	uint32_t region_size;
+	unsigned int region_count;
+	size_t bitset_size, buf_size;
+	int r;
+
+	if (argc < 1 || argc > 2) {
+		DMWARN("wrong number of arguments to dirty region log");
+		return -EINVAL;
+	}
+
+	if (argc > 1) {
+		if (!strcmp(argv[1], "sync"))
+			sync = FORCESYNC;
+		else if (!strcmp(argv[1], "nosync"))
+			sync = NOSYNC;
+		else {
+			DMWARN("unrecognised sync argument to "
+			       "dirty region log: %s", argv[1]);
+			return -EINVAL;
+		}
+	}
+
+	if (sscanf(argv[0], "%u", &region_size) != 1 ||
+	    !_check_region_size(ti, region_size)) {
+		DMWARN("invalid region size %s", argv[0]);
+		return -EINVAL;
+	}
+
+	region_count = dm_sector_div_up(ti->len, region_size);
+
+	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+	if (!lc) {
+		DMWARN("couldn't allocate core log");
+		return -ENOMEM;
+	}
+
+	lc->ti = ti;
+	lc->touched = 0;
+	lc->region_size = region_size;
+	lc->region_count = region_count;
+	lc->sync = sync;
+
+	/*
+	 * Work out how many "unsigned long"s we need to hold the bitset.
+	 */
+	bitset_size = dm_round_up(region_count,
+				  sizeof(*lc->clean_bits) << BYTE_SHIFT);
+	bitset_size >>= BYTE_SHIFT;
+
+	lc->bitset_uint32_count = bitset_size / sizeof(*lc->clean_bits);
+
+	/*
+	 * Disk log?
+	 */
+	if (!dev) {
+		lc->clean_bits = vmalloc(bitset_size);
+		if (!lc->clean_bits) {
+			DMWARN("couldn't allocate clean bitset");
+			kfree(lc);
+			return -ENOMEM;
+		}
+		lc->disk_header = NULL;
+	} else {
+		lc->log_dev = dev;
+		lc->log_dev_failed = 0;
+		lc->header_location.bdev = lc->log_dev->bdev;
+		lc->header_location.sector = 0;
+
+		/*
+		 * Buffer holds both header and bitset.
+		 */
+		buf_size =
+		    dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size,
+				bdev_logical_block_size(lc->header_location.
+							    bdev));
+
+		if (buf_size > i_size_read(dev->bdev->bd_inode)) {
+			DMWARN("log device %s too small: need %llu bytes",
+				dev->name, (unsigned long long)buf_size);
+			kfree(lc);
+			return -EINVAL;
+		}
+
+		lc->header_location.count = buf_size >> SECTOR_SHIFT;
+
+		lc->io_req.mem.type = DM_IO_VMA;
+		lc->io_req.notify.fn = NULL;
+		lc->io_req.client = dm_io_client_create(dm_div_up(buf_size,
+								   PAGE_SIZE));
+		if (IS_ERR(lc->io_req.client)) {
+			r = PTR_ERR(lc->io_req.client);
+			DMWARN("couldn't allocate disk io client");
+			kfree(lc);
+			return -ENOMEM;
+		}
+
+		lc->disk_header = vmalloc(buf_size);
+		if (!lc->disk_header) {
+			DMWARN("couldn't allocate disk log buffer");
+			dm_io_client_destroy(lc->io_req.client);
+			kfree(lc);
+			return -ENOMEM;
+		}
+
+		lc->io_req.mem.ptr.vma = lc->disk_header;
+		lc->clean_bits = (void *)lc->disk_header +
+				 (LOG_OFFSET << SECTOR_SHIFT);
+	}
+
+	memset(lc->clean_bits, -1, bitset_size);
+
+	lc->sync_bits = vmalloc(bitset_size);
+	if (!lc->sync_bits) {
+		DMWARN("couldn't allocate sync bitset");
+		if (!dev)
+			vfree(lc->clean_bits);
+		else
+			dm_io_client_destroy(lc->io_req.client);
+		vfree(lc->disk_header);
+		kfree(lc);
+		return -ENOMEM;
+	}
+	memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
+	lc->sync_count = (sync == NOSYNC) ? region_count : 0;
+
+	lc->recovering_bits = vmalloc(bitset_size);
+	if (!lc->recovering_bits) {
+		DMWARN("couldn't allocate sync bitset");
+		vfree(lc->sync_bits);
+		if (!dev)
+			vfree(lc->clean_bits);
+		else
+			dm_io_client_destroy(lc->io_req.client);
+		vfree(lc->disk_header);
+		kfree(lc);
+		return -ENOMEM;
+	}
+	memset(lc->recovering_bits, 0, bitset_size);
+	lc->sync_search = 0;
+	log->context = lc;
+
+	return 0;
+}
+
+static int core_ctr(struct dm_dirty_log *log, struct dm_target *ti,
+		    unsigned int argc, char **argv)
+{
+	return create_log_context(log, ti, argc, argv, NULL);
+}
+
+static void destroy_log_context(struct log_c *lc)
+{
+	vfree(lc->sync_bits);
+	vfree(lc->recovering_bits);
+	kfree(lc);
+}
+
+static void core_dtr(struct dm_dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+
+	vfree(lc->clean_bits);
+	destroy_log_context(lc);
+}
+
+/*----------------------------------------------------------------
+ * disk log constructor/destructor
+ *
+ * argv contains log_device region_size followed optionally by [no]sync
+ *--------------------------------------------------------------*/
+static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti,
+		    unsigned int argc, char **argv)
+{
+	int r;
+	struct dm_dev *dev;
+
+	if (argc < 2 || argc > 3) {
+		DMWARN("wrong number of arguments to disk dirty region log");
+		return -EINVAL;
+	}
+
+	r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */,
+			  FMODE_READ | FMODE_WRITE, &dev);
+	if (r)
+		return r;
+
+	r = create_log_context(log, ti, argc - 1, argv + 1, dev);
+	if (r) {
+		dm_put_device(ti, dev);
+		return r;
+	}
+
+	return 0;
+}
+
+static void disk_dtr(struct dm_dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+
+	dm_put_device(lc->ti, lc->log_dev);
+	vfree(lc->disk_header);
+	dm_io_client_destroy(lc->io_req.client);
+	destroy_log_context(lc);
+}
+
+static int count_bits32(uint32_t *addr, unsigned size)
+{
+	int count = 0, i;
+
+	for (i = 0; i < size; i++) {
+		count += hweight32(*(addr+i));
+	}
+	return count;
+}
+
+static void fail_log_device(struct log_c *lc)
+{
+	if (lc->log_dev_failed)
+		return;
+
+	lc->log_dev_failed = 1;
+	dm_table_event(lc->ti->table);
+}
+
+static int disk_resume(struct dm_dirty_log *log)
+{
+	int r;
+	unsigned i;
+	struct log_c *lc = (struct log_c *) log->context;
+	size_t size = lc->bitset_uint32_count * sizeof(uint32_t);
+
+	/* read the disk header */
+	r = read_header(lc);
+	if (r) {
+		DMWARN("%s: Failed to read header on dirty region log device",
+		       lc->log_dev->name);
+		fail_log_device(lc);
+		/*
+		 * If the log device cannot be read, we must assume
+		 * all regions are out-of-sync.  If we simply return
+		 * here, the state will be uninitialized and could
+		 * lead us to return 'in-sync' status for regions
+		 * that are actually 'out-of-sync'.
+		 */
+		lc->header.nr_regions = 0;
+	}
+
+	/* set or clear any new bits -- device has grown */
+	if (lc->sync == NOSYNC)
+		for (i = lc->header.nr_regions; i < lc->region_count; i++)
+			/* FIXME: amazingly inefficient */
+			log_set_bit(lc, lc->clean_bits, i);
+	else
+		for (i = lc->header.nr_regions; i < lc->region_count; i++)
+			/* FIXME: amazingly inefficient */
+			log_clear_bit(lc, lc->clean_bits, i);
+
+	/* clear any old bits -- device has shrunk */
+	for (i = lc->region_count; i % (sizeof(*lc->clean_bits) << BYTE_SHIFT); i++)
+		log_clear_bit(lc, lc->clean_bits, i);
+
+	/* copy clean across to sync */
+	memcpy(lc->sync_bits, lc->clean_bits, size);
+	lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
+	lc->sync_search = 0;
+
+	/* set the correct number of regions in the header */
+	lc->header.nr_regions = lc->region_count;
+
+	header_to_disk(&lc->header, lc->disk_header);
+
+	/* write the new header */
+	r = rw_header(lc, WRITE);
+	if (r) {
+		DMWARN("%s: Failed to write header on dirty region log device",
+		       lc->log_dev->name);
+		fail_log_device(lc);
+	}
+
+	return r;
+}
+
+static uint32_t core_get_region_size(struct dm_dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+	return lc->region_size;
+}
+
+static int core_resume(struct dm_dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+	lc->sync_search = 0;
+	return 0;
+}
+
+static int core_is_clean(struct dm_dirty_log *log, region_t region)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+	return log_test_bit(lc->clean_bits, region);
+}
+
+static int core_in_sync(struct dm_dirty_log *log, region_t region, int block)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+	return log_test_bit(lc->sync_bits, region);
+}
+
+static int core_flush(struct dm_dirty_log *log)
+{
+	/* no op */
+	return 0;
+}
+
+static int disk_flush(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = (struct log_c *) log->context;
+
+	/* only write if the log has changed */
+	if (!lc->touched)
+		return 0;
+
+	r = rw_header(lc, WRITE);
+	if (r)
+		fail_log_device(lc);
+	else
+		lc->touched = 0;
+
+	return r;
+}
+
+static void core_mark_region(struct dm_dirty_log *log, region_t region)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+	log_clear_bit(lc, lc->clean_bits, region);
+}
+
+static void core_clear_region(struct dm_dirty_log *log, region_t region)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+	log_set_bit(lc, lc->clean_bits, region);
+}
+
+static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+
+	if (lc->sync_search >= lc->region_count)
+		return 0;
+
+	do {
+		*region = ext2_find_next_zero_bit(
+					     (unsigned long *) lc->sync_bits,
+					     lc->region_count,
+					     lc->sync_search);
+		lc->sync_search = *region + 1;
+
+		if (*region >= lc->region_count)
+			return 0;
+
+	} while (log_test_bit(lc->recovering_bits, *region));
+
+	log_set_bit(lc, lc->recovering_bits, *region);
+	return 1;
+}
+
+static void core_set_region_sync(struct dm_dirty_log *log, region_t region,
+				 int in_sync)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+
+	log_clear_bit(lc, lc->recovering_bits, region);
+	if (in_sync) {
+		log_set_bit(lc, lc->sync_bits, region);
+                lc->sync_count++;
+        } else if (log_test_bit(lc->sync_bits, region)) {
+		lc->sync_count--;
+		log_clear_bit(lc, lc->sync_bits, region);
+	}
+}
+
+static region_t core_get_sync_count(struct dm_dirty_log *log)
+{
+        struct log_c *lc = (struct log_c *) log->context;
+
+        return lc->sync_count;
+}
+
+#define	DMEMIT_SYNC \
+	if (lc->sync != DEFAULTSYNC) \
+		DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "")
+
+static int core_status(struct dm_dirty_log *log, status_type_t status,
+		       char *result, unsigned int maxlen)
+{
+	int sz = 0;
+	struct log_c *lc = log->context;
+
+	switch(status) {
+	case STATUSTYPE_INFO:
+		DMEMIT("1 %s", log->type->name);
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%s %u %u ", log->type->name,
+		       lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size);
+		DMEMIT_SYNC;
+	}
+
+	return sz;
+}
+
+static int disk_status(struct dm_dirty_log *log, status_type_t status,
+		       char *result, unsigned int maxlen)
+{
+	int sz = 0;
+	struct log_c *lc = log->context;
+
+	switch(status) {
+	case STATUSTYPE_INFO:
+		DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
+		       lc->log_dev_failed ? 'D' : 'A');
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%s %u %s %u ", log->type->name,
+		       lc->sync == DEFAULTSYNC ? 2 : 3, lc->log_dev->name,
+		       lc->region_size);
+		DMEMIT_SYNC;
+	}
+
+	return sz;
+}
+
+static struct dm_dirty_log_type _core_type = {
+	.name = "core",
+	.module = THIS_MODULE,
+	.ctr = core_ctr,
+	.dtr = core_dtr,
+	.resume = core_resume,
+	.get_region_size = core_get_region_size,
+	.is_clean = core_is_clean,
+	.in_sync = core_in_sync,
+	.flush = core_flush,
+	.mark_region = core_mark_region,
+	.clear_region = core_clear_region,
+	.get_resync_work = core_get_resync_work,
+	.set_region_sync = core_set_region_sync,
+	.get_sync_count = core_get_sync_count,
+	.status = core_status,
+};
+
+static struct dm_dirty_log_type _disk_type = {
+	.name = "disk",
+	.module = THIS_MODULE,
+	.ctr = disk_ctr,
+	.dtr = disk_dtr,
+	.postsuspend = disk_flush,
+	.resume = disk_resume,
+	.get_region_size = core_get_region_size,
+	.is_clean = core_is_clean,
+	.in_sync = core_in_sync,
+	.flush = disk_flush,
+	.mark_region = core_mark_region,
+	.clear_region = core_clear_region,
+	.get_resync_work = core_get_resync_work,
+	.set_region_sync = core_set_region_sync,
+	.get_sync_count = core_get_sync_count,
+	.status = disk_status,
+};
+
+static int __init dm_dirty_log_init(void)
+{
+	int r;
+
+	r = dm_dirty_log_type_register(&_core_type);
+	if (r)
+		DMWARN("couldn't register core log");
+
+	r = dm_dirty_log_type_register(&_disk_type);
+	if (r) {
+		DMWARN("couldn't register disk type");
+		dm_dirty_log_type_unregister(&_core_type);
+	}
+
+	return r;
+}
+
+static void __exit dm_dirty_log_exit(void)
+{
+	dm_dirty_log_type_unregister(&_disk_type);
+	dm_dirty_log_type_unregister(&_core_type);
+}
+
+module_init(dm_dirty_log_init);
+module_exit(dm_dirty_log_exit);
+
+MODULE_DESCRIPTION(DM_NAME " dirty region log");
+MODULE_AUTHOR("Joe Thornber, Heinz Mauelshagen <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
--- a/kernel/drivers/md/dm-mpath.c
+++ b/kernel/drivers/md/dm-mpath.c
--- a/kernel/drivers/md/dm-mpath.h
+++ b/kernel/drivers/md/dm-mpath.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Multipath.
+ */
+
+#ifndef	DM_MPATH_H
+#define	DM_MPATH_H
+
+struct dm_dev;
+
+struct dm_path {
+	struct dm_dev *dev;	/* Read-only */
+	void *pscontext;	/* For path-selector use */
+};
+
+/* Callback for hwh_pg_init_fn to use when complete */
+void dm_pg_init_complete(struct dm_path *path, unsigned err_flags);
+
+#endif
--- a/kernel/drivers/md/dm-path-selector.c
+++ b/kernel/drivers/md/dm-path-selector.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2003 Sistina Software.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen
+ *
+ * This file is released under the GPL.
+ *
+ * Path selector registration.
+ */
+
+#include <linux/device-mapper.h>
+
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+
+struct ps_internal {
+	struct path_selector_type pst;
+	struct list_head list;
+};
+
+#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst)
+
+static LIST_HEAD(_path_selectors);
+static DECLARE_RWSEM(_ps_lock);
+
+static struct ps_internal *__find_path_selector_type(const char *name)
+{
+	struct ps_internal *psi;
+
+	list_for_each_entry(psi, &_path_selectors, list) {
+		if (!strcmp(name, psi->pst.name))
+			return psi;
+	}
+
+	return NULL;
+}
+
+static struct ps_internal *get_path_selector(const char *name)
+{
+	struct ps_internal *psi;
+
+	down_read(&_ps_lock);
+	psi = __find_path_selector_type(name);
+	if (psi && !try_module_get(psi->pst.module))
+		psi = NULL;
+	up_read(&_ps_lock);
+
+	return psi;
+}
+
+struct path_selector_type *dm_get_path_selector(const char *name)
+{
+	struct ps_internal *psi;
+
+	if (!name)
+		return NULL;
+
+	psi = get_path_selector(name);
+	if (!psi) {
+		request_module("dm-%s", name);
+		psi = get_path_selector(name);
+	}
+
+	return psi ? &psi->pst : NULL;
+}
+
+void dm_put_path_selector(struct path_selector_type *pst)
+{
+	struct ps_internal *psi;
+
+	if (!pst)
+		return;
+
+	down_read(&_ps_lock);
+	psi = __find_path_selector_type(pst->name);
+	if (!psi)
+		goto out;
+
+	module_put(psi->pst.module);
+out:
+	up_read(&_ps_lock);
+}
+
+static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst)
+{
+	struct ps_internal *psi = kzalloc(sizeof(*psi), GFP_KERNEL);
+
+	if (psi)
+		psi->pst = *pst;
+
+	return psi;
+}
+
+int dm_register_path_selector(struct path_selector_type *pst)
+{
+	int r = 0;
+	struct ps_internal *psi = _alloc_path_selector(pst);
+
+	if (!psi)
+		return -ENOMEM;
+
+	down_write(&_ps_lock);
+
+	if (__find_path_selector_type(pst->name)) {
+		kfree(psi);
+		r = -EEXIST;
+	} else
+		list_add(&psi->list, &_path_selectors);
+
+	up_write(&_ps_lock);
+
+	return r;
+}
+
+int dm_unregister_path_selector(struct path_selector_type *pst)
+{
+	struct ps_internal *psi;
+
+	down_write(&_ps_lock);
+
+	psi = __find_path_selector_type(pst->name);
+	if (!psi) {
+		up_write(&_ps_lock);
+		return -EINVAL;
+	}
+
+	list_del(&psi->list);
+
+	up_write(&_ps_lock);
+
+	kfree(psi);
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(dm_register_path_selector);
+EXPORT_SYMBOL_GPL(dm_unregister_path_selector);
--- a/kernel/drivers/md/dm-path-selector.h
+++ b/kernel/drivers/md/dm-path-selector.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2003 Sistina Software.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen
+ *
+ * This file is released under the GPL.
+ *
+ * Path-Selector registration.
+ */
+
+#ifndef	DM_PATH_SELECTOR_H
+#define	DM_PATH_SELECTOR_H
+
+#include <linux/device-mapper.h>
+
+#include "dm-mpath.h"
+
+/*
+ * We provide an abstraction for the code that chooses which path
+ * to send some io down.
+ */
+struct path_selector_type;
+struct path_selector {
+	struct path_selector_type *type;
+	void *context;
+};
+
+/* Information about a path selector type */
+struct path_selector_type {
+	char *name;
+	struct module *module;
+
+	unsigned int table_args;
+	unsigned int info_args;
+
+	/*
+	 * Constructs a path selector object, takes custom arguments
+	 */
+	int (*create) (struct path_selector *ps, unsigned argc, char **argv);
+	void (*destroy) (struct path_selector *ps);
+
+	/*
+	 * Add an opaque path object, along with some selector specific
+	 * path args (eg, path priority).
+	 */
+	int (*add_path) (struct path_selector *ps, struct dm_path *path,
+			 int argc, char **argv, char **error);
+
+	/*
+	 * Chooses a path for this io, if no paths are available then
+	 * NULL will be returned.
+	 *
+	 * repeat_count is the number of times to use the path before
+	 * calling the function again.  0 means don't call it again unless
+	 * the path fails.
+	 */
+	struct dm_path *(*select_path) (struct path_selector *ps,
+					unsigned *repeat_count,
+					size_t nr_bytes);
+
+	/*
+	 * Notify the selector that a path has failed.
+	 */
+	void (*fail_path) (struct path_selector *ps, struct dm_path *p);
+
+	/*
+	 * Ask selector to reinstate a path.
+	 */
+	int (*reinstate_path) (struct path_selector *ps, struct dm_path *p);
+
+	/*
+	 * Table content based on parameters added in ps_add_path_fn
+	 * or path selector status
+	 */
+	int (*status) (struct path_selector *ps, struct dm_path *path,
+		       status_type_t type, char *result, unsigned int maxlen);
+
+	int (*start_io) (struct path_selector *ps, struct dm_path *path,
+			 size_t nr_bytes);
+	int (*end_io) (struct path_selector *ps, struct dm_path *path,
+		       size_t nr_bytes);
+};
+
+/* Register a path selector */
+int dm_register_path_selector(struct path_selector_type *type);
+
+/* Unregister a path selector */
+int dm_unregister_path_selector(struct path_selector_type *type);
+
+/* Returns a registered path selector type */
+struct path_selector_type *dm_get_path_selector(const char *name);
+
+/* Releases a path selector  */
+void dm_put_path_selector(struct path_selector_type *pst);
+
+#endif
--- a/kernel/drivers/md/dm-queue-length.c
+++ b/kernel/drivers/md/dm-queue-length.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2004-2005 IBM Corp.  All Rights Reserved.
+ * Copyright (C) 2006-2009 NEC Corporation.
+ *
+ * dm-queue-length.c
+ *
+ * Module Author: Stefan Bader, IBM
+ * Modified by: Kiyoshi Ueda, NEC
+ *
+ * This file is released under the GPL.
+ *
+ * queue-length path selector - choose a path with the least number of
+ * in-flight I/Os.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <asm/atomic.h>
+
+#define DM_MSG_PREFIX	"multipath queue-length"
+#define QL_MIN_IO	128
+#define QL_VERSION	"0.1.0"
+
+struct selector {
+	struct list_head	valid_paths;
+	struct list_head	failed_paths;
+};
+
+struct path_info {
+	struct list_head	list;
+	struct dm_path		*path;
+	unsigned		repeat_count;
+	atomic_t		qlen;	/* the number of in-flight I/Os */
+};
+
+static struct selector *alloc_selector(void)
+{
+	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (s) {
+		INIT_LIST_HEAD(&s->valid_paths);
+		INIT_LIST_HEAD(&s->failed_paths);
+	}
+
+	return s;
+}
+
+static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+	struct selector *s = alloc_selector();
+
+	if (!s)
+		return -ENOMEM;
+
+	ps->context = s;
+	return 0;
+}
+
+static void ql_free_paths(struct list_head *paths)
+{
+	struct path_info *pi, *next;
+
+	list_for_each_entry_safe(pi, next, paths, list) {
+		list_del(&pi->list);
+		kfree(pi);
+	}
+}
+
+static void ql_destroy(struct path_selector *ps)
+{
+	struct selector *s = ps->context;
+
+	ql_free_paths(&s->valid_paths);
+	ql_free_paths(&s->failed_paths);
+	kfree(s);
+	ps->context = NULL;
+}
+
+static int ql_status(struct path_selector *ps, struct dm_path *path,
+		     status_type_t type, char *result, unsigned maxlen)
+{
+	unsigned sz = 0;
+	struct path_info *pi;
+
+	/* When called with NULL path, return selector status/args. */
+	if (!path)
+		DMEMIT("0 ");
+	else {
+		pi = path->pscontext;
+
+		switch (type) {
+		case STATUSTYPE_INFO:
+			DMEMIT("%d ", atomic_read(&pi->qlen));
+			break;
+		case STATUSTYPE_TABLE:
+			DMEMIT("%u ", pi->repeat_count);
+			break;
+		}
+	}
+
+	return sz;
+}
+
+static int ql_add_path(struct path_selector *ps, struct dm_path *path,
+		       int argc, char **argv, char **error)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi;
+	unsigned repeat_count = QL_MIN_IO;
+
+	/*
+	 * Arguments: [<repeat_count>]
+	 * 	<repeat_count>: The number of I/Os before switching path.
+	 * 			If not given, default (QL_MIN_IO) is used.
+	 */
+	if (argc > 1) {
+		*error = "queue-length ps: incorrect number of arguments";
+		return -EINVAL;
+	}
+
+	if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+		*error = "queue-length ps: invalid repeat count";
+		return -EINVAL;
+	}
+
+	/* Allocate the path information structure */
+	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+	if (!pi) {
+		*error = "queue-length ps: Error allocating path information";
+		return -ENOMEM;
+	}
+
+	pi->path = path;
+	pi->repeat_count = repeat_count;
+	atomic_set(&pi->qlen, 0);
+
+	path->pscontext = pi;
+
+	list_add_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move(&pi->list, &s->failed_paths);
+}
+
+static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+/*
+ * Select a path having the minimum number of in-flight I/Os
+ */
+static struct dm_path *ql_select_path(struct path_selector *ps,
+				      unsigned *repeat_count, size_t nr_bytes)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi = NULL, *best = NULL;
+
+	if (list_empty(&s->valid_paths))
+		return NULL;
+
+	/* Change preferred (first in list) path to evenly balance. */
+	list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+	list_for_each_entry(pi, &s->valid_paths, list) {
+		if (!best ||
+		    (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
+			best = pi;
+
+		if (!atomic_read(&best->qlen))
+			break;
+	}
+
+	if (!best)
+		return NULL;
+
+	*repeat_count = best->repeat_count;
+
+	return best->path;
+}
+
+static int ql_start_io(struct path_selector *ps, struct dm_path *path,
+		       size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_inc(&pi->qlen);
+
+	return 0;
+}
+
+static int ql_end_io(struct path_selector *ps, struct dm_path *path,
+		     size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_dec(&pi->qlen);
+
+	return 0;
+}
+
+static struct path_selector_type ql_ps = {
+	.name		= "queue-length",
+	.module		= THIS_MODULE,
+	.table_args	= 1,
+	.info_args	= 1,
+	.create		= ql_create,
+	.destroy	= ql_destroy,
+	.status		= ql_status,
+	.add_path	= ql_add_path,
+	.fail_path	= ql_fail_path,
+	.reinstate_path	= ql_reinstate_path,
+	.select_path	= ql_select_path,
+	.start_io	= ql_start_io,
+	.end_io		= ql_end_io,
+};
+
+static int __init dm_ql_init(void)
+{
+	int r = dm_register_path_selector(&ql_ps);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	DMINFO("version " QL_VERSION " loaded");
+
+	return r;
+}
+
+static void __exit dm_ql_exit(void)
+{
+	int r = dm_unregister_path_selector(&ql_ps);
+
+	if (r < 0)
+		DMERR("unregister failed %d", r);
+}
+
+module_init(dm_ql_init);
+module_exit(dm_ql_exit);
+
+MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
+MODULE_DESCRIPTION(
+	"(C) Copyright IBM Corp. 2004,2005   All Rights Reserved.\n"
+	DM_NAME " path selector to balance the number of in-flight I/Os"
+);
+MODULE_LICENSE("GPL");
--- a/kernel/drivers/md/dm-raid1.c
+++ b/kernel/drivers/md/dm-raid1.c
--- a/kernel/drivers/md/dm-region-hash.c
+++ b/kernel/drivers/md/dm-region-hash.c
@@ -0,0 +1,702 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/dm-dirty-log.h>
+#include <linux/dm-region-hash.h>
+
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+
+#include "dm.h"
+
+#define	DM_MSG_PREFIX	"region hash"
+
+/*-----------------------------------------------------------------
+ * Region hash
+ *
+ * The mirror splits itself up into discrete regions.  Each
+ * region can be in one of three states: clean, dirty,
+ * nosync.  There is no need to put clean regions in the hash.
+ *
+ * In addition to being present in the hash table a region _may_
+ * be present on one of three lists.
+ *
+ *   clean_regions: Regions on this list have no io pending to
+ *   them, they are in sync, we are no longer interested in them,
+ *   they are dull.  dm_rh_update_states() will remove them from the
+ *   hash table.
+ *
+ *   quiesced_regions: These regions have been spun down, ready
+ *   for recovery.  rh_recovery_start() will remove regions from
+ *   this list and hand them to kmirrord, which will schedule the
+ *   recovery io with kcopyd.
+ *
+ *   recovered_regions: Regions that kcopyd has successfully
+ *   recovered.  dm_rh_update_states() will now schedule any delayed
+ *   io, up the recovery_count, and remove the region from the
+ *   hash.
+ *
+ * There are 2 locks:
+ *   A rw spin lock 'hash_lock' protects just the hash table,
+ *   this is never held in write mode from interrupt context,
+ *   which I believe means that we only have to disable irqs when
+ *   doing a write lock.
+ *
+ *   An ordinary spin lock 'region_lock' that protects the three
+ *   lists in the region_hash, with the 'state', 'list' and
+ *   'delayed_bios' fields of the regions.  This is used from irq
+ *   context, so all other uses will have to suspend local irqs.
+ *---------------------------------------------------------------*/
+struct dm_region_hash {
+	uint32_t region_size;
+	unsigned region_shift;
+
+	/* holds persistent region state */
+	struct dm_dirty_log *log;
+
+	/* hash table */
+	rwlock_t hash_lock;
+	mempool_t *region_pool;
+	unsigned mask;
+	unsigned nr_buckets;
+	unsigned prime;
+	unsigned shift;
+	struct list_head *buckets;
+
+	unsigned max_recovery; /* Max # of regions to recover in parallel */
+
+	spinlock_t region_lock;
+	atomic_t recovery_in_flight;
+	struct semaphore recovery_count;
+	struct list_head clean_regions;
+	struct list_head quiesced_regions;
+	struct list_head recovered_regions;
+	struct list_head failed_recovered_regions;
+
+	void *context;
+	sector_t target_begin;
+
+	/* Callback function to schedule bios writes */
+	void (*dispatch_bios)(void *context, struct bio_list *bios);
+
+	/* Callback function to wakeup callers worker thread. */
+	void (*wakeup_workers)(void *context);
+
+	/* Callback function to wakeup callers recovery waiters. */
+	void (*wakeup_all_recovery_waiters)(void *context);
+};
+
+struct dm_region {
+	struct dm_region_hash *rh;	/* FIXME: can we get rid of this ? */
+	region_t key;
+	int state;
+
+	struct list_head hash_list;
+	struct list_head list;
+
+	atomic_t pending;
+	struct bio_list delayed_bios;
+};
+
+/*
+ * Conversion fns
+ */
+static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector)
+{
+	return sector >> rh->region_shift;
+}
+
+sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region)
+{
+	return region << rh->region_shift;
+}
+EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
+
+region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio)
+{
+	return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin);
+}
+EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
+
+void *dm_rh_region_context(struct dm_region *reg)
+{
+	return reg->rh->context;
+}
+EXPORT_SYMBOL_GPL(dm_rh_region_context);
+
+region_t dm_rh_get_region_key(struct dm_region *reg)
+{
+	return reg->key;
+}
+EXPORT_SYMBOL_GPL(dm_rh_get_region_key);
+
+sector_t dm_rh_get_region_size(struct dm_region_hash *rh)
+{
+	return rh->region_size;
+}
+EXPORT_SYMBOL_GPL(dm_rh_get_region_size);
+
+/*
+ * FIXME: shall we pass in a structure instead of all these args to
+ * dm_region_hash_create()????
+ */
+#define RH_HASH_MULT 2654435387U
+#define RH_HASH_SHIFT 12
+
+#define MIN_REGIONS 64
+struct dm_region_hash *dm_region_hash_create(
+		void *context, void (*dispatch_bios)(void *context,
+						     struct bio_list *bios),
+		void (*wakeup_workers)(void *context),
+		void (*wakeup_all_recovery_waiters)(void *context),
+		sector_t target_begin, unsigned max_recovery,
+		struct dm_dirty_log *log, uint32_t region_size,
+		region_t nr_regions)
+{
+	struct dm_region_hash *rh;
+	unsigned nr_buckets, max_buckets;
+	size_t i;
+
+	/*
+	 * Calculate a suitable number of buckets for our hash
+	 * table.
+	 */
+	max_buckets = nr_regions >> 6;
+	for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
+		;
+	nr_buckets >>= 1;
+
+	rh = kmalloc(sizeof(*rh), GFP_KERNEL);
+	if (!rh) {
+		DMERR("unable to allocate region hash memory");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	rh->context = context;
+	rh->dispatch_bios = dispatch_bios;
+	rh->wakeup_workers = wakeup_workers;
+	rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters;
+	rh->target_begin = target_begin;
+	rh->max_recovery = max_recovery;
+	rh->log = log;
+	rh->region_size = region_size;
+	rh->region_shift = ffs(region_size) - 1;
+	rwlock_init(&rh->hash_lock);
+	rh->mask = nr_buckets - 1;
+	rh->nr_buckets = nr_buckets;
+
+	rh->shift = RH_HASH_SHIFT;
+	rh->prime = RH_HASH_MULT;
+
+	rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
+	if (!rh->buckets) {
+		DMERR("unable to allocate region hash bucket memory");
+		kfree(rh);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < nr_buckets; i++)
+		INIT_LIST_HEAD(rh->buckets + i);
+
+	spin_lock_init(&rh->region_lock);
+	sema_init(&rh->recovery_count, 0);
+	atomic_set(&rh->recovery_in_flight, 0);
+	INIT_LIST_HEAD(&rh->clean_regions);
+	INIT_LIST_HEAD(&rh->quiesced_regions);
+	INIT_LIST_HEAD(&rh->recovered_regions);
+	INIT_LIST_HEAD(&rh->failed_recovered_regions);
+
+	rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
+						      sizeof(struct dm_region));
+	if (!rh->region_pool) {
+		vfree(rh->buckets);
+		kfree(rh);
+		rh = ERR_PTR(-ENOMEM);
+	}
+
+	return rh;
+}
+EXPORT_SYMBOL_GPL(dm_region_hash_create);
+
+void dm_region_hash_destroy(struct dm_region_hash *rh)
+{
+	unsigned h;
+	struct dm_region *reg, *nreg;
+
+	BUG_ON(!list_empty(&rh->quiesced_regions));
+	for (h = 0; h < rh->nr_buckets; h++) {
+		list_for_each_entry_safe(reg, nreg, rh->buckets + h,
+					 hash_list) {
+			BUG_ON(atomic_read(&reg->pending));
+			mempool_free(reg, rh->region_pool);
+		}
+	}
+
+	if (rh->log)
+		dm_dirty_log_destroy(rh->log);
+
+	if (rh->region_pool)
+		mempool_destroy(rh->region_pool);
+
+	vfree(rh->buckets);
+	kfree(rh);
+}
+EXPORT_SYMBOL_GPL(dm_region_hash_destroy);
+
+struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh)
+{
+	return rh->log;
+}
+EXPORT_SYMBOL_GPL(dm_rh_dirty_log);
+
+static unsigned rh_hash(struct dm_region_hash *rh, region_t region)
+{
+	return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask;
+}
+
+static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region)
+{
+	struct dm_region *reg;
+	struct list_head *bucket = rh->buckets + rh_hash(rh, region);
+
+	list_for_each_entry(reg, bucket, hash_list)
+		if (reg->key == region)
+			return reg;
+
+	return NULL;
+}
+
+static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg)
+{
+	list_add(&reg->hash_list, rh->buckets + rh_hash(rh, reg->key));
+}
+
+static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
+{
+	struct dm_region *reg, *nreg;
+
+	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
+	if (unlikely(!nreg))
+		nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
+
+	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
+		      DM_RH_CLEAN : DM_RH_NOSYNC;
+	nreg->rh = rh;
+	nreg->key = region;
+	INIT_LIST_HEAD(&nreg->list);
+	atomic_set(&nreg->pending, 0);
+	bio_list_init(&nreg->delayed_bios);
+
+	write_lock_irq(&rh->hash_lock);
+	reg = __rh_lookup(rh, region);
+	if (reg)
+		/* We lost the race. */
+		mempool_free(nreg, rh->region_pool);
+	else {
+		__rh_insert(rh, nreg);
+		if (nreg->state == DM_RH_CLEAN) {
+			spin_lock(&rh->region_lock);
+			list_add(&nreg->list, &rh->clean_regions);
+			spin_unlock(&rh->region_lock);
+		}
+
+		reg = nreg;
+	}
+	write_unlock_irq(&rh->hash_lock);
+
+	return reg;
+}
+
+static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region)
+{
+	struct dm_region *reg;
+
+	reg = __rh_lookup(rh, region);
+	if (!reg) {
+		read_unlock(&rh->hash_lock);
+		reg = __rh_alloc(rh, region);
+		read_lock(&rh->hash_lock);
+	}
+
+	return reg;
+}
+
+int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block)
+{
+	int r;
+	struct dm_region *reg;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_lookup(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	if (reg)
+		return reg->state;
+
+	/*
+	 * The region wasn't in the hash, so we fall back to the
+	 * dirty log.
+	 */
+	r = rh->log->type->in_sync(rh->log, region, may_block);
+
+	/*
+	 * Any error from the dirty log (eg. -EWOULDBLOCK) gets
+	 * taken as a DM_RH_NOSYNC
+	 */
+	return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC;
+}
+EXPORT_SYMBOL_GPL(dm_rh_get_state);
+
+static void complete_resync_work(struct dm_region *reg, int success)
+{
+	struct dm_region_hash *rh = reg->rh;
+
+	rh->log->type->set_region_sync(rh->log, reg->key, success);
+
+	/*
+	 * Dispatch the bios before we call 'wake_up_all'.
+	 * This is important because if we are suspending,
+	 * we want to know that recovery is complete and
+	 * the work queue is flushed.  If we wake_up_all
+	 * before we dispatch_bios (queue bios and call wake()),
+	 * then we risk suspending before the work queue
+	 * has been properly flushed.
+	 */
+	rh->dispatch_bios(rh->context, &reg->delayed_bios);
+	if (atomic_dec_and_test(&rh->recovery_in_flight))
+		rh->wakeup_all_recovery_waiters(rh->context);
+	up(&rh->recovery_count);
+}
+
+/* dm_rh_mark_nosync
+ * @ms
+ * @bio
+ * @done
+ * @error
+ *
+ * The bio was written on some mirror(s) but failed on other mirror(s).
+ * We can successfully endio the bio but should avoid the region being
+ * marked clean by setting the state DM_RH_NOSYNC.
+ *
+ * This function is _not_ safe in interrupt context!
+ */
+void dm_rh_mark_nosync(struct dm_region_hash *rh,
+		       struct bio *bio, unsigned done, int error)
+{
+	unsigned long flags;
+	struct dm_dirty_log *log = rh->log;
+	struct dm_region *reg;
+	region_t region = dm_rh_bio_to_region(rh, bio);
+	int recovering = 0;
+
+	/* We must inform the log that the sync count has changed. */
+	log->type->set_region_sync(log, region, 0);
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	/* region hash entry should exist because write was in-flight */
+	BUG_ON(!reg);
+	BUG_ON(!list_empty(&reg->list));
+
+	spin_lock_irqsave(&rh->region_lock, flags);
+	/*
+	 * Possible cases:
+	 *   1) DM_RH_DIRTY
+	 *   2) DM_RH_NOSYNC: was dirty, other preceeding writes failed
+	 *   3) DM_RH_RECOVERING: flushing pending writes
+	 * Either case, the region should have not been connected to list.
+	 */
+	recovering = (reg->state == DM_RH_RECOVERING);
+	reg->state = DM_RH_NOSYNC;
+	BUG_ON(!list_empty(&reg->list));
+	spin_unlock_irqrestore(&rh->region_lock, flags);
+
+	bio_endio(bio, error);
+	if (recovering)
+		complete_resync_work(reg, 0);
+}
+EXPORT_SYMBOL_GPL(dm_rh_mark_nosync);
+
+void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled)
+{
+	struct dm_region *reg, *next;
+
+	LIST_HEAD(clean);
+	LIST_HEAD(recovered);
+	LIST_HEAD(failed_recovered);
+
+	/*
+	 * Quickly grab the lists.
+	 */
+	write_lock_irq(&rh->hash_lock);
+	spin_lock(&rh->region_lock);
+	if (!list_empty(&rh->clean_regions)) {
+		list_splice_init(&rh->clean_regions, &clean);
+
+		list_for_each_entry(reg, &clean, list)
+			list_del(&reg->hash_list);
+	}
+
+	if (!list_empty(&rh->recovered_regions)) {
+		list_splice_init(&rh->recovered_regions, &recovered);
+
+		list_for_each_entry(reg, &recovered, list)
+			list_del(&reg->hash_list);
+	}
+
+	if (!list_empty(&rh->failed_recovered_regions)) {
+		list_splice_init(&rh->failed_recovered_regions,
+				 &failed_recovered);
+
+		list_for_each_entry(reg, &failed_recovered, list)
+			list_del(&reg->hash_list);
+	}
+
+	spin_unlock(&rh->region_lock);
+	write_unlock_irq(&rh->hash_lock);
+
+	/*
+	 * All the regions on the recovered and clean lists have
+	 * now been pulled out of the system, so no need to do
+	 * any more locking.
+	 */
+	list_for_each_entry_safe(reg, next, &recovered, list) {
+		rh->log->type->clear_region(rh->log, reg->key);
+		complete_resync_work(reg, 1);
+		mempool_free(reg, rh->region_pool);
+	}
+
+	list_for_each_entry_safe(reg, next, &failed_recovered, list) {
+		complete_resync_work(reg, errors_handled ? 0 : 1);
+		mempool_free(reg, rh->region_pool);
+	}
+
+	list_for_each_entry_safe(reg, next, &clean, list) {
+		rh->log->type->clear_region(rh->log, reg->key);
+		mempool_free(reg, rh->region_pool);
+	}
+
+	rh->log->type->flush(rh->log);
+}
+EXPORT_SYMBOL_GPL(dm_rh_update_states);
+
+static void rh_inc(struct dm_region_hash *rh, region_t region)
+{
+	struct dm_region *reg;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+
+	spin_lock_irq(&rh->region_lock);
+	atomic_inc(&reg->pending);
+
+	if (reg->state == DM_RH_CLEAN) {
+		reg->state = DM_RH_DIRTY;
+		list_del_init(&reg->list);	/* take off the clean list */
+		spin_unlock_irq(&rh->region_lock);
+
+		rh->log->type->mark_region(rh->log, reg->key);
+	} else
+		spin_unlock_irq(&rh->region_lock);
+
+
+	read_unlock(&rh->hash_lock);
+}
+
+void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
+{
+	struct bio *bio;
+
+	for (bio = bios->head; bio; bio = bio->bi_next)
+		rh_inc(rh, dm_rh_bio_to_region(rh, bio));
+}
+EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
+
+void dm_rh_dec(struct dm_region_hash *rh, region_t region)
+{
+	unsigned long flags;
+	struct dm_region *reg;
+	int should_wake = 0;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_lookup(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	spin_lock_irqsave(&rh->region_lock, flags);
+	if (atomic_dec_and_test(&reg->pending)) {
+		/*
+		 * There is no pending I/O for this region.
+		 * We can move the region to corresponding list for next action.
+		 * At this point, the region is not yet connected to any list.
+		 *
+		 * If the state is DM_RH_NOSYNC, the region should be kept off
+		 * from clean list.
+		 * The hash entry for DM_RH_NOSYNC will remain in memory
+		 * until the region is recovered or the map is reloaded.
+		 */
+
+		/* do nothing for DM_RH_NOSYNC */
+		if (reg->state == DM_RH_RECOVERING) {
+			list_add_tail(&reg->list, &rh->quiesced_regions);
+		} else if (reg->state == DM_RH_DIRTY) {
+			reg->state = DM_RH_CLEAN;
+			list_add(&reg->list, &rh->clean_regions);
+		}
+		should_wake = 1;
+	}
+	spin_unlock_irqrestore(&rh->region_lock, flags);
+
+	if (should_wake)
+		rh->wakeup_workers(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_dec);
+
+/*
+ * Starts quiescing a region in preparation for recovery.
+ */
+static int __rh_recovery_prepare(struct dm_region_hash *rh)
+{
+	int r;
+	region_t region;
+	struct dm_region *reg;
+
+	/*
+	 * Ask the dirty log what's next.
+	 */
+	r = rh->log->type->get_resync_work(rh->log, &region);
+	if (r <= 0)
+		return r;
+
+	/*
+	 * Get this region, and start it quiescing by setting the
+	 * recovering flag.
+	 */
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	spin_lock_irq(&rh->region_lock);
+	reg->state = DM_RH_RECOVERING;
+
+	/* Already quiesced ? */
+	if (atomic_read(&reg->pending))
+		list_del_init(&reg->list);
+	else
+		list_move(&reg->list, &rh->quiesced_regions);
+
+	spin_unlock_irq(&rh->region_lock);
+
+	return 1;
+}
+
+void dm_rh_recovery_prepare(struct dm_region_hash *rh)
+{
+	/* Extra reference to avoid race with dm_rh_stop_recovery */
+	atomic_inc(&rh->recovery_in_flight);
+
+	while (!down_trylock(&rh->recovery_count)) {
+		atomic_inc(&rh->recovery_in_flight);
+		if (__rh_recovery_prepare(rh) <= 0) {
+			atomic_dec(&rh->recovery_in_flight);
+			up(&rh->recovery_count);
+			break;
+		}
+	}
+
+	/* Drop the extra reference */
+	if (atomic_dec_and_test(&rh->recovery_in_flight))
+		rh->wakeup_all_recovery_waiters(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare);
+
+/*
+ * Returns any quiesced regions.
+ */
+struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh)
+{
+	struct dm_region *reg = NULL;
+
+	spin_lock_irq(&rh->region_lock);
+	if (!list_empty(&rh->quiesced_regions)) {
+		reg = list_entry(rh->quiesced_regions.next,
+				 struct dm_region, list);
+		list_del_init(&reg->list);  /* remove from the quiesced list */
+	}
+	spin_unlock_irq(&rh->region_lock);
+
+	return reg;
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_start);
+
+void dm_rh_recovery_end(struct dm_region *reg, int success)
+{
+	struct dm_region_hash *rh = reg->rh;
+
+	spin_lock_irq(&rh->region_lock);
+	if (success)
+		list_add(&reg->list, &reg->rh->recovered_regions);
+	else
+		list_add(&reg->list, &reg->rh->failed_recovered_regions);
+
+	spin_unlock_irq(&rh->region_lock);
+
+	rh->wakeup_workers(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_end);
+
+/* Return recovery in flight count. */
+int dm_rh_recovery_in_flight(struct dm_region_hash *rh)
+{
+	return atomic_read(&rh->recovery_in_flight);
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight);
+
+int dm_rh_flush(struct dm_region_hash *rh)
+{
+	return rh->log->type->flush(rh->log);
+}
+EXPORT_SYMBOL_GPL(dm_rh_flush);
+
+void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio)
+{
+	struct dm_region *reg;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio));
+	bio_list_add(&reg->delayed_bios, bio);
+	read_unlock(&rh->hash_lock);
+}
+EXPORT_SYMBOL_GPL(dm_rh_delay);
+
+void dm_rh_stop_recovery(struct dm_region_hash *rh)
+{
+	int i;
+
+	/* wait for any recovering regions */
+	for (i = 0; i < rh->max_recovery; i++)
+		down(&rh->recovery_count);
+}
+EXPORT_SYMBOL_GPL(dm_rh_stop_recovery);
+
+void dm_rh_start_recovery(struct dm_region_hash *rh)
+{
+	int i;
+
+	for (i = 0; i < rh->max_recovery; i++)
+		up(&rh->recovery_count);
+
+	rh->wakeup_workers(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_start_recovery);
+
+MODULE_DESCRIPTION(DM_NAME " region hash");
+MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
--- a/kernel/drivers/md/dm-round-robin.c
+++ b/kernel/drivers/md/dm-round-robin.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) 2003 Sistina Software.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen
+ *
+ * This file is released under the GPL.
+ *
+ * Round-robin path selector.
+ */
+
+#include <linux/device-mapper.h>
+
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+
+#define DM_MSG_PREFIX "multipath round-robin"
+
+/*-----------------------------------------------------------------
+ * Path-handling code, paths are held in lists
+ *---------------------------------------------------------------*/
+struct path_info {
+	struct list_head list;
+	struct dm_path *path;
+	unsigned repeat_count;
+};
+
+static void free_paths(struct list_head *paths)
+{
+	struct path_info *pi, *next;
+
+	list_for_each_entry_safe(pi, next, paths, list) {
+		list_del(&pi->list);
+		kfree(pi);
+	}
+}
+
+/*-----------------------------------------------------------------
+ * Round-robin selector
+ *---------------------------------------------------------------*/
+
+#define RR_MIN_IO		1000
+
+struct selector {
+	struct list_head valid_paths;
+	struct list_head invalid_paths;
+};
+
+static struct selector *alloc_selector(void)
+{
+	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (s) {
+		INIT_LIST_HEAD(&s->valid_paths);
+		INIT_LIST_HEAD(&s->invalid_paths);
+	}
+
+	return s;
+}
+
+static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+	struct selector *s;
+
+	s = alloc_selector();
+	if (!s)
+		return -ENOMEM;
+
+	ps->context = s;
+	return 0;
+}
+
+static void rr_destroy(struct path_selector *ps)
+{
+	struct selector *s = (struct selector *) ps->context;
+
+	free_paths(&s->valid_paths);
+	free_paths(&s->invalid_paths);
+	kfree(s);
+	ps->context = NULL;
+}
+
+static int rr_status(struct path_selector *ps, struct dm_path *path,
+		     status_type_t type, char *result, unsigned int maxlen)
+{
+	struct path_info *pi;
+	int sz = 0;
+
+	if (!path)
+		DMEMIT("0 ");
+	else {
+		switch(type) {
+		case STATUSTYPE_INFO:
+			break;
+		case STATUSTYPE_TABLE:
+			pi = path->pscontext;
+			DMEMIT("%u ", pi->repeat_count);
+			break;
+		}
+	}
+
+	return sz;
+}
+
+/*
+ * Called during initialisation to register each path with an
+ * optional repeat_count.
+ */
+static int rr_add_path(struct path_selector *ps, struct dm_path *path,
+		       int argc, char **argv, char **error)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi;
+	unsigned repeat_count = RR_MIN_IO;
+
+	if (argc > 1) {
+		*error = "round-robin ps: incorrect number of arguments";
+		return -EINVAL;
+	}
+
+	/* First path argument is number of I/Os before switching path */
+	if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+		*error = "round-robin ps: invalid repeat count";
+		return -EINVAL;
+	}
+
+	/* allocate the path */
+	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+	if (!pi) {
+		*error = "round-robin ps: Error allocating path context";
+		return -ENOMEM;
+	}
+
+	pi->path = path;
+	pi->repeat_count = repeat_count;
+
+	path->pscontext = pi;
+
+	list_add_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = p->pscontext;
+
+	list_move(&pi->list, &s->invalid_paths);
+}
+
+static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = p->pscontext;
+
+	list_move(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+static struct dm_path *rr_select_path(struct path_selector *ps,
+				      unsigned *repeat_count, size_t nr_bytes)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = NULL;
+
+	if (!list_empty(&s->valid_paths)) {
+		pi = list_entry(s->valid_paths.next, struct path_info, list);
+		list_move_tail(&pi->list, &s->valid_paths);
+		*repeat_count = pi->repeat_count;
+	}
+
+	return pi ? pi->path : NULL;
+}
+
+static struct path_selector_type rr_ps = {
+	.name = "round-robin",
+	.module = THIS_MODULE,
+	.table_args = 1,
+	.info_args = 0,
+	.create = rr_create,
+	.destroy = rr_destroy,
+	.status = rr_status,
+	.add_path = rr_add_path,
+	.fail_path = rr_fail_path,
+	.reinstate_path = rr_reinstate_path,
+	.select_path = rr_select_path,
+};
+
+static int __init dm_rr_init(void)
+{
+	int r = dm_register_path_selector(&rr_ps);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	DMINFO("version 1.0.0 loaded");
+
+	return r;
+}
+
+static void __exit dm_rr_exit(void)
+{
+	int r = dm_unregister_path_selector(&rr_ps);
+
+	if (r < 0)
+		DMERR("unregister failed %d", r);
+}
+
+module_init(dm_rr_init);
+module_exit(dm_rr_exit);
+
+MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector");
+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
--- a/kernel/drivers/md/dm-service-time.c
+++ b/kernel/drivers/md/dm-service-time.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (C) 2007-2009 NEC Corporation.  All Rights Reserved.
+ *
+ * Module Author: Kiyoshi Ueda
+ *
+ * This file is released under the GPL.
+ *
+ * Throughput oriented path selector.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#define DM_MSG_PREFIX	"multipath service-time"
+#define ST_MIN_IO	1
+#define ST_MAX_RELATIVE_THROUGHPUT	100
+#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT	7
+#define ST_MAX_INFLIGHT_SIZE	((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
+#define ST_VERSION	"0.2.0"
+
+struct selector {
+	struct list_head valid_paths;
+	struct list_head failed_paths;
+};
+
+struct path_info {
+	struct list_head list;
+	struct dm_path *path;
+	unsigned repeat_count;
+	unsigned relative_throughput;
+	atomic_t in_flight_size;	/* Total size of in-flight I/Os */
+};
+
+static struct selector *alloc_selector(void)
+{
+	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (s) {
+		INIT_LIST_HEAD(&s->valid_paths);
+		INIT_LIST_HEAD(&s->failed_paths);
+	}
+
+	return s;
+}
+
+static int st_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+	struct selector *s = alloc_selector();
+
+	if (!s)
+		return -ENOMEM;
+
+	ps->context = s;
+	return 0;
+}
+
+static void free_paths(struct list_head *paths)
+{
+	struct path_info *pi, *next;
+
+	list_for_each_entry_safe(pi, next, paths, list) {
+		list_del(&pi->list);
+		kfree(pi);
+	}
+}
+
+static void st_destroy(struct path_selector *ps)
+{
+	struct selector *s = ps->context;
+
+	free_paths(&s->valid_paths);
+	free_paths(&s->failed_paths);
+	kfree(s);
+	ps->context = NULL;
+}
+
+static int st_status(struct path_selector *ps, struct dm_path *path,
+		     status_type_t type, char *result, unsigned maxlen)
+{
+	unsigned sz = 0;
+	struct path_info *pi;
+
+	if (!path)
+		DMEMIT("0 ");
+	else {
+		pi = path->pscontext;
+
+		switch (type) {
+		case STATUSTYPE_INFO:
+			DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
+			       pi->relative_throughput);
+			break;
+		case STATUSTYPE_TABLE:
+			DMEMIT("%u %u ", pi->repeat_count,
+			       pi->relative_throughput);
+			break;
+		}
+	}
+
+	return sz;
+}
+
+static int st_add_path(struct path_selector *ps, struct dm_path *path,
+		       int argc, char **argv, char **error)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi;
+	unsigned repeat_count = ST_MIN_IO;
+	unsigned relative_throughput = 1;
+
+	/*
+	 * Arguments: [<repeat_count> [<relative_throughput>]]
+	 * 	<repeat_count>: The number of I/Os before switching path.
+	 * 			If not given, default (ST_MIN_IO) is used.
+	 * 	<relative_throughput>: The relative throughput value of
+	 *			the path among all paths in the path-group.
+	 * 			The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
+	 *			If not given, minimum value '1' is used.
+	 *			If '0' is given, the path isn't selected while
+	 * 			other paths having a positive value are
+	 * 			available.
+	 */
+	if (argc > 2) {
+		*error = "service-time ps: incorrect number of arguments";
+		return -EINVAL;
+	}
+
+	if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+		*error = "service-time ps: invalid repeat count";
+		return -EINVAL;
+	}
+
+	if ((argc == 2) &&
+	    (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
+	     relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
+		*error = "service-time ps: invalid relative_throughput value";
+		return -EINVAL;
+	}
+
+	/* allocate the path */
+	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+	if (!pi) {
+		*error = "service-time ps: Error allocating path context";
+		return -ENOMEM;
+	}
+
+	pi->path = path;
+	pi->repeat_count = repeat_count;
+	pi->relative_throughput = relative_throughput;
+	atomic_set(&pi->in_flight_size, 0);
+
+	path->pscontext = pi;
+
+	list_add_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+static void st_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move(&pi->list, &s->failed_paths);
+}
+
+static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+/*
+ * Compare the estimated service time of 2 paths, pi1 and pi2,
+ * for the incoming I/O.
+ *
+ * Returns:
+ * < 0 : pi1 is better
+ * 0   : no difference between pi1 and pi2
+ * > 0 : pi2 is better
+ *
+ * Description:
+ * Basically, the service time is estimated by:
+ *     ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
+ * To reduce the calculation, some optimizations are made.
+ * (See comments inline)
+ */
+static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
+			   size_t incoming)
+{
+	size_t sz1, sz2, st1, st2;
+
+	sz1 = atomic_read(&pi1->in_flight_size);
+	sz2 = atomic_read(&pi2->in_flight_size);
+
+	/*
+	 * Case 1: Both have same throughput value. Choose less loaded path.
+	 */
+	if (pi1->relative_throughput == pi2->relative_throughput)
+		return sz1 - sz2;
+
+	/*
+	 * Case 2a: Both have same load. Choose higher throughput path.
+	 * Case 2b: One path has no throughput value. Choose the other one.
+	 */
+	if (sz1 == sz2 ||
+	    !pi1->relative_throughput || !pi2->relative_throughput)
+		return pi2->relative_throughput - pi1->relative_throughput;
+
+	/*
+	 * Case 3: Calculate service time. Choose faster path.
+	 *         Service time using pi1:
+	 *             st1 = (sz1 + incoming) / pi1->relative_throughput
+	 *         Service time using pi2:
+	 *             st2 = (sz2 + incoming) / pi2->relative_throughput
+	 *
+	 *         To avoid the division, transform the expression to use
+	 *         multiplication.
+	 *         Because ->relative_throughput > 0 here, if st1 < st2,
+	 *         the expressions below are the same meaning:
+	 *             (sz1 + incoming) / pi1->relative_throughput <
+	 *                 (sz2 + incoming) / pi2->relative_throughput
+	 *             (sz1 + incoming) * pi2->relative_throughput <
+	 *                 (sz2 + incoming) * pi1->relative_throughput
+	 *         So use the later one.
+	 */
+	sz1 += incoming;
+	sz2 += incoming;
+	if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
+		     sz2 >= ST_MAX_INFLIGHT_SIZE)) {
+		/*
+		 * Size may be too big for multiplying pi->relative_throughput
+		 * and overflow.
+		 * To avoid the overflow and mis-selection, shift down both.
+		 */
+		sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+		sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+	}
+	st1 = sz1 * pi2->relative_throughput;
+	st2 = sz2 * pi1->relative_throughput;
+	if (st1 != st2)
+		return st1 - st2;
+
+	/*
+	 * Case 4: Service time is equal. Choose higher throughput path.
+	 */
+	return pi2->relative_throughput - pi1->relative_throughput;
+}
+
+static struct dm_path *st_select_path(struct path_selector *ps,
+				      unsigned *repeat_count, size_t nr_bytes)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi = NULL, *best = NULL;
+
+	if (list_empty(&s->valid_paths))
+		return NULL;
+
+	/* Change preferred (first in list) path to evenly balance. */
+	list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+	list_for_each_entry(pi, &s->valid_paths, list)
+		if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
+			best = pi;
+
+	if (!best)
+		return NULL;
+
+	*repeat_count = best->repeat_count;
+
+	return best->path;
+}
+
+static int st_start_io(struct path_selector *ps, struct dm_path *path,
+		       size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_add(nr_bytes, &pi->in_flight_size);
+
+	return 0;
+}
+
+static int st_end_io(struct path_selector *ps, struct dm_path *path,
+		     size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_sub(nr_bytes, &pi->in_flight_size);
+
+	return 0;
+}
+
+static struct path_selector_type st_ps = {
+	.name		= "service-time",
+	.module		= THIS_MODULE,
+	.table_args	= 2,
+	.info_args	= 2,
+	.create		= st_create,
+	.destroy	= st_destroy,
+	.status		= st_status,
+	.add_path	= st_add_path,
+	.fail_path	= st_fail_path,
+	.reinstate_path	= st_reinstate_path,
+	.select_path	= st_select_path,
+	.start_io	= st_start_io,
+	.end_io		= st_end_io,
+};
+
+static int __init dm_st_init(void)
+{
+	int r = dm_register_path_selector(&st_ps);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	DMINFO("version " ST_VERSION " loaded");
+
+	return r;
+}
+
+static void __exit dm_st_exit(void)
+{
+	int r = dm_unregister_path_selector(&st_ps);
+
+	if (r < 0)
+		DMERR("unregister failed %d", r);
+}
+
+module_init(dm_st_init);
+module_exit(dm_st_exit);
+
+MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
+MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
+MODULE_LICENSE("GPL");
--- a/kernel/drivers/md/dm-snap-persistent.c
+++ b/kernel/drivers/md/dm-snap-persistent.c
@@ -0,0 +1,787 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006-2008 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-exception-store.h"
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/dm-io.h>
+
+#define DM_MSG_PREFIX "persistent snapshot"
+#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32	/* 16KB */
+
+/*-----------------------------------------------------------------
+ * Persistent snapshots, by persistent we mean that the snapshot
+ * will survive a reboot.
+ *---------------------------------------------------------------*/
+
+/*
+ * We need to store a record of which parts of the origin have
+ * been copied to the snapshot device.  The snapshot code
+ * requires that we copy exception chunks to chunk aligned areas
+ * of the COW store.  It makes sense therefore, to store the
+ * metadata in chunk size blocks.
+ *
+ * There is no backward or forward compatibility implemented,
+ * snapshots with different disk versions than the kernel will
+ * not be usable.  It is expected that "lvcreate" will blank out
+ * the start of a fresh COW device before calling the snapshot
+ * constructor.
+ *
+ * The first chunk of the COW device just contains the header.
+ * After this there is a chunk filled with exception metadata,
+ * followed by as many exception chunks as can fit in the
+ * metadata areas.
+ *
+ * All on disk structures are in little-endian format.  The end
+ * of the exceptions info is indicated by an exception with a
+ * new_chunk of 0, which is invalid since it would point to the
+ * header chunk.
+ */
+
+/*
+ * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
+ */
+#define SNAP_MAGIC 0x70416e53
+
+/*
+ * The on-disk version of the metadata.
+ */
+#define SNAPSHOT_DISK_VERSION 1
+
+struct disk_header {
+	uint32_t magic;
+
+	/*
+	 * Is this snapshot valid.  There is no way of recovering
+	 * an invalid snapshot.
+	 */
+	uint32_t valid;
+
+	/*
+	 * Simple, incrementing version. no backward
+	 * compatibility.
+	 */
+	uint32_t version;
+
+	/* In sectors */
+	uint32_t chunk_size;
+};
+
+struct disk_exception {
+	uint64_t old_chunk;
+	uint64_t new_chunk;
+};
+
+struct commit_callback {
+	void (*callback)(void *, int success);
+	void *context;
+};
+
+/*
+ * The top level structure for a persistent exception store.
+ */
+struct pstore {
+	struct dm_exception_store *store;
+	int version;
+	int valid;
+	uint32_t exceptions_per_area;
+
+	/*
+	 * Now that we have an asynchronous kcopyd there is no
+	 * need for large chunk sizes, so it wont hurt to have a
+	 * whole chunks worth of metadata in memory at once.
+	 */
+	void *area;
+
+	/*
+	 * An area of zeros used to clear the next area.
+	 */
+	void *zero_area;
+
+	/*
+	 * An area used for header. The header can be written
+	 * concurrently with metadata (when invalidating the snapshot),
+	 * so it needs a separate buffer.
+	 */
+	void *header_area;
+
+	/*
+	 * Used to keep track of which metadata area the data in
+	 * 'chunk' refers to.
+	 */
+	chunk_t current_area;
+
+	/*
+	 * The next free chunk for an exception.
+	 */
+	chunk_t next_free;
+
+	/*
+	 * The index of next free exception in the current
+	 * metadata area.
+	 */
+	uint32_t current_committed;
+
+	atomic_t pending_count;
+	uint32_t callback_count;
+	struct commit_callback *callbacks;
+	struct dm_io_client *io_client;
+
+	struct workqueue_struct *metadata_wq;
+};
+
+static unsigned sectors_to_pages(unsigned sectors)
+{
+	return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
+}
+
+static int alloc_area(struct pstore *ps)
+{
+	int r = -ENOMEM;
+	size_t len;
+
+	len = ps->store->chunk_size << SECTOR_SHIFT;
+
+	/*
+	 * Allocate the chunk_size block of memory that will hold
+	 * a single metadata area.
+	 */
+	ps->area = vmalloc(len);
+	if (!ps->area)
+		goto err_area;
+
+	ps->zero_area = vmalloc(len);
+	if (!ps->zero_area)
+		goto err_zero_area;
+	memset(ps->zero_area, 0, len);
+
+	ps->header_area = vmalloc(len);
+	if (!ps->header_area)
+		goto err_header_area;
+
+	return 0;
+
+err_header_area:
+	vfree(ps->zero_area);
+
+err_zero_area:
+	vfree(ps->area);
+
+err_area:
+	return r;
+}
+
+static void free_area(struct pstore *ps)
+{
+	if (ps->area)
+		vfree(ps->area);
+	ps->area = NULL;
+
+	if (ps->zero_area)
+		vfree(ps->zero_area);
+	ps->zero_area = NULL;
+
+	if (ps->header_area)
+		vfree(ps->header_area);
+	ps->header_area = NULL;
+}
+
+struct mdata_req {
+	struct dm_io_region *where;
+	struct dm_io_request *io_req;
+	struct work_struct work;
+	int result;
+};
+
+static void do_metadata(struct work_struct *work)
+{
+	struct mdata_req *req = container_of(work, struct mdata_req, work);
+
+	req->result = dm_io(req->io_req, 1, req->where, NULL);
+}
+
+/*
+ * Read or write a chunk aligned and sized block of data from a device.
+ */
+static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
+		    int metadata)
+{
+	struct dm_io_region where = {
+		.bdev = ps->store->cow->bdev,
+		.sector = ps->store->chunk_size * chunk,
+		.count = ps->store->chunk_size,
+	};
+	struct dm_io_request io_req = {
+		.bi_rw = rw,
+		.mem.type = DM_IO_VMA,
+		.mem.ptr.vma = area,
+		.client = ps->io_client,
+		.notify.fn = NULL,
+	};
+	struct mdata_req req;
+
+	if (!metadata)
+		return dm_io(&io_req, 1, &where, NULL);
+
+	req.where = &where;
+	req.io_req = &io_req;
+
+	/*
+	 * Issue the synchronous I/O from a different thread
+	 * to avoid generic_make_request recursion.
+	 */
+	INIT_WORK(&req.work, do_metadata);
+	queue_work(ps->metadata_wq, &req.work);
+	flush_workqueue(ps->metadata_wq);
+
+	return req.result;
+}
+
+/*
+ * Convert a metadata area index to a chunk index.
+ */
+static chunk_t area_location(struct pstore *ps, chunk_t area)
+{
+	return 1 + ((ps->exceptions_per_area + 1) * area);
+}
+
+/*
+ * Read or write a metadata area.  Remembering to skip the first
+ * chunk which holds the header.
+ */
+static int area_io(struct pstore *ps, int rw)
+{
+	int r;
+	chunk_t chunk;
+
+	chunk = area_location(ps, ps->current_area);
+
+	r = chunk_io(ps, ps->area, chunk, rw, 0);
+	if (r)
+		return r;
+
+	return 0;
+}
+
+static void zero_memory_area(struct pstore *ps)
+{
+	memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
+}
+
+static int zero_disk_area(struct pstore *ps, chunk_t area)
+{
+	return chunk_io(ps, ps->zero_area, area_location(ps, area), WRITE, 0);
+}
+
+static int read_header(struct pstore *ps, int *new_snapshot)
+{
+	int r;
+	struct disk_header *dh;
+	unsigned chunk_size;
+	int chunk_size_supplied = 1;
+	char *chunk_err;
+
+	/*
+	 * Use default chunk size (or logical_block_size, if larger)
+	 * if none supplied
+	 */
+	if (!ps->store->chunk_size) {
+		ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
+		    bdev_logical_block_size(ps->store->cow->bdev) >> 9);
+		ps->store->chunk_mask = ps->store->chunk_size - 1;
+		ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
+		chunk_size_supplied = 0;
+	}
+
+	ps->io_client = dm_io_client_create(sectors_to_pages(ps->store->
+							     chunk_size));
+	if (IS_ERR(ps->io_client))
+		return PTR_ERR(ps->io_client);
+
+	r = alloc_area(ps);
+	if (r)
+		return r;
+
+	r = chunk_io(ps, ps->header_area, 0, READ, 1);
+	if (r)
+		goto bad;
+
+	dh = ps->header_area;
+
+	if (le32_to_cpu(dh->magic) == 0) {
+		*new_snapshot = 1;
+		return 0;
+	}
+
+	if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
+		DMWARN("Invalid or corrupt snapshot");
+		r = -ENXIO;
+		goto bad;
+	}
+
+	*new_snapshot = 0;
+	ps->valid = le32_to_cpu(dh->valid);
+	ps->version = le32_to_cpu(dh->version);
+	chunk_size = le32_to_cpu(dh->chunk_size);
+
+	if (ps->store->chunk_size == chunk_size)
+		return 0;
+
+	if (chunk_size_supplied)
+		DMWARN("chunk size %u in device metadata overrides "
+		       "table chunk size of %u.",
+		       chunk_size, ps->store->chunk_size);
+
+	/* We had a bogus chunk_size. Fix stuff up. */
+	free_area(ps);
+
+	r = dm_exception_store_set_chunk_size(ps->store, chunk_size,
+					      &chunk_err);
+	if (r) {
+		DMERR("invalid on-disk chunk size %u: %s.",
+		      chunk_size, chunk_err);
+		return r;
+	}
+
+	r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size),
+				ps->io_client);
+	if (r)
+		return r;
+
+	r = alloc_area(ps);
+	return r;
+
+bad:
+	free_area(ps);
+	return r;
+}
+
+static int write_header(struct pstore *ps)
+{
+	struct disk_header *dh;
+
+	memset(ps->header_area, 0, ps->store->chunk_size << SECTOR_SHIFT);
+
+	dh = ps->header_area;
+	dh->magic = cpu_to_le32(SNAP_MAGIC);
+	dh->valid = cpu_to_le32(ps->valid);
+	dh->version = cpu_to_le32(ps->version);
+	dh->chunk_size = cpu_to_le32(ps->store->chunk_size);
+
+	return chunk_io(ps, ps->header_area, 0, WRITE, 1);
+}
+
+/*
+ * Access functions for the disk exceptions, these do the endian conversions.
+ */
+static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
+{
+	BUG_ON(index >= ps->exceptions_per_area);
+
+	return ((struct disk_exception *) ps->area) + index;
+}
+
+static void read_exception(struct pstore *ps,
+			   uint32_t index, struct disk_exception *result)
+{
+	struct disk_exception *e = get_exception(ps, index);
+
+	/* copy it */
+	result->old_chunk = le64_to_cpu(e->old_chunk);
+	result->new_chunk = le64_to_cpu(e->new_chunk);
+}
+
+static void write_exception(struct pstore *ps,
+			    uint32_t index, struct disk_exception *de)
+{
+	struct disk_exception *e = get_exception(ps, index);
+
+	/* copy it */
+	e->old_chunk = cpu_to_le64(de->old_chunk);
+	e->new_chunk = cpu_to_le64(de->new_chunk);
+}
+
+/*
+ * Registers the exceptions that are present in the current area.
+ * 'full' is filled in to indicate if the area has been
+ * filled.
+ */
+static int insert_exceptions(struct pstore *ps,
+			     int (*callback)(void *callback_context,
+					     chunk_t old, chunk_t new),
+			     void *callback_context,
+			     int *full)
+{
+	int r;
+	unsigned int i;
+	struct disk_exception de;
+
+	/* presume the area is full */
+	*full = 1;
+
+	for (i = 0; i < ps->exceptions_per_area; i++) {
+		read_exception(ps, i, &de);
+
+		/*
+		 * If the new_chunk is pointing at the start of
+		 * the COW device, where the first metadata area
+		 * is we know that we've hit the end of the
+		 * exceptions.  Therefore the area is not full.
+		 */
+		if (de.new_chunk == 0LL) {
+			ps->current_committed = i;
+			*full = 0;
+			break;
+		}
+
+		/*
+		 * Keep track of the start of the free chunks.
+		 */
+		if (ps->next_free <= de.new_chunk)
+			ps->next_free = de.new_chunk + 1;
+
+		/*
+		 * Otherwise we add the exception to the snapshot.
+		 */
+		r = callback(callback_context, de.old_chunk, de.new_chunk);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static int read_exceptions(struct pstore *ps,
+			   int (*callback)(void *callback_context, chunk_t old,
+					   chunk_t new),
+			   void *callback_context)
+{
+	int r, full = 1;
+
+	/*
+	 * Keeping reading chunks and inserting exceptions until
+	 * we find a partially full area.
+	 */
+	for (ps->current_area = 0; full; ps->current_area++) {
+		r = area_io(ps, READ);
+		if (r)
+			return r;
+
+		r = insert_exceptions(ps, callback, callback_context, &full);
+		if (r)
+			return r;
+	}
+
+	ps->current_area--;
+
+	return 0;
+}
+
+static struct pstore *get_info(struct dm_exception_store *store)
+{
+	return (struct pstore *) store->context;
+}
+
+static void persistent_fraction_full(struct dm_exception_store *store,
+				     sector_t *numerator, sector_t *denominator)
+{
+	*numerator = get_info(store)->next_free * store->chunk_size;
+	*denominator = get_dev_size(store->cow->bdev);
+}
+
+static void persistent_dtr(struct dm_exception_store *store)
+{
+	struct pstore *ps = get_info(store);
+
+	destroy_workqueue(ps->metadata_wq);
+
+	/* Created in read_header */
+	if (ps->io_client)
+		dm_io_client_destroy(ps->io_client);
+	free_area(ps);
+
+	/* Allocated in persistent_read_metadata */
+	if (ps->callbacks)
+		vfree(ps->callbacks);
+
+	kfree(ps);
+}
+
+static int persistent_read_metadata(struct dm_exception_store *store,
+				    int (*callback)(void *callback_context,
+						    chunk_t old, chunk_t new),
+				    void *callback_context)
+{
+	int r, uninitialized_var(new_snapshot);
+	struct pstore *ps = get_info(store);
+
+	/*
+	 * Read the snapshot header.
+	 */
+	r = read_header(ps, &new_snapshot);
+	if (r)
+		return r;
+
+	/*
+	 * Now we know correct chunk_size, complete the initialisation.
+	 */
+	ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
+				  sizeof(struct disk_exception);
+	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
+			sizeof(*ps->callbacks));
+	if (!ps->callbacks)
+		return -ENOMEM;
+
+	/*
+	 * Do we need to setup a new snapshot ?
+	 */
+	if (new_snapshot) {
+		r = write_header(ps);
+		if (r) {
+			DMWARN("write_header failed");
+			return r;
+		}
+
+		ps->current_area = 0;
+		zero_memory_area(ps);
+		r = zero_disk_area(ps, 0);
+		if (r) {
+			DMWARN("zero_disk_area(0) failed");
+			return r;
+		}
+	} else {
+		/*
+		 * Sanity checks.
+		 */
+		if (ps->version != SNAPSHOT_DISK_VERSION) {
+			DMWARN("unable to handle snapshot disk version %d",
+			       ps->version);
+			return -EINVAL;
+		}
+
+		/*
+		 * Metadata are valid, but snapshot is invalidated
+		 */
+		if (!ps->valid)
+			return 1;
+
+		/*
+		 * Read the metadata.
+		 */
+		r = read_exceptions(ps, callback, callback_context);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static int persistent_prepare_exception(struct dm_exception_store *store,
+					struct dm_snap_exception *e)
+{
+	struct pstore *ps = get_info(store);
+	uint32_t stride;
+	chunk_t next_free;
+	sector_t size = get_dev_size(store->cow->bdev);
+
+	/* Is there enough room ? */
+	if (size < ((ps->next_free + 1) * store->chunk_size))
+		return -ENOSPC;
+
+	e->new_chunk = ps->next_free;
+
+	/*
+	 * Move onto the next free pending, making sure to take
+	 * into account the location of the metadata chunks.
+	 */
+	stride = (ps->exceptions_per_area + 1);
+	next_free = ++ps->next_free;
+	if (sector_div(next_free, stride) == 1)
+		ps->next_free++;
+
+	atomic_inc(&ps->pending_count);
+	return 0;
+}
+
+static void persistent_commit_exception(struct dm_exception_store *store,
+					struct dm_snap_exception *e,
+					void (*callback) (void *, int success),
+					void *callback_context)
+{
+	unsigned int i;
+	struct pstore *ps = get_info(store);
+	struct disk_exception de;
+	struct commit_callback *cb;
+
+	de.old_chunk = e->old_chunk;
+	de.new_chunk = e->new_chunk;
+	write_exception(ps, ps->current_committed++, &de);
+
+	/*
+	 * Add the callback to the back of the array.  This code
+	 * is the only place where the callback array is
+	 * manipulated, and we know that it will never be called
+	 * multiple times concurrently.
+	 */
+	cb = ps->callbacks + ps->callback_count++;
+	cb->callback = callback;
+	cb->context = callback_context;
+
+	/*
+	 * If there are exceptions in flight and we have not yet
+	 * filled this metadata area there's nothing more to do.
+	 */
+	if (!atomic_dec_and_test(&ps->pending_count) &&
+	    (ps->current_committed != ps->exceptions_per_area))
+		return;
+
+	/*
+	 * If we completely filled the current area, then wipe the next one.
+	 */
+	if ((ps->current_committed == ps->exceptions_per_area) &&
+	     zero_disk_area(ps, ps->current_area + 1))
+		ps->valid = 0;
+
+	/*
+	 * Commit exceptions to disk.
+	 */
+	if (ps->valid && area_io(ps, WRITE_BARRIER))
+		ps->valid = 0;
+
+	/*
+	 * Advance to the next area if this one is full.
+	 */
+	if (ps->current_committed == ps->exceptions_per_area) {
+		ps->current_committed = 0;
+		ps->current_area++;
+		zero_memory_area(ps);
+	}
+
+	for (i = 0; i < ps->callback_count; i++) {
+		cb = ps->callbacks + i;
+		cb->callback(cb->context, ps->valid);
+	}
+
+	ps->callback_count = 0;
+}
+
+static void persistent_drop_snapshot(struct dm_exception_store *store)
+{
+	struct pstore *ps = get_info(store);
+
+	ps->valid = 0;
+	if (write_header(ps))
+		DMWARN("write header failed");
+}
+
+static int persistent_ctr(struct dm_exception_store *store,
+			  unsigned argc, char **argv)
+{
+	struct pstore *ps;
+
+	/* allocate the pstore */
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		return -ENOMEM;
+
+	ps->store = store;
+	ps->valid = 1;
+	ps->version = SNAPSHOT_DISK_VERSION;
+	ps->area = NULL;
+	ps->zero_area = NULL;
+	ps->header_area = NULL;
+	ps->next_free = 2;	/* skipping the header and first area */
+	ps->current_committed = 0;
+
+	ps->callback_count = 0;
+	atomic_set(&ps->pending_count, 0);
+	ps->callbacks = NULL;
+
+	ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
+	if (!ps->metadata_wq) {
+		kfree(ps);
+		DMERR("couldn't start header metadata update thread");
+		return -ENOMEM;
+	}
+
+	store->context = ps;
+
+	return 0;
+}
+
+static unsigned persistent_status(struct dm_exception_store *store,
+				  status_type_t status, char *result,
+				  unsigned maxlen)
+{
+	unsigned sz = 0;
+
+	switch (status) {
+	case STATUSTYPE_INFO:
+		break;
+	case STATUSTYPE_TABLE:
+		DMEMIT(" %s P %llu", store->cow->name,
+		       (unsigned long long)store->chunk_size);
+	}
+
+	return sz;
+}
+
+static struct dm_exception_store_type _persistent_type = {
+	.name = "persistent",
+	.module = THIS_MODULE,
+	.ctr = persistent_ctr,
+	.dtr = persistent_dtr,
+	.read_metadata = persistent_read_metadata,
+	.prepare_exception = persistent_prepare_exception,
+	.commit_exception = persistent_commit_exception,
+	.drop_snapshot = persistent_drop_snapshot,
+	.fraction_full = persistent_fraction_full,
+	.status = persistent_status,
+};
+
+static struct dm_exception_store_type _persistent_compat_type = {
+	.name = "P",
+	.module = THIS_MODULE,
+	.ctr = persistent_ctr,
+	.dtr = persistent_dtr,
+	.read_metadata = persistent_read_metadata,
+	.prepare_exception = persistent_prepare_exception,
+	.commit_exception = persistent_commit_exception,
+	.drop_snapshot = persistent_drop_snapshot,
+	.fraction_full = persistent_fraction_full,
+	.status = persistent_status,
+};
+
+int dm_persistent_snapshot_init(void)
+{
+	int r;
+
+	r = dm_exception_store_type_register(&_persistent_type);
+	if (r) {
+		DMERR("Unable to register persistent exception store type");
+		return r;
+	}
+
+	r = dm_exception_store_type_register(&_persistent_compat_type);
+	if (r) {
+		DMERR("Unable to register old-style persistent exception "
+		      "store type");
+		dm_exception_store_type_unregister(&_persistent_type);
+		return r;
+	}
+
+	return r;
+}
+
+void dm_persistent_snapshot_exit(void)
+{
+	dm_exception_store_type_unregister(&_persistent_type);
+	dm_exception_store_type_unregister(&_persistent_compat_type);
+}
--- a/kernel/drivers/md/dm-snap-transient.c
+++ b/kernel/drivers/md/dm-snap-transient.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006-2008 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-exception-store.h"
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/dm-io.h>
+
+#define DM_MSG_PREFIX "transient snapshot"
+
+/*-----------------------------------------------------------------
+ * Implementation of the store for non-persistent snapshots.
+ *---------------------------------------------------------------*/
+struct transient_c {
+	sector_t next_free;
+};
+
+static void transient_dtr(struct dm_exception_store *store)
+{
+	kfree(store->context);
+}
+
+static int transient_read_metadata(struct dm_exception_store *store,
+				   int (*callback)(void *callback_context,
+						   chunk_t old, chunk_t new),
+				   void *callback_context)
+{
+	return 0;
+}
+
+static int transient_prepare_exception(struct dm_exception_store *store,
+				       struct dm_snap_exception *e)
+{
+	struct transient_c *tc = store->context;
+	sector_t size = get_dev_size(store->cow->bdev);
+
+	if (size < (tc->next_free + store->chunk_size))
+		return -1;
+
+	e->new_chunk = sector_to_chunk(store, tc->next_free);
+	tc->next_free += store->chunk_size;
+
+	return 0;
+}
+
+static void transient_commit_exception(struct dm_exception_store *store,
+				       struct dm_snap_exception *e,
+				       void (*callback) (void *, int success),
+				       void *callback_context)
+{
+	/* Just succeed */
+	callback(callback_context, 1);
+}
+
+static void transient_fraction_full(struct dm_exception_store *store,
+				    sector_t *numerator, sector_t *denominator)
+{
+	*numerator = ((struct transient_c *) store->context)->next_free;
+	*denominator = get_dev_size(store->cow->bdev);
+}
+
+static int transient_ctr(struct dm_exception_store *store,
+			 unsigned argc, char **argv)
+{
+	struct transient_c *tc;
+
+	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
+	if (!tc)
+		return -ENOMEM;
+
+	tc->next_free = 0;
+	store->context = tc;
+
+	return 0;
+}
+
+static unsigned transient_status(struct dm_exception_store *store,
+				 status_type_t status, char *result,
+				 unsigned maxlen)
+{
+	unsigned sz = 0;
+
+	switch (status) {
+	case STATUSTYPE_INFO:
+		break;
+	case STATUSTYPE_TABLE:
+		DMEMIT(" %s N %llu", store->cow->name,
+		       (unsigned long long)store->chunk_size);
+	}
+
+	return sz;
+}
+
+static struct dm_exception_store_type _transient_type = {
+	.name = "transient",
+	.module = THIS_MODULE,
+	.ctr = transient_ctr,
+	.dtr = transient_dtr,
+	.read_metadata = transient_read_metadata,
+	.prepare_exception = transient_prepare_exception,
+	.commit_exception = transient_commit_exception,
+	.fraction_full = transient_fraction_full,
+	.status = transient_status,
+};
+
+static struct dm_exception_store_type _transient_compat_type = {
+	.name = "N",
+	.module = THIS_MODULE,
+	.ctr = transient_ctr,
+	.dtr = transient_dtr,
+	.read_metadata = transient_read_metadata,
+	.prepare_exception = transient_prepare_exception,
+	.commit_exception = transient_commit_exception,
+	.fraction_full = transient_fraction_full,
+	.status = transient_status,
+};
+
+int dm_transient_snapshot_init(void)
+{
+	int r;
+
+	r = dm_exception_store_type_register(&_transient_type);
+	if (r) {
+		DMWARN("Unable to register transient exception store type");
+		return r;
+	}
+
+	r = dm_exception_store_type_register(&_transient_compat_type);
+	if (r) {
+		DMWARN("Unable to register old-style transient "
+		       "exception store type");
+		dm_exception_store_type_unregister(&_transient_type);
+		return r;
+	}
+
+	return r;
+}
+
+void dm_transient_snapshot_exit(void)
+{
+	dm_exception_store_type_unregister(&_transient_type);
+	dm_exception_store_type_unregister(&_transient_compat_type);
+}
--- a/kernel/drivers/md/dm-snap.c
+++ b/kernel/drivers/md/dm-snap.c
--- a/kernel/drivers/md/dm-stripe.c
+++ b/kernel/drivers/md/dm-stripe.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (C) 2001-2003 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+
+#define DM_MSG_PREFIX "striped"
+#define DM_IO_ERROR_THRESHOLD 15
+
+struct stripe {
+	struct dm_dev *dev;
+	sector_t physical_start;
+
+	atomic_t error_count;
+};
+
+struct stripe_c {
+	uint32_t stripes;
+
+	/* The size of this target / num. stripes */
+	sector_t stripe_width;
+
+	/* stripe chunk size */
+	uint32_t chunk_shift;
+	sector_t chunk_mask;
+
+	/* Needed for handling events */
+	struct dm_target *ti;
+
+	/* Work struct used for triggering events*/
+	struct work_struct kstriped_ws;
+
+	struct stripe stripe[0];
+};
+
+static struct workqueue_struct *kstriped;
+
+/*
+ * An event is triggered whenever a drive
+ * drops out of a stripe volume.
+ */
+static void trigger_event(struct work_struct *work)
+{
+	struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws);
+
+	dm_table_event(sc->ti->table);
+
+}
+
+static inline struct stripe_c *alloc_context(unsigned int stripes)
+{
+	size_t len;
+
+	if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
+			     stripes))
+		return NULL;
+
+	len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
+
+	return kmalloc(len, GFP_KERNEL);
+}
+
+/*
+ * Parse a single <dev> <sector> pair
+ */
+static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
+		      unsigned int stripe, char **argv)
+{
+	unsigned long long start;
+
+	if (sscanf(argv[1], "%llu", &start) != 1)
+		return -EINVAL;
+
+	if (dm_get_device(ti, argv[0], start, sc->stripe_width,
+			  dm_table_get_mode(ti->table),
+			  &sc->stripe[stripe].dev))
+		return -ENXIO;
+
+	sc->stripe[stripe].physical_start = start;
+
+	return 0;
+}
+
+/*
+ * Construct a striped mapping.
+ * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
+ */
+static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct stripe_c *sc;
+	sector_t width;
+	uint32_t stripes;
+	uint32_t chunk_size;
+	char *end;
+	int r;
+	unsigned int i;
+
+	if (argc < 2) {
+		ti->error = "Not enough arguments";
+		return -EINVAL;
+	}
+
+	stripes = simple_strtoul(argv[0], &end, 10);
+	if (!stripes || *end) {
+		ti->error = "Invalid stripe count";
+		return -EINVAL;
+	}
+
+	chunk_size = simple_strtoul(argv[1], &end, 10);
+	if (*end) {
+		ti->error = "Invalid chunk_size";
+		return -EINVAL;
+	}
+
+	/*
+	 * chunk_size is a power of two
+	 */
+	if (!is_power_of_2(chunk_size) ||
+	    (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
+		ti->error = "Invalid chunk size";
+		return -EINVAL;
+	}
+
+	if (ti->len & (chunk_size - 1)) {
+		ti->error = "Target length not divisible by "
+		    "chunk size";
+		return -EINVAL;
+	}
+
+	width = ti->len;
+	if (sector_div(width, stripes)) {
+		ti->error = "Target length not divisible by "
+		    "number of stripes";
+		return -EINVAL;
+	}
+
+	/*
+	 * Do we have enough arguments for that many stripes ?
+	 */
+	if (argc != (2 + 2 * stripes)) {
+		ti->error = "Not enough destinations "
+			"specified";
+		return -EINVAL;
+	}
+
+	sc = alloc_context(stripes);
+	if (!sc) {
+		ti->error = "Memory allocation for striped context "
+		    "failed";
+		return -ENOMEM;
+	}
+
+	INIT_WORK(&sc->kstriped_ws, trigger_event);
+
+	/* Set pointer to dm target; used in trigger_event */
+	sc->ti = ti;
+
+	sc->stripes = stripes;
+	sc->stripe_width = width;
+	ti->split_io = chunk_size;
+	ti->num_flush_requests = stripes;
+
+	sc->chunk_mask = ((sector_t) chunk_size) - 1;
+	for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
+		chunk_size >>= 1;
+	sc->chunk_shift--;
+
+	/*
+	 * Get the stripe destinations.
+	 */
+	for (i = 0; i < stripes; i++) {
+		argv += 2;
+
+		r = get_stripe(ti, sc, i, argv);
+		if (r < 0) {
+			ti->error = "Couldn't parse stripe destination";
+			while (i--)
+				dm_put_device(ti, sc->stripe[i].dev);
+			kfree(sc);
+			return r;
+		}
+		atomic_set(&(sc->stripe[i].error_count), 0);
+	}
+
+	ti->private = sc;
+
+	return 0;
+}
+
+static void stripe_dtr(struct dm_target *ti)
+{
+	unsigned int i;
+	struct stripe_c *sc = (struct stripe_c *) ti->private;
+
+	for (i = 0; i < sc->stripes; i++)
+		dm_put_device(ti, sc->stripe[i].dev);
+
+	flush_workqueue(kstriped);
+	kfree(sc);
+}
+
+static int stripe_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
+{
+	struct stripe_c *sc = (struct stripe_c *) ti->private;
+	sector_t offset, chunk;
+	uint32_t stripe;
+
+	if (unlikely(bio_empty_barrier(bio))) {
+		BUG_ON(map_context->flush_request >= sc->stripes);
+		bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev;
+		return DM_MAPIO_REMAPPED;
+	}
+
+	offset = bio->bi_sector - ti->begin;
+	chunk = offset >> sc->chunk_shift;
+	stripe = sector_div(chunk, sc->stripes);
+
+	bio->bi_bdev = sc->stripe[stripe].dev->bdev;
+	bio->bi_sector = sc->stripe[stripe].physical_start +
+	    (chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
+	return DM_MAPIO_REMAPPED;
+}
+
+/*
+ * Stripe status:
+ *
+ * INFO
+ * #stripes [stripe_name <stripe_name>] [group word count]
+ * [error count 'A|D' <error count 'A|D'>]
+ *
+ * TABLE
+ * #stripes [stripe chunk size]
+ * [stripe_name physical_start <stripe_name physical_start>]
+ *
+ */
+
+static int stripe_status(struct dm_target *ti,
+			 status_type_t type, char *result, unsigned int maxlen)
+{
+	struct stripe_c *sc = (struct stripe_c *) ti->private;
+	char buffer[sc->stripes + 1];
+	unsigned int sz = 0;
+	unsigned int i;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		DMEMIT("%d ", sc->stripes);
+		for (i = 0; i < sc->stripes; i++)  {
+			DMEMIT("%s ", sc->stripe[i].dev->name);
+			buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ?
+				'D' : 'A';
+		}
+		buffer[i] = '\0';
+		DMEMIT("1 %s", buffer);
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%d %llu", sc->stripes,
+			(unsigned long long)sc->chunk_mask + 1);
+		for (i = 0; i < sc->stripes; i++)
+			DMEMIT(" %s %llu", sc->stripe[i].dev->name,
+			    (unsigned long long)sc->stripe[i].physical_start);
+		break;
+	}
+	return 0;
+}
+
+static int stripe_end_io(struct dm_target *ti, struct bio *bio,
+			 int error, union map_info *map_context)
+{
+	unsigned i;
+	char major_minor[16];
+	struct stripe_c *sc = ti->private;
+
+	if (!error)
+		return 0; /* I/O complete */
+
+	if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
+		return error;
+
+	if (error == -EOPNOTSUPP)
+		return error;
+
+	memset(major_minor, 0, sizeof(major_minor));
+	sprintf(major_minor, "%d:%d",
+		MAJOR(disk_devt(bio->bi_bdev->bd_disk)),
+		MINOR(disk_devt(bio->bi_bdev->bd_disk)));
+
+	/*
+	 * Test to see which stripe drive triggered the event
+	 * and increment error count for all stripes on that device.
+	 * If the error count for a given device exceeds the threshold
+	 * value we will no longer trigger any further events.
+	 */
+	for (i = 0; i < sc->stripes; i++)
+		if (!strcmp(sc->stripe[i].dev->name, major_minor)) {
+			atomic_inc(&(sc->stripe[i].error_count));
+			if (atomic_read(&(sc->stripe[i].error_count)) <
+			    DM_IO_ERROR_THRESHOLD)
+				queue_work(kstriped, &sc->kstriped_ws);
+		}
+
+	return error;
+}
+
+static int stripe_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct stripe_c *sc = ti->private;
+	int ret = 0;
+	unsigned i = 0;
+
+	do {
+		ret = fn(ti, sc->stripe[i].dev,
+			 sc->stripe[i].physical_start,
+			 sc->stripe_width, data);
+	} while (!ret && ++i < sc->stripes);
+
+	return ret;
+}
+
+static void stripe_io_hints(struct dm_target *ti,
+			    struct queue_limits *limits)
+{
+	struct stripe_c *sc = ti->private;
+	unsigned chunk_size = (sc->chunk_mask + 1) << 9;
+
+	blk_limits_io_min(limits, chunk_size);
+	blk_limits_io_opt(limits, chunk_size * sc->stripes);
+}
+
+static struct target_type stripe_target = {
+	.name   = "striped",
+	.version = {1, 3, 0},
+	.module = THIS_MODULE,
+	.ctr    = stripe_ctr,
+	.dtr    = stripe_dtr,
+	.map    = stripe_map,
+	.end_io = stripe_end_io,
+	.status = stripe_status,
+	.iterate_devices = stripe_iterate_devices,
+	.io_hints = stripe_io_hints,
+};
+
+int __init dm_stripe_init(void)
+{
+	int r;
+
+	r = dm_register_target(&stripe_target);
+	if (r < 0) {
+		DMWARN("target registration failed");
+		return r;
+	}
+
+	kstriped = create_singlethread_workqueue("kstriped");
+	if (!kstriped) {
+		DMERR("failed to create workqueue kstriped");
+		dm_unregister_target(&stripe_target);
+		return -ENOMEM;
+	}
+
+	return r;
+}
+
+void dm_stripe_exit(void)
+{
+	dm_unregister_target(&stripe_target);
+	destroy_workqueue(kstriped);
+
+	return;
+}
--- a/kernel/drivers/md/dm-sysfs.c
+++ b/kernel/drivers/md/dm-sysfs.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/sysfs.h>
+#include <linux/dm-ioctl.h>
+#include "dm.h"
+
+struct dm_sysfs_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct mapped_device *, char *);
+	ssize_t (*store)(struct mapped_device *, char *);
+};
+
+#define DM_ATTR_RO(_name) \
+struct dm_sysfs_attr dm_attr_##_name = \
+	__ATTR(_name, S_IRUGO, dm_attr_##_name##_show, NULL)
+
+static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
+			    char *page)
+{
+	struct dm_sysfs_attr *dm_attr;
+	struct mapped_device *md;
+	ssize_t ret;
+
+	dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
+	if (!dm_attr->show)
+		return -EIO;
+
+	md = dm_get_from_kobject(kobj);
+	if (!md)
+		return -EINVAL;
+
+	ret = dm_attr->show(md, page);
+	dm_put(md);
+
+	return ret;
+}
+
+static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
+{
+	if (dm_copy_name_and_uuid(md, buf, NULL))
+		return -EIO;
+
+	strcat(buf, "\n");
+	return strlen(buf);
+}
+
+static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
+{
+	if (dm_copy_name_and_uuid(md, NULL, buf))
+		return -EIO;
+
+	strcat(buf, "\n");
+	return strlen(buf);
+}
+
+static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
+{
+	sprintf(buf, "%d\n", dm_suspended(md));
+
+	return strlen(buf);
+}
+
+static DM_ATTR_RO(name);
+static DM_ATTR_RO(uuid);
+static DM_ATTR_RO(suspended);
+
+static struct attribute *dm_attrs[] = {
+	&dm_attr_name.attr,
+	&dm_attr_uuid.attr,
+	&dm_attr_suspended.attr,
+	NULL,
+};
+
+static struct sysfs_ops dm_sysfs_ops = {
+	.show	= dm_attr_show,
+};
+
+/*
+ * dm kobject is embedded in mapped_device structure
+ * no need to define release function here
+ */
+static struct kobj_type dm_ktype = {
+	.sysfs_ops	= &dm_sysfs_ops,
+	.default_attrs	= dm_attrs,
+};
+
+/*
+ * Initialize kobj
+ * because nobody using md yet, no need to call explicit dm_get/put
+ */
+int dm_sysfs_init(struct mapped_device *md)
+{
+	return kobject_init_and_add(dm_kobject(md), &dm_ktype,
+				    &disk_to_dev(dm_disk(md))->kobj,
+				    "%s", "dm");
+}
+
+/*
+ * Remove kobj, called after all references removed
+ */
+void dm_sysfs_exit(struct mapped_device *md)
+{
+	kobject_put(dm_kobject(md));
+}
--- a/kernel/drivers/md/dm-table.c
+++ b/kernel/drivers/md/dm-table.c
--- a/kernel/drivers/md/dm-target.c
+++ b/kernel/drivers/md/dm-target.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+
+#define DM_MSG_PREFIX "target"
+
+static LIST_HEAD(_targets);
+static DECLARE_RWSEM(_lock);
+
+#define DM_MOD_NAME_SIZE 32
+
+static inline struct target_type *__find_target_type(const char *name)
+{
+	struct target_type *tt;
+
+	list_for_each_entry(tt, &_targets, list)
+		if (!strcmp(name, tt->name))
+			return tt;
+
+	return NULL;
+}
+
+static struct target_type *get_target_type(const char *name)
+{
+	struct target_type *tt;
+
+	down_read(&_lock);
+
+	tt = __find_target_type(name);
+	if (tt && !try_module_get(tt->module))
+		tt = NULL;
+
+	up_read(&_lock);
+	return tt;
+}
+
+static void load_module(const char *name)
+{
+	request_module("dm-%s", name);
+}
+
+struct target_type *dm_get_target_type(const char *name)
+{
+	struct target_type *tt = get_target_type(name);
+
+	if (!tt) {
+		load_module(name);
+		tt = get_target_type(name);
+	}
+
+	return tt;
+}
+
+void dm_put_target_type(struct target_type *tt)
+{
+	down_read(&_lock);
+	module_put(tt->module);
+	up_read(&_lock);
+}
+
+int dm_target_iterate(void (*iter_func)(struct target_type *tt,
+					void *param), void *param)
+{
+	struct target_type *tt;
+
+	down_read(&_lock);
+	list_for_each_entry(tt, &_targets, list)
+		iter_func(tt, param);
+	up_read(&_lock);
+
+	return 0;
+}
+
+int dm_register_target(struct target_type *tt)
+{
+	int rv = 0;
+
+	down_write(&_lock);
+	if (__find_target_type(tt->name))
+		rv = -EEXIST;
+	else
+		list_add(&tt->list, &_targets);
+
+	up_write(&_lock);
+	return rv;
+}
+
+void dm_unregister_target(struct target_type *tt)
+{
+	down_write(&_lock);
+	if (!__find_target_type(tt->name)) {
+		DMCRIT("Unregistering unrecognised target: %s", tt->name);
+		BUG();
+	}
+
+	list_del(&tt->list);
+
+	up_write(&_lock);
+}
+
+/*
+ * io-err: always fails an io, useful for bringing
+ * up LVs that have holes in them.
+ */
+static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
+{
+	return 0;
+}
+
+static void io_err_dtr(struct dm_target *tt)
+{
+	/* empty */
+}
+
+static int io_err_map(struct dm_target *tt, struct bio *bio,
+		      union map_info *map_context)
+{
+	return -EIO;
+}
+
+static struct target_type error_target = {
+	.name = "error",
+	.version = {1, 0, 1},
+	.ctr  = io_err_ctr,
+	.dtr  = io_err_dtr,
+	.map  = io_err_map,
+};
+
+int __init dm_target_init(void)
+{
+	return dm_register_target(&error_target);
+}
+
+void dm_target_exit(void)
+{
+	dm_unregister_target(&error_target);
+}
+
+EXPORT_SYMBOL(dm_register_target);
+EXPORT_SYMBOL(dm_unregister_target);
--- a/kernel/drivers/md/dm-uevent.c
+++ b/kernel/drivers/md/dm-uevent.c
@@ -0,0 +1,221 @@
+/*
+ * Device Mapper Uevent Support (dm-uevent)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2007
+ * 	Author: Mike Anderson <andmike@linux.vnet.ibm.com>
+ */
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/kobject.h>
+#include <linux/dm-ioctl.h>
+
+#include "dm.h"
+#include "dm-uevent.h"
+
+#define DM_MSG_PREFIX "uevent"
+
+static const struct {
+	enum dm_uevent_type type;
+	enum kobject_action action;
+	char *name;
+} _dm_uevent_type_names[] = {
+	{DM_UEVENT_PATH_FAILED, KOBJ_CHANGE, "PATH_FAILED"},
+	{DM_UEVENT_PATH_REINSTATED, KOBJ_CHANGE, "PATH_REINSTATED"},
+};
+
+static struct kmem_cache *_dm_event_cache;
+
+struct dm_uevent {
+	struct mapped_device *md;
+	enum kobject_action action;
+	struct kobj_uevent_env ku_env;
+	struct list_head elist;
+	char name[DM_NAME_LEN];
+	char uuid[DM_UUID_LEN];
+};
+
+static void dm_uevent_free(struct dm_uevent *event)
+{
+	kmem_cache_free(_dm_event_cache, event);
+}
+
+static struct dm_uevent *dm_uevent_alloc(struct mapped_device *md)
+{
+	struct dm_uevent *event;
+
+	event = kmem_cache_zalloc(_dm_event_cache, GFP_ATOMIC);
+	if (!event)
+		return NULL;
+
+	INIT_LIST_HEAD(&event->elist);
+	event->md = md;
+
+	return event;
+}
+
+static struct dm_uevent *dm_build_path_uevent(struct mapped_device *md,
+					      struct dm_target *ti,
+					      enum kobject_action action,
+					      const char *dm_action,
+					      const char *path,
+					      unsigned nr_valid_paths)
+{
+	struct dm_uevent *event;
+
+	event = dm_uevent_alloc(md);
+	if (!event) {
+		DMERR("%s: dm_uevent_alloc() failed", __func__);
+		goto err_nomem;
+	}
+
+	event->action = action;
+
+	if (add_uevent_var(&event->ku_env, "DM_TARGET=%s", ti->type->name)) {
+		DMERR("%s: add_uevent_var() for DM_TARGET failed",
+		      __func__);
+		goto err_add;
+	}
+
+	if (add_uevent_var(&event->ku_env, "DM_ACTION=%s", dm_action)) {
+		DMERR("%s: add_uevent_var() for DM_ACTION failed",
+		      __func__);
+		goto err_add;
+	}
+
+	if (add_uevent_var(&event->ku_env, "DM_SEQNUM=%u",
+			   dm_next_uevent_seq(md))) {
+		DMERR("%s: add_uevent_var() for DM_SEQNUM failed",
+		      __func__);
+		goto err_add;
+	}
+
+	if (add_uevent_var(&event->ku_env, "DM_PATH=%s", path)) {
+		DMERR("%s: add_uevent_var() for DM_PATH failed", __func__);
+		goto err_add;
+	}
+
+	if (add_uevent_var(&event->ku_env, "DM_NR_VALID_PATHS=%d",
+			   nr_valid_paths)) {
+		DMERR("%s: add_uevent_var() for DM_NR_VALID_PATHS failed",
+		      __func__);
+		goto err_add;
+	}
+
+	return event;
+
+err_add:
+	dm_uevent_free(event);
+err_nomem:
+	return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * dm_send_uevents - send uevents for given list
+ *
+ * @events:	list of events to send
+ * @kobj:	kobject generating event
+ *
+ */
+void dm_send_uevents(struct list_head *events, struct kobject *kobj)
+{
+	int r;
+	struct dm_uevent *event, *next;
+
+	list_for_each_entry_safe(event, next, events, elist) {
+		list_del_init(&event->elist);
+
+		/*
+		 * When a device is being removed this copy fails and we
+		 * discard these unsent events.
+		 */
+		if (dm_copy_name_and_uuid(event->md, event->name,
+					  event->uuid)) {
+			DMINFO("%s: skipping sending uevent for lost device",
+			       __func__);
+			goto uevent_free;
+		}
+
+		if (add_uevent_var(&event->ku_env, "DM_NAME=%s", event->name)) {
+			DMERR("%s: add_uevent_var() for DM_NAME failed",
+			      __func__);
+			goto uevent_free;
+		}
+
+		if (add_uevent_var(&event->ku_env, "DM_UUID=%s", event->uuid)) {
+			DMERR("%s: add_uevent_var() for DM_UUID failed",
+			      __func__);
+			goto uevent_free;
+		}
+
+		r = kobject_uevent_env(kobj, event->action, event->ku_env.envp);
+		if (r)
+			DMERR("%s: kobject_uevent_env failed", __func__);
+uevent_free:
+		dm_uevent_free(event);
+	}
+}
+EXPORT_SYMBOL_GPL(dm_send_uevents);
+
+/**
+ * dm_path_uevent - called to create a new path event and queue it
+ *
+ * @event_type:	path event type enum
+ * @ti:			pointer to a dm_target
+ * @path:		string containing pathname
+ * @nr_valid_paths:	number of valid paths remaining
+ *
+ */
+void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
+		   const char *path, unsigned nr_valid_paths)
+{
+	struct mapped_device *md = dm_table_get_md(ti->table);
+	struct dm_uevent *event;
+
+	if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) {
+		DMERR("%s: Invalid event_type %d", __func__, event_type);
+		goto out;
+	}
+
+	event = dm_build_path_uevent(md, ti,
+				     _dm_uevent_type_names[event_type].action,
+				     _dm_uevent_type_names[event_type].name,
+				     path, nr_valid_paths);
+	if (IS_ERR(event))
+		goto out;
+
+	dm_uevent_add(md, &event->elist);
+
+out:
+	dm_put(md);
+}
+EXPORT_SYMBOL_GPL(dm_path_uevent);
+
+int dm_uevent_init(void)
+{
+	_dm_event_cache = KMEM_CACHE(dm_uevent, 0);
+	if (!_dm_event_cache)
+		return -ENOMEM;
+
+	DMINFO("version 1.0.3");
+
+	return 0;
+}
+
+void dm_uevent_exit(void)
+{
+	kmem_cache_destroy(_dm_event_cache);
+}
--- a/kernel/drivers/md/dm-uevent.h
+++ b/kernel/drivers/md/dm-uevent.h
@@ -0,0 +1,59 @@
+/*
+ * Device Mapper Uevent Support
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2007
+ * 	Author: Mike Anderson <andmike@linux.vnet.ibm.com>
+ */
+#ifndef DM_UEVENT_H
+#define DM_UEVENT_H
+
+enum dm_uevent_type {
+	DM_UEVENT_PATH_FAILED,
+	DM_UEVENT_PATH_REINSTATED,
+};
+
+#ifdef CONFIG_DM_UEVENT
+
+extern int dm_uevent_init(void);
+extern void dm_uevent_exit(void);
+extern void dm_send_uevents(struct list_head *events, struct kobject *kobj);
+extern void dm_path_uevent(enum dm_uevent_type event_type,
+			   struct dm_target *ti, const char *path,
+			   unsigned nr_valid_paths);
+
+#else
+
+static inline int dm_uevent_init(void)
+{
+	return 0;
+}
+static inline void dm_uevent_exit(void)
+{
+}
+static inline void dm_send_uevents(struct list_head *events,
+				   struct kobject *kobj)
+{
+}
+static inline void dm_path_uevent(enum dm_uevent_type event_type,
+				  struct dm_target *ti, const char *path,
+				  unsigned nr_valid_paths)
+{
+}
+
+#endif	/* CONFIG_DM_UEVENT */
+
+#endif	/* DM_UEVENT_H */
--- a/kernel/drivers/md/dm-zero.c
+++ b/kernel/drivers/md/dm-zero.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+
+#define DM_MSG_PREFIX "zero"
+
+/*
+ * Construct a dummy mapping that only returns zeros
+ */
+static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	if (argc != 0) {
+		ti->error = "No arguments required";
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Return zeros only on reads
+ */
+static int zero_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
+{
+	switch(bio_rw(bio)) {
+	case READ:
+		zero_fill_bio(bio);
+		break;
+	case READA:
+		/* readahead of null bytes only wastes buffer cache */
+		return -EIO;
+	case WRITE:
+		/* writes get silently dropped */
+		break;
+	}
+
+	bio_endio(bio, 0);
+
+	/* accepted bio, don't make new request */
+	return DM_MAPIO_SUBMITTED;
+}
+
+static struct target_type zero_target = {
+	.name   = "zero",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr    = zero_ctr,
+	.map    = zero_map,
+};
+
+static int __init dm_zero_init(void)
+{
+	int r = dm_register_target(&zero_target);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	return r;
+}
+
+static void __exit dm_zero_exit(void)
+{
+	dm_unregister_target(&zero_target);
+}
+
+module_init(dm_zero_init)
+module_exit(dm_zero_exit)
+
+MODULE_AUTHOR("Christophe Saout <christophe@saout.de>");
+MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros");
+MODULE_LICENSE("GPL");
--- a/kernel/drivers/md/dm.c
+++ b/kernel/drivers/md/dm.c
--- a/kernel/drivers/md/dm.h
+++ b/kernel/drivers/md/dm.h
@@ -0,0 +1,130 @@
+/*
+ * Internal header file for device mapper
+ *
+ * Copyright (C) 2001, 2002 Sistina Software
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef DM_INTERNAL_H
+#define DM_INTERNAL_H
+
+#include <linux/fs.h>
+#include <linux/device-mapper.h>
+#include <linux/list.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+
+/*
+ * Suspend feature flags
+ */
+#define DM_SUSPEND_LOCKFS_FLAG		(1 << 0)
+#define DM_SUSPEND_NOFLUSH_FLAG		(1 << 1)
+
+/*
+ * Type of table and mapped_device's mempool
+ */
+#define DM_TYPE_NONE		0
+#define DM_TYPE_BIO_BASED	1
+#define DM_TYPE_REQUEST_BASED	2
+
+/*
+ * List of devices that a metadevice uses and should open/close.
+ */
+struct dm_dev_internal {
+	struct list_head list;
+	atomic_t count;
+	struct dm_dev dm_dev;
+};
+
+struct dm_table;
+struct dm_md_mempools;
+
+/*-----------------------------------------------------------------
+ * Internal table functions.
+ *---------------------------------------------------------------*/
+void dm_table_destroy(struct dm_table *t);
+void dm_table_event_callback(struct dm_table *t,
+			     void (*fn)(void *), void *context);
+struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
+struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
+int dm_calculate_queue_limits(struct dm_table *table,
+			      struct queue_limits *limits);
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+			       struct queue_limits *limits);
+struct list_head *dm_table_get_devices(struct dm_table *t);
+void dm_table_presuspend_targets(struct dm_table *t);
+void dm_table_postsuspend_targets(struct dm_table *t);
+int dm_table_resume_targets(struct dm_table *t);
+int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+int dm_table_any_busy_target(struct dm_table *t);
+int dm_table_set_type(struct dm_table *t);
+unsigned dm_table_get_type(struct dm_table *t);
+bool dm_table_request_based(struct dm_table *t);
+int dm_table_alloc_md_mempools(struct dm_table *t);
+void dm_table_free_md_mempools(struct dm_table *t);
+struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
+
+/*
+ * To check the return value from dm_table_find_target().
+ */
+#define dm_target_is_valid(t) ((t)->table)
+
+/*
+ * To check whether the target type is request-based or not (bio-based).
+ */
+#define dm_target_request_based(t) ((t)->type->map_rq != NULL)
+
+/*-----------------------------------------------------------------
+ * A registry of target types.
+ *---------------------------------------------------------------*/
+int dm_target_init(void);
+void dm_target_exit(void);
+struct target_type *dm_get_target_type(const char *name);
+void dm_put_target_type(struct target_type *tt);
+int dm_target_iterate(void (*iter_func)(struct target_type *tt,
+					void *param), void *param);
+
+int dm_split_args(int *argc, char ***argvp, char *input);
+
+/*
+ * The device-mapper can be driven through one of two interfaces;
+ * ioctl or filesystem, depending which patch you have applied.
+ */
+int dm_interface_init(void);
+void dm_interface_exit(void);
+
+/*
+ * sysfs interface
+ */
+int dm_sysfs_init(struct mapped_device *md);
+void dm_sysfs_exit(struct mapped_device *md);
+struct kobject *dm_kobject(struct mapped_device *md);
+struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
+
+/*
+ * Targets for linear and striped mappings
+ */
+int dm_linear_init(void);
+void dm_linear_exit(void);
+
+int dm_stripe_init(void);
+void dm_stripe_exit(void);
+
+int dm_open_count(struct mapped_device *md);
+int dm_lock_for_deletion(struct mapped_device *md);
+
+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
+		       unsigned cookie);
+
+int dm_kcopyd_init(void);
+void dm_kcopyd_exit(void);
+
+/*
+ * Mempool operations
+ */
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type);
+void dm_free_md_mempools(struct dm_md_mempools *pools);
+
+#endif
--- a/kernel/drivers/md/faulty.c
+++ b/kernel/drivers/md/faulty.c
@@ -0,0 +1,365 @@
+/*
+ * faulty.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 2004 Neil Brown
+ *
+ * fautly-device-simulator personality for md
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+/*
+ * The "faulty" personality causes some requests to fail.
+ *
+ * Possible failure modes are:
+ *   reads fail "randomly" but succeed on retry
+ *   writes fail "randomly" but succeed on retry
+ *   reads for some address fail and then persist until a write
+ *   reads for some address fail and then persist irrespective of write
+ *   writes for some address fail and persist
+ *   all writes fail
+ *
+ * Different modes can be active at a time, but only
+ * one can be set at array creation.  Others can be added later.
+ * A mode can be one-shot or recurrent with the recurrance being
+ * once in every N requests.
+ * The bottom 5 bits of the "layout" indicate the mode.  The
+ * remainder indicate a period, or 0 for one-shot.
+ *
+ * There is an implementation limit on the number of concurrently
+ * persisting-faulty blocks. When a new fault is requested that would
+ * exceed the limit, it is ignored.
+ * All current faults can be clear using a layout of "0".
+ *
+ * Requests are always sent to the device.  If they are to fail,
+ * we clone the bio and insert a new b_end_io into the chain.
+ */
+
+#define	WriteTransient	0
+#define	ReadTransient	1
+#define	WritePersistent	2
+#define	ReadPersistent	3
+#define	WriteAll	4 /* doesn't go to device */
+#define	ReadFixable	5
+#define	Modes	6
+
+#define	ClearErrors	31
+#define	ClearFaults	30
+
+#define AllPersist	100 /* internal use only */
+#define	NoPersist	101
+
+#define	ModeMask	0x1f
+#define	ModeShift	5
+
+#define MaxFault	50
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include "md.h"
+#include <linux/seq_file.h>
+
+
+static void faulty_fail(struct bio *bio, int error)
+{
+	struct bio *b = bio->bi_private;
+
+	b->bi_size = bio->bi_size;
+	b->bi_sector = bio->bi_sector;
+
+	bio_put(bio);
+
+	bio_io_error(b);
+}
+
+typedef struct faulty_conf {
+	int period[Modes];
+	atomic_t counters[Modes];
+	sector_t faults[MaxFault];
+	int	modes[MaxFault];
+	int nfaults;
+	mdk_rdev_t *rdev;
+} conf_t;
+
+static int check_mode(conf_t *conf, int mode)
+{
+	if (conf->period[mode] == 0 &&
+	    atomic_read(&conf->counters[mode]) <= 0)
+		return 0; /* no failure, no decrement */
+
+
+	if (atomic_dec_and_test(&conf->counters[mode])) {
+		if (conf->period[mode])
+			atomic_set(&conf->counters[mode], conf->period[mode]);
+		return 1;
+	}
+	return 0;
+}
+
+static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir)
+{
+	/* If we find a ReadFixable sector, we fix it ... */
+	int i;
+	for (i=0; i<conf->nfaults; i++)
+		if (conf->faults[i] >= start &&
+		    conf->faults[i] < end) {
+			/* found it ... */
+			switch (conf->modes[i] * 2 + dir) {
+			case WritePersistent*2+WRITE: return 1;
+			case ReadPersistent*2+READ: return 1;
+			case ReadFixable*2+READ: return 1;
+			case ReadFixable*2+WRITE:
+				conf->modes[i] = NoPersist;
+				return 0;
+			case AllPersist*2+READ:
+			case AllPersist*2+WRITE: return 1;
+			default:
+				return 0;
+			}
+		}
+	return 0;
+}
+
+static void add_sector(conf_t *conf, sector_t start, int mode)
+{
+	int i;
+	int n = conf->nfaults;
+	for (i=0; i<conf->nfaults; i++)
+		if (conf->faults[i] == start) {
+			switch(mode) {
+			case NoPersist: conf->modes[i] = mode; return;
+			case WritePersistent:
+				if (conf->modes[i] == ReadPersistent ||
+				    conf->modes[i] == ReadFixable)
+					conf->modes[i] = AllPersist;
+				else
+					conf->modes[i] = WritePersistent;
+				return;
+			case ReadPersistent:
+				if (conf->modes[i] == WritePersistent)
+					conf->modes[i] = AllPersist;
+				else
+					conf->modes[i] = ReadPersistent;
+				return;
+			case ReadFixable:
+				if (conf->modes[i] == WritePersistent ||
+				    conf->modes[i] == ReadPersistent)
+					conf->modes[i] = AllPersist;
+				else
+					conf->modes[i] = ReadFixable;
+				return;
+			}
+		} else if (conf->modes[i] == NoPersist)
+			n = i;
+
+	if (n >= MaxFault)
+		return;
+	conf->faults[n] = start;
+	conf->modes[n] = mode;
+	if (conf->nfaults == n)
+		conf->nfaults = n+1;
+}
+
+static int make_request(struct request_queue *q, struct bio *bio)
+{
+	mddev_t *mddev = q->queuedata;
+	conf_t *conf = (conf_t*)mddev->private;
+	int failit = 0;
+
+	if (bio_data_dir(bio) == WRITE) {
+		/* write request */
+		if (atomic_read(&conf->counters[WriteAll])) {
+			/* special case - don't decrement, don't generic_make_request,
+			 * just fail immediately
+			 */
+			bio_endio(bio, -EIO);
+			return 0;
+		}
+
+		if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
+				 WRITE))
+			failit = 1;
+		if (check_mode(conf, WritePersistent)) {
+			add_sector(conf, bio->bi_sector, WritePersistent);
+			failit = 1;
+		}
+		if (check_mode(conf, WriteTransient))
+			failit = 1;
+	} else {
+		/* read request */
+		if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9),
+				 READ))
+			failit = 1;
+		if (check_mode(conf, ReadTransient))
+			failit = 1;
+		if (check_mode(conf, ReadPersistent)) {
+			add_sector(conf, bio->bi_sector, ReadPersistent);
+			failit = 1;
+		}
+		if (check_mode(conf, ReadFixable)) {
+			add_sector(conf, bio->bi_sector, ReadFixable);
+			failit = 1;
+		}
+	}
+	if (failit) {
+		struct bio *b = bio_clone(bio, GFP_NOIO);
+		b->bi_bdev = conf->rdev->bdev;
+		b->bi_private = bio;
+		b->bi_end_io = faulty_fail;
+		generic_make_request(b);
+		return 0;
+	} else {
+		bio->bi_bdev = conf->rdev->bdev;
+		return 1;
+	}
+}
+
+static void status(struct seq_file *seq, mddev_t *mddev)
+{
+	conf_t *conf = (conf_t*)mddev->private;
+	int n;
+
+	if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
+		seq_printf(seq, " WriteTransient=%d(%d)",
+			   n, conf->period[WriteTransient]);
+
+	if ((n=atomic_read(&conf->counters[ReadTransient])) != 0)
+		seq_printf(seq, " ReadTransient=%d(%d)",
+			   n, conf->period[ReadTransient]);
+
+	if ((n=atomic_read(&conf->counters[WritePersistent])) != 0)
+		seq_printf(seq, " WritePersistent=%d(%d)",
+			   n, conf->period[WritePersistent]);
+
+	if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0)
+		seq_printf(seq, " ReadPersistent=%d(%d)",
+			   n, conf->period[ReadPersistent]);
+
+
+	if ((n=atomic_read(&conf->counters[ReadFixable])) != 0)
+		seq_printf(seq, " ReadFixable=%d(%d)",
+			   n, conf->period[ReadFixable]);
+
+	if ((n=atomic_read(&conf->counters[WriteAll])) != 0)
+		seq_printf(seq, " WriteAll");
+
+	seq_printf(seq, " nfaults=%d", conf->nfaults);
+}
+
+
+static int reshape(mddev_t *mddev)
+{
+	int mode = mddev->new_layout & ModeMask;
+	int count = mddev->new_layout >> ModeShift;
+	conf_t *conf = mddev->private;
+
+	if (mddev->new_layout < 0)
+		return 0;
+
+	/* new layout */
+	if (mode == ClearFaults)
+		conf->nfaults = 0;
+	else if (mode == ClearErrors) {
+		int i;
+		for (i=0 ; i < Modes ; i++) {
+			conf->period[i] = 0;
+			atomic_set(&conf->counters[i], 0);
+		}
+	} else if (mode < Modes) {
+		conf->period[mode] = count;
+		if (!count) count++;
+		atomic_set(&conf->counters[mode], count);
+	} else
+		return -EINVAL;
+	mddev->new_layout = -1;
+	mddev->layout = -1; /* makes sure further changes come through */
+	return 0;
+}
+
+static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+	WARN_ONCE(raid_disks,
+		  "%s does not support generic reshape\n", __func__);
+
+	if (sectors == 0)
+		return mddev->dev_sectors;
+
+	return sectors;
+}
+
+static int run(mddev_t *mddev)
+{
+	mdk_rdev_t *rdev;
+	int i;
+	conf_t *conf;
+
+	if (md_check_no_bitmap(mddev))
+		return -EINVAL;
+
+	conf = kmalloc(sizeof(*conf), GFP_KERNEL);
+	if (!conf)
+		return -ENOMEM;
+
+	for (i=0; i<Modes; i++) {
+		atomic_set(&conf->counters[i], 0);
+		conf->period[i] = 0;
+	}
+	conf->nfaults = 0;
+
+	list_for_each_entry(rdev, &mddev->disks, same_set)
+		conf->rdev = rdev;
+
+	md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
+	mddev->private = conf;
+
+	reshape(mddev);
+
+	return 0;
+}
+
+static int stop(mddev_t *mddev)
+{
+	conf_t *conf = (conf_t *)mddev->private;
+
+	kfree(conf);
+	mddev->private = NULL;
+	return 0;
+}
+
+static struct mdk_personality faulty_personality =
+{
+	.name		= "faulty",
+	.level		= LEVEL_FAULTY,
+	.owner		= THIS_MODULE,
+	.make_request	= make_request,
+	.run		= run,
+	.stop		= stop,
+	.status		= status,
+	.check_reshape	= reshape,
+	.size		= faulty_size,
+};
+
+static int __init raid_init(void)
+{
+	return register_md_personality(&faulty_personality);
+}
+
+static void raid_exit(void)
+{
+	unregister_md_personality(&faulty_personality);
+}
+
+module_init(raid_init);
+module_exit(raid_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md-personality-10"); /* faulty */
+MODULE_ALIAS("md-faulty");
+MODULE_ALIAS("md-level--5");
--- a/kernel/drivers/md/linear.c
+++ b/kernel/drivers/md/linear.c
@@ -0,0 +1,390 @@
+/*
+   linear.c : Multiple Devices driver for Linux
+	      Copyright (C) 1994-96 Marc ZYNGIER
+	      <zyngier@ufr-info-p7.ibp.fr> or
+	      <maz@gloups.fdn.fr>
+
+   Linear mode management functions.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "linear.h"
+
+/*
+ * find which device holds a particular offset 
+ */
+static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
+{
+	int lo, mid, hi;
+	linear_conf_t *conf;
+
+	lo = 0;
+	hi = mddev->raid_disks - 1;
+	conf = rcu_dereference(mddev->private);
+
+	/*
+	 * Binary Search
+	 */
+
+	while (hi > lo) {
+
+		mid = (hi + lo) / 2;
+		if (sector < conf->disks[mid].end_sector)
+			hi = mid;
+		else
+			lo = mid + 1;
+	}
+
+	return conf->disks + lo;
+}
+
+/**
+ *	linear_mergeable_bvec -- tell bio layer if two requests can be merged
+ *	@q: request queue
+ *	@bvm: properties of new bio
+ *	@biovec: the request that could be merged to it.
+ *
+ *	Return amount of bytes we can take at this offset
+ */
+static int linear_mergeable_bvec(struct request_queue *q,
+				 struct bvec_merge_data *bvm,
+				 struct bio_vec *biovec)
+{
+	mddev_t *mddev = q->queuedata;
+	dev_info_t *dev0;
+	unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
+	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+
+	rcu_read_lock();
+	dev0 = which_dev(mddev, sector);
+	maxsectors = dev0->end_sector - sector;
+	rcu_read_unlock();
+
+	if (maxsectors < bio_sectors)
+		maxsectors = 0;
+	else
+		maxsectors -= bio_sectors;
+
+	if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
+		return biovec->bv_len;
+	/* The bytes available at this offset could be really big,
+	 * so we cap at 2^31 to avoid overflow */
+	if (maxsectors > (1 << (31-9)))
+		return 1<<31;
+	return maxsectors << 9;
+}
+
+static void linear_unplug(struct request_queue *q)
+{
+	mddev_t *mddev = q->queuedata;
+	linear_conf_t *conf;
+	int i;
+
+	rcu_read_lock();
+	conf = rcu_dereference(mddev->private);
+
+	for (i=0; i < mddev->raid_disks; i++) {
+		struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
+		blk_unplug(r_queue);
+	}
+	rcu_read_unlock();
+}
+
+static int linear_congested(void *data, int bits)
+{
+	mddev_t *mddev = data;
+	linear_conf_t *conf;
+	int i, ret = 0;
+
+	if (mddev_congested(mddev, bits))
+		return 1;
+
+	rcu_read_lock();
+	conf = rcu_dereference(mddev->private);
+
+	for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+		struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
+		ret |= bdi_congested(&q->backing_dev_info, bits);
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
+
+static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+	linear_conf_t *conf;
+	sector_t array_sectors;
+
+	rcu_read_lock();
+	conf = rcu_dereference(mddev->private);
+	WARN_ONCE(sectors || raid_disks,
+		  "%s does not support generic reshape\n", __func__);
+	array_sectors = conf->array_sectors;
+	rcu_read_unlock();
+
+	return array_sectors;
+}
+
+static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
+{
+	linear_conf_t *conf;
+	mdk_rdev_t *rdev;
+	int i, cnt;
+
+	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
+			GFP_KERNEL);
+	if (!conf)
+		return NULL;
+
+	cnt = 0;
+	conf->array_sectors = 0;
+
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		int j = rdev->raid_disk;
+		dev_info_t *disk = conf->disks + j;
+		sector_t sectors;
+
+		if (j < 0 || j >= raid_disks || disk->rdev) {
+			printk("linear: disk numbering problem. Aborting!\n");
+			goto out;
+		}
+
+		disk->rdev = rdev;
+		if (mddev->chunk_sectors) {
+			sectors = rdev->sectors;
+			sector_div(sectors, mddev->chunk_sectors);
+			rdev->sectors = sectors * mddev->chunk_sectors;
+		}
+
+		disk_stack_limits(mddev->gendisk, rdev->bdev,
+				  rdev->data_offset << 9);
+		/* as we don't honour merge_bvec_fn, we must never risk
+		 * violating it, so limit max_phys_segments to 1 lying within
+		 * a single page.
+		 */
+		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+			blk_queue_max_phys_segments(mddev->queue, 1);
+			blk_queue_segment_boundary(mddev->queue,
+						   PAGE_CACHE_SIZE - 1);
+		}
+
+		conf->array_sectors += rdev->sectors;
+		cnt++;
+
+	}
+	if (cnt != raid_disks) {
+		printk("linear: not enough drives present. Aborting!\n");
+		goto out;
+	}
+
+	/*
+	 * Here we calculate the device offsets.
+	 */
+	conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
+
+	for (i = 1; i < raid_disks; i++)
+		conf->disks[i].end_sector =
+			conf->disks[i-1].end_sector +
+			conf->disks[i].rdev->sectors;
+
+	return conf;
+
+out:
+	kfree(conf);
+	return NULL;
+}
+
+static int linear_run (mddev_t *mddev)
+{
+	linear_conf_t *conf;
+
+	if (md_check_no_bitmap(mddev))
+		return -EINVAL;
+	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
+	conf = linear_conf(mddev, mddev->raid_disks);
+
+	if (!conf)
+		return 1;
+	mddev->private = conf;
+	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
+
+	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
+	mddev->queue->unplug_fn = linear_unplug;
+	mddev->queue->backing_dev_info.congested_fn = linear_congested;
+	mddev->queue->backing_dev_info.congested_data = mddev;
+	md_integrity_register(mddev);
+	return 0;
+}
+
+static void free_conf(struct rcu_head *head)
+{
+	linear_conf_t *conf = container_of(head, linear_conf_t, rcu);
+	kfree(conf);
+}
+
+static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	/* Adding a drive to a linear array allows the array to grow.
+	 * It is permitted if the new drive has a matching superblock
+	 * already on it, with raid_disk equal to raid_disks.
+	 * It is achieved by creating a new linear_private_data structure
+	 * and swapping it in in-place of the current one.
+	 * The current one is never freed until the array is stopped.
+	 * This avoids races.
+	 */
+	linear_conf_t *newconf, *oldconf;
+
+	if (rdev->saved_raid_disk != mddev->raid_disks)
+		return -EINVAL;
+
+	rdev->raid_disk = rdev->saved_raid_disk;
+
+	newconf = linear_conf(mddev,mddev->raid_disks+1);
+
+	if (!newconf)
+		return -ENOMEM;
+
+	oldconf = rcu_dereference(mddev->private);
+	mddev->raid_disks++;
+	rcu_assign_pointer(mddev->private, newconf);
+	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
+	set_capacity(mddev->gendisk, mddev->array_sectors);
+	revalidate_disk(mddev->gendisk);
+	call_rcu(&oldconf->rcu, free_conf);
+	return 0;
+}
+
+static int linear_stop (mddev_t *mddev)
+{
+	linear_conf_t *conf = mddev->private;
+
+	/*
+	 * We do not require rcu protection here since
+	 * we hold reconfig_mutex for both linear_add and
+	 * linear_stop, so they cannot race.
+	 * We should make sure any old 'conf's are properly
+	 * freed though.
+	 */
+	rcu_barrier();
+	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+	kfree(conf);
+
+	return 0;
+}
+
+static int linear_make_request (struct request_queue *q, struct bio *bio)
+{
+	const int rw = bio_data_dir(bio);
+	mddev_t *mddev = q->queuedata;
+	dev_info_t *tmp_dev;
+	sector_t start_sector;
+	int cpu;
+
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
+		bio_endio(bio, -EOPNOTSUPP);
+		return 0;
+	}
+
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
+		      bio_sectors(bio));
+	part_stat_unlock();
+
+	rcu_read_lock();
+	tmp_dev = which_dev(mddev, bio->bi_sector);
+	start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+
+
+	if (unlikely(bio->bi_sector >= (tmp_dev->end_sector)
+		     || (bio->bi_sector < start_sector))) {
+		char b[BDEVNAME_SIZE];
+
+		printk("linear_make_request: Sector %llu out of bounds on "
+			"dev %s: %llu sectors, offset %llu\n",
+			(unsigned long long)bio->bi_sector,
+			bdevname(tmp_dev->rdev->bdev, b),
+			(unsigned long long)tmp_dev->rdev->sectors,
+			(unsigned long long)start_sector);
+		rcu_read_unlock();
+		bio_io_error(bio);
+		return 0;
+	}
+	if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
+		     tmp_dev->end_sector)) {
+		/* This bio crosses a device boundary, so we have to
+		 * split it.
+		 */
+		struct bio_pair *bp;
+		sector_t end_sector = tmp_dev->end_sector;
+
+		rcu_read_unlock();
+
+		bp = bio_split(bio, end_sector - bio->bi_sector);
+
+		if (linear_make_request(q, &bp->bio1))
+			generic_make_request(&bp->bio1);
+		if (linear_make_request(q, &bp->bio2))
+			generic_make_request(&bp->bio2);
+		bio_pair_release(bp);
+		return 0;
+	}
+		    
+	bio->bi_bdev = tmp_dev->rdev->bdev;
+	bio->bi_sector = bio->bi_sector - start_sector
+		+ tmp_dev->rdev->data_offset;
+	rcu_read_unlock();
+
+	return 1;
+}
+
+static void linear_status (struct seq_file *seq, mddev_t *mddev)
+{
+
+	seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
+}
+
+
+static struct mdk_personality linear_personality =
+{
+	.name		= "linear",
+	.level		= LEVEL_LINEAR,
+	.owner		= THIS_MODULE,
+	.make_request	= linear_make_request,
+	.run		= linear_run,
+	.stop		= linear_stop,
+	.status		= linear_status,
+	.hot_add_disk	= linear_add,
+	.size		= linear_size,
+};
+
+static int __init linear_init (void)
+{
+	return register_md_personality (&linear_personality);
+}
+
+static void linear_exit (void)
+{
+	unregister_md_personality (&linear_personality);
+}
+
+
+module_init(linear_init);
+module_exit(linear_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
+MODULE_ALIAS("md-linear");
+MODULE_ALIAS("md-level--1");
--- a/kernel/drivers/md/linear.h
+++ b/kernel/drivers/md/linear.h
@@ -0,0 +1,21 @@
+#ifndef _LINEAR_H
+#define _LINEAR_H
+
+struct dev_info {
+	mdk_rdev_t	*rdev;
+	sector_t	end_sector;
+};
+
+typedef struct dev_info dev_info_t;
+
+struct linear_private_data
+{
+	sector_t		array_sectors;
+	dev_info_t		disks[0];
+	struct rcu_head		rcu;
+};
+
+
+typedef struct linear_private_data linear_conf_t;
+
+#endif
--- a/kernel/drivers/md/md.c
+++ b/kernel/drivers/md/md.c
--- a/kernel/drivers/md/md.h
+++ b/kernel/drivers/md/md.h
@@ -0,0 +1,449 @@
+/*
+   md_k.h : kernel internal structure of the Linux MD driver
+          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
+	  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#ifndef _MD_MD_H
+#define _MD_MD_H
+
+#include <linux/blkdev.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+
+#define MaxSector (~(sector_t)0)
+
+typedef struct mddev_s mddev_t;
+typedef struct mdk_rdev_s mdk_rdev_t;
+
+/*
+ * MD's 'extended' device
+ */
+struct mdk_rdev_s
+{
+	struct list_head same_set;	/* RAID devices within the same set */
+
+	sector_t sectors;		/* Device size (in 512bytes sectors) */
+	mddev_t *mddev;			/* RAID array if running */
+	int last_events;		/* IO event timestamp */
+
+	struct block_device *bdev;	/* block device handle */
+
+	struct page	*sb_page;
+	int		sb_loaded;
+	__u64		sb_events;
+	sector_t	data_offset;	/* start of data in array */
+	sector_t 	sb_start;	/* offset of the super block (in 512byte sectors) */
+	int		sb_size;	/* bytes in the superblock */
+	int		preferred_minor;	/* autorun support */
+
+	struct kobject	kobj;
+
+	/* A device can be in one of three states based on two flags:
+	 * Not working:   faulty==1 in_sync==0
+	 * Fully working: faulty==0 in_sync==1
+	 * Working, but not
+	 * in sync with array
+	 *                faulty==0 in_sync==0
+	 *
+	 * It can never have faulty==1, in_sync==1
+	 * This reduces the burden of testing multiple flags in many cases
+	 */
+
+	unsigned long	flags;
+#define	Faulty		1		/* device is known to have a fault */
+#define	In_sync		2		/* device is in_sync with rest of array */
+#define	WriteMostly	4		/* Avoid reading if at all possible */
+#define	BarriersNotsupp	5		/* BIO_RW_BARRIER is not supported */
+#define	AllReserved	6		/* If whole device is reserved for
+					 * one array */
+#define	AutoDetected	7		/* added by auto-detect */
+#define Blocked		8		/* An error occured on an externally
+					 * managed array, don't allow writes
+					 * until it is cleared */
+#define StateChanged	9		/* Faulty or Blocked has changed during
+					 * interrupt, so it needs to be
+					 * notified by the thread */
+	wait_queue_head_t blocked_wait;
+
+	int desc_nr;			/* descriptor index in the superblock */
+	int raid_disk;			/* role of device in array */
+	int saved_raid_disk;		/* role that device used to have in the
+					 * array and could again if we did a partial
+					 * resync from the bitmap
+					 */
+	sector_t	recovery_offset;/* If this device has been partially
+					 * recovered, this is where we were
+					 * up to.
+					 */
+
+	atomic_t	nr_pending;	/* number of pending requests.
+					 * only maintained for arrays that
+					 * support hot removal
+					 */
+	atomic_t	read_errors;	/* number of consecutive read errors that
+					 * we have tried to ignore.
+					 */
+	atomic_t	corrected_errors; /* number of corrected read errors,
+					   * for reporting to userspace and storing
+					   * in superblock.
+					   */
+	struct work_struct del_work;	/* used for delayed sysfs removal */
+
+	struct sysfs_dirent *sysfs_state; /* handle for 'state'
+					   * sysfs entry */
+};
+
+struct mddev_s
+{
+	void				*private;
+	struct mdk_personality		*pers;
+	dev_t				unit;
+	int				md_minor;
+	struct list_head 		disks;
+	unsigned long			flags;
+#define MD_CHANGE_DEVS	0	/* Some device status has changed */
+#define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
+#define MD_CHANGE_PENDING 2	/* superblock update in progress */
+
+	int				suspended;
+	atomic_t			active_io;
+	int				ro;
+
+	struct gendisk			*gendisk;
+
+	struct kobject			kobj;
+	int				hold_active;
+#define	UNTIL_IOCTL	1
+#define	UNTIL_STOP	2
+
+	/* Superblock information */
+	int				major_version,
+					minor_version,
+					patch_version;
+	int				persistent;
+	int 				external;	/* metadata is
+							 * managed externally */
+	char				metadata_type[17]; /* externally set*/
+	int				chunk_sectors;
+	time_t				ctime, utime;
+	int				level, layout;
+	char				clevel[16];
+	int				raid_disks;
+	int				max_disks;
+	sector_t			dev_sectors; 	/* used size of
+							 * component devices */
+	sector_t			array_sectors; /* exported array size */
+	int				external_size; /* size managed
+							* externally */
+	__u64				events;
+
+	char				uuid[16];
+
+	/* If the array is being reshaped, we need to record the
+	 * new shape and an indication of where we are up to.
+	 * This is written to the superblock.
+	 * If reshape_position is MaxSector, then no reshape is happening (yet).
+	 */
+	sector_t			reshape_position;
+	int				delta_disks, new_level, new_layout;
+	int				new_chunk_sectors;
+
+	struct mdk_thread_s		*thread;	/* management thread */
+	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
+	sector_t			curr_resync;	/* last block scheduled */
+	/* As resync requests can complete out of order, we cannot easily track
+	 * how much resync has been completed.  So we occasionally pause until
+	 * everything completes, then set curr_resync_completed to curr_resync.
+	 * As such it may be well behind the real resync mark, but it is a value
+	 * we are certain of.
+	 */
+	sector_t			curr_resync_completed;
+	unsigned long			resync_mark;	/* a recent timestamp */
+	sector_t			resync_mark_cnt;/* blocks written at resync_mark */
+	sector_t			curr_mark_cnt; /* blocks scheduled now */
+
+	sector_t			resync_max_sectors; /* may be set by personality */
+
+	sector_t			resync_mismatches; /* count of sectors where
+							    * parity/replica mismatch found
+							    */
+
+	/* allow user-space to request suspension of IO to regions of the array */
+	sector_t			suspend_lo;
+	sector_t			suspend_hi;
+	/* if zero, use the system-wide default */
+	int				sync_speed_min;
+	int				sync_speed_max;
+
+	/* resync even though the same disks are shared among md-devices */
+	int				parallel_resync;
+
+	int				ok_start_degraded;
+	/* recovery/resync flags 
+	 * NEEDED:   we might need to start a resync/recover
+	 * RUNNING:  a thread is running, or about to be started
+	 * SYNC:     actually doing a resync, not a recovery
+	 * RECOVER:  doing recovery, or need to try it.
+	 * INTR:     resync needs to be aborted for some reason
+	 * DONE:     thread is done and is waiting to be reaped
+	 * REQUEST:  user-space has requested a sync (used with SYNC)
+	 * CHECK:    user-space request for check-only, no repair
+	 * RESHAPE:  A reshape is happening
+	 *
+	 * If neither SYNC or RESHAPE are set, then it is a recovery.
+	 */
+#define	MD_RECOVERY_RUNNING	0
+#define	MD_RECOVERY_SYNC	1
+#define	MD_RECOVERY_RECOVER	2
+#define	MD_RECOVERY_INTR	3
+#define	MD_RECOVERY_DONE	4
+#define	MD_RECOVERY_NEEDED	5
+#define	MD_RECOVERY_REQUESTED	6
+#define	MD_RECOVERY_CHECK	7
+#define MD_RECOVERY_RESHAPE	8
+#define	MD_RECOVERY_FROZEN	9
+
+	unsigned long			recovery;
+	int				recovery_disabled; /* if we detect that recovery
+							    * will always fail, set this
+							    * so we don't loop trying */
+
+	int				in_sync;	/* know to not need resync */
+	/* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
+	 * that we are never stopping an array while it is open.
+	 * 'reconfig_mutex' protects all other reconfiguration.
+	 * These locks are separate due to conflicting interactions
+	 * with bdev->bd_mutex.
+	 * Lock ordering is:
+	 *  reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
+	 *  bd_mutex -> open_mutex:  e.g. __blkdev_get -> md_open
+	 */
+	struct mutex			open_mutex;
+	struct mutex			reconfig_mutex;
+	atomic_t			active;		/* general refcount */
+	atomic_t			openers;	/* number of active opens */
+
+	int				changed;	/* true if we might need to reread partition info */
+	int				degraded;	/* whether md should consider
+							 * adding a spare
+							 */
+	int				barriers_work;	/* initialised to true, cleared as soon
+							 * as a barrier request to slave
+							 * fails.  Only supported
+							 */
+	struct bio			*biolist; 	/* bios that need to be retried
+							 * because BIO_RW_BARRIER is not supported
+							 */
+
+	atomic_t			recovery_active; /* blocks scheduled, but not written */
+	wait_queue_head_t		recovery_wait;
+	sector_t			recovery_cp;
+	sector_t			resync_min;	/* user requested sync
+							 * starts here */
+	sector_t			resync_max;	/* resync should pause
+							 * when it gets here */
+
+	struct sysfs_dirent		*sysfs_state;	/* handle for 'array_state'
+							 * file in sysfs.
+							 */
+	struct sysfs_dirent		*sysfs_action;  /* handle for 'sync_action' */
+
+	struct work_struct del_work;	/* used for delayed sysfs removal */
+
+	spinlock_t			write_lock;
+	wait_queue_head_t		sb_wait;	/* for waiting on superblock updates */
+	atomic_t			pending_writes;	/* number of active superblock writes */
+
+	unsigned int			safemode;	/* if set, update "clean" superblock
+							 * when no writes pending.
+							 */ 
+	unsigned int			safemode_delay;
+	struct timer_list		safemode_timer;
+	atomic_t			writes_pending; 
+	struct request_queue		*queue;	/* for plugging ... */
+
+	atomic_t                        write_behind; /* outstanding async IO */
+	unsigned int                    max_write_behind; /* 0 = sync */
+
+	struct bitmap                   *bitmap; /* the bitmap for the device */
+	struct file			*bitmap_file; /* the bitmap file */
+	long				bitmap_offset; /* offset from superblock of
+							* start of bitmap. May be
+							* negative, but not '0'
+							*/
+	long				default_bitmap_offset; /* this is the offset to use when
+								* hot-adding a bitmap.  It should
+								* eventually be settable by sysfs.
+								*/
+	struct mutex			bitmap_mutex;
+
+	struct list_head		all_mddevs;
+};
+
+
+static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
+{
+	int faulty = test_bit(Faulty, &rdev->flags);
+	if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+}
+
+static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
+{
+        atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
+}
+
+struct mdk_personality
+{
+	char *name;
+	int level;
+	struct list_head list;
+	struct module *owner;
+	int (*make_request)(struct request_queue *q, struct bio *bio);
+	int (*run)(mddev_t *mddev);
+	int (*stop)(mddev_t *mddev);
+	void (*status)(struct seq_file *seq, mddev_t *mddev);
+	/* error_handler must set ->faulty and clear ->in_sync
+	 * if appropriate, and should abort recovery if needed 
+	 */
+	void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
+	int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
+	int (*hot_remove_disk) (mddev_t *mddev, int number);
+	int (*spare_active) (mddev_t *mddev);
+	sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
+	int (*resize) (mddev_t *mddev, sector_t sectors);
+	sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks);
+	int (*check_reshape) (mddev_t *mddev);
+	int (*start_reshape) (mddev_t *mddev);
+	void (*finish_reshape) (mddev_t *mddev);
+	/* quiesce moves between quiescence states
+	 * 0 - fully active
+	 * 1 - no new requests allowed
+	 * others - reserved
+	 */
+	void (*quiesce) (mddev_t *mddev, int state);
+	/* takeover is used to transition an array from one
+	 * personality to another.  The new personality must be able
+	 * to handle the data in the current layout.
+	 * e.g. 2drive raid1 -> 2drive raid5
+	 *      ndrive raid5 -> degraded n+1drive raid6 with special layout
+	 * If the takeover succeeds, a new 'private' structure is returned.
+	 * This needs to be installed and then ->run used to activate the
+	 * array.
+	 */
+	void *(*takeover) (mddev_t *mddev);
+};
+
+
+struct md_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(mddev_t *, char *);
+	ssize_t (*store)(mddev_t *, const char *, size_t);
+};
+
+
+static inline char * mdname (mddev_t * mddev)
+{
+	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
+}
+
+/*
+ * iterates through some rdev ringlist. It's safe to remove the
+ * current 'rdev'. Dont touch 'tmp' though.
+ */
+#define rdev_for_each_list(rdev, tmp, head)				\
+	list_for_each_entry_safe(rdev, tmp, head, same_set)
+
+/*
+ * iterates through the 'same array disks' ringlist
+ */
+#define rdev_for_each(rdev, tmp, mddev)				\
+	list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
+
+#define rdev_for_each_rcu(rdev, mddev)				\
+	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
+
+typedef struct mdk_thread_s {
+	void			(*run) (mddev_t *mddev);
+	mddev_t			*mddev;
+	wait_queue_head_t	wqueue;
+	unsigned long           flags;
+	struct task_struct	*tsk;
+	unsigned long		timeout;
+} mdk_thread_t;
+
+#define THREAD_WAKEUP  0
+
+#define __wait_event_lock_irq(wq, condition, lock, cmd) 		\
+do {									\
+	wait_queue_t __wait;						\
+	init_waitqueue_entry(&__wait, current);				\
+									\
+	add_wait_queue(&wq, &__wait);					\
+	for (;;) {							\
+		set_current_state(TASK_UNINTERRUPTIBLE);		\
+		if (condition)						\
+			break;						\
+		spin_unlock_irq(&lock);					\
+		cmd;							\
+		schedule();						\
+		spin_lock_irq(&lock);					\
+	}								\
+	current->state = TASK_RUNNING;					\
+	remove_wait_queue(&wq, &__wait);				\
+} while (0)
+
+#define wait_event_lock_irq(wq, condition, lock, cmd) 			\
+do {									\
+	if (condition)	 						\
+		break;							\
+	__wait_event_lock_irq(wq, condition, lock, cmd);		\
+} while (0)
+
+static inline void safe_put_page(struct page *p)
+{
+	if (p) put_page(p);
+}
+
+extern int register_md_personality(struct mdk_personality *p);
+extern int unregister_md_personality(struct mdk_personality *p);
+extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
+				mddev_t *mddev, const char *name);
+extern void md_unregister_thread(mdk_thread_t *thread);
+extern void md_wakeup_thread(mdk_thread_t *thread);
+extern void md_check_recovery(mddev_t *mddev);
+extern void md_write_start(mddev_t *mddev, struct bio *bi);
+extern void md_write_end(mddev_t *mddev);
+extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
+extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
+
+extern int mddev_congested(mddev_t *mddev, int bits);
+extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+			   sector_t sector, int size, struct page *page);
+extern void md_super_wait(mddev_t *mddev);
+extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+			struct page *page, int rw);
+extern void md_do_sync(mddev_t *mddev);
+extern void md_new_event(mddev_t *mddev);
+extern int md_allow_write(mddev_t *mddev);
+extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
+extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
+extern int md_check_no_bitmap(mddev_t *mddev);
+extern int md_integrity_register(mddev_t *mddev);
+void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
+
+#endif /* _MD_MD_H */
--- a/kernel/drivers/md/mktables.c
+++ b/kernel/drivers/md/mktables.c
@@ -0,0 +1,132 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002-2007 H. Peter Anvin - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2 or (at your
+ *   option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * mktables.c
+ *
+ * Make RAID-6 tables.  This is a host user space program to be run at
+ * compile time.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <time.h>
+
+static uint8_t gfmul(uint8_t a, uint8_t b)
+{
+	uint8_t v = 0;
+
+	while (b) {
+		if (b & 1)
+			v ^= a;
+		a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
+		b >>= 1;
+	}
+
+	return v;
+}
+
+static uint8_t gfpow(uint8_t a, int b)
+{
+	uint8_t v = 1;
+
+	b %= 255;
+	if (b < 0)
+		b += 255;
+
+	while (b) {
+		if (b & 1)
+			v = gfmul(v, a);
+		a = gfmul(a, a);
+		b >>= 1;
+	}
+
+	return v;
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, k;
+	uint8_t v;
+	uint8_t exptbl[256], invtbl[256];
+
+	printf("#include <linux/raid/pq.h>\n");
+
+	/* Compute multiplication table */
+	printf("\nconst u8  __attribute__((aligned(256)))\n"
+		"raid6_gfmul[256][256] =\n"
+		"{\n");
+	for (i = 0; i < 256; i++) {
+		printf("\t{\n");
+		for (j = 0; j < 256; j += 8) {
+			printf("\t\t");
+			for (k = 0; k < 8; k++)
+				printf("0x%02x,%c", gfmul(i, j + k),
+				       (k == 7) ? '\n' : ' ');
+		}
+		printf("\t},\n");
+	}
+	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfmul);\n");
+	printf("#endif\n");
+
+	/* Compute power-of-2 table (exponent) */
+	v = 1;
+	printf("\nconst u8 __attribute__((aligned(256)))\n"
+	       "raid6_gfexp[256] =\n" "{\n");
+	for (i = 0; i < 256; i += 8) {
+		printf("\t");
+		for (j = 0; j < 8; j++) {
+			exptbl[i + j] = v;
+			printf("0x%02x,%c", v, (j == 7) ? '\n' : ' ');
+			v = gfmul(v, 2);
+			if (v == 1)
+				v = 0;	/* For entry 255, not a real entry */
+		}
+	}
+	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfexp);\n");
+	printf("#endif\n");
+
+	/* Compute inverse table x^-1 == x^254 */
+	printf("\nconst u8 __attribute__((aligned(256)))\n"
+	       "raid6_gfinv[256] =\n" "{\n");
+	for (i = 0; i < 256; i += 8) {
+		printf("\t");
+		for (j = 0; j < 8; j++) {
+			invtbl[i + j] = v = gfpow(i + j, 254);
+			printf("0x%02x,%c", v, (j == 7) ? '\n' : ' ');
+		}
+	}
+	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfinv);\n");
+	printf("#endif\n");
+
+	/* Compute inv(2^x + 1) (exponent-xor-inverse) table */
+	printf("\nconst u8 __attribute__((aligned(256)))\n"
+	       "raid6_gfexi[256] =\n" "{\n");
+	for (i = 0; i < 256; i += 8) {
+		printf("\t");
+		for (j = 0; j < 8; j++)
+			printf("0x%02x,%c", invtbl[exptbl[i + j] ^ 1],
+			       (j == 7) ? '\n' : ' ');
+	}
+	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfexi);\n");
+	printf("#endif\n");
+
+	return 0;
+}
--- a/kernel/drivers/md/multipath.c
+++ b/kernel/drivers/md/multipath.c
@@ -0,0 +1,590 @@
+/*
+ * multipath.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
+ *
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+ * MULTIPATH management functions.
+ *
+ * derived from raid1.c.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "multipath.h"
+
+#define MAX_WORK_PER_DISK 128
+
+#define	NR_RESERVED_BUFS	32
+
+
+static int multipath_map (multipath_conf_t *conf)
+{
+	int i, disks = conf->raid_disks;
+
+	/*
+	 * Later we do read balancing on the read side 
+	 * now we use the first available disk.
+	 */
+
+	rcu_read_lock();
+	for (i = 0; i < disks; i++) {
+		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
+		if (rdev && test_bit(In_sync, &rdev->flags)) {
+			atomic_inc(&rdev->nr_pending);
+			rcu_read_unlock();
+			return i;
+		}
+	}
+	rcu_read_unlock();
+
+	printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
+	return (-1);
+}
+
+static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
+{
+	unsigned long flags;
+	mddev_t *mddev = mp_bh->mddev;
+	multipath_conf_t *conf = mddev->private;
+
+	spin_lock_irqsave(&conf->device_lock, flags);
+	list_add(&mp_bh->retry_list, &conf->retry_list);
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+	md_wakeup_thread(mddev->thread);
+}
+
+
+/*
+ * multipath_end_bh_io() is called when we have finished servicing a multipathed
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
+{
+	struct bio *bio = mp_bh->master_bio;
+	multipath_conf_t *conf = mp_bh->mddev->private;
+
+	bio_endio(bio, err);
+	mempool_free(mp_bh, conf->pool);
+}
+
+static void multipath_end_request(struct bio *bio, int error)
+{
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
+	multipath_conf_t *conf = mp_bh->mddev->private;
+	mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
+
+	if (uptodate)
+		multipath_end_bh_io(mp_bh, 0);
+	else if (!bio_rw_flagged(bio, BIO_RW_AHEAD)) {
+		/*
+		 * oops, IO error:
+		 */
+		char b[BDEVNAME_SIZE];
+		md_error (mp_bh->mddev, rdev);
+		printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", 
+		       bdevname(rdev->bdev,b), 
+		       (unsigned long long)bio->bi_sector);
+		multipath_reschedule_retry(mp_bh);
+	} else
+		multipath_end_bh_io(mp_bh, error);
+	rdev_dec_pending(rdev, conf->mddev);
+}
+
+static void unplug_slaves(mddev_t *mddev)
+{
+	multipath_conf_t *conf = mddev->private;
+	int i;
+
+	rcu_read_lock();
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
+		if (rdev && !test_bit(Faulty, &rdev->flags)
+		    && atomic_read(&rdev->nr_pending)) {
+			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
+
+			atomic_inc(&rdev->nr_pending);
+			rcu_read_unlock();
+
+			blk_unplug(r_queue);
+
+			rdev_dec_pending(rdev, mddev);
+			rcu_read_lock();
+		}
+	}
+	rcu_read_unlock();
+}
+
+static void multipath_unplug(struct request_queue *q)
+{
+	unplug_slaves(q->queuedata);
+}
+
+
+static int multipath_make_request (struct request_queue *q, struct bio * bio)
+{
+	mddev_t *mddev = q->queuedata;
+	multipath_conf_t *conf = mddev->private;
+	struct multipath_bh * mp_bh;
+	struct multipath_info *multipath;
+	const int rw = bio_data_dir(bio);
+	int cpu;
+
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
+		bio_endio(bio, -EOPNOTSUPP);
+		return 0;
+	}
+
+	mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
+
+	mp_bh->master_bio = bio;
+	mp_bh->mddev = mddev;
+
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
+		      bio_sectors(bio));
+	part_stat_unlock();
+
+	mp_bh->path = multipath_map(conf);
+	if (mp_bh->path < 0) {
+		bio_endio(bio, -EIO);
+		mempool_free(mp_bh, conf->pool);
+		return 0;
+	}
+	multipath = conf->multipaths + mp_bh->path;
+
+	mp_bh->bio = *bio;
+	mp_bh->bio.bi_sector += multipath->rdev->data_offset;
+	mp_bh->bio.bi_bdev = multipath->rdev->bdev;
+	mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT);
+	mp_bh->bio.bi_end_io = multipath_end_request;
+	mp_bh->bio.bi_private = mp_bh;
+	generic_make_request(&mp_bh->bio);
+	return 0;
+}
+
+static void multipath_status (struct seq_file *seq, mddev_t *mddev)
+{
+	multipath_conf_t *conf = mddev->private;
+	int i;
+	
+	seq_printf (seq, " [%d/%d] [", conf->raid_disks,
+						 conf->working_disks);
+	for (i = 0; i < conf->raid_disks; i++)
+		seq_printf (seq, "%s",
+			       conf->multipaths[i].rdev && 
+			       test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_");
+	seq_printf (seq, "]");
+}
+
+static int multipath_congested(void *data, int bits)
+{
+	mddev_t *mddev = data;
+	multipath_conf_t *conf = mddev->private;
+	int i, ret = 0;
+
+	if (mddev_congested(mddev, bits))
+		return 1;
+
+	rcu_read_lock();
+	for (i = 0; i < mddev->raid_disks ; i++) {
+		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
+		if (rdev && !test_bit(Faulty, &rdev->flags)) {
+			struct request_queue *q = bdev_get_queue(rdev->bdev);
+
+			ret |= bdi_congested(&q->backing_dev_info, bits);
+			/* Just like multipath_map, we just check the
+			 * first available device
+			 */
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+/*
+ * Careful, this can execute in IRQ contexts as well!
+ */
+static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	multipath_conf_t *conf = mddev->private;
+
+	if (conf->working_disks <= 1) {
+		/*
+		 * Uh oh, we can do nothing if this is our last path, but
+		 * first check if this is a queued request for a device
+		 * which has just failed.
+		 */
+		printk(KERN_ALERT 
+			"multipath: only one IO path left and IO error.\n");
+		/* leave it active... it's all we have */
+	} else {
+		/*
+		 * Mark disk as unusable
+		 */
+		if (!test_bit(Faulty, &rdev->flags)) {
+			char b[BDEVNAME_SIZE];
+			clear_bit(In_sync, &rdev->flags);
+			set_bit(Faulty, &rdev->flags);
+			set_bit(MD_CHANGE_DEVS, &mddev->flags);
+			conf->working_disks--;
+			mddev->degraded++;
+			printk(KERN_ALERT "multipath: IO failure on %s,"
+				" disabling IO path.\n"
+				"multipath: Operation continuing"
+				" on %d IO paths.\n",
+				bdevname (rdev->bdev,b),
+				conf->working_disks);
+		}
+	}
+}
+
+static void print_multipath_conf (multipath_conf_t *conf)
+{
+	int i;
+	struct multipath_info *tmp;
+
+	printk("MULTIPATH conf printout:\n");
+	if (!conf) {
+		printk("(conf==NULL)\n");
+		return;
+	}
+	printk(" --- wd:%d rd:%d\n", conf->working_disks,
+			 conf->raid_disks);
+
+	for (i = 0; i < conf->raid_disks; i++) {
+		char b[BDEVNAME_SIZE];
+		tmp = conf->multipaths + i;
+		if (tmp->rdev)
+			printk(" disk%d, o:%d, dev:%s\n",
+				i,!test_bit(Faulty, &tmp->rdev->flags),
+			       bdevname(tmp->rdev->bdev,b));
+	}
+}
+
+
+static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	multipath_conf_t *conf = mddev->private;
+	struct request_queue *q;
+	int err = -EEXIST;
+	int path;
+	struct multipath_info *p;
+	int first = 0;
+	int last = mddev->raid_disks - 1;
+
+	if (rdev->raid_disk >= 0)
+		first = last = rdev->raid_disk;
+
+	print_multipath_conf(conf);
+
+	for (path = first; path <= last; path++)
+		if ((p=conf->multipaths+path)->rdev == NULL) {
+			q = rdev->bdev->bd_disk->queue;
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->data_offset << 9);
+
+		/* as we don't honour merge_bvec_fn, we must never risk
+		 * violating it, so limit ->max_phys_segments to one, lying
+		 * within a single page.
+		 * (Note: it is very unlikely that a device with
+		 * merge_bvec_fn will be involved in multipath.)
+		 */
+			if (q->merge_bvec_fn) {
+				blk_queue_max_phys_segments(mddev->queue, 1);
+				blk_queue_segment_boundary(mddev->queue,
+							   PAGE_CACHE_SIZE - 1);
+			}
+
+			conf->working_disks++;
+			mddev->degraded--;
+			rdev->raid_disk = path;
+			set_bit(In_sync, &rdev->flags);
+			rcu_assign_pointer(p->rdev, rdev);
+			err = 0;
+			md_integrity_add_rdev(rdev, mddev);
+			break;
+		}
+
+	print_multipath_conf(conf);
+
+	return err;
+}
+
+static int multipath_remove_disk(mddev_t *mddev, int number)
+{
+	multipath_conf_t *conf = mddev->private;
+	int err = 0;
+	mdk_rdev_t *rdev;
+	struct multipath_info *p = conf->multipaths + number;
+
+	print_multipath_conf(conf);
+
+	rdev = p->rdev;
+	if (rdev) {
+		if (test_bit(In_sync, &rdev->flags) ||
+		    atomic_read(&rdev->nr_pending)) {
+			printk(KERN_ERR "hot-remove-disk, slot %d is identified"
+			       " but is still operational!\n", number);
+			err = -EBUSY;
+			goto abort;
+		}
+		p->rdev = NULL;
+		synchronize_rcu();
+		if (atomic_read(&rdev->nr_pending)) {
+			/* lost the race, try later */
+			err = -EBUSY;
+			p->rdev = rdev;
+			goto abort;
+		}
+		md_integrity_register(mddev);
+	}
+abort:
+
+	print_multipath_conf(conf);
+	return err;
+}
+
+
+
+/*
+ * This is a kernel thread which:
+ *
+ *	1.	Retries failed read operations on working multipaths.
+ *	2.	Updates the raid superblock when problems encounter.
+ *	3.	Performs writes following reads for array syncronising.
+ */
+
+static void multipathd (mddev_t *mddev)
+{
+	struct multipath_bh *mp_bh;
+	struct bio *bio;
+	unsigned long flags;
+	multipath_conf_t *conf = mddev->private;
+	struct list_head *head = &conf->retry_list;
+
+	md_check_recovery(mddev);
+	for (;;) {
+		char b[BDEVNAME_SIZE];
+		spin_lock_irqsave(&conf->device_lock, flags);
+		if (list_empty(head))
+			break;
+		mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
+		list_del(head->prev);
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+
+		bio = &mp_bh->bio;
+		bio->bi_sector = mp_bh->master_bio->bi_sector;
+		
+		if ((mp_bh->path = multipath_map (conf))<0) {
+			printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
+				" error for block %llu\n",
+				bdevname(bio->bi_bdev,b),
+				(unsigned long long)bio->bi_sector);
+			multipath_end_bh_io(mp_bh, -EIO);
+		} else {
+			printk(KERN_ERR "multipath: %s: redirecting sector %llu"
+				" to another IO path\n",
+				bdevname(bio->bi_bdev,b),
+				(unsigned long long)bio->bi_sector);
+			*bio = *(mp_bh->master_bio);
+			bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset;
+			bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
+			bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT);
+			bio->bi_end_io = multipath_end_request;
+			bio->bi_private = mp_bh;
+			generic_make_request(bio);
+		}
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+}
+
+static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+	WARN_ONCE(sectors || raid_disks,
+		  "%s does not support generic reshape\n", __func__);
+
+	return mddev->dev_sectors;
+}
+
+static int multipath_run (mddev_t *mddev)
+{
+	multipath_conf_t *conf;
+	int disk_idx;
+	struct multipath_info *disk;
+	mdk_rdev_t *rdev;
+
+	if (md_check_no_bitmap(mddev))
+		return -EINVAL;
+
+	if (mddev->level != LEVEL_MULTIPATH) {
+		printk("multipath: %s: raid level not set to multipath IO (%d)\n",
+		       mdname(mddev), mddev->level);
+		goto out;
+	}
+	/*
+	 * copy the already verified devices into our private MULTIPATH
+	 * bookkeeping area. [whatever we allocate in multipath_run(),
+	 * should be freed in multipath_stop()]
+	 */
+	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
+
+	conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
+	mddev->private = conf;
+	if (!conf) {
+		printk(KERN_ERR 
+			"multipath: couldn't allocate memory for %s\n",
+			mdname(mddev));
+		goto out;
+	}
+
+	conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
+				   GFP_KERNEL);
+	if (!conf->multipaths) {
+		printk(KERN_ERR 
+			"multipath: couldn't allocate memory for %s\n",
+			mdname(mddev));
+		goto out_free_conf;
+	}
+
+	conf->working_disks = 0;
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		disk_idx = rdev->raid_disk;
+		if (disk_idx < 0 ||
+		    disk_idx >= mddev->raid_disks)
+			continue;
+
+		disk = conf->multipaths + disk_idx;
+		disk->rdev = rdev;
+		disk_stack_limits(mddev->gendisk, rdev->bdev,
+				  rdev->data_offset << 9);
+
+		/* as we don't honour merge_bvec_fn, we must never risk
+		 * violating it, not that we ever expect a device with
+		 * a merge_bvec_fn to be involved in multipath */
+		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+			blk_queue_max_phys_segments(mddev->queue, 1);
+			blk_queue_segment_boundary(mddev->queue,
+						   PAGE_CACHE_SIZE - 1);
+		}
+
+		if (!test_bit(Faulty, &rdev->flags))
+			conf->working_disks++;
+	}
+
+	conf->raid_disks = mddev->raid_disks;
+	conf->mddev = mddev;
+	spin_lock_init(&conf->device_lock);
+	INIT_LIST_HEAD(&conf->retry_list);
+
+	if (!conf->working_disks) {
+		printk(KERN_ERR "multipath: no operational IO paths for %s\n",
+			mdname(mddev));
+		goto out_free_conf;
+	}
+	mddev->degraded = conf->raid_disks - conf->working_disks;
+
+	conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
+						 sizeof(struct multipath_bh));
+	if (conf->pool == NULL) {
+		printk(KERN_ERR 
+			"multipath: couldn't allocate memory for %s\n",
+			mdname(mddev));
+		goto out_free_conf;
+	}
+
+	{
+		mddev->thread = md_register_thread(multipathd, mddev, NULL);
+		if (!mddev->thread) {
+			printk(KERN_ERR "multipath: couldn't allocate thread"
+				" for %s\n", mdname(mddev));
+			goto out_free_conf;
+		}
+	}
+
+	printk(KERN_INFO 
+		"multipath: array %s active with %d out of %d IO paths\n",
+		mdname(mddev), conf->working_disks, mddev->raid_disks);
+	/*
+	 * Ok, everything is just fine now
+	 */
+	md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
+
+	mddev->queue->unplug_fn = multipath_unplug;
+	mddev->queue->backing_dev_info.congested_fn = multipath_congested;
+	mddev->queue->backing_dev_info.congested_data = mddev;
+	md_integrity_register(mddev);
+	return 0;
+
+out_free_conf:
+	if (conf->pool)
+		mempool_destroy(conf->pool);
+	kfree(conf->multipaths);
+	kfree(conf);
+	mddev->private = NULL;
+out:
+	return -EIO;
+}
+
+
+static int multipath_stop (mddev_t *mddev)
+{
+	multipath_conf_t *conf = mddev->private;
+
+	md_unregister_thread(mddev->thread);
+	mddev->thread = NULL;
+	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+	mempool_destroy(conf->pool);
+	kfree(conf->multipaths);
+	kfree(conf);
+	mddev->private = NULL;
+	return 0;
+}
+
+static struct mdk_personality multipath_personality =
+{
+	.name		= "multipath",
+	.level		= LEVEL_MULTIPATH,
+	.owner		= THIS_MODULE,
+	.make_request	= multipath_make_request,
+	.run		= multipath_run,
+	.stop		= multipath_stop,
+	.status		= multipath_status,
+	.error_handler	= multipath_error,
+	.hot_add_disk	= multipath_add_disk,
+	.hot_remove_disk= multipath_remove_disk,
+	.size		= multipath_size,
+};
+
+static int __init multipath_init (void)
+{
+	return register_md_personality (&multipath_personality);
+}
+
+static void __exit multipath_exit (void)
+{
+	unregister_md_personality (&multipath_personality);
+}
+
+module_init(multipath_init);
+module_exit(multipath_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
+MODULE_ALIAS("md-multipath");
+MODULE_ALIAS("md-level--4");
--- a/kernel/drivers/md/multipath.h
+++ b/kernel/drivers/md/multipath.h
@@ -0,0 +1,34 @@
+#ifndef _MULTIPATH_H
+#define _MULTIPATH_H
+
+struct multipath_info {
+	mdk_rdev_t	*rdev;
+};
+
+struct multipath_private_data {
+	mddev_t			*mddev;
+	struct multipath_info	*multipaths;
+	int			raid_disks;
+	int			working_disks;
+	spinlock_t		device_lock;
+	struct list_head	retry_list;
+
+	mempool_t		*pool;
+};
+
+typedef struct multipath_private_data multipath_conf_t;
+
+/*
+ * this is our 'private' 'collective' MULTIPATH buffer head.
+ * it contains information about what kind of IO operations were started
+ * for this MULTIPATH operation, and about their status:
+ */
+
+struct multipath_bh {
+	mddev_t			*mddev;
+	struct bio		*master_bio;
+	struct bio		bio;
+	int			path;
+	struct list_head	retry_list;
+};
+#endif
--- a/kernel/drivers/md/raid0.c
+++ b/kernel/drivers/md/raid0.c
@@ -0,0 +1,573 @@
+/*
+   raid0.c : Multiple Devices driver for Linux
+             Copyright (C) 1994-96 Marc ZYNGIER
+	     <zyngier@ufr-info-p7.ibp.fr> or
+	     <maz@gloups.fdn.fr>
+             Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
+
+
+   RAID-0 management functions.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "raid0.h"
+
+static void raid0_unplug(struct request_queue *q)
+{
+	mddev_t *mddev = q->queuedata;
+	raid0_conf_t *conf = mddev->private;
+	mdk_rdev_t **devlist = conf->devlist;
+	int i;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev);
+
+		blk_unplug(r_queue);
+	}
+}
+
+static int raid0_congested(void *data, int bits)
+{
+	mddev_t *mddev = data;
+	raid0_conf_t *conf = mddev->private;
+	mdk_rdev_t **devlist = conf->devlist;
+	int i, ret = 0;
+
+	if (mddev_congested(mddev, bits))
+		return 1;
+
+	for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+		struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
+
+		ret |= bdi_congested(&q->backing_dev_info, bits);
+	}
+	return ret;
+}
+
+/*
+ * inform the user of the raid configuration
+*/
+static void dump_zones(mddev_t *mddev)
+{
+	int j, k, h;
+	sector_t zone_size = 0;
+	sector_t zone_start = 0;
+	char b[BDEVNAME_SIZE];
+	raid0_conf_t *conf = mddev->private;
+	printk(KERN_INFO "******* %s configuration *********\n",
+		mdname(mddev));
+	h = 0;
+	for (j = 0; j < conf->nr_strip_zones; j++) {
+		printk(KERN_INFO "zone%d=[", j);
+		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
+			printk("%s/",
+			bdevname(conf->devlist[j*mddev->raid_disks
+						+ k]->bdev, b));
+		printk("]\n");
+
+		zone_size  = conf->strip_zone[j].zone_end - zone_start;
+		printk(KERN_INFO "        zone offset=%llukb "
+				"device offset=%llukb size=%llukb\n",
+			(unsigned long long)zone_start>>1,
+			(unsigned long long)conf->strip_zone[j].dev_start>>1,
+			(unsigned long long)zone_size>>1);
+		zone_start = conf->strip_zone[j].zone_end;
+	}
+	printk(KERN_INFO "**********************************\n\n");
+}
+
+static int create_strip_zones(mddev_t *mddev)
+{
+	int i, c, err;
+	sector_t curr_zone_end, sectors;
+	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev;
+	struct strip_zone *zone;
+	int cnt;
+	char b[BDEVNAME_SIZE];
+	raid0_conf_t *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+
+	if (!conf)
+		return -ENOMEM;
+	list_for_each_entry(rdev1, &mddev->disks, same_set) {
+		printk(KERN_INFO "raid0: looking at %s\n",
+			bdevname(rdev1->bdev,b));
+		c = 0;
+
+		/* round size to chunk_size */
+		sectors = rdev1->sectors;
+		sector_div(sectors, mddev->chunk_sectors);
+		rdev1->sectors = sectors * mddev->chunk_sectors;
+
+		list_for_each_entry(rdev2, &mddev->disks, same_set) {
+			printk(KERN_INFO "raid0:   comparing %s(%llu)",
+			       bdevname(rdev1->bdev,b),
+			       (unsigned long long)rdev1->sectors);
+			printk(KERN_INFO " with %s(%llu)\n",
+			       bdevname(rdev2->bdev,b),
+			       (unsigned long long)rdev2->sectors);
+			if (rdev2 == rdev1) {
+				printk(KERN_INFO "raid0:   END\n");
+				break;
+			}
+			if (rdev2->sectors == rdev1->sectors) {
+				/*
+				 * Not unique, don't count it as a new
+				 * group
+				 */
+				printk(KERN_INFO "raid0:   EQUAL\n");
+				c = 1;
+				break;
+			}
+			printk(KERN_INFO "raid0:   NOT EQUAL\n");
+		}
+		if (!c) {
+			printk(KERN_INFO "raid0:   ==> UNIQUE\n");
+			conf->nr_strip_zones++;
+			printk(KERN_INFO "raid0: %d zones\n",
+				conf->nr_strip_zones);
+		}
+	}
+	printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones);
+	err = -ENOMEM;
+	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
+				conf->nr_strip_zones, GFP_KERNEL);
+	if (!conf->strip_zone)
+		goto abort;
+	conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
+				conf->nr_strip_zones*mddev->raid_disks,
+				GFP_KERNEL);
+	if (!conf->devlist)
+		goto abort;
+
+	/* The first zone must contain all devices, so here we check that
+	 * there is a proper alignment of slots to devices and find them all
+	 */
+	zone = &conf->strip_zone[0];
+	cnt = 0;
+	smallest = NULL;
+	dev = conf->devlist;
+	err = -EINVAL;
+	list_for_each_entry(rdev1, &mddev->disks, same_set) {
+		int j = rdev1->raid_disk;
+
+		if (j < 0 || j >= mddev->raid_disks) {
+			printk(KERN_ERR "raid0: bad disk number %d - "
+				"aborting!\n", j);
+			goto abort;
+		}
+		if (dev[j]) {
+			printk(KERN_ERR "raid0: multiple devices for %d - "
+				"aborting!\n", j);
+			goto abort;
+		}
+		dev[j] = rdev1;
+
+		disk_stack_limits(mddev->gendisk, rdev1->bdev,
+				  rdev1->data_offset << 9);
+		/* as we don't honour merge_bvec_fn, we must never risk
+		 * violating it, so limit ->max_phys_segments to 1, lying within
+		 * a single page.
+		 */
+
+		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) {
+			blk_queue_max_phys_segments(mddev->queue, 1);
+			blk_queue_segment_boundary(mddev->queue,
+						   PAGE_CACHE_SIZE - 1);
+		}
+		if (!smallest || (rdev1->sectors < smallest->sectors))
+			smallest = rdev1;
+		cnt++;
+	}
+	if (cnt != mddev->raid_disks) {
+		printk(KERN_ERR "raid0: too few disks (%d of %d) - "
+			"aborting!\n", cnt, mddev->raid_disks);
+		goto abort;
+	}
+	zone->nb_dev = cnt;
+	zone->zone_end = smallest->sectors * cnt;
+
+	curr_zone_end = zone->zone_end;
+
+	/* now do the other zones */
+	for (i = 1; i < conf->nr_strip_zones; i++)
+	{
+		int j;
+
+		zone = conf->strip_zone + i;
+		dev = conf->devlist + i * mddev->raid_disks;
+
+		printk(KERN_INFO "raid0: zone %d\n", i);
+		zone->dev_start = smallest->sectors;
+		smallest = NULL;
+		c = 0;
+
+		for (j=0; j<cnt; j++) {
+			rdev = conf->devlist[j];
+			printk(KERN_INFO "raid0: checking %s ...",
+				bdevname(rdev->bdev, b));
+			if (rdev->sectors <= zone->dev_start) {
+				printk(KERN_INFO " nope.\n");
+				continue;
+			}
+			printk(KERN_INFO " contained as device %d\n", c);
+			dev[c] = rdev;
+			c++;
+			if (!smallest || rdev->sectors < smallest->sectors) {
+				smallest = rdev;
+				printk(KERN_INFO "  (%llu) is smallest!.\n",
+					(unsigned long long)rdev->sectors);
+			}
+		}
+
+		zone->nb_dev = c;
+		sectors = (smallest->sectors - zone->dev_start) * c;
+		printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
+			zone->nb_dev, (unsigned long long)sectors);
+
+		curr_zone_end += sectors;
+		zone->zone_end = curr_zone_end;
+
+		printk(KERN_INFO "raid0: current zone start: %llu\n",
+			(unsigned long long)smallest->sectors);
+	}
+	mddev->queue->unplug_fn = raid0_unplug;
+	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
+	mddev->queue->backing_dev_info.congested_data = mddev;
+
+	/*
+	 * now since we have the hard sector sizes, we can make sure
+	 * chunk size is a multiple of that sector size
+	 */
+	if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
+		printk(KERN_ERR "%s chunk_size of %d not valid\n",
+		       mdname(mddev),
+		       mddev->chunk_sectors << 9);
+		goto abort;
+	}
+
+	blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
+	blk_queue_io_opt(mddev->queue,
+			 (mddev->chunk_sectors << 9) * mddev->raid_disks);
+
+	printk(KERN_INFO "raid0: done.\n");
+	mddev->private = conf;
+	return 0;
+abort:
+	kfree(conf->strip_zone);
+	kfree(conf->devlist);
+	kfree(conf);
+	mddev->private = NULL;
+	return err;
+}
+
+/**
+ *	raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
+ *	@q: request queue
+ *	@bvm: properties of new bio
+ *	@biovec: the request that could be merged to it.
+ *
+ *	Return amount of bytes we can accept at this offset
+ */
+static int raid0_mergeable_bvec(struct request_queue *q,
+				struct bvec_merge_data *bvm,
+				struct bio_vec *biovec)
+{
+	mddev_t *mddev = q->queuedata;
+	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+	int max;
+	unsigned int chunk_sectors = mddev->chunk_sectors;
+	unsigned int bio_sectors = bvm->bi_size >> 9;
+
+	if (is_power_of_2(chunk_sectors))
+		max =  (chunk_sectors - ((sector & (chunk_sectors-1))
+						+ bio_sectors)) << 9;
+	else
+		max =  (chunk_sectors - (sector_div(sector, chunk_sectors)
+						+ bio_sectors)) << 9;
+	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
+	if (max <= biovec->bv_len && bio_sectors == 0)
+		return biovec->bv_len;
+	else 
+		return max;
+}
+
+static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+	sector_t array_sectors = 0;
+	mdk_rdev_t *rdev;
+
+	WARN_ONCE(sectors || raid_disks,
+		  "%s does not support generic reshape\n", __func__);
+
+	list_for_each_entry(rdev, &mddev->disks, same_set)
+		array_sectors += rdev->sectors;
+
+	return array_sectors;
+}
+
+static int raid0_run(mddev_t *mddev)
+{
+	int ret;
+
+	if (mddev->chunk_sectors == 0) {
+		printk(KERN_ERR "md/raid0: chunk size must be set.\n");
+		return -EINVAL;
+	}
+	if (md_check_no_bitmap(mddev))
+		return -EINVAL;
+	blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors);
+	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
+
+	ret = create_strip_zones(mddev);
+	if (ret < 0)
+		return ret;
+
+	/* calculate array device size */
+	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
+
+	printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
+		(unsigned long long)mddev->array_sectors);
+	/* calculate the max read-ahead size.
+	 * For read-ahead of large files to be effective, we need to
+	 * readahead at least twice a whole stripe. i.e. number of devices
+	 * multiplied by chunk size times 2.
+	 * If an individual device has an ra_pages greater than the
+	 * chunk size, then we will not drive that device as hard as it
+	 * wants.  We consider this a configuration error: a larger
+	 * chunksize should be used in that case.
+	 */
+	{
+		int stripe = mddev->raid_disks *
+			(mddev->chunk_sectors << 9) / PAGE_SIZE;
+		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
+			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
+	}
+
+	blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
+	dump_zones(mddev);
+	md_integrity_register(mddev);
+	return 0;
+}
+
+static int raid0_stop(mddev_t *mddev)
+{
+	raid0_conf_t *conf = mddev->private;
+
+	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+	kfree(conf->strip_zone);
+	kfree(conf->devlist);
+	kfree(conf);
+	mddev->private = NULL;
+	return 0;
+}
+
+/* Find the zone which holds a particular offset
+ * Update *sectorp to be an offset in that zone
+ */
+static struct strip_zone *find_zone(struct raid0_private_data *conf,
+				    sector_t *sectorp)
+{
+	int i;
+	struct strip_zone *z = conf->strip_zone;
+	sector_t sector = *sectorp;
+
+	for (i = 0; i < conf->nr_strip_zones; i++)
+		if (sector < z[i].zone_end) {
+			if (i)
+				*sectorp = sector - z[i-1].zone_end;
+			return z + i;
+		}
+	BUG();
+}
+
+/*
+ * remaps the bio to the target device. we separate two flows.
+ * power 2 flow and a general flow for the sake of perfromance
+*/
+static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
+				sector_t sector, sector_t *sector_offset)
+{
+	unsigned int sect_in_chunk;
+	sector_t chunk;
+	raid0_conf_t *conf = mddev->private;
+	unsigned int chunk_sects = mddev->chunk_sectors;
+
+	if (is_power_of_2(chunk_sects)) {
+		int chunksect_bits = ffz(~chunk_sects);
+		/* find the sector offset inside the chunk */
+		sect_in_chunk  = sector & (chunk_sects - 1);
+		sector >>= chunksect_bits;
+		/* chunk in zone */
+		chunk = *sector_offset;
+		/* quotient is the chunk in real device*/
+		sector_div(chunk, zone->nb_dev << chunksect_bits);
+	} else{
+		sect_in_chunk = sector_div(sector, chunk_sects);
+		chunk = *sector_offset;
+		sector_div(chunk, chunk_sects * zone->nb_dev);
+	}
+	/*
+	*  position the bio over the real device
+	*  real sector = chunk in device + starting of zone
+	*	+ the position in the chunk
+	*/
+	*sector_offset = (chunk * chunk_sects) + sect_in_chunk;
+	return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks
+			     + sector_div(sector, zone->nb_dev)];
+}
+
+/*
+ * Is io distribute over 1 or more chunks ?
+*/
+static inline int is_io_in_chunk_boundary(mddev_t *mddev,
+			unsigned int chunk_sects, struct bio *bio)
+{
+	if (likely(is_power_of_2(chunk_sects))) {
+		return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
+					+ (bio->bi_size >> 9));
+	} else{
+		sector_t sector = bio->bi_sector;
+		return chunk_sects >= (sector_div(sector, chunk_sects)
+						+ (bio->bi_size >> 9));
+	}
+}
+
+static int raid0_make_request(struct request_queue *q, struct bio *bio)
+{
+	mddev_t *mddev = q->queuedata;
+	unsigned int chunk_sects;
+	sector_t sector_offset;
+	struct strip_zone *zone;
+	mdk_rdev_t *tmp_dev;
+	const int rw = bio_data_dir(bio);
+	int cpu;
+
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
+		bio_endio(bio, -EOPNOTSUPP);
+		return 0;
+	}
+
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
+		      bio_sectors(bio));
+	part_stat_unlock();
+
+	chunk_sects = mddev->chunk_sectors;
+	if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
+		sector_t sector = bio->bi_sector;
+		struct bio_pair *bp;
+		/* Sanity check -- queue functions should prevent this happening */
+		if (bio->bi_vcnt != 1 ||
+		    bio->bi_idx != 0)
+			goto bad_map;
+		/* This is a one page bio that upper layers
+		 * refuse to split for us, so we need to split it.
+		 */
+		if (likely(is_power_of_2(chunk_sects)))
+			bp = bio_split(bio, chunk_sects - (sector &
+							   (chunk_sects-1)));
+		else
+			bp = bio_split(bio, chunk_sects -
+				       sector_div(sector, chunk_sects));
+		if (raid0_make_request(q, &bp->bio1))
+			generic_make_request(&bp->bio1);
+		if (raid0_make_request(q, &bp->bio2))
+			generic_make_request(&bp->bio2);
+
+		bio_pair_release(bp);
+		return 0;
+	}
+
+	sector_offset = bio->bi_sector;
+	zone =  find_zone(mddev->private, &sector_offset);
+	tmp_dev = map_sector(mddev, zone, bio->bi_sector,
+			     &sector_offset);
+	bio->bi_bdev = tmp_dev->bdev;
+	bio->bi_sector = sector_offset + zone->dev_start +
+		tmp_dev->data_offset;
+	/*
+	 * Let the main block layer submit the IO and resolve recursion:
+	 */
+	return 1;
+
+bad_map:
+	printk("raid0_make_request bug: can't convert block across chunks"
+		" or bigger than %dk %llu %d\n", chunk_sects / 2,
+		(unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+
+	bio_io_error(bio);
+	return 0;
+}
+
+static void raid0_status(struct seq_file *seq, mddev_t *mddev)
+{
+#undef MD_DEBUG
+#ifdef MD_DEBUG
+	int j, k, h;
+	char b[BDEVNAME_SIZE];
+	raid0_conf_t *conf = mddev->private;
+
+	sector_t zone_size;
+	sector_t zone_start = 0;
+	h = 0;
+
+	for (j = 0; j < conf->nr_strip_zones; j++) {
+		seq_printf(seq, "      z%d", j);
+		seq_printf(seq, "=[");
+		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
+			seq_printf(seq, "%s/", bdevname(
+				conf->devlist[j*mddev->raid_disks + k]
+						->bdev, b));
+
+		zone_size  = conf->strip_zone[j].zone_end - zone_start;
+		seq_printf(seq, "] ze=%lld ds=%lld s=%lld\n",
+			(unsigned long long)zone_start>>1,
+			(unsigned long long)conf->strip_zone[j].dev_start>>1,
+			(unsigned long long)zone_size>>1);
+		zone_start = conf->strip_zone[j].zone_end;
+	}
+#endif
+	seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2);
+	return;
+}
+
+static struct mdk_personality raid0_personality=
+{
+	.name		= "raid0",
+	.level		= 0,
+	.owner		= THIS_MODULE,
+	.make_request	= raid0_make_request,
+	.run		= raid0_run,
+	.stop		= raid0_stop,
+	.status		= raid0_status,
+	.size		= raid0_size,
+};
+
+static int __init raid0_init (void)
+{
+	return register_md_personality (&raid0_personality);
+}
+
+static void raid0_exit (void)
+{
+	unregister_md_personality (&raid0_personality);
+}
+
+module_init(raid0_init);
+module_exit(raid0_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md-personality-2"); /* RAID0 */
+MODULE_ALIAS("md-raid0");
+MODULE_ALIAS("md-level-0");
--- a/kernel/drivers/md/raid0.h
+++ b/kernel/drivers/md/raid0.h
@@ -0,0 +1,20 @@
+#ifndef _RAID0_H
+#define _RAID0_H
+
+struct strip_zone
+{
+	sector_t zone_end;	/* Start of the next zone (in sectors) */
+	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
+	int nb_dev;		/* # of devices attached to the zone */
+};
+
+struct raid0_private_data
+{
+	struct strip_zone *strip_zone;
+	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
+	int nr_strip_zones;
+};
+
+typedef struct raid0_private_data raid0_conf_t;
+
+#endif
--- a/kernel/drivers/md/raid1.c
+++ b/kernel/drivers/md/raid1.c
--- a/kernel/drivers/md/raid1.h
+++ b/kernel/drivers/md/raid1.h
@@ -0,0 +1,126 @@
+#ifndef _RAID1_H
+#define _RAID1_H
+
+typedef struct mirror_info mirror_info_t;
+
+struct mirror_info {
+	mdk_rdev_t	*rdev;
+	sector_t	head_position;
+};
+
+/*
+ * memory pools need a pointer to the mddev, so they can force an unplug
+ * when memory is tight, and a count of the number of drives that the
+ * pool was allocated for, so they know how much to allocate and free.
+ * mddev->raid_disks cannot be used, as it can change while a pool is active
+ * These two datums are stored in a kmalloced struct.
+ */
+
+struct pool_info {
+	mddev_t *mddev;
+	int	raid_disks;
+};
+
+
+typedef struct r1bio_s r1bio_t;
+
+struct r1_private_data_s {
+	mddev_t			*mddev;
+	mirror_info_t		*mirrors;
+	int			raid_disks;
+	int			last_used;
+	sector_t		next_seq_sect;
+	spinlock_t		device_lock;
+
+	struct list_head	retry_list;
+	/* queue pending writes and submit them on unplug */
+	struct bio_list		pending_bio_list;
+	/* queue of writes that have been unplugged */
+	struct bio_list		flushing_bio_list;
+
+	/* for use when syncing mirrors: */
+
+	spinlock_t		resync_lock;
+	int			nr_pending;
+	int			nr_waiting;
+	int			nr_queued;
+	int			barrier;
+	sector_t		next_resync;
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+
+	wait_queue_head_t	wait_barrier;
+
+	struct pool_info	*poolinfo;
+
+	struct page		*tmppage;
+
+	mempool_t *r1bio_pool;
+	mempool_t *r1buf_pool;
+};
+
+typedef struct r1_private_data_s conf_t;
+
+/*
+ * this is our 'private' RAID1 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID1 operation, and about their status:
+ */
+
+struct r1bio_s {
+	atomic_t		remaining; /* 'have we finished' count,
+					    * used from IRQ handlers
+					    */
+	atomic_t		behind_remaining; /* number of write-behind ios remaining
+						 * in this BehindIO request
+						 */
+	sector_t		sector;
+	int			sectors;
+	unsigned long		state;
+	mddev_t			*mddev;
+	/*
+	 * original bio going to /dev/mdx
+	 */
+	struct bio		*master_bio;
+	/*
+	 * if the IO is in READ direction, then this is where we read
+	 */
+	int			read_disk;
+
+	struct list_head	retry_list;
+	struct bitmap_update	*bitmap_update;
+	/*
+	 * if the IO is in WRITE direction, then multiple bios are used.
+	 * We choose the number when they are allocated.
+	 */
+	struct bio		*bios[0];
+	/* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
+};
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
+/* bits for r1bio.state */
+#define	R1BIO_Uptodate	0
+#define	R1BIO_IsSync	1
+#define	R1BIO_Degraded	2
+#define	R1BIO_BehindIO	3
+#define	R1BIO_Barrier	4
+#define R1BIO_BarrierRetry 5
+/* For write-behind requests, we call bi_end_io when
+ * the last non-write-behind device completes, providing
+ * any write was successful.  Otherwise we call when
+ * any write-behind write succeeds, otherwise we call
+ * with failure when last write completes (and all failed).
+ * Record that bi_end_io was called with this flag...
+ */
+#define	R1BIO_Returned 6
+
+#endif
--- a/kernel/drivers/md/raid10.c
+++ b/kernel/drivers/md/raid10.c
--- a/kernel/drivers/md/raid10.h
+++ b/kernel/drivers/md/raid10.h
@@ -0,0 +1,115 @@
+#ifndef _RAID10_H
+#define _RAID10_H
+
+typedef struct mirror_info mirror_info_t;
+
+struct mirror_info {
+	mdk_rdev_t	*rdev;
+	sector_t	head_position;
+};
+
+typedef struct r10bio_s r10bio_t;
+
+struct r10_private_data_s {
+	mddev_t			*mddev;
+	mirror_info_t		*mirrors;
+	int			raid_disks;
+	spinlock_t		device_lock;
+
+	/* geometry */
+	int			near_copies;  /* number of copies layed out raid0 style */
+	int 			far_copies;   /* number of copies layed out
+					       * at large strides across drives
+					       */
+	int			far_offset;   /* far_copies are offset by 1 stripe
+					       * instead of many
+					       */
+	int			copies;	      /* near_copies * far_copies.
+					       * must be <= raid_disks
+					       */
+	sector_t		stride;	      /* distance between far copies.
+					       * This is size / far_copies unless
+					       * far_offset, in which case it is
+					       * 1 stripe.
+					       */
+
+	int chunk_shift; /* shift from chunks to sectors */
+	sector_t chunk_mask;
+
+	struct list_head	retry_list;
+	/* queue pending writes and submit them on unplug */
+	struct bio_list		pending_bio_list;
+
+
+	spinlock_t		resync_lock;
+	int nr_pending;
+	int nr_waiting;
+	int nr_queued;
+	int barrier;
+	sector_t		next_resync;
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+
+	wait_queue_head_t	wait_barrier;
+
+	mempool_t *r10bio_pool;
+	mempool_t *r10buf_pool;
+	struct page		*tmppage;
+};
+
+typedef struct r10_private_data_s conf_t;
+
+/*
+ * this is our 'private' RAID10 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID10 operation, and about their status:
+ */
+
+struct r10bio_s {
+	atomic_t		remaining; /* 'have we finished' count,
+					    * used from IRQ handlers
+					    */
+	sector_t		sector;	/* virtual sector number */
+	int			sectors;
+	unsigned long		state;
+	mddev_t			*mddev;
+	/*
+	 * original bio going to /dev/mdx
+	 */
+	struct bio		*master_bio;
+	/*
+	 * if the IO is in READ direction, then this is where we read
+	 */
+	int			read_slot;
+
+	struct list_head	retry_list;
+	/*
+	 * if the IO is in WRITE direction, then multiple bios are used,
+	 * one for each copy.
+	 * When resyncing we also use one for each copy.
+	 * When reconstructing, we use 2 bios, one for read, one for write.
+	 * We choose the number when they are allocated.
+	 */
+	struct {
+		struct bio		*bio;
+		sector_t addr;
+		int devnum;
+	} devs[0];
+};
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
+/* bits for r10bio.state */
+#define	R10BIO_Uptodate	0
+#define	R10BIO_IsSync	1
+#define	R10BIO_IsRecover 2
+#define	R10BIO_Degraded 3
+#endif
--- a/kernel/drivers/md/raid5.c
+++ b/kernel/drivers/md/raid5.c
--- a/kernel/drivers/md/raid5.h
+++ b/kernel/drivers/md/raid5.h
@@ -0,0 +1,500 @@
+#ifndef _RAID5_H
+#define _RAID5_H
+
+#include <linux/raid/xor.h>
+#include <linux/dmaengine.h>
+
+/*
+ *
+ * Each stripe contains one buffer per disc.  Each buffer can be in
+ * one of a number of states stored in "flags".  Changes between
+ * these states happen *almost* exclusively under a per-stripe
+ * spinlock.  Some very specific changes can happen in bi_end_io, and
+ * these are not protected by the spin lock.
+ *
+ * The flag bits that are used to represent these states are:
+ *   R5_UPTODATE and R5_LOCKED
+ *
+ * State Empty == !UPTODATE, !LOCK
+ *        We have no data, and there is no active request
+ * State Want == !UPTODATE, LOCK
+ *        A read request is being submitted for this block
+ * State Dirty == UPTODATE, LOCK
+ *        Some new data is in this buffer, and it is being written out
+ * State Clean == UPTODATE, !LOCK
+ *        We have valid data which is the same as on disc
+ *
+ * The possible state transitions are:
+ *
+ *  Empty -> Want   - on read or write to get old data for  parity calc
+ *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
+ *  Empty -> Clean  - on compute_block when computing a block for failed drive
+ *  Want  -> Empty  - on failed read
+ *  Want  -> Clean  - on successful completion of read request
+ *  Dirty -> Clean  - on successful completion of write request
+ *  Dirty -> Clean  - on failed write
+ *  Clean -> Dirty  - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
+ *
+ * The Want->Empty, Want->Clean, Dirty->Clean, transitions
+ * all happen in b_end_io at interrupt time.
+ * Each sets the Uptodate bit before releasing the Lock bit.
+ * This leaves one multi-stage transition:
+ *    Want->Dirty->Clean
+ * This is safe because thinking that a Clean buffer is actually dirty
+ * will at worst delay some action, and the stripe will be scheduled
+ * for attention after the transition is complete.
+ *
+ * There is one possibility that is not covered by these states.  That
+ * is if one drive has failed and there is a spare being rebuilt.  We
+ * can't distinguish between a clean block that has been generated
+ * from parity calculations, and a clean block that has been
+ * successfully written to the spare ( or to parity when resyncing).
+ * To distingush these states we have a stripe bit STRIPE_INSYNC that
+ * is set whenever a write is scheduled to the spare, or to the parity
+ * disc if there is no spare.  A sync request clears this bit, and
+ * when we find it set with no buffers locked, we know the sync is
+ * complete.
+ *
+ * Buffers for the md device that arrive via make_request are attached
+ * to the appropriate stripe in one of two lists linked on b_reqnext.
+ * One list (bh_read) for read requests, one (bh_write) for write.
+ * There should never be more than one buffer on the two lists
+ * together, but we are not guaranteed of that so we allow for more.
+ *
+ * If a buffer is on the read list when the associated cache buffer is
+ * Uptodate, the data is copied into the read buffer and it's b_end_io
+ * routine is called.  This may happen in the end_request routine only
+ * if the buffer has just successfully been read.  end_request should
+ * remove the buffers from the list and then set the Uptodate bit on
+ * the buffer.  Other threads may do this only if they first check
+ * that the Uptodate bit is set.  Once they have checked that they may
+ * take buffers off the read queue.
+ *
+ * When a buffer on the write list is committed for write it is copied
+ * into the cache buffer, which is then marked dirty, and moved onto a
+ * third list, the written list (bh_written).  Once both the parity
+ * block and the cached buffer are successfully written, any buffer on
+ * a written list can be returned with b_end_io.
+ *
+ * The write list and read list both act as fifos.  The read list is
+ * protected by the device_lock.  The write and written lists are
+ * protected by the stripe lock.  The device_lock, which can be
+ * claimed while the stipe lock is held, is only for list
+ * manipulations and will only be held for a very short time.  It can
+ * be claimed from interrupts.
+ *
+ *
+ * Stripes in the stripe cache can be on one of two lists (or on
+ * neither).  The "inactive_list" contains stripes which are not
+ * currently being used for any request.  They can freely be reused
+ * for another stripe.  The "handle_list" contains stripes that need
+ * to be handled in some way.  Both of these are fifo queues.  Each
+ * stripe is also (potentially) linked to a hash bucket in the hash
+ * table so that it can be found by sector number.  Stripes that are
+ * not hashed must be on the inactive_list, and will normally be at
+ * the front.  All stripes start life this way.
+ *
+ * The inactive_list, handle_list and hash bucket lists are all protected by the
+ * device_lock.
+ *  - stripes on the inactive_list never have their stripe_lock held.
+ *  - stripes have a reference counter. If count==0, they are on a list.
+ *  - If a stripe might need handling, STRIPE_HANDLE is set.
+ *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on
+ *    handle_list else inactive_list
+ *
+ * This, combined with the fact that STRIPE_HANDLE is only ever
+ * cleared while a stripe has a non-zero count means that if the
+ * refcount is 0 and STRIPE_HANDLE is set, then it is on the
+ * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
+ * the stripe is on inactive_list.
+ *
+ * The possible transitions are:
+ *  activate an unhashed/inactive stripe (get_active_stripe())
+ *     lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
+ *  activate a hashed, possibly active stripe (get_active_stripe())
+ *     lockdev check-hash if(!cnt++)unlink-stripe unlockdev
+ *  attach a request to an active stripe (add_stripe_bh())
+ *     lockdev attach-buffer unlockdev
+ *  handle a stripe (handle_stripe())
+ *     lockstripe clrSTRIPE_HANDLE ...
+ *		(lockdev check-buffers unlockdev) ..
+ *		change-state ..
+ *		record io/ops needed unlockstripe schedule io/ops
+ *  release an active stripe (release_stripe())
+ *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
+ *
+ * The refcount counts each thread that have activated the stripe,
+ * plus raid5d if it is handling it, plus one for each active request
+ * on a cached buffer, and plus one if the stripe is undergoing stripe
+ * operations.
+ *
+ * Stripe operations are performed outside the stripe lock,
+ * the stripe operations are:
+ * -copying data between the stripe cache and user application buffers
+ * -computing blocks to save a disk access, or to recover a missing block
+ * -updating the parity on a write operation (reconstruct write and
+ *  read-modify-write)
+ * -checking parity correctness
+ * -running i/o to disk
+ * These operations are carried out by raid5_run_ops which uses the async_tx
+ * api to (optionally) offload operations to dedicated hardware engines.
+ * When requesting an operation handle_stripe sets the pending bit for the
+ * operation and increments the count.  raid5_run_ops is then run whenever
+ * the count is non-zero.
+ * There are some critical dependencies between the operations that prevent some
+ * from being requested while another is in flight.
+ * 1/ Parity check operations destroy the in cache version of the parity block,
+ *    so we prevent parity dependent operations like writes and compute_blocks
+ *    from starting while a check is in progress.  Some dma engines can perform
+ *    the check without damaging the parity block, in these cases the parity
+ *    block is re-marked up to date (assuming the check was successful) and is
+ *    not re-read from disk.
+ * 2/ When a write operation is requested we immediately lock the affected
+ *    blocks, and mark them as not up to date.  This causes new read requests
+ *    to be held off, as well as parity checks and compute block operations.
+ * 3/ Once a compute block operation has been requested handle_stripe treats
+ *    that block as if it is up to date.  raid5_run_ops guaruntees that any
+ *    operation that is dependent on the compute block result is initiated after
+ *    the compute block completes.
+ */
+
+/*
+ * Operations state - intermediate states that are visible outside of sh->lock
+ * In general _idle indicates nothing is running, _run indicates a data
+ * processing operation is active, and _result means the data processing result
+ * is stable and can be acted upon.  For simple operations like biofill and
+ * compute that only have an _idle and _run state they are indicated with
+ * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
+ */
+/**
+ * enum check_states - handles syncing / repairing a stripe
+ * @check_state_idle - check operations are quiesced
+ * @check_state_run - check operation is running
+ * @check_state_result - set outside lock when check result is valid
+ * @check_state_compute_run - check failed and we are repairing
+ * @check_state_compute_result - set outside lock when compute result is valid
+ */
+enum check_states {
+	check_state_idle = 0,
+	check_state_run, /* xor parity check */
+	check_state_run_q, /* q-parity check */
+	check_state_run_pq, /* pq dual parity check */
+	check_state_check_result,
+	check_state_compute_run, /* parity repair */
+	check_state_compute_result,
+};
+
+/**
+ * enum reconstruct_states - handles writing or expanding a stripe
+ */
+enum reconstruct_states {
+	reconstruct_state_idle = 0,
+	reconstruct_state_prexor_drain_run,	/* prexor-write */
+	reconstruct_state_drain_run,		/* write */
+	reconstruct_state_run,			/* expand */
+	reconstruct_state_prexor_drain_result,
+	reconstruct_state_drain_result,
+	reconstruct_state_result,
+};
+
+struct stripe_head {
+	struct hlist_node	hash;
+	struct list_head	lru;	      /* inactive_list or handle_list */
+	struct raid5_private_data *raid_conf;
+	short			generation;	/* increments with every
+						 * reshape */
+	sector_t		sector;		/* sector of this row */
+	short			pd_idx;		/* parity disk index */
+	short			qd_idx;		/* 'Q' disk index for raid6 */
+	short			ddf_layout;/* use DDF ordering to calculate Q */
+	unsigned long		state;		/* state flags */
+	atomic_t		count;	      /* nr of active thread/requests */
+	spinlock_t		lock;
+	int			bm_seq;	/* sequence number for bitmap flushes */
+	int			disks;		/* disks in stripe */
+	enum check_states	check_state;
+	enum reconstruct_states reconstruct_state;
+	/**
+	 * struct stripe_operations
+	 * @target - STRIPE_OP_COMPUTE_BLK target
+	 * @target2 - 2nd compute target in the raid6 case
+	 * @zero_sum_result - P and Q verification flags
+	 * @request - async service request flags for raid_run_ops
+	 */
+	struct stripe_operations {
+		int 		     target, target2;
+		enum sum_check_flags zero_sum_result;
+		#ifdef CONFIG_MULTICORE_RAID456
+		unsigned long	     request;
+		wait_queue_head_t    wait_for_ops;
+		#endif
+	} ops;
+	struct r5dev {
+		struct bio	req;
+		struct bio_vec	vec;
+		struct page	*page;
+		struct bio	*toread, *read, *towrite, *written;
+		sector_t	sector;			/* sector of this page */
+		unsigned long	flags;
+	} dev[1]; /* allocated with extra space depending of RAID geometry */
+};
+
+/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
+ *     for handle_stripe.  It is only valid under spin_lock(sh->lock);
+ */
+struct stripe_head_state {
+	int syncing, expanding, expanded;
+	int locked, uptodate, to_read, to_write, failed, written;
+	int to_fill, compute, req_compute, non_overwrite;
+	int failed_num;
+	unsigned long ops_request;
+};
+
+/* r6_state - extra state data only relevant to r6 */
+struct r6_state {
+	int p_failed, q_failed, failed_num[2];
+};
+
+/* Flags */
+#define	R5_UPTODATE	0	/* page contains current data */
+#define	R5_LOCKED	1	/* IO has been submitted on "req" */
+#define	R5_OVERWRITE	2	/* towrite covers whole page */
+/* and some that are internal to handle_stripe */
+#define	R5_Insync	3	/* rdev && rdev->in_sync at start */
+#define	R5_Wantread	4	/* want to schedule a read */
+#define	R5_Wantwrite	5
+#define	R5_Overlap	7	/* There is a pending overlapping request on this block */
+#define	R5_ReadError	8	/* seen a read error here recently */
+#define	R5_ReWrite	9	/* have tried to over-write the readerror */
+
+#define	R5_Expanded	10	/* This block now has post-expand data */
+#define	R5_Wantcompute	11 /* compute_block in progress treat as
+				    * uptodate
+				    */
+#define	R5_Wantfill	12 /* dev->toread contains a bio that needs
+				    * filling
+				    */
+#define R5_Wantdrain	13 /* dev->towrite needs to be drained */
+/*
+ * Write method
+ */
+#define RECONSTRUCT_WRITE	1
+#define READ_MODIFY_WRITE	2
+/* not a write method, but a compute_parity mode */
+#define	CHECK_PARITY		3
+/* Additional compute_parity mode -- updates the parity w/o LOCKING */
+#define UPDATE_PARITY		4
+
+/*
+ * Stripe state
+ */
+#define STRIPE_HANDLE		2
+#define	STRIPE_SYNCING		3
+#define	STRIPE_INSYNC		4
+#define	STRIPE_PREREAD_ACTIVE	5
+#define	STRIPE_DELAYED		6
+#define	STRIPE_DEGRADED		7
+#define	STRIPE_BIT_DELAY	8
+#define	STRIPE_EXPANDING	9
+#define	STRIPE_EXPAND_SOURCE	10
+#define	STRIPE_EXPAND_READY	11
+#define	STRIPE_IO_STARTED	12 /* do not count towards 'bypass_count' */
+#define	STRIPE_FULL_WRITE	13 /* all blocks are set to be overwritten */
+#define	STRIPE_BIOFILL_RUN	14
+#define	STRIPE_COMPUTE_RUN	15
+#define	STRIPE_OPS_REQ_PENDING	16
+
+/*
+ * Operation request flags
+ */
+#define STRIPE_OP_BIOFILL	0
+#define STRIPE_OP_COMPUTE_BLK	1
+#define STRIPE_OP_PREXOR	2
+#define STRIPE_OP_BIODRAIN	3
+#define STRIPE_OP_RECONSTRUCT	4
+#define STRIPE_OP_CHECK	5
+
+/*
+ * Plugging:
+ *
+ * To improve write throughput, we need to delay the handling of some
+ * stripes until there has been a chance that several write requests
+ * for the one stripe have all been collected.
+ * In particular, any write request that would require pre-reading
+ * is put on a "delayed" queue until there are no stripes currently
+ * in a pre-read phase.  Further, if the "delayed" queue is empty when
+ * a stripe is put on it then we "plug" the queue and do not process it
+ * until an unplug call is made. (the unplug_io_fn() is called).
+ *
+ * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
+ * it to the count of prereading stripes.
+ * When write is initiated, or the stripe refcnt == 0 (just in case) we
+ * clear the PREREAD_ACTIVE flag and decrement the count
+ * Whenever the 'handle' queue is empty and the device is not plugged, we
+ * move any strips from delayed to handle and clear the DELAYED flag and set
+ * PREREAD_ACTIVE.
+ * In stripe_handle, if we find pre-reading is necessary, we do it if
+ * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
+ * HANDLE gets cleared if stripe_handle leave nothing locked.
+ */
+
+
+struct disk_info {
+	mdk_rdev_t	*rdev;
+};
+
+struct raid5_private_data {
+	struct hlist_head	*stripe_hashtbl;
+	mddev_t			*mddev;
+	struct disk_info	*spare;
+	int			chunk_sectors;
+	int			level, algorithm;
+	int			max_degraded;
+	int			raid_disks;
+	int			max_nr_stripes;
+
+	/* reshape_progress is the leading edge of a 'reshape'
+	 * It has value MaxSector when no reshape is happening
+	 * If delta_disks < 0, it is the last sector we started work on,
+	 * else is it the next sector to work on.
+	 */
+	sector_t		reshape_progress;
+	/* reshape_safe is the trailing edge of a reshape.  We know that
+	 * before (or after) this address, all reshape has completed.
+	 */
+	sector_t		reshape_safe;
+	int			previous_raid_disks;
+	int			prev_chunk_sectors;
+	int			prev_algo;
+	short			generation; /* increments with every reshape */
+	unsigned long		reshape_checkpoint; /* Time we last updated
+						     * metadata */
+
+	struct list_head	handle_list; /* stripes needing handling */
+	struct list_head	hold_list; /* preread ready stripes */
+	struct list_head	delayed_list; /* stripes that have plugged requests */
+	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
+	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
+	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
+	atomic_t		preread_active_stripes; /* stripes with scheduled io */
+	atomic_t		active_aligned_reads;
+	atomic_t		pending_full_writes; /* full write backlog */
+	int			bypass_count; /* bypassed prereads */
+	int			bypass_threshold; /* preread nice */
+	struct list_head	*last_hold; /* detect hold_list promotions */
+
+	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
+	/* unfortunately we need two cache names as we temporarily have
+	 * two caches.
+	 */
+	int			active_name;
+	char			cache_name[2][20];
+	struct kmem_cache		*slab_cache; /* for allocating stripes */
+
+	int			seq_flush, seq_write;
+	int			quiesce;
+
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+	/* per cpu variables */
+	struct raid5_percpu {
+		struct page	*spare_page; /* Used when checking P/Q in raid6 */
+		void		*scribble;   /* space for constructing buffer
+					      * lists and performing address
+					      * conversions
+					      */
+	} *percpu;
+	size_t			scribble_len; /* size of scribble region must be
+					       * associated with conf to handle
+					       * cpu hotplug while reshaping
+					       */
+#ifdef CONFIG_HOTPLUG_CPU
+	struct notifier_block	cpu_notify;
+#endif
+
+	/*
+	 * Free stripes pool
+	 */
+	atomic_t		active_stripes;
+	struct list_head	inactive_list;
+	wait_queue_head_t	wait_for_stripe;
+	wait_queue_head_t	wait_for_overlap;
+	int			inactive_blocked;	/* release of inactive stripes blocked,
+							 * waiting for 25% to be free
+							 */
+	int			pool_size; /* number of disks in stripeheads in pool */
+	spinlock_t		device_lock;
+	struct disk_info	*disks;
+
+	/* When taking over an array from a different personality, we store
+	 * the new thread here until we fully activate the array.
+	 */
+	struct mdk_thread_s	*thread;
+};
+
+typedef struct raid5_private_data raid5_conf_t;
+
+/*
+ * Our supported algorithms
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC	0 /* Rotating Parity N with Data Restart */
+#define ALGORITHM_RIGHT_ASYMMETRIC	1 /* Rotating Parity 0 with Data Restart */
+#define ALGORITHM_LEFT_SYMMETRIC	2 /* Rotating Parity N with Data Continuation */
+#define ALGORITHM_RIGHT_SYMMETRIC	3 /* Rotating Parity 0 with Data Continuation */
+
+/* Define non-rotating (raid4) algorithms.  These allow
+ * conversion of raid4 to raid5.
+ */
+#define ALGORITHM_PARITY_0		4 /* P or P,Q are initial devices */
+#define ALGORITHM_PARITY_N		5 /* P or P,Q are final devices. */
+
+/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
+ * Firstly, the exact positioning of the parity block is slightly
+ * different between the 'LEFT_*' modes of md and the "_N_*" modes
+ * of DDF.
+ * Secondly, or order of datablocks over which the Q syndrome is computed
+ * is different.
+ * Consequently we have different layouts for DDF/raid6 than md/raid6.
+ * These layouts are from the DDFv1.2 spec.
+ * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
+ * leaves RLQ=3 as 'Vendor Specific'
+ */
+
+#define ALGORITHM_ROTATING_ZERO_RESTART	8 /* DDF PRL=6 RLQ=1 */
+#define ALGORITHM_ROTATING_N_RESTART	9 /* DDF PRL=6 RLQ=2 */
+#define ALGORITHM_ROTATING_N_CONTINUE	10 /*DDF PRL=6 RLQ=3 */
+
+
+/* For every RAID5 algorithm we define a RAID6 algorithm
+ * with exactly the same layout for data and parity, and
+ * with the Q block always on the last device (N-1).
+ * This allows trivial conversion from RAID5 to RAID6
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC_6	16
+#define ALGORITHM_RIGHT_ASYMMETRIC_6	17
+#define ALGORITHM_LEFT_SYMMETRIC_6	18
+#define ALGORITHM_RIGHT_SYMMETRIC_6	19
+#define ALGORITHM_PARITY_0_6		20
+#define ALGORITHM_PARITY_N_6		ALGORITHM_PARITY_N
+
+static inline int algorithm_valid_raid5(int layout)
+{
+	return (layout >= 0) &&
+		(layout <= 5);
+}
+static inline int algorithm_valid_raid6(int layout)
+{
+	return (layout >= 0 && layout <= 5)
+		||
+		(layout >= 8 && layout <= 10)
+		||
+		(layout >= 16 && layout <= 20);
+}
+
+static inline int algorithm_is_DDF(int layout)
+{
+	return layout >= 8 && layout <= 10;
+}
+#endif
--- a/kernel/drivers/md/raid6algos.c
+++ b/kernel/drivers/md/raid6algos.c
@@ -0,0 +1,171 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6algos.c
+ *
+ * Algorithm list and algorithm selection for RAID-6
+ */
+
+#include <linux/raid/pq.h>
+#ifndef __KERNEL__
+#include <sys/mman.h>
+#include <stdio.h>
+#else
+#if !RAID6_USE_EMPTY_ZERO_PAGE
+/* In .bss so it's zeroed */
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+EXPORT_SYMBOL(raid6_empty_zero_page);
+#endif
+#endif
+
+struct raid6_calls raid6_call;
+EXPORT_SYMBOL_GPL(raid6_call);
+
+/* Various routine sets */
+extern const struct raid6_calls raid6_intx1;
+extern const struct raid6_calls raid6_intx2;
+extern const struct raid6_calls raid6_intx4;
+extern const struct raid6_calls raid6_intx8;
+extern const struct raid6_calls raid6_intx16;
+extern const struct raid6_calls raid6_intx32;
+extern const struct raid6_calls raid6_mmxx1;
+extern const struct raid6_calls raid6_mmxx2;
+extern const struct raid6_calls raid6_sse1x1;
+extern const struct raid6_calls raid6_sse1x2;
+extern const struct raid6_calls raid6_sse2x1;
+extern const struct raid6_calls raid6_sse2x2;
+extern const struct raid6_calls raid6_sse2x4;
+extern const struct raid6_calls raid6_altivec1;
+extern const struct raid6_calls raid6_altivec2;
+extern const struct raid6_calls raid6_altivec4;
+extern const struct raid6_calls raid6_altivec8;
+
+const struct raid6_calls * const raid6_algos[] = {
+	&raid6_intx1,
+	&raid6_intx2,
+	&raid6_intx4,
+	&raid6_intx8,
+#if defined(__ia64__)
+	&raid6_intx16,
+	&raid6_intx32,
+#endif
+#if defined(__i386__) && !defined(__arch_um__)
+	&raid6_mmxx1,
+	&raid6_mmxx2,
+	&raid6_sse1x1,
+	&raid6_sse1x2,
+	&raid6_sse2x1,
+	&raid6_sse2x2,
+#endif
+#if defined(__x86_64__) && !defined(__arch_um__)
+	&raid6_sse2x1,
+	&raid6_sse2x2,
+	&raid6_sse2x4,
+#endif
+#ifdef CONFIG_ALTIVEC
+	&raid6_altivec1,
+	&raid6_altivec2,
+	&raid6_altivec4,
+	&raid6_altivec8,
+#endif
+	NULL
+};
+
+#ifdef __KERNEL__
+#define RAID6_TIME_JIFFIES_LG2	4
+#else
+/* Need more time to be stable in userspace */
+#define RAID6_TIME_JIFFIES_LG2	9
+#define time_before(x, y) ((x) < (y))
+#endif
+
+/* Try to pick the best algorithm */
+/* This code uses the gfmul table as convenient data set to abuse */
+
+int __init raid6_select_algo(void)
+{
+	const struct raid6_calls * const * algo;
+	const struct raid6_calls * best;
+	char *syndromes;
+	void *dptrs[(65536/PAGE_SIZE)+2];
+	int i, disks;
+	unsigned long perf, bestperf;
+	int bestprefer;
+	unsigned long j0, j1;
+
+	disks = (65536/PAGE_SIZE)+2;
+	for ( i = 0 ; i < disks-2 ; i++ ) {
+		dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
+	}
+
+	/* Normal code - use a 2-page allocation to avoid D$ conflict */
+	syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
+
+	if ( !syndromes ) {
+		printk("raid6: Yikes!  No memory available.\n");
+		return -ENOMEM;
+	}
+
+	dptrs[disks-2] = syndromes;
+	dptrs[disks-1] = syndromes + PAGE_SIZE;
+
+	bestperf = 0;  bestprefer = 0;  best = NULL;
+
+	for ( algo = raid6_algos ; *algo ; algo++ ) {
+		if ( !(*algo)->valid || (*algo)->valid() ) {
+			perf = 0;
+
+			preempt_disable();
+			j0 = jiffies;
+			while ( (j1 = jiffies) == j0 )
+				cpu_relax();
+			while (time_before(jiffies,
+					    j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
+				(*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs);
+				perf++;
+			}
+			preempt_enable();
+
+			if ( (*algo)->prefer > bestprefer ||
+			     ((*algo)->prefer == bestprefer &&
+			      perf > bestperf) ) {
+				best = *algo;
+				bestprefer = best->prefer;
+				bestperf = perf;
+			}
+			printk("raid6: %-8s %5ld MB/s\n", (*algo)->name,
+			       (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
+		}
+	}
+
+	if (best) {
+		printk("raid6: using algorithm %s (%ld MB/s)\n",
+		       best->name,
+		       (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
+		raid6_call = *best;
+	} else
+		printk("raid6: Yikes!  No algorithm found!\n");
+
+	free_pages((unsigned long)syndromes, 1);
+
+	return best ? 0 : -EINVAL;
+}
+
+static void raid6_exit(void)
+{
+	do { } while (0);
+}
+
+subsys_initcall(raid6_select_algo);
+module_exit(raid6_exit);
+MODULE_LICENSE("GPL");
--- a/kernel/drivers/md/raid6altivec.uc
+++ b/kernel/drivers/md/raid6altivec.uc
@@ -0,0 +1,130 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6altivec$#.c
+ *
+ * $#-way unrolled portable integer math RAID-6 instruction set
+ *
+ * This file is postprocessed using unroll.awk
+ *
+ * <benh> hpa: in process,
+ * you can just "steal" the vec unit with enable_kernel_altivec() (but
+ * bracked this with preempt_disable/enable or in a lock)
+ */
+
+#include <linux/raid/pq.h>
+
+#ifdef CONFIG_ALTIVEC
+
+#include <altivec.h>
+#ifdef __KERNEL__
+# include <asm/system.h>
+# include <asm/cputable.h>
+#endif
+
+/*
+ * This is the C data type to use.  We use a vector of
+ * signed char so vec_cmpgt() will generate the right
+ * instruction.
+ */
+
+typedef vector signed char unative_t;
+
+#define NBYTES(x) ((vector signed char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
+#define NSIZE	sizeof(unative_t)
+
+/*
+ * The SHLBYTE() operation shifts each byte left by 1, *not*
+ * rolling over into the next byte
+ */
+static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
+{
+	return vec_add(v,v);
+}
+
+/*
+ * The MASK() operation returns 0xFF in any byte for which the high
+ * bit is 1, 0x00 for any byte for which the high bit is 0.
+ */
+static inline __attribute_const__ unative_t MASK(unative_t v)
+{
+	unative_t zv = NBYTES(0);
+
+	/* vec_cmpgt returns a vector bool char; thus the need for the cast */
+	return (unative_t)vec_cmpgt(zv, v);
+}
+
+
+/* This is noinline to make damned sure that gcc doesn't move any of the
+   Altivec code around the enable/disable code */
+static void noinline
+raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
+	unative_t x1d = NBYTES(0x1d);
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+		wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
+		for ( z = z0-1 ; z >= 0 ; z-- ) {
+			wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			wp$$ = vec_xor(wp$$, wd$$);
+			w2$$ = MASK(wq$$);
+			w1$$ = SHLBYTE(wq$$);
+			w2$$ = vec_and(w2$$, x1d);
+			w1$$ = vec_xor(w1$$, w2$$);
+			wq$$ = vec_xor(w1$$, wd$$);
+		}
+		*(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		*(unative_t *)&q[d+NSIZE*$$] = wq$$;
+	}
+}
+
+static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	preempt_disable();
+	enable_kernel_altivec();
+
+	raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs);
+
+	preempt_enable();
+}
+
+int raid6_have_altivec(void);
+#if $# == 1
+int raid6_have_altivec(void)
+{
+	/* This assumes either all CPUs have Altivec or none does */
+# ifdef __KERNEL__
+	return cpu_has_feature(CPU_FTR_ALTIVEC);
+# else
+	return 1;
+# endif
+}
+#endif
+
+const struct raid6_calls raid6_altivec$# = {
+	raid6_altivec$#_gen_syndrome,
+	raid6_have_altivec,
+	"altivecx$#",
+	0
+};
+
+#endif /* CONFIG_ALTIVEC */
--- a/kernel/drivers/md/raid6int.uc
+++ b/kernel/drivers/md/raid6int.uc
@@ -0,0 +1,117 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6int$#.c
+ *
+ * $#-way unrolled portable integer math RAID-6 instruction set
+ *
+ * This file is postprocessed using unroll.awk
+ */
+
+#include <linux/raid/pq.h>
+
+/*
+ * This is the C data type to use
+ */
+
+/* Change this from BITS_PER_LONG if there is something better... */
+#if BITS_PER_LONG == 64
+# define NBYTES(x) ((x) * 0x0101010101010101UL)
+# define NSIZE  8
+# define NSHIFT 3
+# define NSTRING "64"
+typedef u64 unative_t;
+#else
+# define NBYTES(x) ((x) * 0x01010101U)
+# define NSIZE  4
+# define NSHIFT 2
+# define NSTRING "32"
+typedef u32 unative_t;
+#endif
+
+
+
+/*
+ * IA-64 wants insane amounts of unrolling.  On other architectures that
+ * is just a waste of space.
+ */
+#if ($# <= 8) || defined(__ia64__)
+
+
+/*
+ * These sub-operations are separate inlines since they can sometimes be
+ * specially optimized using architecture-specific hacks.
+ */
+
+/*
+ * The SHLBYTE() operation shifts each byte left by 1, *not*
+ * rolling over into the next byte
+ */
+static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
+{
+	unative_t vv;
+
+	vv = (v << 1) & NBYTES(0xfe);
+	return vv;
+}
+
+/*
+ * The MASK() operation returns 0xFF in any byte for which the high
+ * bit is 1, 0x00 for any byte for which the high bit is 0.
+ */
+static inline __attribute_const__ unative_t MASK(unative_t v)
+{
+	unative_t vv;
+
+	vv = v & NBYTES(0x80);
+	vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */
+	return vv;
+}
+
+
+static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+		wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
+		for ( z = z0-1 ; z >= 0 ; z-- ) {
+			wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			wp$$ ^= wd$$;
+			w2$$ = MASK(wq$$);
+			w1$$ = SHLBYTE(wq$$);
+			w2$$ &= NBYTES(0x1d);
+			w1$$ ^= w2$$;
+			wq$$ = w1$$ ^ wd$$;
+		}
+		*(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		*(unative_t *)&q[d+NSIZE*$$] = wq$$;
+	}
+}
+
+const struct raid6_calls raid6_intx$# = {
+	raid6_int$#_gen_syndrome,
+	NULL,		/* always valid */
+	"int" NSTRING "x$#",
+	0
+};
+
+#endif
--- a/kernel/drivers/md/raid6mmx.c
+++ b/kernel/drivers/md/raid6mmx.c
@@ -0,0 +1,142 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6mmx.c
+ *
+ * MMX implementation of RAID-6 syndrome functions
+ */
+
+#if defined(__i386__) && !defined(__arch_um__)
+
+#include <linux/raid/pq.h>
+#include "raid6x86.h"
+
+/* Shared with raid6sse1.c */
+const struct raid6_mmx_constants {
+	u64 x1d;
+} raid6_mmx_constants = {
+	0x1d1d1d1d1d1d1d1dULL,
+};
+
+static int raid6_have_mmx(void)
+{
+	/* Not really "boot_cpu" but "all_cpus" */
+	return boot_cpu_has(X86_FEATURE_MMX);
+}
+
+/*
+ * Plain MMX implementation
+ */
+static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
+	asm volatile("pxor %mm5,%mm5");	/* Zero temp */
+
+	for ( d = 0 ; d < bytes ; d += 8 ) {
+		asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
+		asm volatile("movq %mm2,%mm4");	/* Q[0] */
+		for ( z = z0-1 ; z >= 0 ; z-- ) {
+			asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d]));
+			asm volatile("pcmpgtb %mm4,%mm5");
+			asm volatile("paddb %mm4,%mm4");
+			asm volatile("pand %mm0,%mm5");
+			asm volatile("pxor %mm5,%mm4");
+			asm volatile("pxor %mm5,%mm5");
+			asm volatile("pxor %mm6,%mm2");
+			asm volatile("pxor %mm6,%mm4");
+		}
+		asm volatile("movq %%mm2,%0" : "=m" (p[d]));
+		asm volatile("pxor %mm2,%mm2");
+		asm volatile("movq %%mm4,%0" : "=m" (q[d]));
+		asm volatile("pxor %mm4,%mm4");
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_mmxx1 = {
+	raid6_mmx1_gen_syndrome,
+	raid6_have_mmx,
+	"mmxx1",
+	0
+};
+
+/*
+ * Unrolled-by-2 MMX implementation
+ */
+static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
+	asm volatile("pxor %mm5,%mm5");	/* Zero temp */
+	asm volatile("pxor %mm7,%mm7"); /* Zero temp */
+
+	for ( d = 0 ; d < bytes ; d += 16 ) {
+		asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
+		asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8]));
+		asm volatile("movq %mm2,%mm4"); /* Q[0] */
+		asm volatile("movq %mm3,%mm6"); /* Q[1] */
+		for ( z = z0-1 ; z >= 0 ; z-- ) {
+			asm volatile("pcmpgtb %mm4,%mm5");
+			asm volatile("pcmpgtb %mm6,%mm7");
+			asm volatile("paddb %mm4,%mm4");
+			asm volatile("paddb %mm6,%mm6");
+			asm volatile("pand %mm0,%mm5");
+			asm volatile("pand %mm0,%mm7");
+			asm volatile("pxor %mm5,%mm4");
+			asm volatile("pxor %mm7,%mm6");
+			asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d]));
+			asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8]));
+			asm volatile("pxor %mm5,%mm2");
+			asm volatile("pxor %mm7,%mm3");
+			asm volatile("pxor %mm5,%mm4");
+			asm volatile("pxor %mm7,%mm6");
+			asm volatile("pxor %mm5,%mm5");
+			asm volatile("pxor %mm7,%mm7");
+		}
+		asm volatile("movq %%mm2,%0" : "=m" (p[d]));
+		asm volatile("movq %%mm3,%0" : "=m" (p[d+8]));
+		asm volatile("movq %%mm4,%0" : "=m" (q[d]));
+		asm volatile("movq %%mm6,%0" : "=m" (q[d+8]));
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_mmxx2 = {
+	raid6_mmx2_gen_syndrome,
+	raid6_have_mmx,
+	"mmxx2",
+	0
+};
+
+#endif
--- a/kernel/drivers/md/raid6recov.c
+++ b/kernel/drivers/md/raid6recov.c
@@ -0,0 +1,132 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6recov.c
+ *
+ * RAID-6 data recovery in dual failure mode.  In single failure mode,
+ * use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct
+ * the syndrome.)
+ */
+
+#include <linux/raid/pq.h>
+
+/* Recover two failed data blocks. */
+void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+		       void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	u8 px, qx, db;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data pages
+	   Use the dead data pages as temporary storage for
+	   delta p and delta q */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dp;
+	ptrs[failb]   = dq;
+	ptrs[disks-2] = p;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
+
+	/* Now do it... */
+	while ( bytes-- ) {
+		px    = *p ^ *dp;
+		qx    = qmul[*q ^ *dq];
+		*dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
+		*dp++ = db ^ px; /* Reconstructed A */
+		p++; q++;
+	}
+}
+EXPORT_SYMBOL_GPL(raid6_2data_recov);
+
+/* Recover failure of one data block plus the P block */
+void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data page
+	   Use the dead data page as temporary storage for delta q */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dq;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	/* Now do it... */
+	while ( bytes-- ) {
+		*p++ ^= *dq = qmul[*q ^ *dq];
+		q++; dq++;
+	}
+}
+EXPORT_SYMBOL_GPL(raid6_datap_recov);
+
+#ifndef __KERNEL__
+/* Testing only */
+
+/* Recover two failed blocks. */
+void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)
+{
+	if ( faila > failb ) {
+		int tmp = faila;
+		faila = failb;
+		failb = tmp;
+	}
+
+	if ( failb == disks-1 ) {
+		if ( faila == disks-2 ) {
+			/* P+Q failure.  Just rebuild the syndrome. */
+			raid6_call.gen_syndrome(disks, bytes, ptrs);
+		} else {
+			/* data+Q failure.  Reconstruct data from P,
+			   then rebuild syndrome. */
+			/* NOT IMPLEMENTED - equivalent to RAID-5 */
+		}
+	} else {
+		if ( failb == disks-2 ) {
+			/* data+P failure. */
+			raid6_datap_recov(disks, bytes, faila, ptrs);
+		} else {
+			/* data+data failure. */
+			raid6_2data_recov(disks, bytes, faila, failb, ptrs);
+		}
+	}
+}
+
+#endif
--- a/kernel/drivers/md/raid6sse1.c
+++ b/kernel/drivers/md/raid6sse1.c
@@ -0,0 +1,162 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6sse1.c
+ *
+ * SSE-1/MMXEXT implementation of RAID-6 syndrome functions
+ *
+ * This is really an MMX implementation, but it requires SSE-1 or
+ * AMD MMXEXT for prefetch support and a few other features.  The
+ * support for nontemporal memory accesses is enough to make this
+ * worthwhile as a separate implementation.
+ */
+
+#if defined(__i386__) && !defined(__arch_um__)
+
+#include <linux/raid/pq.h>
+#include "raid6x86.h"
+
+/* Defined in raid6mmx.c */
+extern const struct raid6_mmx_constants {
+	u64 x1d;
+} raid6_mmx_constants;
+
+static int raid6_have_sse1_or_mmxext(void)
+{
+	/* Not really boot_cpu but "all_cpus" */
+	return boot_cpu_has(X86_FEATURE_MMX) &&
+		(boot_cpu_has(X86_FEATURE_XMM) ||
+		 boot_cpu_has(X86_FEATURE_MMXEXT));
+}
+
+/*
+ * Plain SSE1 implementation
+ */
+static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
+	asm volatile("pxor %mm5,%mm5");	/* Zero temp */
+
+	for ( d = 0 ; d < bytes ; d += 8 ) {
+		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+		asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
+		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
+		asm volatile("movq %mm2,%mm4");	/* Q[0] */
+		asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d]));
+		for ( z = z0-2 ; z >= 0 ; z-- ) {
+			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+			asm volatile("pcmpgtb %mm4,%mm5");
+			asm volatile("paddb %mm4,%mm4");
+			asm volatile("pand %mm0,%mm5");
+			asm volatile("pxor %mm5,%mm4");
+			asm volatile("pxor %mm5,%mm5");
+			asm volatile("pxor %mm6,%mm2");
+			asm volatile("pxor %mm6,%mm4");
+			asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d]));
+		}
+		asm volatile("pcmpgtb %mm4,%mm5");
+		asm volatile("paddb %mm4,%mm4");
+		asm volatile("pand %mm0,%mm5");
+		asm volatile("pxor %mm5,%mm4");
+		asm volatile("pxor %mm5,%mm5");
+		asm volatile("pxor %mm6,%mm2");
+		asm volatile("pxor %mm6,%mm4");
+
+		asm volatile("movntq %%mm2,%0" : "=m" (p[d]));
+		asm volatile("movntq %%mm4,%0" : "=m" (q[d]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_sse1x1 = {
+	raid6_sse11_gen_syndrome,
+	raid6_have_sse1_or_mmxext,
+	"sse1x1",
+	1			/* Has cache hints */
+};
+
+/*
+ * Unrolled-by-2 SSE1 implementation
+ */
+static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
+	asm volatile("pxor %mm5,%mm5");	/* Zero temp */
+	asm volatile("pxor %mm7,%mm7"); /* Zero temp */
+
+	/* We uniformly assume a single prefetch covers at least 16 bytes */
+	for ( d = 0 ; d < bytes ; d += 16 ) {
+		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+		asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
+		asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */
+		asm volatile("movq %mm2,%mm4");	/* Q[0] */
+		asm volatile("movq %mm3,%mm6"); /* Q[1] */
+		for ( z = z0-1 ; z >= 0 ; z-- ) {
+			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+			asm volatile("pcmpgtb %mm4,%mm5");
+			asm volatile("pcmpgtb %mm6,%mm7");
+			asm volatile("paddb %mm4,%mm4");
+			asm volatile("paddb %mm6,%mm6");
+			asm volatile("pand %mm0,%mm5");
+			asm volatile("pand %mm0,%mm7");
+			asm volatile("pxor %mm5,%mm4");
+			asm volatile("pxor %mm7,%mm6");
+			asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d]));
+			asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8]));
+			asm volatile("pxor %mm5,%mm2");
+			asm volatile("pxor %mm7,%mm3");
+			asm volatile("pxor %mm5,%mm4");
+			asm volatile("pxor %mm7,%mm6");
+			asm volatile("pxor %mm5,%mm5");
+			asm volatile("pxor %mm7,%mm7");
+		}
+		asm volatile("movntq %%mm2,%0" : "=m" (p[d]));
+		asm volatile("movntq %%mm3,%0" : "=m" (p[d+8]));
+		asm volatile("movntq %%mm4,%0" : "=m" (q[d]));
+		asm volatile("movntq %%mm6,%0" : "=m" (q[d+8]));
+	}
+
+	asm volatile("sfence" : :: "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_sse1x2 = {
+	raid6_sse12_gen_syndrome,
+	raid6_have_sse1_or_mmxext,
+	"sse1x2",
+	1			/* Has cache hints */
+};
+
+#endif
--- a/kernel/drivers/md/raid6sse2.c
+++ b/kernel/drivers/md/raid6sse2.c
@@ -0,0 +1,262 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6sse2.c
+ *
+ * SSE-2 implementation of RAID-6 syndrome functions
+ *
+ */
+
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
+
+#include <linux/raid/pq.h>
+#include "raid6x86.h"
+
+static const struct raid6_sse_constants {
+	u64 x1d[2];
+} raid6_sse_constants  __attribute__((aligned(16))) = {
+	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
+};
+
+static int raid6_have_sse2(void)
+{
+	/* Not really boot_cpu but "all_cpus" */
+	return boot_cpu_has(X86_FEATURE_MMX) &&
+		boot_cpu_has(X86_FEATURE_FXSR) &&
+		boot_cpu_has(X86_FEATURE_XMM) &&
+		boot_cpu_has(X86_FEATURE_XMM2);
+}
+
+/*
+ * Plain SSE2 implementation
+ */
+static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
+	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
+
+	for ( d = 0 ; d < bytes ; d += 16 ) {
+		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
+		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
+		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
+		asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
+		for ( z = z0-2 ; z >= 0 ; z-- ) {
+			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+			asm volatile("pcmpgtb %xmm4,%xmm5");
+			asm volatile("paddb %xmm4,%xmm4");
+			asm volatile("pand %xmm0,%xmm5");
+			asm volatile("pxor %xmm5,%xmm4");
+			asm volatile("pxor %xmm5,%xmm5");
+			asm volatile("pxor %xmm6,%xmm2");
+			asm volatile("pxor %xmm6,%xmm4");
+			asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
+		}
+		asm volatile("pcmpgtb %xmm4,%xmm5");
+		asm volatile("paddb %xmm4,%xmm4");
+		asm volatile("pand %xmm0,%xmm5");
+		asm volatile("pxor %xmm5,%xmm4");
+		asm volatile("pxor %xmm5,%xmm5");
+		asm volatile("pxor %xmm6,%xmm2");
+		asm volatile("pxor %xmm6,%xmm4");
+
+		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
+		asm volatile("pxor %xmm2,%xmm2");
+		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
+		asm volatile("pxor %xmm4,%xmm4");
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_sse2x1 = {
+	raid6_sse21_gen_syndrome,
+	raid6_have_sse2,
+	"sse2x1",
+	1			/* Has cache hints */
+};
+
+/*
+ * Unrolled-by-2 SSE2 implementation
+ */
+static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
+	asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
+	asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
+
+	/* We uniformly assume a single prefetch covers at least 32 bytes */
+	for ( d = 0 ; d < bytes ; d += 32 ) {
+		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d]));    /* P[0] */
+		asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
+		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
+		asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
+		for ( z = z0-1 ; z >= 0 ; z-- ) {
+			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+			asm volatile("pcmpgtb %xmm4,%xmm5");
+			asm volatile("pcmpgtb %xmm6,%xmm7");
+			asm volatile("paddb %xmm4,%xmm4");
+			asm volatile("paddb %xmm6,%xmm6");
+			asm volatile("pand %xmm0,%xmm5");
+			asm volatile("pand %xmm0,%xmm7");
+			asm volatile("pxor %xmm5,%xmm4");
+			asm volatile("pxor %xmm7,%xmm6");
+			asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
+			asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
+			asm volatile("pxor %xmm5,%xmm2");
+			asm volatile("pxor %xmm7,%xmm3");
+			asm volatile("pxor %xmm5,%xmm4");
+			asm volatile("pxor %xmm7,%xmm6");
+			asm volatile("pxor %xmm5,%xmm5");
+			asm volatile("pxor %xmm7,%xmm7");
+		}
+		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
+		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
+		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
+		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_sse2x2 = {
+	raid6_sse22_gen_syndrome,
+	raid6_have_sse2,
+	"sse2x2",
+	1			/* Has cache hints */
+};
+
+#endif
+
+#if defined(__x86_64__) && !defined(__arch_um__)
+
+/*
+ * Unrolled-by-4 SSE2 implementation
+ */
+static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
+	asm volatile("pxor %xmm2,%xmm2");	/* P[0] */
+	asm volatile("pxor %xmm3,%xmm3");	/* P[1] */
+	asm volatile("pxor %xmm4,%xmm4"); 	/* Q[0] */
+	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
+	asm volatile("pxor %xmm6,%xmm6"); 	/* Q[1] */
+	asm volatile("pxor %xmm7,%xmm7"); 	/* Zero temp */
+	asm volatile("pxor %xmm10,%xmm10");	/* P[2] */
+	asm volatile("pxor %xmm11,%xmm11");	/* P[3] */
+	asm volatile("pxor %xmm12,%xmm12"); 	/* Q[2] */
+	asm volatile("pxor %xmm13,%xmm13");	/* Zero temp */
+	asm volatile("pxor %xmm14,%xmm14"); 	/* Q[3] */
+	asm volatile("pxor %xmm15,%xmm15"); 	/* Zero temp */
+
+	for ( d = 0 ; d < bytes ; d += 64 ) {
+		for ( z = z0 ; z >= 0 ; z-- ) {
+			/* The second prefetch seems to improve performance... */
+			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
+			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
+			asm volatile("pcmpgtb %xmm4,%xmm5");
+			asm volatile("pcmpgtb %xmm6,%xmm7");
+			asm volatile("pcmpgtb %xmm12,%xmm13");
+			asm volatile("pcmpgtb %xmm14,%xmm15");
+			asm volatile("paddb %xmm4,%xmm4");
+			asm volatile("paddb %xmm6,%xmm6");
+			asm volatile("paddb %xmm12,%xmm12");
+			asm volatile("paddb %xmm14,%xmm14");
+			asm volatile("pand %xmm0,%xmm5");
+			asm volatile("pand %xmm0,%xmm7");
+			asm volatile("pand %xmm0,%xmm13");
+			asm volatile("pand %xmm0,%xmm15");
+			asm volatile("pxor %xmm5,%xmm4");
+			asm volatile("pxor %xmm7,%xmm6");
+			asm volatile("pxor %xmm13,%xmm12");
+			asm volatile("pxor %xmm15,%xmm14");
+			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
+			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
+			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
+			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
+			asm volatile("pxor %xmm5,%xmm2");
+			asm volatile("pxor %xmm7,%xmm3");
+			asm volatile("pxor %xmm13,%xmm10");
+			asm volatile("pxor %xmm15,%xmm11");
+			asm volatile("pxor %xmm5,%xmm4");
+			asm volatile("pxor %xmm7,%xmm6");
+			asm volatile("pxor %xmm13,%xmm12");
+			asm volatile("pxor %xmm15,%xmm14");
+			asm volatile("pxor %xmm5,%xmm5");
+			asm volatile("pxor %xmm7,%xmm7");
+			asm volatile("pxor %xmm13,%xmm13");
+			asm volatile("pxor %xmm15,%xmm15");
+		}
+		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
+		asm volatile("pxor %xmm2,%xmm2");
+		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
+		asm volatile("pxor %xmm3,%xmm3");
+		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
+		asm volatile("pxor %xmm10,%xmm10");
+		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
+		asm volatile("pxor %xmm11,%xmm11");
+		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
+		asm volatile("pxor %xmm4,%xmm4");
+		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
+		asm volatile("pxor %xmm6,%xmm6");
+		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
+		asm volatile("pxor %xmm12,%xmm12");
+		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
+		asm volatile("pxor %xmm14,%xmm14");
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_sse2x4 = {
+	raid6_sse24_gen_syndrome,
+	raid6_have_sse2,
+	"sse2x4",
+	1			/* Has cache hints */
+};
+
+#endif
--- a/kernel/drivers/md/raid6test/Makefile
+++ b/kernel/drivers/md/raid6test/Makefile
@@ -0,0 +1,75 @@
+#
+# This is a simple Makefile to test some of the RAID-6 code
+# from userspace.
+#
+
+CC	 = gcc
+OPTFLAGS = -O2			# Adjust as desired
+CFLAGS	 = -I.. -I ../../../include -g $(OPTFLAGS)
+LD	 = ld
+AWK	 = awk
+AR	 = ar
+RANLIB	 = ranlib
+
+.c.o:
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+%.c: ../%.c
+	cp -f $< $@
+
+%.uc: ../%.uc
+	cp -f $< $@
+
+all:	raid6.a raid6test
+
+raid6.a: raid6int1.o raid6int2.o raid6int4.o raid6int8.o raid6int16.o \
+	 raid6int32.o \
+	 raid6mmx.o raid6sse1.o raid6sse2.o \
+	 raid6altivec1.o raid6altivec2.o raid6altivec4.o raid6altivec8.o \
+	 raid6recov.o raid6algos.o \
+	 raid6tables.o
+	 rm -f $@
+	 $(AR) cq $@ $^
+	 $(RANLIB) $@
+
+raid6test: test.c raid6.a
+	$(CC) $(CFLAGS) -o raid6test $^
+
+raid6altivec1.c: raid6altivec.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=1 < raid6altivec.uc > $@
+
+raid6altivec2.c: raid6altivec.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=2 < raid6altivec.uc > $@
+
+raid6altivec4.c: raid6altivec.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=4 < raid6altivec.uc > $@
+
+raid6altivec8.c: raid6altivec.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=8 < raid6altivec.uc > $@
+
+raid6int1.c: raid6int.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=1 < raid6int.uc > $@
+
+raid6int2.c: raid6int.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=2 < raid6int.uc > $@
+
+raid6int4.c: raid6int.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=4 < raid6int.uc > $@
+
+raid6int8.c: raid6int.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=8 < raid6int.uc > $@
+
+raid6int16.c: raid6int.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=16 < raid6int.uc > $@
+
+raid6int32.c: raid6int.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=32 < raid6int.uc > $@
+
+raid6tables.c: mktables
+	./mktables > raid6tables.c
+
+clean:
+	rm -f *.o *.a mktables mktables.c raid6int.uc raid6*.c raid6test
+
+spotless: clean
+	rm -f *~
--- a/kernel/drivers/md/raid6test/test.c
+++ b/kernel/drivers/md/raid6test/test.c
@@ -0,0 +1,124 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002-2007 H. Peter Anvin - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2 or (at your
+ *   option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6test.c
+ *
+ * Test RAID-6 recovery with various algorithms
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <linux/raid/pq.h>
+
+#define NDISKS		16	/* Including P and Q */
+
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+struct raid6_calls raid6_call;
+
+char *dataptrs[NDISKS];
+char data[NDISKS][PAGE_SIZE];
+char recovi[PAGE_SIZE], recovj[PAGE_SIZE];
+
+static void makedata(void)
+{
+	int i, j;
+
+	for (i = 0; i < NDISKS; i++) {
+		for (j = 0; j < PAGE_SIZE; j++)
+			data[i][j] = rand();
+
+		dataptrs[i] = data[i];
+	}
+}
+
+static char disk_type(int d)
+{
+	switch (d) {
+	case NDISKS-2:
+		return 'P';
+	case NDISKS-1:
+		return 'Q';
+	default:
+		return 'D';
+	}
+}
+
+static int test_disks(int i, int j)
+{
+	int erra, errb;
+
+	memset(recovi, 0xf0, PAGE_SIZE);
+	memset(recovj, 0xba, PAGE_SIZE);
+
+	dataptrs[i] = recovi;
+	dataptrs[j] = recovj;
+
+	raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs);
+
+	erra = memcmp(data[i], recovi, PAGE_SIZE);
+	errb = memcmp(data[j], recovj, PAGE_SIZE);
+
+	if (i < NDISKS-2 && j == NDISKS-1) {
+		/* We don't implement the DQ failure scenario, since it's
+		   equivalent to a RAID-5 failure (XOR, then recompute Q) */
+		erra = errb = 0;
+	} else {
+		printf("algo=%-8s  faila=%3d(%c)  failb=%3d(%c)  %s\n",
+		       raid6_call.name,
+		       i, disk_type(i),
+		       j, disk_type(j),
+		       (!erra && !errb) ? "OK" :
+		       !erra ? "ERRB" :
+		       !errb ? "ERRA" : "ERRAB");
+	}
+
+	dataptrs[i] = data[i];
+	dataptrs[j] = data[j];
+
+	return erra || errb;
+}
+
+int main(int argc, char *argv[])
+{
+	const struct raid6_calls *const *algo;
+	int i, j;
+	int err = 0;
+
+	makedata();
+
+	for (algo = raid6_algos; *algo; algo++) {
+		if (!(*algo)->valid || (*algo)->valid()) {
+			raid6_call = **algo;
+
+			/* Nuke syndromes */
+			memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
+
+			/* Generate assumed good syndrome */
+			raid6_call.gen_syndrome(NDISKS, PAGE_SIZE,
+						(void **)&dataptrs);
+
+			for (i = 0; i < NDISKS-1; i++)
+				for (j = i+1; j < NDISKS; j++)
+					err += test_disks(i, j);
+		}
+		printf("\n");
+	}
+
+	printf("\n");
+	/* Pick the best algorithm test */
+	raid6_select_algo();
+
+	if (err)
+		printf("\n*** ERRORS FOUND ***\n");
+
+	return err;
+}
--- a/kernel/drivers/md/raid6x86.h
+++ b/kernel/drivers/md/raid6x86.h
@@ -0,0 +1,61 @@
+/* ----------------------------------------------------------------------- *
+ *
+ *   Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6x86.h
+ *
+ * Definitions common to x86 and x86-64 RAID-6 code only
+ */
+
+#ifndef LINUX_RAID_RAID6X86_H
+#define LINUX_RAID_RAID6X86_H
+
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
+
+#ifdef __KERNEL__ /* Real code */
+
+#include <asm/i387.h>
+
+#else /* Dummy code for user space testing */
+
+static inline void kernel_fpu_begin(void)
+{
+}
+
+static inline void kernel_fpu_end(void)
+{
+}
+
+#define X86_FEATURE_MMX		(0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR	(0*32+24) /* FXSAVE and FXRSTOR instructions
+					   * (fast save and restore) */
+#define X86_FEATURE_XMM		(0*32+25) /* Streaming SIMD Extensions */
+#define X86_FEATURE_XMM2	(0*32+26) /* Streaming SIMD Extensions-2 */
+#define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
+
+/* Should work well enough on modern CPUs for testing */
+static inline int boot_cpu_has(int flag)
+{
+	u32 eax = (flag >> 5) ? 0x80000001 : 1;
+	u32 edx;
+
+	asm volatile("cpuid"
+		     : "+a" (eax), "=d" (edx)
+		     : : "ecx", "ebx");
+
+	return (edx >> (flag & 31)) & 1;
+}
+
+#endif /* ndef __KERNEL__ */
+
+#endif
+#endif
--- a/kernel/drivers/md/unroll.awk
+++ b/kernel/drivers/md/unroll.awk
@@ -0,0 +1,20 @@
+
+# This filter requires one command line option of form -vN=n
+# where n must be a decimal number.
+#
+# Repeat each input line containing $$ n times, replacing $$ with 0...n-1.
+# Replace each $# with n, and each $* with a single $.
+
+BEGIN {
+	n = N + 0
+}
+{
+	if (/\$\$/) { rep = n } else { rep = 1 }
+	for (i = 0; i < rep; ++i) {
+		tmp = $0
+		gsub(/\$\$/, i, tmp)
+		gsub(/\$\#/, n, tmp)
+		gsub(/\$\*/, "$", tmp)
+		print tmp
+	}
+}