Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6:
  smc911x: Fix external PHY detection
  e1000: allow VLAN devices to use TSO and CSUM offload
  gre: Initialise rtnl_link tunnel parameters properly
  ipvs: Add proper dependencies on IP_VS, and fix description header line.
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 0d53949..74484e6 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -32,9 +32,9 @@
     you will need to merge your changes with the version from e2fsprogs
     1.41.x.
 
-  - Create a new filesystem using the ext4dev filesystem type:
+  - Create a new filesystem using the ext4 filesystem type:
 
-    	# mke2fs -t ext4dev /dev/hda1
+    	# mke2fs -t ext4 /dev/hda1
 
     Or configure an existing ext3 filesystem to support extents and set
     the test_fs flag to indicate that it's ok for an in-development
@@ -47,13 +47,13 @@
 
         # tune2fs -I 256 /dev/hda1
 
-    (Note: we currently do not have tools to convert an ext4dev
+    (Note: we currently do not have tools to convert an ext4
     filesystem back to ext3; so please do not do try this on production
     filesystems.)
 
   - Mounting:
 
-	# mount -t ext4dev /dev/hda1 /wherever
+	# mount -t ext4 /dev/hda1 /wherever
 
   - When comparing performance with other filesystems, remember that
     ext3/4 by default offers higher data integrity guarantees than most.
@@ -177,6 +177,11 @@
 			your disks are battery-backed in one way or another,
 			disabling barriers may safely improve performance.
 
+inode_readahead=n	This tuning parameter controls the maximum
+			number of inode table blocks that ext4's inode
+			table readahead algorithm will pre-read into
+			the buffer cache.  The default value is 32 blocks.
+
 orlov		(*)	This enables the new Orlov block allocator. It is
 			enabled by default.
 
@@ -252,6 +257,7 @@
 delalloc	(*)	Deferring block allocation until write-out time.
 nodelalloc		Disable delayed allocation. Blocks are allocation
 			when data is copied from user to page cache.
+
 Data Mode
 =========
 There are 3 different data modes:
diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt
new file mode 100644
index 0000000..1e3defc
--- /dev/null
+++ b/Documentation/filesystems/fiemap.txt
@@ -0,0 +1,228 @@
+============
+Fiemap Ioctl
+============
+
+The fiemap ioctl is an efficient method for userspace to get file
+extent mappings. Instead of block-by-block mapping (such as bmap), fiemap
+returns a list of extents.
+
+
+Request Basics
+--------------
+
+A fiemap request is encoded within struct fiemap:
+
+struct fiemap {
+	__u64	fm_start;	 /* logical offset (inclusive) at
+				  * which to start mapping (in) */
+	__u64	fm_length;	 /* logical length of mapping which
+				  * userspace cares about (in) */
+	__u32	fm_flags;	 /* FIEMAP_FLAG_* flags for request (in/out) */
+	__u32	fm_mapped_extents; /* number of extents that were
+				    * mapped (out) */
+	__u32	fm_extent_count; /* size of fm_extents array (in) */
+	__u32	fm_reserved;
+	struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+
+fm_start, and fm_length specify the logical range within the file
+which the process would like mappings for. Extents returned mirror
+those on disk - that is, the logical offset of the 1st returned extent
+may start before fm_start, and the range covered by the last returned
+extent may end after fm_length. All offsets and lengths are in bytes.
+
+Certain flags to modify the way in which mappings are looked up can be
+set in fm_flags. If the kernel doesn't understand some particular
+flags, it will return EBADR and the contents of fm_flags will contain
+the set of flags which caused the error. If the kernel is compatible
+with all flags passed, the contents of fm_flags will be unmodified.
+It is up to userspace to determine whether rejection of a particular
+flag is fatal to it's operation. This scheme is intended to allow the
+fiemap interface to grow in the future but without losing
+compatibility with old software.
+
+fm_extent_count specifies the number of elements in the fm_extents[] array
+that can be used to return extents.  If fm_extent_count is zero, then the
+fm_extents[] array is ignored (no extents will be returned), and the
+fm_mapped_extents count will hold the number of extents needed in
+fm_extents[] to hold the file's current mapping.  Note that there is
+nothing to prevent the file from changing between calls to FIEMAP.
+
+The following flags can be set in fm_flags:
+
+* FIEMAP_FLAG_SYNC
+If this flag is set, the kernel will sync the file before mapping extents.
+
+* FIEMAP_FLAG_XATTR
+If this flag is set, the extents returned will describe the inodes
+extended attribute lookup tree, instead of it's data tree.
+
+
+Extent Mapping
+--------------
+
+Extent information is returned within the embedded fm_extents array
+which userspace must allocate along with the fiemap structure. The
+number of elements in the fiemap_extents[] array should be passed via
+fm_extent_count. The number of extents mapped by kernel will be
+returned via fm_mapped_extents. If the number of fiemap_extents
+allocated is less than would be required to map the requested range,
+the maximum number of extents that can be mapped in the fm_extent[]
+array will be returned and fm_mapped_extents will be equal to
+fm_extent_count. In that case, the last extent in the array will not
+complete the requested range and will not have the FIEMAP_EXTENT_LAST
+flag set (see the next section on extent flags).
+
+Each extent is described by a single fiemap_extent structure as
+returned in fm_extents.
+
+struct fiemap_extent {
+	__u64	fe_logical;  /* logical offset in bytes for the start of
+			      * the extent */
+	__u64	fe_physical; /* physical offset in bytes for the start
+			      * of the extent */
+	__u64	fe_length;   /* length in bytes for the extent */
+	__u64	fe_reserved64[2];
+	__u32	fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
+	__u32	fe_reserved[3];
+};
+
+All offsets and lengths are in bytes and mirror those on disk.  It is valid
+for an extents logical offset to start before the request or it's logical
+length to extend past the request.  Unless FIEMAP_EXTENT_NOT_ALIGNED is
+returned, fe_logical, fe_physical, and fe_length will be aligned to the
+block size of the file system.  With the exception of extents flagged as
+FIEMAP_EXTENT_MERGED, adjacent extents will not be merged.
+
+The fe_flags field contains flags which describe the extent returned.
+A special flag, FIEMAP_EXTENT_LAST is always set on the last extent in
+the file so that the process making fiemap calls can determine when no
+more extents are available, without having to call the ioctl again.
+
+Some flags are intentionally vague and will always be set in the
+presence of other more specific flags. This way a program looking for
+a general property does not have to know all existing and future flags
+which imply that property.
+
+For example, if FIEMAP_EXTENT_DATA_INLINE or FIEMAP_EXTENT_DATA_TAIL
+are set, FIEMAP_EXTENT_NOT_ALIGNED will also be set. A program looking
+for inline or tail-packed data can key on the specific flag. Software
+which simply cares not to try operating on non-aligned extents
+however, can just key on FIEMAP_EXTENT_NOT_ALIGNED, and not have to
+worry about all present and future flags which might imply unaligned
+data. Note that the opposite is not true - it would be valid for
+FIEMAP_EXTENT_NOT_ALIGNED to appear alone.
+
+* FIEMAP_EXTENT_LAST
+This is the last extent in the file. A mapping attempt past this
+extent will return nothing.
+
+* FIEMAP_EXTENT_UNKNOWN
+The location of this extent is currently unknown. This may indicate
+the data is stored on an inaccessible volume or that no storage has
+been allocated for the file yet.
+
+* FIEMAP_EXTENT_DELALLOC
+  - This will also set FIEMAP_EXTENT_UNKNOWN.
+Delayed allocation - while there is data for this extent, it's
+physical location has not been allocated yet.
+
+* FIEMAP_EXTENT_ENCODED
+This extent does not consist of plain filesystem blocks but is
+encoded (e.g. encrypted or compressed).  Reading the data in this
+extent via I/O to the block device will have undefined results.
+
+Note that it is *always* undefined to try to update the data
+in-place by writing to the indicated location without the
+assistance of the filesystem, or to access the data using the
+information returned by the FIEMAP interface while the filesystem
+is mounted.  In other words, user applications may only read the
+extent data via I/O to the block device while the filesystem is
+unmounted, and then only if the FIEMAP_EXTENT_ENCODED flag is
+clear; user applications must not try reading or writing to the
+filesystem via the block device under any other circumstances.
+
+* FIEMAP_EXTENT_DATA_ENCRYPTED
+  - This will also set FIEMAP_EXTENT_ENCODED
+The data in this extent has been encrypted by the file system.
+
+* FIEMAP_EXTENT_NOT_ALIGNED
+Extent offsets and length are not guaranteed to be block aligned.
+
+* FIEMAP_EXTENT_DATA_INLINE
+  This will also set FIEMAP_EXTENT_NOT_ALIGNED
+Data is located within a meta data block.
+
+* FIEMAP_EXTENT_DATA_TAIL
+  This will also set FIEMAP_EXTENT_NOT_ALIGNED
+Data is packed into a block with data from other files.
+
+* FIEMAP_EXTENT_UNWRITTEN
+Unwritten extent - the extent is allocated but it's data has not been
+initialized.  This indicates the extent's data will be all zero if read
+through the filesystem but the contents are undefined if read directly from
+the device.
+
+* FIEMAP_EXTENT_MERGED
+This will be set when a file does not support extents, i.e., it uses a block
+based addressing scheme.  Since returning an extent for each block back to
+userspace would be highly inefficient, the kernel will try to merge most
+adjacent blocks into 'extents'.
+
+
+VFS -> File System Implementation
+---------------------------------
+
+File systems wishing to support fiemap must implement a ->fiemap callback on
+their inode_operations structure. The fs ->fiemap call is responsible for
+defining it's set of supported fiemap flags, and calling a helper function on
+each discovered extent:
+
+struct inode_operations {
+       ...
+
+       int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
+                     u64 len);
+
+->fiemap is passed struct fiemap_extent_info which describes the
+fiemap request:
+
+struct fiemap_extent_info {
+	unsigned int fi_flags;		/* Flags as passed from user */
+	unsigned int fi_extents_mapped;	/* Number of mapped extents */
+	unsigned int fi_extents_max;	/* Size of fiemap_extent array */
+	struct fiemap_extent *fi_extents_start;	/* Start of fiemap_extent array */
+};
+
+It is intended that the file system should not need to access any of this
+structure directly.
+
+
+Flag checking should be done at the beginning of the ->fiemap callback via the
+fiemap_check_flags() helper:
+
+int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
+
+The struct fieinfo should be passed in as recieved from ioctl_fiemap(). The
+set of fiemap flags which the fs understands should be passed via fs_flags. If
+fiemap_check_flags finds invalid user flags, it will place the bad values in
+fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from
+fiemap_check_flags(), it should immediately exit, returning that error back to
+ioctl_fiemap().
+
+
+For each extent in the request range, the file system should call
+the helper function, fiemap_fill_next_extent():
+
+int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
+			    u64 phys, u64 len, u32 flags, u32 dev);
+
+fiemap_fill_next_extent() will use the passed values to populate the
+next free extent in the fm_extents array. 'General' extent flags will
+automatically be set from specific flags on behalf of the calling file
+system so that the userspace API is not broken.
+
+fiemap_fill_next_extent() returns 0 on success, and 1 when the
+user-supplied fm_extents array is full. If an error is encountered
+while copying the extent to user memory, -EFAULT will be returned.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index f566ad9..d831d24 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -923,45 +923,44 @@
 The   "procs_blocked" line gives  the  number of  processes currently blocked,
 waiting for I/O to complete.
 
+
 1.9 Ext4 file system parameters
 ------------------------------
-Ext4 file system have one directory per partition under /proc/fs/ext4/
-# ls /proc/fs/ext4/hdc/
-group_prealloc  max_to_scan  mb_groups  mb_history  min_to_scan  order2_req
-stats  stream_req
 
-mb_groups:
-This file gives the details of multiblock allocator buddy cache of free blocks
+Information about mounted ext4 file systems can be found in
+/proc/fs/ext4.  Each mounted filesystem will have a directory in
+/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
+/proc/fs/ext4/dm-0).   The files in each per-device directory are shown
+in Table 1-10, below.
 
-mb_history:
-Multiblock allocation history.
+Table 1-10: Files in /proc/fs/ext4/<devname>
+..............................................................................
+ File            Content                                        
+ mb_groups       details of multiblock allocator buddy cache of free blocks
+ mb_history      multiblock allocation history
+ stats           controls whether the multiblock allocator should start
+                 collecting statistics, which are shown during the unmount
+ group_prealloc  the multiblock allocator will round up allocation
+                 requests to a multiple of this tuning parameter if the
+                 stripe size is not set in the ext4 superblock
+ max_to_scan     The maximum number of extents the multiblock allocator
+                 will search to find the best extent
+ min_to_scan     The minimum number of extents the multiblock allocator
+                 will search to find the best extent
+ order2_req      Tuning parameter which controls the minimum size for 
+                 requests (as a power of 2) where the buddy cache is
+                 used
+ stream_req      Files which have fewer blocks than this tunable
+                 parameter will have their blocks allocated out of a
+                 block group specific preallocation pool, so that small
+                 files are packed closely together.  Each large file
+                 will have its blocks allocated out of its own unique
+                 preallocation pool.
+inode_readahead  Tuning parameter which controls the maximum number of
+                 inode table blocks that ext4's inode table readahead
+                 algorithm will pre-read into the buffer cache
+..............................................................................
 
-stats:
-This file indicate whether the multiblock allocator should start collecting
-statistics. The statistics are shown during unmount
-
-group_prealloc:
-The multiblock allocator normalize the block allocation request to
-group_prealloc filesystem blocks if we don't have strip value set.
-The stripe value can be specified at mount time or during mke2fs.
-
-max_to_scan:
-How long multiblock allocator can look for a best extent (in found extents)
-
-min_to_scan:
-How long multiblock allocator  must look for a best extent
-
-order2_req:
-Multiblock allocator use  2^N search using buddies only for requests greater
-than or equal to order2_req. The request size is specfied in file system
-blocks. A value of 2 indicate only if the requests are greater than or equal
-to 4 blocks.
-
-stream_req:
-Files smaller than stream_req are served by the stream allocator, whose
-purpose is to pack requests as close each to other as possible to
-produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16
-filesystem block size will use group based preallocation.
 
 ------------------------------------------------------------------------------
 Summary
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 2ca9c8f..25efbaf 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1428,6 +1428,12 @@
 
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
 
+	nox2apic	[X86-64,APIC] Do not enable x2APIC mode.
+
+	x2apic_phys	[X86-64,APIC] Use x2apic physical mode instead of
+			default x2apic cluster mode on platforms
+			supporting x2apic.
+
 	noltlbs		[PPC] Do not use large page/tlb entries for kernel
 			lowmem mapping on PPC40x.
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 68781ed..587f418 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1659,9 +1659,10 @@
 S:	Maintained
 
 EXT4 FILE SYSTEM
-P:	Stephen Tweedie, Andrew Morton
-M:	sct@redhat.com, akpm@linux-foundation.org, adilger@sun.com
+P:	Theodore Ts'o
+M:	tytso@mit.edu, adilger@sun.com
 L:	linux-ext4@vger.kernel.org
+W:	http://ext4.wiki.kernel.org
 S:	Maintained
 
 F71805F HARDWARE MONITORING DRIVER
diff --git a/arch/um/sys-x86_64/syscall_table.c b/arch/um/sys-x86_64/syscall_table.c
index c128eb8..32f5fbe 100644
--- a/arch/um/sys-x86_64/syscall_table.c
+++ b/arch/um/sys-x86_64/syscall_table.c
@@ -41,12 +41,12 @@
 #define stub_rt_sigreturn sys_rt_sigreturn
 
 #define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_64_UNISTD_H_
+#undef ASM_X86__UNISTD_64_H
 #include <asm-x86/unistd_64.h>
 
 #undef __SYSCALL
 #define __SYSCALL(nr, sym) [ nr ] = sym,
-#undef _ASM_X86_64_UNISTD_H_
+#undef ASM_X86__UNISTD_64_H
 
 typedef void (*sys_call_ptr_t)(void);
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0d7cdbb..44d4f21 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1689,6 +1689,14 @@
 	 workaround will setup a 1:1 mapping for the first
 	 16M to make floppy (an ISA device) work.
 
+config INTR_REMAP
+	bool "Support for Interrupt Remapping (EXPERIMENTAL)"
+	depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
+	help
+	 Supports Interrupt remapping for IO-APIC and MSI devices.
+	 To use x2apic mode in the CPU's which support x2APIC enhancements or
+	 to support platforms with CPU's having > 8 bit APIC ID, say Y.
+
 source "drivers/pci/pcie/Kconfig"
 
 source "drivers/pci/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 60a8576..f8843c3 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -419,6 +419,60 @@
 	def_bool y
 	depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
 
+menuconfig PROCESSOR_SELECT
+	default y
+	bool "Supported processor vendors" if EMBEDDED
+	help
+	  This lets you choose what x86 vendor support code your kernel
+	  will include.
+
+config CPU_SUP_INTEL
+	default y
+	bool "Support Intel processors" if PROCESSOR_SELECT
+	help
+	  This enables extended support for Intel processors
+
+config CPU_SUP_CYRIX_32
+	default y
+	bool "Support Cyrix processors" if PROCESSOR_SELECT
+	depends on !64BIT
+	help
+	  This enables extended support for Cyrix processors
+
+config CPU_SUP_AMD
+	default y
+	bool "Support AMD processors" if PROCESSOR_SELECT
+	help
+	  This enables extended support for AMD processors
+
+config CPU_SUP_CENTAUR_32
+	default y
+	bool "Support Centaur processors" if PROCESSOR_SELECT
+	depends on !64BIT
+	help
+	  This enables extended support for Centaur processors
+
+config CPU_SUP_CENTAUR_64
+	default y
+	bool "Support Centaur processors" if PROCESSOR_SELECT
+	depends on 64BIT
+	help
+	  This enables extended support for Centaur processors
+
+config CPU_SUP_TRANSMETA_32
+	default y
+	bool "Support Transmeta processors" if PROCESSOR_SELECT
+	depends on !64BIT
+	help
+	  This enables extended support for Transmeta processors
+
+config CPU_SUP_UMC_32
+	default y
+	bool "Support UMC processors" if PROCESSOR_SELECT
+	depends on !64BIT
+	help
+	  This enables extended support for UMC processors
+
 config X86_DS
 	bool "Debug Store support"
 	default y
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 75298fe..6ec6bb6 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -59,17 +59,18 @@
 			u32 e = err_flags[i];
 
 			for (j = 0; j < 32; j++) {
-				int n = (i << 5)+j;
-				if (*msg_strs < n) {
+				if (msg_strs[0] < i ||
+				    (msg_strs[0] == i && msg_strs[1] < j)) {
 					/* Skip to the next string */
-					do {
-						msg_strs++;
-					} while (*msg_strs);
-					msg_strs++;
+					msg_strs += 2;
+					while (*msg_strs++)
+						;
 				}
 				if (e & 1) {
-					if (*msg_strs == n && msg_strs[1])
-						printf("%s ", msg_strs+1);
+					if (msg_strs[0] == i &&
+					    msg_strs[1] == j &&
+					    msg_strs[2])
+						printf("%s ", msg_strs+2);
 					else
 						printf("%d:%d ", i, j);
 				}
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index bbe7695..8ef60f2 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -15,33 +15,33 @@
 
 #include <stdio.h>
 
-#include "../kernel/cpu/feature_names.c"
-
-#if NCAPFLAGS > 8
-# error "Need to adjust the boot code handling of CPUID strings"
-#endif
+#include "../kernel/cpu/capflags.c"
 
 int main(void)
 {
-	int i;
+	int i, j;
 	const char *str;
 
 	printf("static const char x86_cap_strs[] = \n");
 
-	for (i = 0; i < NCAPINTS*32; i++) {
-		str = x86_cap_flags[i];
+	for (i = 0; i < NCAPINTS; i++) {
+		for (j = 0; j < 32; j++) {
+			str = x86_cap_flags[i*32+j];
 
-		if (i == NCAPINTS*32-1) {
-			/* The last entry must be unconditional; this
-			   also consumes the compiler-added null character */
-			if (!str)
-				str = "";
-			printf("\t\"\\x%02x\"\"%s\"\n", i, str);
-		} else if (str) {
-			printf("#if REQUIRED_MASK%d & (1 << %d)\n"
-			       "\t\"\\x%02x\"\"%s\\0\"\n"
-			       "#endif\n",
-			       i >> 5, i & 31, i, str);
+			if (i == NCAPINTS-1 && j == 31) {
+				/* The last entry must be unconditional; this
+				   also consumes the compiler-added null
+				   character */
+				if (!str)
+					str = "";
+				printf("\t\"\\x%02x\\x%02x\"\"%s\"\n",
+				       i, j, str);
+			} else if (str) {
+				printf("#if REQUIRED_MASK%d & (1 << %d)\n"
+				       "\t\"\\x%02x\\x%02x\"\"%s\\0\"\n"
+				       "#endif\n",
+				       i, j, i, j, str);
+			}
 		}
 	}
 	printf("\t;\n");
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f1a2ac7..8d64c1b 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -179,9 +179,10 @@
 	u32 pretcode;
 	int sig;
 	struct sigcontext_ia32 sc;
-	struct _fpstate_ia32 fpstate;
+	struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */
 	unsigned int extramask[_COMPAT_NSIG_WORDS-1];
 	char retcode[8];
+	/* fp state follows here */
 };
 
 struct rt_sigframe
@@ -192,8 +193,8 @@
 	u32 puc;
 	compat_siginfo_t info;
 	struct ucontext_ia32 uc;
-	struct _fpstate_ia32 fpstate;
 	char retcode[8];
+	/* fp state follows here */
 };
 
 #define COPY(x)		{ 		\
@@ -215,7 +216,7 @@
 				   unsigned int *peax)
 {
 	unsigned int tmpflags, gs, oldgs, err = 0;
-	struct _fpstate_ia32 __user *buf;
+	void __user *buf;
 	u32 tmp;
 
 	/* Always make any pending restarted system calls return -EINTR */
@@ -259,26 +260,12 @@
 
 	err |= __get_user(tmp, &sc->fpstate);
 	buf = compat_ptr(tmp);
-	if (buf) {
-		if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
-			goto badframe;
-		err |= restore_i387_ia32(buf);
-	} else {
-		struct task_struct *me = current;
-
-		if (used_math()) {
-			clear_fpu(me);
-			clear_used_math();
-		}
-	}
+	err |= restore_i387_xstate_ia32(buf);
 
 	err |= __get_user(tmp, &sc->ax);
 	*peax = tmp;
 
 	return err;
-
-badframe:
-	return 1;
 }
 
 asmlinkage long sys32_sigreturn(struct pt_regs *regs)
@@ -350,7 +337,7 @@
  */
 
 static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
-				 struct _fpstate_ia32 __user *fpstate,
+				 void __user *fpstate,
 				 struct pt_regs *regs, unsigned int mask)
 {
 	int tmp, err = 0;
@@ -380,7 +367,7 @@
 	err |= __put_user((u32)regs->flags, &sc->flags);
 	err |= __put_user((u32)regs->sp, &sc->sp_at_signal);
 
-	tmp = save_i387_ia32(fpstate);
+	tmp = save_i387_xstate_ia32(fpstate);
 	if (tmp < 0)
 		err = -EFAULT;
 	else {
@@ -401,7 +388,8 @@
  * Determine which stack to use..
  */
 static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
-				 size_t frame_size)
+				 size_t frame_size,
+				 void **fpstate)
 {
 	unsigned long sp;
 
@@ -420,6 +408,11 @@
 		 ka->sa.sa_restorer)
 		sp = (unsigned long) ka->sa.sa_restorer;
 
+	if (used_math()) {
+		sp = sp - sig_xstate_ia32_size;
+		*fpstate = (struct _fpstate_ia32 *) sp;
+	}
+
 	sp -= frame_size;
 	/* Align the stack pointer according to the i386 ABI,
 	 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
@@ -433,6 +426,7 @@
 	struct sigframe __user *frame;
 	void __user *restorer;
 	int err = 0;
+	void __user *fpstate = NULL;
 
 	/* copy_to_user optimizes that into a single 8 byte store */
 	static const struct {
@@ -447,7 +441,7 @@
 		0,
 	};
 
-	frame = get_sigframe(ka, regs, sizeof(*frame));
+	frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
 
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
 		goto give_sigsegv;
@@ -456,8 +450,7 @@
 	if (err)
 		goto give_sigsegv;
 
-	err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs,
-					set->sig[0]);
+	err |= ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]);
 	if (err)
 		goto give_sigsegv;
 
@@ -521,6 +514,7 @@
 	struct rt_sigframe __user *frame;
 	void __user *restorer;
 	int err = 0;
+	void __user *fpstate = NULL;
 
 	/* __copy_to_user optimizes that into a single 8 byte store */
 	static const struct {
@@ -536,7 +530,7 @@
 		0,
 	};
 
-	frame = get_sigframe(ka, regs, sizeof(*frame));
+	frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
 
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
 		goto give_sigsegv;
@@ -549,13 +543,16 @@
 		goto give_sigsegv;
 
 	/* Create the ucontext.  */
-	err |= __put_user(0, &frame->uc.uc_flags);
+	if (cpu_has_xsave)
+		err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
+	else
+		err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __put_user(0, &frame->uc.uc_link);
 	err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
 	err |= __put_user(sas_ss_flags(regs->sp),
 			  &frame->uc.uc_stack.ss_flags);
 	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
-	err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
+	err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
 				     regs, set->sig[0]);
 	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
 	if (err)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3db651f..c9be69f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -38,7 +38,7 @@
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-y				+= process.o
-obj-y				+= i387.o
+obj-y				+= i387.o xsave.o
 obj-y				+= ptrace.o
 obj-y				+= ds.o
 obj-$(CONFIG_X86_32)		+= tls.o
@@ -69,6 +69,7 @@
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
 obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
+obj-$(CONFIG_X86_ES7000)	+= es7000_32.o
 obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit_32.o
 obj-y				+= vsmp_64.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
@@ -104,6 +105,8 @@
 ifeq ($(CONFIG_X86_64),y)
         obj-y				+= genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
 	obj-y				+= bios_uv.o
+        obj-y				+= genx2apic_cluster.o
+        obj-y				+= genx2apic_phys.o
         obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
         obj-$(CONFIG_AUDIT)		+= audit_64.o
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7d40ef7..c2ac1b4 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -252,10 +252,8 @@
 		return;
 	}
 
-#ifdef CONFIG_X86_32
 	if (boot_cpu_physical_apicid != -1U)
 		ver = apic_version[boot_cpu_physical_apicid];
-#endif
 
 	generic_processor_info(id, ver);
 }
@@ -774,11 +772,9 @@
 
 	set_fixmap_nocache(FIX_APIC_BASE, address);
 	if (boot_cpu_physical_apicid == -1U) {
-		boot_cpu_physical_apicid  = GET_APIC_ID(read_apic_id());
-#ifdef CONFIG_X86_32
+		boot_cpu_physical_apicid  = read_apic_id();
 		apic_version[boot_cpu_physical_apicid] =
 			 GET_APIC_VERSION(apic_read(APIC_LVR));
-#endif
 	}
 }
 
@@ -1350,7 +1346,9 @@
 				acpi_ioapic = 1;
 
 				smp_found_config = 1;
+#ifdef CONFIG_X86_32
 				setup_apic_routing();
+#endif
 			}
 		}
 		if (error == -EINVAL) {
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index f88bd0d..a91c57c 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -60,10 +60,8 @@
 static int force_enable_local_apic;
 int disable_apic;
 
-/* Local APIC timer verification ok */
-static int local_apic_timer_verify_ok;
 /* Disable local APIC timer from the kernel commandline or via dmi quirk */
-static int local_apic_timer_disabled;
+static int disable_apic_timer __cpuinitdata;
 /* Local APIC timer works in C2 */
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -130,7 +128,11 @@
  */
 static inline int lapic_is_integrated(void)
 {
+#ifdef CONFIG_X86_64
+	return 1;
+#else
 	return APIC_INTEGRATED(lapic_get_version());
+#endif
 }
 
 /*
@@ -145,13 +147,18 @@
 	return lapic_get_version() >= 0x14;
 }
 
-void apic_wait_icr_idle(void)
+/*
+ * Paravirt kernels also might be using these below ops. So we still
+ * use generic apic_read()/apic_write(), which might be pointing to different
+ * ops in PARAVIRT case.
+ */
+void xapic_wait_icr_idle(void)
 {
 	while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
 		cpu_relax();
 }
 
-u32 safe_apic_wait_icr_idle(void)
+u32 safe_xapic_wait_icr_idle(void)
 {
 	u32 send_status;
 	int timeout;
@@ -167,16 +174,48 @@
 	return send_status;
 }
 
+void xapic_icr_write(u32 low, u32 id)
+{
+	apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
+	apic_write(APIC_ICR, low);
+}
+
+u64 xapic_icr_read(void)
+{
+	u32 icr1, icr2;
+
+	icr2 = apic_read(APIC_ICR2);
+	icr1 = apic_read(APIC_ICR);
+
+	return icr1 | ((u64)icr2 << 32);
+}
+
+static struct apic_ops xapic_ops = {
+	.read = native_apic_mem_read,
+	.write = native_apic_mem_write,
+	.icr_read = xapic_icr_read,
+	.icr_write = xapic_icr_write,
+	.wait_icr_idle = xapic_wait_icr_idle,
+	.safe_wait_icr_idle = safe_xapic_wait_icr_idle,
+};
+
+struct apic_ops __read_mostly *apic_ops = &xapic_ops;
+EXPORT_SYMBOL_GPL(apic_ops);
+
 /**
  * enable_NMI_through_LVT0 - enable NMI through local vector table 0
  */
 void __cpuinit enable_NMI_through_LVT0(void)
 {
-	unsigned int v = APIC_DM_NMI;
+	unsigned int v;
 
-	/* Level triggered for 82489DX */
+	/* unmask and set to NMI */
+	v = APIC_DM_NMI;
+
+	/* Level triggered for 82489DX (32bit mode) */
 	if (!lapic_is_integrated())
 		v |= APIC_LVT_LEVEL_TRIGGER;
+
 	apic_write(APIC_LVT0, v);
 }
 
@@ -193,9 +232,13 @@
  */
 int lapic_get_maxlvt(void)
 {
-	unsigned int v = apic_read(APIC_LVR);
+	unsigned int v;
 
-	/* 82489DXs do not report # of LVT entries. */
+	v = apic_read(APIC_LVR);
+	/*
+	 * - we always have APIC integrated on 64bit mode
+	 * - 82489DXs do not report # of LVT entries
+	 */
 	return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
 }
 
@@ -203,8 +246,12 @@
  * Local APIC timer
  */
 
-/* Clock divisor is set to 16 */
+/* Clock divisor */
+#ifdef CONFG_X86_64
+#define APIC_DIVISOR 1
+#else
 #define APIC_DIVISOR 16
+#endif
 
 /*
  * This function sets up the local APIC timer, with a timeout of
@@ -212,6 +259,9 @@
  * this function twice on the boot CPU, once with a bogus timeout
  * value, second time for real. The other (noncalibrating) CPUs
  * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
  */
 static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 {
@@ -233,14 +283,44 @@
 	 */
 	tmp_value = apic_read(APIC_TDCR);
 	apic_write(APIC_TDCR,
-		   (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
-		   APIC_TDR_DIV_16);
+		(tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
+		APIC_TDR_DIV_16);
 
 	if (!oneshot)
 		apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
 }
 
 /*
+ * Setup extended LVT, AMD specific (K8, family 10h)
+ *
+ * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
+ * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ */
+
+#define APIC_EILVT_LVTOFF_MCE 0
+#define APIC_EILVT_LVTOFF_IBS 1
+
+static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+{
+	unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
+	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
+
+	apic_write(reg, v);
+}
+
+u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
+{
+	setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
+	return APIC_EILVT_LVTOFF_MCE;
+}
+
+u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
+{
+	setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
+	return APIC_EILVT_LVTOFF_IBS;
+}
+
+/*
  * Program the next event, relative to now
  */
 static int lapic_next_event(unsigned long delta,
@@ -259,8 +339,8 @@
 	unsigned long flags;
 	unsigned int v;
 
-	/* Lapic used for broadcast ? */
-	if (!local_apic_timer_verify_ok)
+	/* Lapic used as dummy for broadcast ? */
+	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
 		return;
 
 	local_irq_save(flags);
@@ -473,7 +553,7 @@
 		return -1;
 	}
 
-	local_apic_timer_verify_ok = 1;
+	levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
 
 	/* We trust the pm timer based calibration */
 	if (!pm_referenced) {
@@ -507,11 +587,11 @@
 		if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
 			apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
 		else
-			local_apic_timer_verify_ok = 0;
+			levt->features |= CLOCK_EVT_FEAT_DUMMY;
 	} else
 		local_irq_enable();
 
-	if (!local_apic_timer_verify_ok) {
+	if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
 		printk(KERN_WARNING
 		       "APIC timer disabled due to verification failure.\n");
 			return -1;
@@ -533,7 +613,8 @@
 	 * timer as a dummy clock event source on SMP systems, so the
 	 * broadcast mechanism is used. On UP systems simply ignore it.
 	 */
-	if (local_apic_timer_disabled) {
+	if (disable_apic_timer) {
+		printk(KERN_INFO "Disabling APIC timer\n");
 		/* No broadcast on UP ! */
 		if (num_possible_cpus() > 1) {
 			lapic_clockevent.mult = 1;
@@ -602,7 +683,11 @@
 	/*
 	 * the NMI deadlock-detector uses this.
 	 */
+#ifdef CONFIG_X86_64
+	add_pda(apic_timer_irqs, 1);
+#else
 	per_cpu(irq_stat, cpu).apic_timer_irqs++;
+#endif
 
 	evt->event_handler(evt);
 }
@@ -642,35 +727,6 @@
 }
 
 /*
- * Setup extended LVT, AMD specific (K8, family 10h)
- *
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
- */
-
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
-
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
-{
-	unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
-	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
-	apic_write(reg, v);
-}
-
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
-{
-	setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
-	return APIC_EILVT_LVTOFF_MCE;
-}
-
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
-{
-	setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
-	return APIC_EILVT_LVTOFF_IBS;
-}
-
-/*
  * Local APIC start and shutdown
  */
 
@@ -715,7 +771,7 @@
 	}
 
 	/* lets not touch this if we didn't frob it */
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
 	if (maxlvt >= 5) {
 		v = apic_read(APIC_LVTTHMR);
 		apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -732,10 +788,6 @@
 	if (maxlvt >= 4)
 		apic_write(APIC_LVTPC, APIC_LVT_MASKED);
 
-#ifdef CONFIG_X86_MCE_P4THERMAL
-	if (maxlvt >= 5)
-		apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
-#endif
 	/* Integrated APIC (!82489DX) ? */
 	if (lapic_is_integrated()) {
 		if (maxlvt > 3)
@@ -750,7 +802,7 @@
  */
 void disable_local_APIC(void)
 {
-	unsigned long value;
+	unsigned int value;
 
 	clear_local_APIC();
 
@@ -762,6 +814,7 @@
 	value &= ~APIC_SPIV_APIC_ENABLED;
 	apic_write(APIC_SPIV, value);
 
+#ifdef CONFIG_X86_32
 	/*
 	 * When LAPIC was disabled by the BIOS and enabled by the kernel,
 	 * restore the disabled state.
@@ -773,6 +826,7 @@
 		l &= ~MSR_IA32_APICBASE_ENABLE;
 		wrmsr(MSR_IA32_APICBASE, l, h);
 	}
+#endif
 }
 
 /*
@@ -789,11 +843,15 @@
 		return;
 
 	local_irq_save(flags);
-	clear_local_APIC();
 
-	if (enabled_via_apicbase)
+#ifdef CONFIG_X86_32
+	if (!enabled_via_apicbase)
+		clear_local_APIC();
+	else
+#endif
 		disable_local_APIC();
 
+
 	local_irq_restore(flags);
 }
 
@@ -838,6 +896,12 @@
 	 */
 	reg0 = apic_read(APIC_ID);
 	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+	apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
+	reg1 = apic_read(APIC_ID);
+	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
+	apic_write(APIC_ID, reg0);
+	if (reg1 != (reg0 ^ APIC_ID_MASK))
+		return 0;
 
 	/*
 	 * The next two are just to see if we have sane values.
@@ -863,14 +927,15 @@
 	 */
 	if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
 		return;
+
 	/*
 	 * Wait for idle.
 	 */
 	apic_wait_icr_idle();
 
 	apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
-	apic_write(APIC_ICR,
-		   APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
+	apic_write(APIC_ICR, APIC_DEST_ALLINC |
+			APIC_INT_LEVELTRIG | APIC_DM_INIT);
 }
 
 /*
@@ -878,7 +943,7 @@
  */
 void __init init_bsp_APIC(void)
 {
-	unsigned long value;
+	unsigned int value;
 
 	/*
 	 * Don't do the setup now if we have a SMP BIOS as the
@@ -899,11 +964,13 @@
 	value &= ~APIC_VECTOR_MASK;
 	value |= APIC_SPIV_APIC_ENABLED;
 
+#ifdef CONFIG_X86_32
 	/* This bit is reserved on P4/Xeon and should be cleared */
 	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
 	    (boot_cpu_data.x86 == 15))
 		value &= ~APIC_SPIV_FOCUS_DISABLED;
 	else
+#endif
 		value |= APIC_SPIV_FOCUS_DISABLED;
 	value |= SPURIOUS_APIC_VECTOR;
 	apic_write(APIC_SPIV, value);
@@ -922,6 +989,16 @@
 {
 	unsigned long oldvalue, value, maxlvt;
 	if (lapic_is_integrated() && !esr_disable) {
+		if (esr_disable) {
+			/*
+			 * Something untraceable is creating bad interrupts on
+			 * secondary quads ... for the moment, just leave the
+			 * ESR disabled - we can't do anything useful with the
+			 * errors anyway - mbligh
+			 */
+			printk(KERN_INFO "Leaving ESR disabled.\n");
+			return;
+		}
 		/* !82489DX */
 		maxlvt = lapic_get_maxlvt();
 		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP. */
@@ -942,16 +1019,7 @@
 				"vector: 0x%08lx  after: 0x%08lx\n",
 				oldvalue, value);
 	} else {
-		if (esr_disable)
-			/*
-			 * Something untraceable is creating bad interrupts on
-			 * secondary quads ... for the moment, just leave the
-			 * ESR disabled - we can't do anything useful with the
-			 * errors anyway - mbligh
-			 */
-			printk(KERN_INFO "Leaving ESR disabled.\n");
-		else
-			printk(KERN_INFO "No ESR for 82489DX.\n");
+		printk(KERN_INFO "No ESR for 82489DX.\n");
 	}
 }
 
@@ -1089,13 +1157,17 @@
 
 void __cpuinit end_local_APIC_setup(void)
 {
-	unsigned long value;
-
 	lapic_setup_esr();
-	/* Disable the local apic timer */
-	value = apic_read(APIC_LVTT);
-	value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-	apic_write(APIC_LVTT, value);
+
+#ifdef CONFIG_X86_32
+	{
+		unsigned int value;
+		/* Disable the local apic timer */
+		value = apic_read(APIC_LVTT);
+		value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+		apic_write(APIC_LVTT, value);
+	}
+#endif
 
 	setup_apic_nmi_watchdog(NULL);
 	apic_pm_activate();
@@ -1205,7 +1277,7 @@
 	 * default configuration (or the MP table is broken).
 	 */
 	if (boot_cpu_physical_apicid == -1U)
-		boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
+		boot_cpu_physical_apicid = read_apic_id();
 
 }
 
@@ -1242,7 +1314,7 @@
 	 * might be zero if read from MP tables. Get it from LAPIC.
 	 */
 #ifdef CONFIG_CRASH_DUMP
-	boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
+	boot_cpu_physical_apicid = read_apic_id();
 #endif
 	physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
 
@@ -1321,59 +1393,12 @@
 	irq_exit();
 }
 
-#ifdef CONFIG_SMP
-void __init smp_intr_init(void)
-{
-	/*
-	 * IRQ0 must be given a fixed assignment and initialized,
-	 * because it's used before the IO-APIC is set up.
-	 */
-	set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
-
-	/*
-	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-	 * IPI, driven by wakeup.
-	 */
-	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-	/* IPI for invalidation */
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
-
-	/* IPI for generic function call */
-	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-	/* IPI for single call function */
-	set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-				call_function_single_interrupt);
-}
-#endif
-
-/*
- * Initialize APIC interrupts
- */
-void __init apic_intr_init(void)
-{
-#ifdef CONFIG_SMP
-	smp_intr_init();
-#endif
-	/* self generated IPI for local APIC timer */
-	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-	/* IPI vectors for APIC spurious and error interrupts */
-	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
-	/* thermal monitor LVT interrupt */
-#ifdef CONFIG_X86_MCE_P4THERMAL
-	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-}
-
 /**
  * connect_bsp_APIC - attach the APIC to the interrupt system
  */
 void __init connect_bsp_APIC(void)
 {
+#ifdef CONFIG_X86_32
 	if (pic_mode) {
 		/*
 		 * Do not trust the local APIC being empty at bootup.
@@ -1388,6 +1413,7 @@
 		outb(0x70, 0x22);
 		outb(0x01, 0x23);
 	}
+#endif
 	enable_apic_mode();
 }
 
@@ -1400,6 +1426,9 @@
  */
 void disconnect_bsp_APIC(int virt_wire_setup)
 {
+	unsigned int value;
+
+#ifdef CONFIG_X86_32
 	if (pic_mode) {
 		/*
 		 * Put the board back into PIC mode (has an effect only on
@@ -1411,54 +1440,53 @@
 				"entering PIC mode.\n");
 		outb(0x70, 0x22);
 		outb(0x00, 0x23);
-	} else {
-		/* Go back to Virtual Wire compatibility mode */
-		unsigned long value;
+		return;
+	}
+#endif
 
-		/* For the spurious interrupt use vector F, and enable it */
-		value = apic_read(APIC_SPIV);
-		value &= ~APIC_VECTOR_MASK;
-		value |= APIC_SPIV_APIC_ENABLED;
-		value |= 0xf;
-		apic_write(APIC_SPIV, value);
+	/* Go back to Virtual Wire compatibility mode */
 
-		if (!virt_wire_setup) {
-			/*
-			 * For LVT0 make it edge triggered, active high,
-			 * external and enabled
-			 */
-			value = apic_read(APIC_LVT0);
-			value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
-				APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-				APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
-			value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-			value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
-			apic_write(APIC_LVT0, value);
-		} else {
-			/* Disable LVT0 */
-			apic_write(APIC_LVT0, APIC_LVT_MASKED);
-		}
+	/* For the spurious interrupt use vector F, and enable it */
+	value = apic_read(APIC_SPIV);
+	value &= ~APIC_VECTOR_MASK;
+	value |= APIC_SPIV_APIC_ENABLED;
+	value |= 0xf;
+	apic_write(APIC_SPIV, value);
 
+	if (!virt_wire_setup) {
 		/*
-		 * For LVT1 make it edge triggered, active high, nmi and
-		 * enabled
+		 * For LVT0 make it edge triggered, active high,
+		 * external and enabled
 		 */
-		value = apic_read(APIC_LVT1);
-		value &= ~(
-			APIC_MODE_MASK | APIC_SEND_PENDING |
+		value = apic_read(APIC_LVT0);
+		value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
 			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
 			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
 		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
-		apic_write(APIC_LVT1, value);
+		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+		apic_write(APIC_LVT0, value);
+	} else {
+		/* Disable LVT0 */
+		apic_write(APIC_LVT0, APIC_LVT_MASKED);
 	}
+
+	/*
+	 * For LVT1 make it edge triggered, active high,
+	 * nmi and enabled
+	 */
+	value = apic_read(APIC_LVT1);
+	value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+	value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+	value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+	apic_write(APIC_LVT1, value);
 }
 
 void __cpuinit generic_processor_info(int apicid, int version)
 {
 	int cpu;
 	cpumask_t tmp_map;
-	physid_mask_t phys_cpu;
 
 	/*
 	 * Validate version
@@ -1471,9 +1499,6 @@
 	}
 	apic_version[apicid] = version;
 
-	phys_cpu = apicid_to_cpu_present(apicid);
-	physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
-
 	if (num_processors >= NR_CPUS) {
 		printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
 			"  Processor ignored.\n", NR_CPUS);
@@ -1484,17 +1509,19 @@
 	cpus_complement(tmp_map, cpu_present_map);
 	cpu = first_cpu(tmp_map);
 
-	if (apicid == boot_cpu_physical_apicid)
+	physid_set(apicid, phys_cpu_present_map);
+	if (apicid == boot_cpu_physical_apicid) {
 		/*
 		 * x86_bios_cpu_apicid is required to have processors listed
 		 * in same order as logical cpu numbers. Hence the first
 		 * entry is BSP, and so on.
 		 */
 		cpu = 0;
-
+	}
 	if (apicid > max_physical_apicid)
 		max_physical_apicid = apicid;
 
+#ifdef CONFIG_X86_32
 	/*
 	 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
 	 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1514,7 +1541,9 @@
 			def_to_bigsmp = 1;
 		}
 	}
-#ifdef CONFIG_SMP
+#endif
+
+#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
 	/* are we being called early in kernel startup? */
 	if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
 		u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1527,6 +1556,7 @@
 		per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
 	}
 #endif
+
 	cpu_set(cpu, cpu_possible_map);
 	cpu_set(cpu, cpu_present_map);
 }
@@ -1537,6 +1567,11 @@
 #ifdef CONFIG_PM
 
 static struct {
+	/*
+	 * 'active' is true if the local APIC was enabled by us and
+	 * not the BIOS; this signifies that we are also responsible
+	 * for disabling it before entering apm/acpi suspend
+	 */
 	int active;
 	/* r/w apic fields */
 	unsigned int apic_id;
@@ -1577,7 +1612,7 @@
 	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
 	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
 	if (maxlvt >= 5)
 		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
 #endif
@@ -1601,16 +1636,23 @@
 
 	local_irq_save(flags);
 
-	/*
-	 * Make sure the APICBASE points to the right address
-	 *
-	 * FIXME! This will be wrong if we ever support suspend on
-	 * SMP! We'll need to do this as part of the CPU restore!
-	 */
-	rdmsr(MSR_IA32_APICBASE, l, h);
-	l &= ~MSR_IA32_APICBASE_BASE;
-	l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
-	wrmsr(MSR_IA32_APICBASE, l, h);
+#ifdef CONFIG_X86_64
+	if (x2apic)
+		enable_x2apic();
+	else
+#endif
+	{
+		/*
+		 * Make sure the APICBASE points to the right address
+		 *
+		 * FIXME! This will be wrong if we ever support suspend on
+		 * SMP! We'll need to do this as part of the CPU restore!
+		 */
+		rdmsr(MSR_IA32_APICBASE, l, h);
+		l &= ~MSR_IA32_APICBASE_BASE;
+		l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+		wrmsr(MSR_IA32_APICBASE, l, h);
+	}
 
 	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
 	apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1620,7 +1662,7 @@
 	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
 	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
 	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
 	if (maxlvt >= 5)
 		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
 #endif
@@ -1634,7 +1676,9 @@
 	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
+
 	local_irq_restore(flags);
+
 	return 0;
 }
 
@@ -1690,20 +1734,20 @@
 }
 early_param("lapic", parse_lapic);
 
-static int __init parse_nolapic(char *arg)
+static int __init setup_disableapic(char *arg)
 {
 	disable_apic = 1;
 	setup_clear_cpu_cap(X86_FEATURE_APIC);
 	return 0;
 }
-early_param("nolapic", parse_nolapic);
+early_param("disableapic", setup_disableapic);
 
-static int __init parse_disable_lapic_timer(char *arg)
+/* same as disableapic, for compatibility */
+static int __init setup_nolapic(char *arg)
 {
-	local_apic_timer_disabled = 1;
-	return 0;
+	return setup_disableapic(arg);
 }
-early_param("nolapic_timer", parse_disable_lapic_timer);
+early_param("nolapic", setup_nolapic);
 
 static int __init parse_lapic_timer_c2_ok(char *arg)
 {
@@ -1712,15 +1756,40 @@
 }
 early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
 
+static int __init parse_disable_apic_timer(char *arg)
+{
+	disable_apic_timer = 1;
+	return 0;
+}
+early_param("noapictimer", parse_disable_apic_timer);
+
+static int __init parse_nolapic_timer(char *arg)
+{
+	disable_apic_timer = 1;
+	return 0;
+}
+early_param("nolapic_timer", parse_nolapic_timer);
+
 static int __init apic_set_verbosity(char *arg)
 {
-	if (!arg)
+	if (!arg)  {
+#ifdef CONFIG_X86_64
+		skip_ioapic_setup = 0;
+		ioapic_force = 1;
+		return 0;
+#endif
 		return -EINVAL;
+	}
 
-	if (strcmp(arg, "debug") == 0)
+	if (strcmp("debug", arg) == 0)
 		apic_verbosity = APIC_DEBUG;
-	else if (strcmp(arg, "verbose") == 0)
+	else if (strcmp("verbose", arg) == 0)
 		apic_verbosity = APIC_VERBOSE;
+	else {
+		printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+			" use apic=verbose or apic=debug\n", arg);
+		return -EINVAL;
+	}
 
 	return 0;
 }
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 446c062..53898b6 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -27,6 +27,7 @@
 #include <linux/clockchips.h>
 #include <linux/acpi_pmtmr.h>
 #include <linux/module.h>
+#include <linux/dmar.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
@@ -39,13 +40,20 @@
 #include <asm/proto.h>
 #include <asm/timex.h>
 #include <asm/apic.h>
+#include <asm/i8259.h>
 
 #include <mach_ipi.h>
 #include <mach_apic.h>
 
+/* Disable local APIC timer from the kernel commandline or via dmi quirk */
 static int disable_apic_timer __cpuinitdata;
 static int apic_calibrate_pmtmr __initdata;
 int disable_apic;
+int disable_x2apic;
+int x2apic;
+
+/* x2apic enabled before OS handover */
+int x2apic_preenabled;
 
 /* Local APIC timer works in C2 */
 int local_apic_timer_c2_ok;
@@ -73,6 +81,9 @@
 static void lapic_timer_broadcast(cpumask_t mask);
 static void apic_pm_activate(void);
 
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
 static struct clock_event_device lapic_clockevent = {
 	.name		= "lapic",
 	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
@@ -99,11 +110,15 @@
 }
 
 /*
- * Check, if the APIC is integrated or a seperate chip
+ * Check, if the APIC is integrated or a separate chip
  */
 static inline int lapic_is_integrated(void)
 {
+#ifdef CONFIG_X86_64
 	return 1;
+#else
+	return APIC_INTEGRATED(lapic_get_version());
+#endif
 }
 
 /*
@@ -118,13 +133,18 @@
 	return lapic_get_version() >= 0x14;
 }
 
-void apic_wait_icr_idle(void)
+/*
+ * Paravirt kernels also might be using these below ops. So we still
+ * use generic apic_read()/apic_write(), which might be pointing to different
+ * ops in PARAVIRT case.
+ */
+void xapic_wait_icr_idle(void)
 {
 	while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
 		cpu_relax();
 }
 
-u32 safe_apic_wait_icr_idle(void)
+u32 safe_xapic_wait_icr_idle(void)
 {
 	u32 send_status;
 	int timeout;
@@ -140,6 +160,68 @@
 	return send_status;
 }
 
+void xapic_icr_write(u32 low, u32 id)
+{
+	apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
+	apic_write(APIC_ICR, low);
+}
+
+u64 xapic_icr_read(void)
+{
+	u32 icr1, icr2;
+
+	icr2 = apic_read(APIC_ICR2);
+	icr1 = apic_read(APIC_ICR);
+
+	return icr1 | ((u64)icr2 << 32);
+}
+
+static struct apic_ops xapic_ops = {
+	.read = native_apic_mem_read,
+	.write = native_apic_mem_write,
+	.icr_read = xapic_icr_read,
+	.icr_write = xapic_icr_write,
+	.wait_icr_idle = xapic_wait_icr_idle,
+	.safe_wait_icr_idle = safe_xapic_wait_icr_idle,
+};
+
+struct apic_ops __read_mostly *apic_ops = &xapic_ops;
+EXPORT_SYMBOL_GPL(apic_ops);
+
+static void x2apic_wait_icr_idle(void)
+{
+	/* no need to wait for icr idle in x2apic */
+	return;
+}
+
+static u32 safe_x2apic_wait_icr_idle(void)
+{
+	/* no need to wait for icr idle in x2apic */
+	return 0;
+}
+
+void x2apic_icr_write(u32 low, u32 id)
+{
+	wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
+}
+
+u64 x2apic_icr_read(void)
+{
+	unsigned long val;
+
+	rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
+	return val;
+}
+
+static struct apic_ops x2apic_ops = {
+	.read = native_apic_msr_read,
+	.write = native_apic_msr_write,
+	.icr_read = x2apic_icr_read,
+	.icr_write = x2apic_icr_write,
+	.wait_icr_idle = x2apic_wait_icr_idle,
+	.safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
+};
+
 /**
  * enable_NMI_through_LVT0 - enable NMI through local vector table 0
  */
@@ -149,6 +231,11 @@
 
 	/* unmask and set to NMI */
 	v = APIC_DM_NMI;
+
+	/* Level triggered for 82489DX (32bit mode) */
+	if (!lapic_is_integrated())
+		v |= APIC_LVT_LEVEL_TRIGGER;
+
 	apic_write(APIC_LVT0, v);
 }
 
@@ -157,14 +244,28 @@
  */
 int lapic_get_maxlvt(void)
 {
-	unsigned int v, maxlvt;
+	unsigned int v;
 
 	v = apic_read(APIC_LVR);
-	maxlvt = GET_APIC_MAXLVT(v);
-	return maxlvt;
+	/*
+	 * - we always have APIC integrated on 64bit mode
+	 * - 82489DXs do not report # of LVT entries
+	 */
+	return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
 }
 
 /*
+ * Local APIC timer
+ */
+
+/* Clock divisor */
+#ifdef CONFG_X86_64
+#define APIC_DIVISOR 1
+#else
+#define APIC_DIVISOR 16
+#endif
+
+/*
  * This function sets up the local APIC timer, with a timeout of
  * 'clocks' APIC bus clock. During calibration we actually call
  * this function twice on the boot CPU, once with a bogus timeout
@@ -174,7 +275,6 @@
  * We do reads before writes even if unnecessary, to get around the
  * P5 APIC double write bug.
  */
-
 static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 {
 	unsigned int lvtt_value, tmp_value;
@@ -182,6 +282,9 @@
 	lvtt_value = LOCAL_TIMER_VECTOR;
 	if (!oneshot)
 		lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+	if (!lapic_is_integrated())
+		lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+
 	if (!irqen)
 		lvtt_value |= APIC_LVT_MASKED;
 
@@ -191,12 +294,12 @@
 	 * Divide PICLK by 16
 	 */
 	tmp_value = apic_read(APIC_TDCR);
-	apic_write(APIC_TDCR, (tmp_value
-				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
-				| APIC_TDR_DIV_16);
+	apic_write(APIC_TDCR,
+		(tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
+		APIC_TDR_DIV_16);
 
 	if (!oneshot)
-		apic_write(APIC_TMICT, clocks);
+		apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
 }
 
 /*
@@ -366,7 +469,7 @@
 	lapic_clockevent.min_delta_ns =
 		clockevent_delta2ns(0xF, &lapic_clockevent);
 
-	calibration_result = result / HZ;
+	calibration_result = (result * APIC_DIVISOR) / HZ;
 
 	/*
 	 * Do a sanity check on the APIC calibration result
@@ -388,10 +491,10 @@
 void __init setup_boot_APIC_clock(void)
 {
 	/*
-	 * The local apic timer can be disabled via the kernel commandline.
-	 * Register the lapic timer as a dummy clock event source on SMP
-	 * systems, so the broadcast mechanism is used. On UP systems simply
-	 * ignore it.
+	 * The local apic timer can be disabled via the kernel
+	 * commandline or from the CPU detection code. Register the lapic
+	 * timer as a dummy clock event source on SMP systems, so the
+	 * broadcast mechanism is used. On UP systems simply ignore it.
 	 */
 	if (disable_apic_timer) {
 		printk(KERN_INFO "Disabling APIC timer\n");
@@ -403,7 +506,9 @@
 		return;
 	}
 
-	printk(KERN_INFO "Using local APIC timer interrupts.\n");
+	apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+		    "calibrating APIC timer ...\n");
+
 	if (calibrate_APIC_clock()) {
 		/* No broadcast on UP ! */
 		if (num_possible_cpus() > 1)
@@ -422,6 +527,7 @@
 		printk(KERN_WARNING "APIC timer registered as dummy,"
 			" due to nmi_watchdog=%d!\n", nmi_watchdog);
 
+	/* Setup the lapic or request the broadcast */
 	setup_APIC_timer();
 }
 
@@ -460,7 +566,11 @@
 	/*
 	 * the NMI deadlock-detector uses this.
 	 */
+#ifdef CONFIG_X86_64
 	add_pda(apic_timer_irqs, 1);
+#else
+	per_cpu(irq_stat, cpu).apic_timer_irqs++;
+#endif
 
 	evt->event_handler(evt);
 }
@@ -491,6 +601,7 @@
 	irq_enter();
 	local_apic_timer_interrupt();
 	irq_exit();
+
 	set_irq_regs(old_regs);
 }
 
@@ -544,6 +655,13 @@
 		apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
 	}
 
+	/* lets not touch this if we didn't frob it */
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
+	if (maxlvt >= 5) {
+		v = apic_read(APIC_LVTTHMR);
+		apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
+	}
+#endif
 	/*
 	 * Clean APIC state for other OSs:
 	 */
@@ -554,8 +672,14 @@
 		apic_write(APIC_LVTERR, APIC_LVT_MASKED);
 	if (maxlvt >= 4)
 		apic_write(APIC_LVTPC, APIC_LVT_MASKED);
-	apic_write(APIC_ESR, 0);
-	apic_read(APIC_ESR);
+
+	/* Integrated APIC (!82489DX) ? */
+	if (lapic_is_integrated()) {
+		if (maxlvt > 3)
+			/* Clear ESR due to Pentium errata 3AP and 11AP */
+			apic_write(APIC_ESR, 0);
+		apic_read(APIC_ESR);
+	}
 }
 
 /**
@@ -574,8 +698,28 @@
 	value = apic_read(APIC_SPIV);
 	value &= ~APIC_SPIV_APIC_ENABLED;
 	apic_write(APIC_SPIV, value);
+
+#ifdef CONFIG_X86_32
+	/*
+	 * When LAPIC was disabled by the BIOS and enabled by the kernel,
+	 * restore the disabled state.
+	 */
+	if (enabled_via_apicbase) {
+		unsigned int l, h;
+
+		rdmsr(MSR_IA32_APICBASE, l, h);
+		l &= ~MSR_IA32_APICBASE_ENABLE;
+		wrmsr(MSR_IA32_APICBASE, l, h);
+	}
+#endif
 }
 
+/*
+ * If Linux enabled the LAPIC against the BIOS default disable it down before
+ * re-entering the BIOS on shutdown.  Otherwise the BIOS may get confused and
+ * not power-off.  Additionally clear all LVT entries before disable_local_APIC
+ * for the case where Linux didn't enable the LAPIC.
+ */
 void lapic_shutdown(void)
 {
 	unsigned long flags;
@@ -585,7 +729,13 @@
 
 	local_irq_save(flags);
 
-	disable_local_APIC();
+#ifdef CONFIG_X86_32
+	if (!enabled_via_apicbase)
+		clear_local_APIC();
+	else
+#endif
+		disable_local_APIC();
+
 
 	local_irq_restore(flags);
 }
@@ -629,10 +779,10 @@
 	/*
 	 * The ID register is read/write in a real APIC.
 	 */
-	reg0 = read_apic_id();
+	reg0 = apic_read(APIC_ID);
 	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
 	apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
-	reg1 = read_apic_id();
+	reg1 = apic_read(APIC_ID);
 	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
 	apic_write(APIC_ID, reg0);
 	if (reg1 != (reg0 ^ APIC_ID_MASK))
@@ -656,8 +806,11 @@
  */
 void __init sync_Arb_IDs(void)
 {
-	/* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
-	if (modern_apic())
+	/*
+	 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
+	 * needed on AMD.
+	 */
+	if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
 		return;
 
 	/*
@@ -666,8 +819,8 @@
 	apic_wait_icr_idle();
 
 	apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
-	apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
-				| APIC_DM_INIT);
+	apic_write(APIC_ICR, APIC_DEST_ALLINC |
+			APIC_INT_LEVELTRIG | APIC_DM_INIT);
 }
 
 /*
@@ -684,8 +837,6 @@
 	if (smp_found_config || !cpu_has_apic)
 		return;
 
-	value = apic_read(APIC_LVR);
-
 	/*
 	 * Do not trust the local APIC being empty at bootup.
 	 */
@@ -697,7 +848,15 @@
 	value = apic_read(APIC_SPIV);
 	value &= ~APIC_VECTOR_MASK;
 	value |= APIC_SPIV_APIC_ENABLED;
-	value |= APIC_SPIV_FOCUS_DISABLED;
+
+#ifdef CONFIG_X86_32
+	/* This bit is reserved on P4/Xeon and should be cleared */
+	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+	    (boot_cpu_data.x86 == 15))
+		value &= ~APIC_SPIV_FOCUS_DISABLED;
+	else
+#endif
+		value |= APIC_SPIV_FOCUS_DISABLED;
 	value |= SPURIOUS_APIC_VECTOR;
 	apic_write(APIC_SPIV, value);
 
@@ -706,9 +865,50 @@
 	 */
 	apic_write(APIC_LVT0, APIC_DM_EXTINT);
 	value = APIC_DM_NMI;
+	if (!lapic_is_integrated())		/* 82489DX */
+		value |= APIC_LVT_LEVEL_TRIGGER;
 	apic_write(APIC_LVT1, value);
 }
 
+static void __cpuinit lapic_setup_esr(void)
+{
+	unsigned long oldvalue, value, maxlvt;
+	if (lapic_is_integrated() && !esr_disable) {
+		if (esr_disable) {
+			/*
+			 * Something untraceable is creating bad interrupts on
+			 * secondary quads ... for the moment, just leave the
+			 * ESR disabled - we can't do anything useful with the
+			 * errors anyway - mbligh
+			 */
+			printk(KERN_INFO "Leaving ESR disabled.\n");
+			return;
+		}
+		/* !82489DX */
+		maxlvt = lapic_get_maxlvt();
+		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP. */
+			apic_write(APIC_ESR, 0);
+		oldvalue = apic_read(APIC_ESR);
+
+		/* enables sending errors */
+		value = ERROR_APIC_VECTOR;
+		apic_write(APIC_LVTERR, value);
+		/*
+		 * spec says clear errors after enabling vector.
+		 */
+		if (maxlvt > 3)
+			apic_write(APIC_ESR, 0);
+		value = apic_read(APIC_ESR);
+		if (value != oldvalue)
+			apic_printk(APIC_VERBOSE, "ESR value before enabling "
+				"vector: 0x%08lx  after: 0x%08lx\n",
+				oldvalue, value);
+	} else {
+		printk(KERN_INFO "No ESR for 82489DX.\n");
+	}
+}
+
+
 /**
  * setup_local_APIC - setup the local APIC
  */
@@ -814,25 +1014,143 @@
 	preempt_enable();
 }
 
-static void __cpuinit lapic_setup_esr(void)
-{
-	unsigned maxlvt = lapic_get_maxlvt();
-
-	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
-	/*
-	 * spec says clear errors after enabling vector.
-	 */
-	if (maxlvt > 3)
-		apic_write(APIC_ESR, 0);
-}
-
 void __cpuinit end_local_APIC_setup(void)
 {
 	lapic_setup_esr();
+
+#ifdef CONFIG_X86_32
+	{
+		unsigned int value;
+		/* Disable the local apic timer */
+		value = apic_read(APIC_LVTT);
+		value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+		apic_write(APIC_LVTT, value);
+	}
+#endif
+
 	setup_apic_nmi_watchdog(NULL);
 	apic_pm_activate();
 }
 
+void check_x2apic(void)
+{
+	int msr, msr2;
+
+	rdmsr(MSR_IA32_APICBASE, msr, msr2);
+
+	if (msr & X2APIC_ENABLE) {
+		printk("x2apic enabled by BIOS, switching to x2apic ops\n");
+		x2apic_preenabled = x2apic = 1;
+		apic_ops = &x2apic_ops;
+	}
+}
+
+void enable_x2apic(void)
+{
+	int msr, msr2;
+
+	rdmsr(MSR_IA32_APICBASE, msr, msr2);
+	if (!(msr & X2APIC_ENABLE)) {
+		printk("Enabling x2apic\n");
+		wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
+	}
+}
+
+void enable_IR_x2apic(void)
+{
+#ifdef CONFIG_INTR_REMAP
+	int ret;
+	unsigned long flags;
+
+	if (!cpu_has_x2apic)
+		return;
+
+	if (!x2apic_preenabled && disable_x2apic) {
+		printk(KERN_INFO
+		       "Skipped enabling x2apic and Interrupt-remapping "
+		       "because of nox2apic\n");
+		return;
+	}
+
+	if (x2apic_preenabled && disable_x2apic)
+		panic("Bios already enabled x2apic, can't enforce nox2apic");
+
+	if (!x2apic_preenabled && skip_ioapic_setup) {
+		printk(KERN_INFO
+		       "Skipped enabling x2apic and Interrupt-remapping "
+		       "because of skipping io-apic setup\n");
+		return;
+	}
+
+	ret = dmar_table_init();
+	if (ret) {
+		printk(KERN_INFO
+		       "dmar_table_init() failed with %d:\n", ret);
+
+		if (x2apic_preenabled)
+			panic("x2apic enabled by bios. But IR enabling failed");
+		else
+			printk(KERN_INFO
+			       "Not enabling x2apic,Intr-remapping\n");
+		return;
+	}
+
+	local_irq_save(flags);
+	mask_8259A();
+	save_mask_IO_APIC_setup();
+
+	ret = enable_intr_remapping(1);
+
+	if (ret && x2apic_preenabled) {
+		local_irq_restore(flags);
+		panic("x2apic enabled by bios. But IR enabling failed");
+	}
+
+	if (ret)
+		goto end;
+
+	if (!x2apic) {
+		x2apic = 1;
+		apic_ops = &x2apic_ops;
+		enable_x2apic();
+	}
+end:
+	if (ret)
+		/*
+		 * IR enabling failed
+		 */
+		restore_IO_APIC_setup();
+	else
+		reinit_intr_remapped_IO_APIC(x2apic_preenabled);
+
+	unmask_8259A();
+	local_irq_restore(flags);
+
+	if (!ret) {
+		if (!x2apic_preenabled)
+			printk(KERN_INFO
+			       "Enabled x2apic and interrupt-remapping\n");
+		else
+			printk(KERN_INFO
+			       "Enabled Interrupt-remapping\n");
+	} else
+		printk(KERN_ERR
+		       "Failed to enable Interrupt-remapping and x2apic\n");
+#else
+	if (!cpu_has_x2apic)
+		return;
+
+	if (x2apic_preenabled)
+		panic("x2apic enabled prior OS handover,"
+		      " enable CONFIG_INTR_REMAP");
+
+	printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
+	       " and x2apic\n");
+#endif
+
+	return;
+}
+
 /*
  * Detect and enable local APICs on non-SMP boards.
  * Original code written by Keir Fraser.
@@ -872,7 +1190,7 @@
 	 * Fetch the APIC ID of the BSP in case we have a
 	 * default configuration (or the MP table is broken).
 	 */
-	boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
+	boot_cpu_physical_apicid = read_apic_id();
 }
 
 /**
@@ -880,6 +1198,11 @@
  */
 void __init init_apic_mappings(void)
 {
+	if (x2apic) {
+		boot_cpu_physical_apicid = read_apic_id();
+		return;
+	}
+
 	/*
 	 * If no local APIC can be found then set up a fake all
 	 * zeroes page to simulate the local APIC and another
@@ -899,13 +1222,15 @@
 	 * Fetch the APIC ID of the BSP in case we have a
 	 * default configuration (or the MP table is broken).
 	 */
-	boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
+	boot_cpu_physical_apicid = read_apic_id();
 }
 
 /*
  * This initializes the IO-APIC and APIC hardware if this is
  * a UP kernel.
  */
+int apic_version[MAX_APICS];
+
 int __init APIC_init_uniprocessor(void)
 {
 	if (disable_apic) {
@@ -918,6 +1243,9 @@
 		return -1;
 	}
 
+	enable_IR_x2apic();
+	setup_apic_routing();
+
 	verify_local_APIC();
 
 	connect_bsp_APIC();
@@ -1004,17 +1332,57 @@
 }
 
 /**
- *  * connect_bsp_APIC - attach the APIC to the interrupt system
- *   */
+ * connect_bsp_APIC - attach the APIC to the interrupt system
+ */
 void __init connect_bsp_APIC(void)
 {
+#ifdef CONFIG_X86_32
+	if (pic_mode) {
+		/*
+		 * Do not trust the local APIC being empty at bootup.
+		 */
+		clear_local_APIC();
+		/*
+		 * PIC mode, enable APIC mode in the IMCR, i.e.  connect BSP's
+		 * local APIC to INT and NMI lines.
+		 */
+		apic_printk(APIC_VERBOSE, "leaving PIC mode, "
+				"enabling APIC mode.\n");
+		outb(0x70, 0x22);
+		outb(0x01, 0x23);
+	}
+#endif
 	enable_apic_mode();
 }
 
+/**
+ * disconnect_bsp_APIC - detach the APIC from the interrupt system
+ * @virt_wire_setup:	indicates, whether virtual wire mode is selected
+ *
+ * Virtual wire mode is necessary to deliver legacy interrupts even when the
+ * APIC is disabled.
+ */
 void disconnect_bsp_APIC(int virt_wire_setup)
 {
+	unsigned int value;
+
+#ifdef CONFIG_X86_32
+	if (pic_mode) {
+		/*
+		 * Put the board back into PIC mode (has an effect only on
+		 * certain older boards).  Note that APIC interrupts, including
+		 * IPIs, won't work beyond this point!  The only exception are
+		 * INIT IPIs.
+		 */
+		apic_printk(APIC_VERBOSE, "disabling APIC mode, "
+				"entering PIC mode.\n");
+		outb(0x70, 0x22);
+		outb(0x00, 0x23);
+		return;
+	}
+#endif
+
 	/* Go back to Virtual Wire compatibility mode */
-	unsigned long value;
 
 	/* For the spurious interrupt use vector F, and enable it */
 	value = apic_read(APIC_SPIV);
@@ -1040,7 +1408,10 @@
 		apic_write(APIC_LVT0, APIC_LVT_MASKED);
 	}
 
-	/* For LVT1 make it edge triggered, active high, nmi and enabled */
+	/*
+	 * For LVT1 make it edge triggered, active high,
+	 * nmi and enabled
+	 */
 	value = apic_read(APIC_LVT1);
 	value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
 			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
@@ -1055,9 +1426,20 @@
 	int cpu;
 	cpumask_t tmp_map;
 
+	/*
+	 * Validate version
+	 */
+	if (version == 0x0) {
+		printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
+				"fixing up to 0x10. (tell your hw vendor)\n",
+				version);
+		version = 0x10;
+	}
+	apic_version[apicid] = version;
+
 	if (num_processors >= NR_CPUS) {
 		printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
-		       " Processor ignored.\n", NR_CPUS);
+			"  Processor ignored.\n", NR_CPUS);
 		return;
 	}
 
@@ -1077,6 +1459,29 @@
 	if (apicid > max_physical_apicid)
 		max_physical_apicid = apicid;
 
+#ifdef CONFIG_X86_32
+	/*
+	 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
+	 * but we need to work other dependencies like SMP_SUSPEND etc
+	 * before this can be done without some confusion.
+	 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
+	 *       - Ashok Raj <ashok.raj@intel.com>
+	 */
+	if (max_physical_apicid >= 8) {
+		switch (boot_cpu_data.x86_vendor) {
+		case X86_VENDOR_INTEL:
+			if (!APIC_XAPIC(version)) {
+				def_to_bigsmp = 0;
+				break;
+			}
+			/* If P4 and above fall through */
+		case X86_VENDOR_AMD:
+			def_to_bigsmp = 1;
+		}
+	}
+#endif
+
+#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
 	/* are we being called early in kernel startup? */
 	if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
 		u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1088,20 +1493,28 @@
 		per_cpu(x86_cpu_to_apicid, cpu) = apicid;
 		per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
 	}
+#endif
 
 	cpu_set(cpu, cpu_possible_map);
 	cpu_set(cpu, cpu_present_map);
 }
 
+int hard_smp_processor_id(void)
+{
+	return read_apic_id();
+}
+
 /*
  * Power management
  */
 #ifdef CONFIG_PM
 
 static struct {
-	/* 'active' is true if the local APIC was enabled by us and
-	   not the BIOS; this signifies that we are also responsible
-	   for disabling it before entering apm/acpi suspend */
+	/*
+	 * 'active' is true if the local APIC was enabled by us and
+	 * not the BIOS; this signifies that we are also responsible
+	 * for disabling it before entering apm/acpi suspend
+	 */
 	int active;
 	/* r/w apic fields */
 	unsigned int apic_id;
@@ -1129,7 +1542,7 @@
 
 	maxlvt = lapic_get_maxlvt();
 
-	apic_pm_state.apic_id = read_apic_id();
+	apic_pm_state.apic_id = apic_read(APIC_ID);
 	apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
 	apic_pm_state.apic_ldr = apic_read(APIC_LDR);
 	apic_pm_state.apic_dfr = apic_read(APIC_DFR);
@@ -1142,10 +1555,11 @@
 	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
 	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_INTEL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
 	if (maxlvt >= 5)
 		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
 #endif
+
 	local_irq_save(flags);
 	disable_local_APIC();
 	local_irq_restore(flags);
@@ -1164,10 +1578,25 @@
 	maxlvt = lapic_get_maxlvt();
 
 	local_irq_save(flags);
-	rdmsr(MSR_IA32_APICBASE, l, h);
-	l &= ~MSR_IA32_APICBASE_BASE;
-	l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
-	wrmsr(MSR_IA32_APICBASE, l, h);
+
+#ifdef CONFIG_X86_64
+	if (x2apic)
+		enable_x2apic();
+	else
+#endif
+	{
+		/*
+		 * Make sure the APICBASE points to the right address
+		 *
+		 * FIXME! This will be wrong if we ever support suspend on
+		 * SMP! We'll need to do this as part of the CPU restore!
+		 */
+		rdmsr(MSR_IA32_APICBASE, l, h);
+		l &= ~MSR_IA32_APICBASE_BASE;
+		l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+		wrmsr(MSR_IA32_APICBASE, l, h);
+	}
+
 	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
 	apic_write(APIC_ID, apic_pm_state.apic_id);
 	apic_write(APIC_DFR, apic_pm_state.apic_dfr);
@@ -1176,7 +1605,7 @@
 	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
 	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
 	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_INTEL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
 	if (maxlvt >= 5)
 		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
 #endif
@@ -1190,10 +1619,17 @@
 	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
+
 	local_irq_restore(flags);
+
 	return 0;
 }
 
+/*
+ * This device has no shutdown method - fully functioning local APICs
+ * are needed on every CPU up until machine_halt/restart/poweroff.
+ */
+
 static struct sysdev_class lapic_sysclass = {
 	.name		= "lapic",
 	.resume		= lapic_resume,
@@ -1307,31 +1743,19 @@
 	return (clusters > 2);
 }
 
+static __init int setup_nox2apic(char *str)
+{
+	disable_x2apic = 1;
+	clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC);
+	return 0;
+}
+early_param("nox2apic", setup_nox2apic);
+
+
 /*
  * APIC command line parameters
  */
-static int __init apic_set_verbosity(char *str)
-{
-	if (str == NULL)  {
-		skip_ioapic_setup = 0;
-		ioapic_force = 1;
-		return 0;
-	}
-	if (strcmp("debug", str) == 0)
-		apic_verbosity = APIC_DEBUG;
-	else if (strcmp("verbose", str) == 0)
-		apic_verbosity = APIC_VERBOSE;
-	else {
-		printk(KERN_WARNING "APIC Verbosity level %s not recognised"
-				" use apic=verbose or apic=debug\n", str);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-early_param("apic", apic_set_verbosity);
-
-static __init int setup_disableapic(char *str)
+static int __init setup_disableapic(char *arg)
 {
 	disable_apic = 1;
 	setup_clear_cpu_cap(X86_FEATURE_APIC);
@@ -1340,9 +1764,9 @@
 early_param("disableapic", setup_disableapic);
 
 /* same as disableapic, for compatibility */
-static __init int setup_nolapic(char *str)
+static int __init setup_nolapic(char *arg)
 {
-	return setup_disableapic(str);
+	return setup_disableapic(arg);
 }
 early_param("nolapic", setup_nolapic);
 
@@ -1353,14 +1777,19 @@
 }
 early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
 
-static __init int setup_noapictimer(char *str)
+static int __init parse_disable_apic_timer(char *arg)
 {
-	if (str[0] != ' ' && str[0] != 0)
-		return 0;
 	disable_apic_timer = 1;
-	return 1;
+	return 0;
 }
-__setup("noapictimer", setup_noapictimer);
+early_param("noapictimer", parse_disable_apic_timer);
+
+static int __init parse_nolapic_timer(char *arg)
+{
+	disable_apic_timer = 1;
+	return 0;
+}
+early_param("nolapic_timer", parse_nolapic_timer);
 
 static __init int setup_apicpmtimer(char *s)
 {
@@ -1370,6 +1799,31 @@
 }
 __setup("apicpmtimer", setup_apicpmtimer);
 
+static int __init apic_set_verbosity(char *arg)
+{
+	if (!arg)  {
+#ifdef CONFIG_X86_64
+		skip_ioapic_setup = 0;
+		ioapic_force = 1;
+		return 0;
+#endif
+		return -EINVAL;
+	}
+
+	if (strcmp("debug", arg) == 0)
+		apic_verbosity = APIC_DEBUG;
+	else if (strcmp("verbose", arg) == 0)
+		apic_verbosity = APIC_VERBOSE;
+	else {
+		printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+			" use apic=verbose or apic=debug\n", arg);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+early_param("apic", apic_set_verbosity);
+
 static int __init lapic_insert_resource(void)
 {
 	if (!apic_phys)
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index ee76eaa..7f0b45a 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,22 +3,30 @@
 #
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
-obj-y			+= proc.o feature_names.o
+obj-y			+= proc.o capflags.o powerflags.o common.o
 
-obj-$(CONFIG_X86_32)	+= common.o bugs.o
-obj-$(CONFIG_X86_64)	+= common_64.o bugs_64.o
-obj-$(CONFIG_X86_32)	+= amd.o
-obj-$(CONFIG_X86_64)	+= amd_64.o
-obj-$(CONFIG_X86_32)	+= cyrix.o
-obj-$(CONFIG_X86_32)	+= centaur.o
-obj-$(CONFIG_X86_64)	+= centaur_64.o
-obj-$(CONFIG_X86_32)	+= transmeta.o
-obj-$(CONFIG_X86_32)	+= intel.o
-obj-$(CONFIG_X86_64)	+= intel_64.o
-obj-$(CONFIG_X86_32)	+= umc.o
+obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o
+obj-$(CONFIG_X86_64)	+= bugs_64.o
+
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel.o
+obj-$(CONFIG_CPU_SUP_AMD)		+= amd.o
+obj-$(CONFIG_CPU_SUP_CYRIX_32)		+= cyrix.o
+obj-$(CONFIG_CPU_SUP_CENTAUR_32)	+= centaur.o
+obj-$(CONFIG_CPU_SUP_CENTAUR_64)	+= centaur_64.o
+obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
+obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 obj-$(CONFIG_X86_MCE)	+= mcheck/
 obj-$(CONFIG_MTRR)	+= mtrr/
 obj-$(CONFIG_CPU_FREQ)	+= cpufreq/
 
 obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+
+quiet_cmd_mkcapflags = MKCAP   $@
+      cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
+
+cpufeature = $(src)/../../../../include/asm-x86/cpufeature.h
+
+targets += capflags.c
+$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
+	$(call if_changed,mkcapflags)
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index a6ef672..0d9c993 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,6 +7,8 @@
 #include <asm/pat.h>
 #include <asm/processor.h>
 
+#include <mach_apic.h>
+
 struct cpuid_bit {
 	u16 feature;
 	u8 reg;
@@ -48,6 +50,92 @@
 	}
 }
 
+/* leaf 0xb SMT level */
+#define SMT_LEVEL	0
+
+/* leaf 0xb sub-leaf types */
+#define INVALID_TYPE	0
+#define SMT_TYPE	1
+#define CORE_TYPE	2
+
+#define LEAFB_SUBTYPE(ecx)		(((ecx) >> 8) & 0xff)
+#define BITS_SHIFT_NEXT_LEVEL(eax)	((eax) & 0x1f)
+#define LEVEL_MAX_SIBLINGS(ebx)		((ebx) & 0xffff)
+
+/*
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
+ */
+void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	unsigned int eax, ebx, ecx, edx, sub_index;
+	unsigned int ht_mask_width, core_plus_mask_width;
+	unsigned int core_select_mask, core_level_siblings;
+
+	if (c->cpuid_level < 0xb)
+		return;
+
+	cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+
+	/*
+	 * check if the cpuid leaf 0xb is actually implemented.
+	 */
+	if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
+		return;
+
+	set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
+
+	/*
+	 * initial apic id, which also represents 32-bit extended x2apic id.
+	 */
+	c->initial_apicid = edx;
+
+	/*
+	 * Populate HT related information from sub-leaf level 0.
+	 */
+	core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+	core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+
+	sub_index = 1;
+	do {
+		cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
+
+		/*
+		 * Check for the Core type in the implemented sub leaves.
+		 */
+		if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
+			core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
+			core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+			break;
+		}
+
+		sub_index++;
+	} while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
+
+	core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
+
+#ifdef CONFIG_X86_32
+	c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
+						 & core_select_mask;
+	c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
+#else
+	c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
+	c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
+#endif
+	c->x86_max_cores = (core_level_siblings / smp_num_siblings);
+
+
+	printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+	       c->phys_proc_id);
+	if (c->x86_max_cores > 1)
+		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+		       c->cpu_core_id);
+	return;
+#endif
+}
+
 #ifdef CONFIG_X86_PAT
 void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
 {
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 18514ed..32e7352 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,13 +1,22 @@
 #include <linux/init.h>
 #include <linux/bitops.h>
 #include <linux/mm.h>
+
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
 
+#ifdef CONFIG_X86_64
+# include <asm/numa_64.h>
+# include <asm/mmconfig.h>
+# include <asm/cacheflush.h>
+#endif
+
 #include <mach_apic.h>
+
 #include "cpu.h"
 
+#ifdef CONFIG_X86_32
 /*
  *	B step AMD K6 before B 9730xxxx have hardware bugs that can cause
  *	misexecution of code under Linux. Owners of such processors should
@@ -24,26 +33,273 @@
 extern void vide(void);
 __asm__(".align 4\nvide: ret");
 
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
 {
-	if (cpuid_eax(0x80000000) >= 0x80000007) {
-		c->x86_power = cpuid_edx(0x80000007);
-		if (c->x86_power & (1<<8))
-			set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+/*
+ * General Systems BIOSen alias the cpu frequency registers
+ * of the Elan at 0x000df000. Unfortuantly, one of the Linux
+ * drivers subsequently pokes it, and changes the CPU speed.
+ * Workaround : Remove the unneeded alias.
+ */
+#define CBAR		(0xfffc) /* Configuration Base Address  (32-bit) */
+#define CBAR_ENB	(0x80000000)
+#define CBAR_KEY	(0X000000CB)
+	if (c->x86_model == 9 || c->x86_model == 10) {
+		if (inl (CBAR) & CBAR_ENB)
+			outl (0 | CBAR_KEY, CBAR);
+	}
+}
+
+
+static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
+{
+	u32 l, h;
+	int mbytes = num_physpages >> (20-PAGE_SHIFT);
+
+	if (c->x86_model < 6) {
+		/* Based on AMD doc 20734R - June 2000 */
+		if (c->x86_model == 0) {
+			clear_cpu_cap(c, X86_FEATURE_APIC);
+			set_cpu_cap(c, X86_FEATURE_PGE);
+		}
+		return;
 	}
 
+	if (c->x86_model == 6 && c->x86_mask == 1) {
+		const int K6_BUG_LOOP = 1000000;
+		int n;
+		void (*f_vide)(void);
+		unsigned long d, d2;
+
+		printk(KERN_INFO "AMD K6 stepping B detected - ");
+
+		/*
+		 * It looks like AMD fixed the 2.6.2 bug and improved indirect
+		 * calls at the same time.
+		 */
+
+		n = K6_BUG_LOOP;
+		f_vide = vide;
+		rdtscl(d);
+		while (n--)
+			f_vide();
+		rdtscl(d2);
+		d = d2-d;
+
+		if (d > 20*K6_BUG_LOOP)
+			printk("system stability may be impaired when more than 32 MB are used.\n");
+		else
+			printk("probably OK (after B9730xxxx).\n");
+		printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
+	}
+
+	/* K6 with old style WHCR */
+	if (c->x86_model < 8 ||
+	   (c->x86_model == 8 && c->x86_mask < 8)) {
+		/* We can only write allocate on the low 508Mb */
+		if (mbytes > 508)
+			mbytes = 508;
+
+		rdmsr(MSR_K6_WHCR, l, h);
+		if ((l&0x0000FFFF) == 0) {
+			unsigned long flags;
+			l = (1<<0)|((mbytes/4)<<1);
+			local_irq_save(flags);
+			wbinvd();
+			wrmsr(MSR_K6_WHCR, l, h);
+			local_irq_restore(flags);
+			printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
+				mbytes);
+		}
+		return;
+	}
+
+	if ((c->x86_model == 8 && c->x86_mask > 7) ||
+	     c->x86_model == 9 || c->x86_model == 13) {
+		/* The more serious chips .. */
+
+		if (mbytes > 4092)
+			mbytes = 4092;
+
+		rdmsr(MSR_K6_WHCR, l, h);
+		if ((l&0xFFFF0000) == 0) {
+			unsigned long flags;
+			l = ((mbytes>>2)<<22)|(1<<16);
+			local_irq_save(flags);
+			wbinvd();
+			wrmsr(MSR_K6_WHCR, l, h);
+			local_irq_restore(flags);
+			printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
+				mbytes);
+		}
+
+		return;
+	}
+
+	if (c->x86_model == 10) {
+		/* AMD Geode LX is model 10 */
+		/* placeholder for any needed mods */
+		return;
+	}
+}
+
+static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
+{
+	u32 l, h;
+
+	/*
+	 * Bit 15 of Athlon specific MSR 15, needs to be 0
+	 * to enable SSE on Palomino/Morgan/Barton CPU's.
+	 * If the BIOS didn't enable it already, enable it here.
+	 */
+	if (c->x86_model >= 6 && c->x86_model <= 10) {
+		if (!cpu_has(c, X86_FEATURE_XMM)) {
+			printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+			rdmsr(MSR_K7_HWCR, l, h);
+			l &= ~0x00008000;
+			wrmsr(MSR_K7_HWCR, l, h);
+			set_cpu_cap(c, X86_FEATURE_XMM);
+		}
+	}
+
+	/*
+	 * It's been determined by AMD that Athlons since model 8 stepping 1
+	 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
+	 * As per AMD technical note 27212 0.2
+	 */
+	if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
+		rdmsr(MSR_K7_CLK_CTL, l, h);
+		if ((l & 0xfff00000) != 0x20000000) {
+			printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
+				((l & 0x000fffff)|0x20000000));
+			wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
+		}
+	}
+
+	set_cpu_cap(c, X86_FEATURE_K7);
+}
+#endif
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+static int __cpuinit nearby_node(int apicid)
+{
+	int i, node;
+
+	for (i = apicid - 1; i >= 0; i--) {
+		node = apicid_to_node[i];
+		if (node != NUMA_NO_NODE && node_online(node))
+			return node;
+	}
+	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+		node = apicid_to_node[i];
+		if (node != NUMA_NO_NODE && node_online(node))
+			return node;
+	}
+	return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+/*
+ * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
+ * Assumes number of cores is a power of two.
+ */
+static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_HT
+	unsigned bits;
+
+	bits = c->x86_coreid_bits;
+
+	/* Low order bits define the core id (index of core in socket) */
+	c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+	/* Convert the initial APIC ID into the socket ID */
+	c->phys_proc_id = c->initial_apicid >> bits;
+#endif
+}
+
+static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
+{
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+	int cpu = smp_processor_id();
+	int node;
+	unsigned apicid = hard_smp_processor_id();
+
+	node = c->phys_proc_id;
+	if (apicid_to_node[apicid] != NUMA_NO_NODE)
+		node = apicid_to_node[apicid];
+	if (!node_online(node)) {
+		/* Two possibilities here:
+		   - The CPU is missing memory and no node was created.
+		   In that case try picking one from a nearby CPU
+		   - The APIC IDs differ from the HyperTransport node IDs
+		   which the K8 northbridge parsing fills in.
+		   Assume they are all increased by a constant offset,
+		   but in the same order as the HT nodeids.
+		   If that doesn't result in a usable node fall back to the
+		   path for the previous case.  */
+
+		int ht_nodeid = c->initial_apicid;
+
+		if (ht_nodeid >= 0 &&
+		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+			node = apicid_to_node[ht_nodeid];
+		/* Pick a nearby node */
+		if (!node_online(node))
+			node = nearby_node(apicid);
+	}
+	numa_set_node(cpu, node);
+
+	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+}
+
+static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_HT
+	unsigned bits, ecx;
+
+	/* Multi core CPU? */
+	if (c->extended_cpuid_level < 0x80000008)
+		return;
+
+	ecx = cpuid_ecx(0x80000008);
+
+	c->x86_max_cores = (ecx & 0xff) + 1;
+
+	/* CPU telling us the core id bits shift? */
+	bits = (ecx >> 12) & 0xF;
+
+	/* Otherwise recompute */
+	if (bits == 0) {
+		while ((1 << bits) < c->x86_max_cores)
+			bits++;
+	}
+
+	c->x86_coreid_bits = bits;
+#endif
+}
+
+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+	early_init_amd_mc(c);
+
+	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+	if (c->x86_power & (1<<8))
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+#ifdef CONFIG_X86_64
+	set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+#else
 	/*  Set MTRR capability flag if appropriate */
-	if (c->x86_model == 13 || c->x86_model == 9 ||
-	   (c->x86_model == 8 && c->x86_mask >= 8))
-		set_cpu_cap(c, X86_FEATURE_K6_MTRR);
+	if (c->x86 == 5)
+		if (c->x86_model == 13 || c->x86_model == 9 ||
+		    (c->x86_model == 8 && c->x86_mask >= 8))
+			set_cpu_cap(c, X86_FEATURE_K6_MTRR);
+#endif
 }
 
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 {
-	u32 l, h;
-	int mbytes = num_physpages >> (20-PAGE_SHIFT);
-	int r;
-
 #ifdef CONFIG_SMP
 	unsigned long long value;
 
@@ -54,7 +310,7 @@
 	 * Errata 63 for SH-B3 steppings
 	 * Errata 122 for all steppings (F+ have it disabled by default)
 	 */
-	if (c->x86 == 15) {
+	if (c->x86 == 0xf) {
 		rdmsrl(MSR_K7_HWCR, value);
 		value |= 1 << 6;
 		wrmsrl(MSR_K7_HWCR, value);
@@ -64,209 +320,119 @@
 	early_init_amd(c);
 
 	/*
-	 *	FIXME: We should handle the K5 here. Set up the write
-	 *	range and also turn on MSR 83 bits 4 and 31 (write alloc,
-	 *	no bus pipeline)
-	 */
-
-	/*
 	 * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
 	 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
 	 */
 	clear_cpu_cap(c, 0*32+31);
 
-	r = get_model_name(c);
+#ifdef CONFIG_X86_64
+	/* On C+ stepping K8 rep microcode works well for copy/memset */
+	if (c->x86 == 0xf) {
+		u32 level;
+
+		level = cpuid_eax(1);
+		if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+			set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	}
+	if (c->x86 == 0x10 || c->x86 == 0x11)
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+#else
+
+	/*
+	 *	FIXME: We should handle the K5 here. Set up the write
+	 *	range and also turn on MSR 83 bits 4 and 31 (write alloc,
+	 *	no bus pipeline)
+	 */
 
 	switch (c->x86) {
 	case 4:
-		/*
-		 * General Systems BIOSen alias the cpu frequency registers
-		 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
-		 * drivers subsequently pokes it, and changes the CPU speed.
-		 * Workaround : Remove the unneeded alias.
-		 */
-#define CBAR		(0xfffc) /* Configuration Base Address  (32-bit) */
-#define CBAR_ENB	(0x80000000)
-#define CBAR_KEY	(0X000000CB)
-			if (c->x86_model == 9 || c->x86_model == 10) {
-				if (inl (CBAR) & CBAR_ENB)
-					outl (0 | CBAR_KEY, CBAR);
-			}
-			break;
+		init_amd_k5(c);
+		break;
 	case 5:
-			if (c->x86_model < 6) {
-				/* Based on AMD doc 20734R - June 2000 */
-				if (c->x86_model == 0) {
-					clear_cpu_cap(c, X86_FEATURE_APIC);
-					set_cpu_cap(c, X86_FEATURE_PGE);
-				}
-				break;
-			}
-
-			if (c->x86_model == 6 && c->x86_mask == 1) {
-				const int K6_BUG_LOOP = 1000000;
-				int n;
-				void (*f_vide)(void);
-				unsigned long d, d2;
-
-				printk(KERN_INFO "AMD K6 stepping B detected - ");
-
-				/*
-				 * It looks like AMD fixed the 2.6.2 bug and improved indirect
-				 * calls at the same time.
-				 */
-
-				n = K6_BUG_LOOP;
-				f_vide = vide;
-				rdtscl(d);
-				while (n--)
-					f_vide();
-				rdtscl(d2);
-				d = d2-d;
-
-				if (d > 20*K6_BUG_LOOP)
-					printk("system stability may be impaired when more than 32 MB are used.\n");
-				else
-					printk("probably OK (after B9730xxxx).\n");
-				printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
-			}
-
-			/* K6 with old style WHCR */
-			if (c->x86_model < 8 ||
-			   (c->x86_model == 8 && c->x86_mask < 8)) {
-				/* We can only write allocate on the low 508Mb */
-				if (mbytes > 508)
-					mbytes = 508;
-
-				rdmsr(MSR_K6_WHCR, l, h);
-				if ((l&0x0000FFFF) == 0) {
-					unsigned long flags;
-					l = (1<<0)|((mbytes/4)<<1);
-					local_irq_save(flags);
-					wbinvd();
-					wrmsr(MSR_K6_WHCR, l, h);
-					local_irq_restore(flags);
-					printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
-						mbytes);
-				}
-				break;
-			}
-
-			if ((c->x86_model == 8 && c->x86_mask > 7) ||
-			     c->x86_model == 9 || c->x86_model == 13) {
-				/* The more serious chips .. */
-
-				if (mbytes > 4092)
-					mbytes = 4092;
-
-				rdmsr(MSR_K6_WHCR, l, h);
-				if ((l&0xFFFF0000) == 0) {
-					unsigned long flags;
-					l = ((mbytes>>2)<<22)|(1<<16);
-					local_irq_save(flags);
-					wbinvd();
-					wrmsr(MSR_K6_WHCR, l, h);
-					local_irq_restore(flags);
-					printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
-						mbytes);
-				}
-
-				break;
-			}
-
-			if (c->x86_model == 10) {
-				/* AMD Geode LX is model 10 */
-				/* placeholder for any needed mods */
-				break;
-			}
-			break;
+		init_amd_k6(c);
+		break;
 	case 6: /* An Athlon/Duron */
-
-			/*
-			 * Bit 15 of Athlon specific MSR 15, needs to be 0
-			 * to enable SSE on Palomino/Morgan/Barton CPU's.
-			 * If the BIOS didn't enable it already, enable it here.
-			 */
-			if (c->x86_model >= 6 && c->x86_model <= 10) {
-				if (!cpu_has(c, X86_FEATURE_XMM)) {
-					printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
-					rdmsr(MSR_K7_HWCR, l, h);
-					l &= ~0x00008000;
-					wrmsr(MSR_K7_HWCR, l, h);
-					set_cpu_cap(c, X86_FEATURE_XMM);
-				}
-			}
-
-			/*
-			 * It's been determined by AMD that Athlons since model 8 stepping 1
-			 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
-			 * As per AMD technical note 27212 0.2
-			 */
-			if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
-				rdmsr(MSR_K7_CLK_CTL, l, h);
-				if ((l & 0xfff00000) != 0x20000000) {
-					printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
-						((l & 0x000fffff)|0x20000000));
-					wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
-				}
-			}
-			break;
-	}
-
-	switch (c->x86) {
-	case 15:
-	/* Use K8 tuning for Fam10h and Fam11h */
-	case 0x10:
-	case 0x11:
-		set_cpu_cap(c, X86_FEATURE_K8);
+		init_amd_k7(c);
 		break;
-	case 6:
-		set_cpu_cap(c, X86_FEATURE_K7);
-		break;
-	}
-	if (c->x86 >= 6)
-		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
-	display_cacheinfo(c);
-
-	if (cpuid_eax(0x80000000) >= 0x80000008)
-		c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
-
-#ifdef CONFIG_X86_HT
-	/*
-	 * On a AMD multi core setup the lower bits of the APIC id
-	 * distinguish the cores.
-	 */
-	if (c->x86_max_cores > 1) {
-		int cpu = smp_processor_id();
-		unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
-
-		if (bits == 0) {
-			while ((1 << bits) < c->x86_max_cores)
-				bits++;
-		}
-		c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
-		c->phys_proc_id >>= bits;
-		printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
-		       cpu, c->x86_max_cores, c->cpu_core_id);
-	}
-#endif
-
-	if (cpuid_eax(0x80000000) >= 0x80000006) {
-		if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000))
-			num_cache_leaves = 4;
-		else
-			num_cache_leaves = 3;
 	}
 
 	/* K6s reports MCEs but don't actually have all the MSRs */
 	if (c->x86 < 6)
 		clear_cpu_cap(c, X86_FEATURE_MCE);
+#endif
 
-	if (cpu_has_xmm2)
+	/* Enable workaround for FXSAVE leak */
+	if (c->x86 >= 6)
+		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
+
+	if (!c->x86_model_id[0]) {
+		switch (c->x86) {
+		case 0xf:
+			/* Should distinguish Models here, but this is only
+			   a fallback anyways. */
+			strcpy(c->x86_model_id, "Hammer");
+			break;
+		}
+	}
+
+	display_cacheinfo(c);
+
+	/* Multi core CPU? */
+	if (c->extended_cpuid_level >= 0x80000008) {
+		amd_detect_cmp(c);
+		srat_detect_node(c);
+	}
+
+#ifdef CONFIG_X86_32
+	detect_ht(c);
+#endif
+
+	if (c->extended_cpuid_level >= 0x80000006) {
+		if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
+			num_cache_leaves = 4;
+		else
+			num_cache_leaves = 3;
+	}
+
+	if (c->x86 >= 0xf && c->x86 <= 0x11)
+		set_cpu_cap(c, X86_FEATURE_K8);
+
+	if (cpu_has_xmm2) {
+		/* MFENCE stops RDTSC speculation */
 		set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+	}
+
+#ifdef CONFIG_X86_64
+	if (c->x86 == 0x10) {
+		/* do this for boot cpu */
+		if (c == &boot_cpu_data)
+			check_enable_amd_mmconf_dmi();
+
+		fam10h_check_enable_mmcfg();
+	}
+
+	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+		unsigned long long tseg;
+
+		/*
+		 * Split up direct mapping around the TSEG SMM area.
+		 * Don't do it for gbpages because there seems very little
+		 * benefit in doing so.
+		 */
+		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+		    printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+		    if ((tseg>>PMD_SHIFT) <
+				(max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
+			((tseg>>PMD_SHIFT) <
+				(max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
+			 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
+			set_memory_4k((unsigned long)__va(tseg), 1);
+		}
+	}
+#endif
 }
 
+#ifdef CONFIG_X86_32
 static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 {
 	/* AMD errata T13 (order #21922) */
@@ -279,10 +445,12 @@
 	}
 	return size;
 }
+#endif
 
 static struct cpu_dev amd_cpu_dev __cpuinitdata = {
 	.c_vendor	= "AMD",
 	.c_ident	= { "AuthenticAMD" },
+#ifdef CONFIG_X86_32
 	.c_models = {
 		{ .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
 		  {
@@ -295,9 +463,11 @@
 		  }
 		},
 	},
+	.c_size_cache	= amd_size_cache,
+#endif
 	.c_early_init   = early_init_amd,
 	.c_init		= init_amd,
-	.c_size_cache	= amd_size_cache,
+	.c_x86_vendor	= X86_VENDOR_AMD,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
+cpu_dev_register(amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
deleted file mode 100644
index d1692b2..0000000
--- a/arch/x86/kernel/cpu/amd_64.c
+++ /dev/null
@@ -1,224 +0,0 @@
-#include <linux/init.h>
-#include <linux/mm.h>
-
-#include <asm/numa_64.h>
-#include <asm/mmconfig.h>
-#include <asm/cacheflush.h>
-
-#include <mach_apic.h>
-
-#include "cpu.h"
-
-int force_mwait __cpuinitdata;
-
-#ifdef CONFIG_NUMA
-static int __cpuinit nearby_node(int apicid)
-{
-	int i, node;
-
-	for (i = apicid - 1; i >= 0; i--) {
-		node = apicid_to_node[i];
-		if (node != NUMA_NO_NODE && node_online(node))
-			return node;
-	}
-	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
-		node = apicid_to_node[i];
-		if (node != NUMA_NO_NODE && node_online(node))
-			return node;
-	}
-	return first_node(node_online_map); /* Shouldn't happen */
-}
-#endif
-
-/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
- * Assumes number of cores is a power of two.
- */
-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	unsigned bits;
-#ifdef CONFIG_NUMA
-	int cpu = smp_processor_id();
-	int node = 0;
-	unsigned apicid = hard_smp_processor_id();
-#endif
-	bits = c->x86_coreid_bits;
-
-	/* Low order bits define the core id (index of core in socket) */
-	c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
-	/* Convert the initial APIC ID into the socket ID */
-	c->phys_proc_id = c->initial_apicid >> bits;
-
-#ifdef CONFIG_NUMA
-	node = c->phys_proc_id;
-	if (apicid_to_node[apicid] != NUMA_NO_NODE)
-		node = apicid_to_node[apicid];
-	if (!node_online(node)) {
-		/* Two possibilities here:
-		   - The CPU is missing memory and no node was created.
-		   In that case try picking one from a nearby CPU
-		   - The APIC IDs differ from the HyperTransport node IDs
-		   which the K8 northbridge parsing fills in.
-		   Assume they are all increased by a constant offset,
-		   but in the same order as the HT nodeids.
-		   If that doesn't result in a usable node fall back to the
-		   path for the previous case.  */
-
-		int ht_nodeid = c->initial_apicid;
-
-		if (ht_nodeid >= 0 &&
-		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
-			node = apicid_to_node[ht_nodeid];
-		/* Pick a nearby node */
-		if (!node_online(node))
-			node = nearby_node(apicid);
-	}
-	numa_set_node(cpu, node);
-
-	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-#endif
-}
-
-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	unsigned bits, ecx;
-
-	/* Multi core CPU? */
-	if (c->extended_cpuid_level < 0x80000008)
-		return;
-
-	ecx = cpuid_ecx(0x80000008);
-
-	c->x86_max_cores = (ecx & 0xff) + 1;
-
-	/* CPU telling us the core id bits shift? */
-	bits = (ecx >> 12) & 0xF;
-
-	/* Otherwise recompute */
-	if (bits == 0) {
-		while ((1 << bits) < c->x86_max_cores)
-			bits++;
-	}
-
-	c->x86_coreid_bits = bits;
-
-#endif
-}
-
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
-{
-	early_init_amd_mc(c);
-
-	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
-	if (c->x86_power & (1<<8))
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
-	set_cpu_cap(c, X86_FEATURE_SYSCALL32);
-}
-
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
-{
-	unsigned level;
-
-#ifdef CONFIG_SMP
-	unsigned long value;
-
-	/*
-	 * Disable TLB flush filter by setting HWCR.FFDIS on K8
-	 * bit 6 of msr C001_0015
-	 *
-	 * Errata 63 for SH-B3 steppings
-	 * Errata 122 for all steppings (F+ have it disabled by default)
-	 */
-	if (c->x86 == 0xf) {
-		rdmsrl(MSR_K8_HWCR, value);
-		value |= 1 << 6;
-		wrmsrl(MSR_K8_HWCR, value);
-	}
-#endif
-
-	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
-	   3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
-	clear_cpu_cap(c, 0*32+31);
-
-	/* On C+ stepping K8 rep microcode works well for copy/memset */
-	if (c->x86 == 0xf) {
-		level = cpuid_eax(1);
-		if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
-			set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	}
-	if (c->x86 == 0x10 || c->x86 == 0x11)
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
-	/* Enable workaround for FXSAVE leak */
-	if (c->x86 >= 6)
-		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
-	level = get_model_name(c);
-	if (!level) {
-		switch (c->x86) {
-		case 0xf:
-			/* Should distinguish Models here, but this is only
-			   a fallback anyways. */
-			strcpy(c->x86_model_id, "Hammer");
-			break;
-		}
-	}
-	display_cacheinfo(c);
-
-	/* Multi core CPU? */
-	if (c->extended_cpuid_level >= 0x80000008)
-		amd_detect_cmp(c);
-
-	if (c->extended_cpuid_level >= 0x80000006 &&
-		(cpuid_edx(0x80000006) & 0xf000))
-		num_cache_leaves = 4;
-	else
-		num_cache_leaves = 3;
-
-	if (c->x86 >= 0xf && c->x86 <= 0x11)
-		set_cpu_cap(c, X86_FEATURE_K8);
-
-	/* MFENCE stops RDTSC speculation */
-	set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
-
-	if (c->x86 == 0x10) {
-		/* do this for boot cpu */
-		if (c == &boot_cpu_data)
-			check_enable_amd_mmconf_dmi();
-
-		fam10h_check_enable_mmcfg();
-	}
-
-	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
-		unsigned long long tseg;
-
-		/*
-		 * Split up direct mapping around the TSEG SMM area.
-		 * Don't do it for gbpages because there seems very little
-		 * benefit in doing so.
-		 */
-		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
-		    printk(KERN_DEBUG "tseg: %010llx\n", tseg);
-		    if ((tseg>>PMD_SHIFT) <
-				(max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
-			((tseg>>PMD_SHIFT) <
-				(max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
-			 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
-			set_memory_4k((unsigned long)__va(tseg), 1);
-		}
-	}
-}
-
-static struct cpu_dev amd_cpu_dev __cpuinitdata = {
-	.c_vendor	= "AMD",
-	.c_ident	= { "AuthenticAMD" },
-	.c_early_init   = early_init_amd,
-	.c_init		= init_amd,
-};
-
-cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
-
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index a0534c0..89bfdd9 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -289,7 +289,6 @@
 	if (c->x86_model >= 6 && c->x86_model < 9)
 		set_cpu_cap(c, X86_FEATURE_3DNOW);
 
-	get_model_name(c);
 	display_cacheinfo(c);
 }
 
@@ -475,6 +474,7 @@
 	.c_early_init	= early_init_centaur,
 	.c_init		= init_centaur,
 	.c_size_cache	= centaur_size_cache,
+	.c_x86_vendor	= X86_VENDOR_CENTAUR,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev);
+cpu_dev_register(centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 1d181c4..a1625f5 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -16,9 +16,10 @@
 
 static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 {
+	early_init_centaur(c);
+
 	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
 		c->x86_cache_alignment = c->x86_clflush_size * 2;
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 	}
 	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -29,7 +30,8 @@
 	.c_ident	= { "CentaurHauls" },
 	.c_early_init	= early_init_centaur,
 	.c_init		= init_centaur,
+	.c_x86_vendor	= X86_VENDOR_CENTAUR,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev);
+cpu_dev_register(centaur_cpu_dev);
 
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
new file mode 100644
index 0000000..2056ccf
--- /dev/null
+++ b/arch/x86/kernel/cpu/cmpxchg.c
@@ -0,0 +1,72 @@
+/*
+ * cmpxchg*() fallbacks for CPU not supporting these instructions
+ */
+
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+
+#ifndef CONFIG_X86_CMPXCHG
+unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
+{
+	u8 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u8 *)ptr;
+	if (prev == old)
+		*(u8 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u8);
+
+unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
+{
+	u16 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u16 *)ptr;
+	if (prev == old)
+		*(u16 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u16);
+
+unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
+{
+	u32 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u32 *)ptr;
+	if (prev == old)
+		*(u32 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u32);
+#endif
+
+#ifndef CONFIG_X86_CMPXCHG64
+unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
+{
+	u64 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u64 *)ptr;
+	if (prev == old)
+		*(u64 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_486_u64);
+#endif
+
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4e456bd..7581b62 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,28 +1,62 @@
 #include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/kgdb.h>
+#include <linux/topology.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
-#include <linux/module.h>
 #include <linux/percpu.h>
-#include <linux/bootmem.h>
-#include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/msr.h>
 #include <asm/io.h>
+#include <asm/linkage.h>
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/pat.h>
 #include <asm/asm.h>
+#include <asm/numa.h>
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/mpspec.h>
 #include <asm/apic.h>
 #include <mach_apic.h>
+#include <asm/genapic.h>
 #endif
 
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+
 #include "cpu.h"
 
+static struct cpu_dev *this_cpu __cpuinitdata;
+
+#ifdef CONFIG_X86_64
+/* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types  kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ */
+/* The TLS descriptors are currently at a different place compared to i386.
+   Hopefully nobody expects them at a fixed place (Wine?) */
 DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+	[GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
+	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
+	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
+	[GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
+	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
+	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+} };
+#else
+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
 	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
 	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -56,34 +90,13 @@
 	[GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
 	[GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
 } };
+#endif
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
-
+#ifdef CONFIG_X86_32
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_serial_nr __cpuinitdata = 1;
 
-struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
-
-static void __cpuinit default_init(struct cpuinfo_x86 *c)
-{
-	/* Not much we can do here... */
-	/* Check if at least it has cpuid */
-	if (c->cpuid_level == -1) {
-		/* No cpuid. It must be an ancient CPU */
-		if (c->x86 == 4)
-			strcpy(c->x86_model_id, "486");
-		else if (c->x86 == 3)
-			strcpy(c->x86_model_id, "386");
-	}
-}
-
-static struct cpu_dev __cpuinitdata default_cpu = {
-	.c_init	= default_init,
-	.c_vendor = "Unknown",
-};
-static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
-
 static int __init cachesize_setup(char *str)
 {
 	get_option(&str, &cachesize_override);
@@ -91,71 +104,84 @@
 }
 __setup("cachesize=", cachesize_setup);
 
-int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+static int __init x86_fxsr_setup(char *s)
 {
-	unsigned int *v;
-	char *p, *q;
-
-	if (cpuid_eax(0x80000000) < 0x80000004)
-		return 0;
-
-	v = (unsigned int *) c->x86_model_id;
-	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
-	cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
-	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
-	c->x86_model_id[48] = 0;
-
-	/* Intel chips right-justify this string for some dumb reason;
-	   undo that brain damage */
-	p = q = &c->x86_model_id[0];
-	while (*p == ' ')
-	     p++;
-	if (p != q) {
-	     while (*p)
-		  *q++ = *p++;
-	     while (q <= &c->x86_model_id[48])
-		  *q++ = '\0';	/* Zero-pad the rest */
-	}
-
+	setup_clear_cpu_cap(X86_FEATURE_FXSR);
+	setup_clear_cpu_cap(X86_FEATURE_XMM);
 	return 1;
 }
+__setup("nofxsr", x86_fxsr_setup);
 
-
-void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+static int __init x86_sep_setup(char *s)
 {
-	unsigned int n, dummy, ecx, edx, l2size;
-
-	n = cpuid_eax(0x80000000);
-
-	if (n >= 0x80000005) {
-		cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
-		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
-			edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
-		c->x86_cache_size = (ecx>>24)+(edx>>24);
-	}
-
-	if (n < 0x80000006)	/* Some chips just has a large L1. */
-		return;
-
-	ecx = cpuid_ecx(0x80000006);
-	l2size = ecx >> 16;
-
-	/* do processor-specific cache resizing */
-	if (this_cpu->c_size_cache)
-		l2size = this_cpu->c_size_cache(c, l2size);
-
-	/* Allow user to override all this if necessary. */
-	if (cachesize_override != -1)
-		l2size = cachesize_override;
-
-	if (l2size == 0)
-		return;		/* Again, no L2 cache is possible */
-
-	c->x86_cache_size = l2size;
-
-	printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
-	       l2size, ecx & 0xFF);
+	setup_clear_cpu_cap(X86_FEATURE_SEP);
+	return 1;
 }
+__setup("nosep", x86_sep_setup);
+
+/* Standard macro to see if a specific flag is changeable */
+static inline int flag_is_changeable_p(u32 flag)
+{
+	u32 f1, f2;
+
+	asm("pushfl\n\t"
+	    "pushfl\n\t"
+	    "popl %0\n\t"
+	    "movl %0,%1\n\t"
+	    "xorl %2,%0\n\t"
+	    "pushl %0\n\t"
+	    "popfl\n\t"
+	    "pushfl\n\t"
+	    "popl %0\n\t"
+	    "popfl\n\t"
+	    : "=&r" (f1), "=&r" (f2)
+	    : "ir" (flag));
+
+	return ((f1^f2) & flag) != 0;
+}
+
+/* Probe for the CPUID instruction */
+static int __cpuinit have_cpuid_p(void)
+{
+	return flag_is_changeable_p(X86_EFLAGS_ID);
+}
+
+static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+	if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
+		/* Disable processor serial number */
+		unsigned long lo, hi;
+		rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+		lo |= 0x200000;
+		wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+		printk(KERN_NOTICE "CPU serial number disabled.\n");
+		clear_cpu_cap(c, X86_FEATURE_PN);
+
+		/* Disabling the serial number may affect the cpuid level */
+		c->cpuid_level = cpuid_eax(0);
+	}
+}
+
+static int __init x86_serial_nr_setup(char *s)
+{
+	disable_x86_serial_nr = 0;
+	return 1;
+}
+__setup("serialnumber", x86_serial_nr_setup);
+#else
+static inline int flag_is_changeable_p(u32 flag)
+{
+	return 1;
+}
+/* Probe for the CPUID instruction */
+static inline int have_cpuid_p(void)
+{
+	return 1;
+}
+static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+}
+#endif
 
 /*
  * Naming convention should be: <Name> [(<Codename>)]
@@ -185,81 +211,210 @@
 	return NULL;		/* Not found */
 }
 
+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
 
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+void switch_to_new_gdt(void)
+{
+	struct desc_ptr gdt_descr;
+
+	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
+#ifdef CONFIG_X86_32
+	asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
+#endif
+}
+
+static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+	display_cacheinfo(c);
+#else
+	/* Not much we can do here... */
+	/* Check if at least it has cpuid */
+	if (c->cpuid_level == -1) {
+		/* No cpuid. It must be an ancient CPU */
+		if (c->x86 == 4)
+			strcpy(c->x86_model_id, "486");
+		else if (c->x86 == 3)
+			strcpy(c->x86_model_id, "386");
+	}
+#endif
+}
+
+static struct cpu_dev __cpuinitdata default_cpu = {
+	.c_init	= default_init,
+	.c_vendor = "Unknown",
+	.c_x86_vendor = X86_VENDOR_UNKNOWN,
+};
+
+static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
+{
+	unsigned int *v;
+	char *p, *q;
+
+	if (c->extended_cpuid_level < 0x80000004)
+		return;
+
+	v = (unsigned int *) c->x86_model_id;
+	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+	cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+	c->x86_model_id[48] = 0;
+
+	/* Intel chips right-justify this string for some dumb reason;
+	   undo that brain damage */
+	p = q = &c->x86_model_id[0];
+	while (*p == ' ')
+	     p++;
+	if (p != q) {
+	     while (*p)
+		  *q++ = *p++;
+	     while (q <= &c->x86_model_id[48])
+		  *q++ = '\0';	/* Zero-pad the rest */
+	}
+}
+
+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+{
+	unsigned int n, dummy, ebx, ecx, edx, l2size;
+
+	n = c->extended_cpuid_level;
+
+	if (n >= 0x80000005) {
+		cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
+				edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+		c->x86_cache_size = (ecx>>24) + (edx>>24);
+#ifdef CONFIG_X86_64
+		/* On K8 L1 TLB is inclusive, so don't count it */
+		c->x86_tlbsize = 0;
+#endif
+	}
+
+	if (n < 0x80000006)	/* Some chips just has a large L1. */
+		return;
+
+	cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+	l2size = ecx >> 16;
+
+#ifdef CONFIG_X86_64
+	c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+#else
+	/* do processor-specific cache resizing */
+	if (this_cpu->c_size_cache)
+		l2size = this_cpu->c_size_cache(c, l2size);
+
+	/* Allow user to override all this if necessary. */
+	if (cachesize_override != -1)
+		l2size = cachesize_override;
+
+	if (l2size == 0)
+		return;		/* Again, no L2 cache is possible */
+#endif
+
+	c->x86_cache_size = l2size;
+
+	printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+			l2size, ecx & 0xFF);
+}
+
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_HT
+	u32 eax, ebx, ecx, edx;
+	int index_msb, core_bits;
+
+	if (!cpu_has(c, X86_FEATURE_HT))
+		return;
+
+	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+		goto out;
+
+	if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
+		return;
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+
+	smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+	if (smp_num_siblings == 1) {
+		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
+	} else if (smp_num_siblings > 1) {
+
+		if (smp_num_siblings > NR_CPUS) {
+			printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
+					smp_num_siblings);
+			smp_num_siblings = 1;
+			return;
+		}
+
+		index_msb = get_count_order(smp_num_siblings);
+#ifdef CONFIG_X86_64
+		c->phys_proc_id = phys_pkg_id(index_msb);
+#else
+		c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
+#endif
+
+		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+		index_msb = get_count_order(smp_num_siblings);
+
+		core_bits = get_count_order(c->x86_max_cores);
+
+#ifdef CONFIG_X86_64
+		c->cpu_core_id = phys_pkg_id(index_msb) &
+					       ((1 << core_bits) - 1);
+#else
+		c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
+					       ((1 << core_bits) - 1);
+#endif
+	}
+
+out:
+	if ((c->x86_max_cores * smp_num_siblings) > 1) {
+		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+		       c->phys_proc_id);
+		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+		       c->cpu_core_id);
+	}
+#endif
+}
+
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
 	char *v = c->x86_vendor_id;
 	int i;
 	static int printed;
 
 	for (i = 0; i < X86_VENDOR_NUM; i++) {
-		if (cpu_devs[i]) {
-			if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
-			    (cpu_devs[i]->c_ident[1] &&
-			     !strcmp(v, cpu_devs[i]->c_ident[1]))) {
-				c->x86_vendor = i;
-				if (!early)
-					this_cpu = cpu_devs[i];
-				return;
-			}
+		if (!cpu_devs[i])
+			break;
+
+		if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+		    (cpu_devs[i]->c_ident[1] &&
+		     !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+			this_cpu = cpu_devs[i];
+			c->x86_vendor = this_cpu->c_x86_vendor;
+			return;
 		}
 	}
+
 	if (!printed) {
 		printed++;
-		printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
+		printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
 		printk(KERN_ERR "CPU: Your system may be unstable.\n");
 	}
+
 	c->x86_vendor = X86_VENDOR_UNKNOWN;
 	this_cpu = &default_cpu;
 }
 
-
-static int __init x86_fxsr_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_FXSR);
-	setup_clear_cpu_cap(X86_FEATURE_XMM);
-	return 1;
-}
-__setup("nofxsr", x86_fxsr_setup);
-
-
-static int __init x86_sep_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_SEP);
-	return 1;
-}
-__setup("nosep", x86_sep_setup);
-
-
-/* Standard macro to see if a specific flag is changeable */
-static inline int flag_is_changeable_p(u32 flag)
-{
-	u32 f1, f2;
-
-	asm("pushfl\n\t"
-	    "pushfl\n\t"
-	    "popl %0\n\t"
-	    "movl %0,%1\n\t"
-	    "xorl %2,%0\n\t"
-	    "pushl %0\n\t"
-	    "popfl\n\t"
-	    "pushfl\n\t"
-	    "popl %0\n\t"
-	    "popfl\n\t"
-	    : "=&r" (f1), "=&r" (f2)
-	    : "ir" (flag));
-
-	return ((f1^f2) & flag) != 0;
-}
-
-
-/* Probe for the CPUID instruction */
-static int __cpuinit have_cpuid_p(void)
-{
-	return flag_is_changeable_p(X86_EFLAGS_ID);
-}
-
-void __init cpu_detect(struct cpuinfo_x86 *c)
+void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
 {
 	/* Get vendor name */
 	cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -268,48 +423,85 @@
 	      (unsigned int *)&c->x86_vendor_id[4]);
 
 	c->x86 = 4;
+	/* Intel-defined flags: level 0x00000001 */
 	if (c->cpuid_level >= 0x00000001) {
 		u32 junk, tfms, cap0, misc;
 		cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
-		c->x86 = (tfms >> 8) & 15;
-		c->x86_model = (tfms >> 4) & 15;
+		c->x86 = (tfms >> 8) & 0xf;
+		c->x86_model = (tfms >> 4) & 0xf;
+		c->x86_mask = tfms & 0xf;
 		if (c->x86 == 0xf)
 			c->x86 += (tfms >> 20) & 0xff;
 		if (c->x86 >= 0x6)
-			c->x86_model += ((tfms >> 16) & 0xF) << 4;
-		c->x86_mask = tfms & 15;
+			c->x86_model += ((tfms >> 16) & 0xf) << 4;
 		if (cap0 & (1<<19)) {
-			c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
 			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+			c->x86_cache_alignment = c->x86_clflush_size;
 		}
 	}
 }
-static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
+
+static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 {
 	u32 tfms, xlvl;
-	unsigned int ebx;
+	u32 ebx;
 
-	memset(&c->x86_capability, 0, sizeof c->x86_capability);
-	if (have_cpuid_p()) {
-		/* Intel-defined flags: level 0x00000001 */
-		if (c->cpuid_level >= 0x00000001) {
-			u32 capability, excap;
-			cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
-			c->x86_capability[0] = capability;
-			c->x86_capability[4] = excap;
-		}
-
-		/* AMD-defined flags: level 0x80000001 */
-		xlvl = cpuid_eax(0x80000000);
-		if ((xlvl & 0xffff0000) == 0x80000000) {
-			if (xlvl >= 0x80000001) {
-				c->x86_capability[1] = cpuid_edx(0x80000001);
-				c->x86_capability[6] = cpuid_ecx(0x80000001);
-			}
-		}
-
+	/* Intel-defined flags: level 0x00000001 */
+	if (c->cpuid_level >= 0x00000001) {
+		u32 capability, excap;
+		cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
+		c->x86_capability[0] = capability;
+		c->x86_capability[4] = excap;
 	}
 
+	/* AMD-defined flags: level 0x80000001 */
+	xlvl = cpuid_eax(0x80000000);
+	c->extended_cpuid_level = xlvl;
+	if ((xlvl & 0xffff0000) == 0x80000000) {
+		if (xlvl >= 0x80000001) {
+			c->x86_capability[1] = cpuid_edx(0x80000001);
+			c->x86_capability[6] = cpuid_ecx(0x80000001);
+		}
+	}
+
+#ifdef CONFIG_X86_64
+	if (c->extended_cpuid_level >= 0x80000008) {
+		u32 eax = cpuid_eax(0x80000008);
+
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+	}
+#endif
+
+	if (c->extended_cpuid_level >= 0x80000007)
+		c->x86_power = cpuid_edx(0x80000007);
+
+}
+
+static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+	int i;
+
+	/*
+	 * First of all, decide if this is a 486 or higher
+	 * It's a 486 if we can modify the AC flag
+	 */
+	if (flag_is_changeable_p(X86_EFLAGS_AC))
+		c->x86 = 4;
+	else
+		c->x86 = 3;
+
+	for (i = 0; i < X86_VENDOR_NUM; i++)
+		if (cpu_devs[i] && cpu_devs[i]->c_identify) {
+			c->x86_vendor_id[0] = 0;
+			cpu_devs[i]->c_identify(c);
+			if (c->x86_vendor_id[0]) {
+				get_cpu_vendor(c);
+				break;
+			}
+		}
+#endif
 }
 
 /*
@@ -321,25 +513,61 @@
  * WARNING: this function is only called on the BP.  Don't add code here
  * that is supposed to run on all CPUs.
  */
-static void __init early_cpu_detect(void)
+static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 {
-	struct cpuinfo_x86 *c = &boot_cpu_data;
-
-	c->x86_cache_alignment = 32;
+#ifdef CONFIG_X86_64
+	c->x86_clflush_size = 64;
+#else
 	c->x86_clflush_size = 32;
+#endif
+	c->x86_cache_alignment = c->x86_clflush_size;
 
+	memset(&c->x86_capability, 0, sizeof c->x86_capability);
+	c->extended_cpuid_level = 0;
+
+	if (!have_cpuid_p())
+		identify_cpu_without_cpuid(c);
+
+	/* cyrix could have cpuid enabled via c_identify()*/
 	if (!have_cpuid_p())
 		return;
 
 	cpu_detect(c);
 
-	get_cpu_vendor(c, 1);
+	get_cpu_vendor(c);
 
-	early_get_cap(c);
+	get_cpu_cap(c);
 
-	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
-	    cpu_devs[c->x86_vendor]->c_early_init)
-		cpu_devs[c->x86_vendor]->c_early_init(c);
+	if (this_cpu->c_early_init)
+		this_cpu->c_early_init(c);
+
+	validate_pat_support(c);
+}
+
+void __init early_cpu_init(void)
+{
+	struct cpu_dev **cdev;
+	int count = 0;
+
+	printk("KERNEL supported cpus:\n");
+	for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
+		struct cpu_dev *cpudev = *cdev;
+		unsigned int j;
+
+		if (count >= X86_VENDOR_NUM)
+			break;
+		cpu_devs[count] = cpudev;
+		count++;
+
+		for (j = 0; j < 2; j++) {
+			if (!cpudev->c_ident[j])
+				continue;
+			printk("  %s %s\n", cpudev->c_vendor,
+				cpudev->c_ident[j]);
+		}
+	}
+
+	early_identify_cpu(&boot_cpu_data);
 }
 
 /*
@@ -357,87 +585,42 @@
 
 static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
 {
-	u32 tfms, xlvl;
-	unsigned int ebx;
+	c->extended_cpuid_level = 0;
 
-	if (have_cpuid_p()) {
-		/* Get vendor name */
-		cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
-		      (unsigned int *)&c->x86_vendor_id[0],
-		      (unsigned int *)&c->x86_vendor_id[8],
-		      (unsigned int *)&c->x86_vendor_id[4]);
+	if (!have_cpuid_p())
+		identify_cpu_without_cpuid(c);
 
-		get_cpu_vendor(c, 0);
-		/* Initialize the standard set of capabilities */
-		/* Note that the vendor-specific code below might override */
-		/* Intel-defined flags: level 0x00000001 */
-		if (c->cpuid_level >= 0x00000001) {
-			u32 capability, excap;
-			cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
-			c->x86_capability[0] = capability;
-			c->x86_capability[4] = excap;
-			c->x86 = (tfms >> 8) & 15;
-			c->x86_model = (tfms >> 4) & 15;
-			if (c->x86 == 0xf)
-				c->x86 += (tfms >> 20) & 0xff;
-			if (c->x86 >= 0x6)
-				c->x86_model += ((tfms >> 16) & 0xF) << 4;
-			c->x86_mask = tfms & 15;
-			c->initial_apicid = (ebx >> 24) & 0xFF;
-#ifdef CONFIG_X86_HT
-			c->apicid = phys_pkg_id(c->initial_apicid, 0);
-			c->phys_proc_id = c->initial_apicid;
-#else
-			c->apicid = c->initial_apicid;
+	/* cyrix could have cpuid enabled via c_identify()*/
+	if (!have_cpuid_p())
+		return;
+
+	cpu_detect(c);
+
+	get_cpu_vendor(c);
+
+	get_cpu_cap(c);
+
+	if (c->cpuid_level >= 0x00000001) {
+		c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_HT
+		c->apicid = phys_pkg_id(c->initial_apicid, 0);
+# else
+		c->apicid = c->initial_apicid;
+# endif
 #endif
-			if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
-				c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
-		} else {
-			/* Have CPUID level 0 only - unheard of */
-			c->x86 = 4;
-		}
 
-		/* AMD-defined flags: level 0x80000001 */
-		xlvl = cpuid_eax(0x80000000);
-		if ((xlvl & 0xffff0000) == 0x80000000) {
-			if (xlvl >= 0x80000001) {
-				c->x86_capability[1] = cpuid_edx(0x80000001);
-				c->x86_capability[6] = cpuid_ecx(0x80000001);
-			}
-			if (xlvl >= 0x80000004)
-				get_model_name(c); /* Default name */
-		}
-
-		init_scattered_cpuid_features(c);
-		detect_nopl(c);
+#ifdef CONFIG_X86_HT
+		c->phys_proc_id = c->initial_apicid;
+#endif
 	}
+
+	get_model_name(c); /* Default name */
+
+	init_scattered_cpuid_features(c);
+	detect_nopl(c);
 }
 
-static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
-{
-	if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
-		/* Disable processor serial number */
-		unsigned long lo, hi;
-		rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-		lo |= 0x200000;
-		wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-		printk(KERN_NOTICE "CPU serial number disabled.\n");
-		clear_cpu_cap(c, X86_FEATURE_PN);
-
-		/* Disabling the serial number may affect the cpuid level */
-		c->cpuid_level = cpuid_eax(0);
-	}
-}
-
-static int __init x86_serial_nr_setup(char *s)
-{
-	disable_x86_serial_nr = 0;
-	return 1;
-}
-__setup("serialnumber", x86_serial_nr_setup);
-
-
-
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
@@ -448,30 +631,29 @@
 	c->loops_per_jiffy = loops_per_jiffy;
 	c->x86_cache_size = -1;
 	c->x86_vendor = X86_VENDOR_UNKNOWN;
-	c->cpuid_level = -1;	/* CPUID not detected */
 	c->x86_model = c->x86_mask = 0;	/* So far unknown... */
 	c->x86_vendor_id[0] = '\0'; /* Unset */
 	c->x86_model_id[0] = '\0';  /* Unset */
 	c->x86_max_cores = 1;
+	c->x86_coreid_bits = 0;
+#ifdef CONFIG_X86_64
+	c->x86_clflush_size = 64;
+#else
+	c->cpuid_level = -1;	/* CPUID not detected */
 	c->x86_clflush_size = 32;
+#endif
+	c->x86_cache_alignment = c->x86_clflush_size;
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
 
-	if (!have_cpuid_p()) {
-		/*
-		 * First of all, decide if this is a 486 or higher
-		 * It's a 486 if we can modify the AC flag
-		 */
-		if (flag_is_changeable_p(X86_EFLAGS_AC))
-			c->x86 = 4;
-		else
-			c->x86 = 3;
-	}
-
 	generic_identify(c);
 
 	if (this_cpu->c_identify)
 		this_cpu->c_identify(c);
 
+#ifdef CONFIG_X86_64
+	c->apicid = phys_pkg_id(0);
+#endif
+
 	/*
 	 * Vendor-specific initialization.  In this section we
 	 * canonicalize the feature flags, meaning if there are
@@ -505,6 +687,10 @@
 				c->x86, c->x86_model);
 	}
 
+#ifdef CONFIG_X86_64
+	detect_ht(c);
+#endif
+
 	/*
 	 * On SMP, boot_cpu_data holds the common feature set between
 	 * all CPUs; so make sure that we indicate which features are
@@ -513,7 +699,7 @@
 	 */
 	if (c != &boot_cpu_data) {
 		/* AND the already accumulated flags with these */
-		for (i = 0 ; i < NCAPINTS ; i++)
+		for (i = 0; i < NCAPINTS; i++)
 			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
 	}
 
@@ -521,72 +707,79 @@
 	for (i = 0; i < NCAPINTS; i++)
 		c->x86_capability[i] &= ~cleared_cpu_caps[i];
 
+#ifdef CONFIG_X86_MCE
 	/* Init Machine Check Exception if available. */
 	mcheck_init(c);
+#endif
 
 	select_idle_routine(c);
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+	numa_add_cpu(smp_processor_id());
+#endif
 }
 
 void __init identify_boot_cpu(void)
 {
 	identify_cpu(&boot_cpu_data);
+#ifdef CONFIG_X86_32
 	sysenter_setup();
 	enable_sep_cpu();
+#endif
 }
 
 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
 {
 	BUG_ON(c == &boot_cpu_data);
 	identify_cpu(c);
+#ifdef CONFIG_X86_32
 	enable_sep_cpu();
+#endif
 	mtrr_ap_init();
 }
 
-#ifdef CONFIG_X86_HT
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+struct msr_range {
+	unsigned min;
+	unsigned max;
+};
+
+static struct msr_range msr_range_array[] __cpuinitdata = {
+	{ 0x00000000, 0x00000418},
+	{ 0xc0000000, 0xc000040b},
+	{ 0xc0010000, 0xc0010142},
+	{ 0xc0011000, 0xc001103b},
+};
+
+static void __cpuinit print_cpu_msr(void)
 {
-	u32 	eax, ebx, ecx, edx;
-	int 	index_msb, core_bits;
+	unsigned index;
+	u64 val;
+	int i;
+	unsigned index_min, index_max;
 
-	cpuid(1, &eax, &ebx, &ecx, &edx);
-
-	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
-		return;
-
-	smp_num_siblings = (ebx & 0xff0000) >> 16;
-
-	if (smp_num_siblings == 1) {
-		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-	} else if (smp_num_siblings > 1) {
-
-		if (smp_num_siblings > NR_CPUS) {
-			printk(KERN_WARNING "CPU: Unsupported number of the "
-					"siblings %d", smp_num_siblings);
-			smp_num_siblings = 1;
-			return;
+	for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
+		index_min = msr_range_array[i].min;
+		index_max = msr_range_array[i].max;
+		for (index = index_min; index < index_max; index++) {
+			if (rdmsrl_amd_safe(index, &val))
+				continue;
+			printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
 		}
-
-		index_msb = get_count_order(smp_num_siblings);
-		c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
-
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       c->phys_proc_id);
-
-		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
-		index_msb = get_count_order(smp_num_siblings) ;
-
-		core_bits = get_count_order(c->x86_max_cores);
-
-		c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
-					       ((1 << core_bits) - 1);
-
-		if (c->x86_max_cores > 1)
-			printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-			       c->cpu_core_id);
 	}
 }
-#endif
+
+static int show_msr __cpuinitdata;
+static __init int setup_show_msr(char *arg)
+{
+	int num;
+
+	get_option(&arg, &num);
+
+	if (num > 0)
+		show_msr = num;
+	return 1;
+}
+__setup("show_msr=", setup_show_msr);
 
 static __init int setup_noclflush(char *arg)
 {
@@ -605,17 +798,25 @@
 		vendor = c->x86_vendor_id;
 
 	if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
-		printk("%s ", vendor);
+		printk(KERN_CONT "%s ", vendor);
 
-	if (!c->x86_model_id[0])
-		printk("%d86", c->x86);
+	if (c->x86_model_id[0])
+		printk(KERN_CONT "%s", c->x86_model_id);
 	else
-		printk("%s", c->x86_model_id);
+		printk(KERN_CONT "%d86", c->x86);
 
 	if (c->x86_mask || c->cpuid_level >= 0)
-		printk(" stepping %02x\n", c->x86_mask);
+		printk(KERN_CONT " stepping %02x\n", c->x86_mask);
 	else
-		printk("\n");
+		printk(KERN_CONT "\n");
+
+#ifdef CONFIG_SMP
+	if (c->cpu_index < show_msr)
+		print_cpu_msr();
+#else
+	if (show_msr)
+		print_cpu_msr();
+#endif
 }
 
 static __init int setup_disablecpuid(char *arg)
@@ -631,19 +832,89 @@
 
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
-void __init early_cpu_init(void)
+#ifdef CONFIG_X86_64
+struct x8664_pda **_cpu_pda __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
+
+struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
+
+char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+
+void __cpuinit pda_init(int cpu)
 {
-	struct cpu_vendor_dev *cvdev;
+	struct x8664_pda *pda = cpu_pda(cpu);
 
-	for (cvdev = __x86cpuvendor_start ;
-	     cvdev < __x86cpuvendor_end   ;
-	     cvdev++)
-		cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+	/* Setup up data that may be needed in __get_free_pages early */
+	loadsegment(fs, 0);
+	loadsegment(gs, 0);
+	/* Memory clobbers used to order PDA accessed */
+	mb();
+	wrmsrl(MSR_GS_BASE, pda);
+	mb();
 
-	early_cpu_detect();
-	validate_pat_support(&boot_cpu_data);
+	pda->cpunumber = cpu;
+	pda->irqcount = -1;
+	pda->kernelstack = (unsigned long)stack_thread_info() -
+				 PDA_STACKOFFSET + THREAD_SIZE;
+	pda->active_mm = &init_mm;
+	pda->mmu_state = 0;
+
+	if (cpu == 0) {
+		/* others are initialized in smpboot.c */
+		pda->pcurrent = &init_task;
+		pda->irqstackptr = boot_cpu_stack;
+		pda->irqstackptr += IRQSTACKSIZE - 64;
+	} else {
+		if (!pda->irqstackptr) {
+			pda->irqstackptr = (char *)
+				__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
+			if (!pda->irqstackptr)
+				panic("cannot allocate irqstack for cpu %d",
+				      cpu);
+			pda->irqstackptr += IRQSTACKSIZE - 64;
+		}
+
+		if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
+			pda->nodenumber = cpu_to_node(cpu);
+	}
 }
 
+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
+			   DEBUG_STKSZ] __page_aligned_bss;
+
+extern asmlinkage void ignore_sysret(void);
+
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+	/*
+	 * LSTAR and STAR live in a bit strange symbiosis.
+	 * They both write to the same internal register. STAR allows to
+	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
+	 */
+	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
+	wrmsrl(MSR_LSTAR, system_call);
+	wrmsrl(MSR_CSTAR, ignore_sysret);
+
+#ifdef CONFIG_IA32_EMULATION
+	syscall32_cpu_init();
+#endif
+
+	/* Flags to clear on syscall */
+	wrmsrl(MSR_SYSCALL_MASK,
+	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+}
+
+unsigned long kernel_eflags;
+
+/*
+ * Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+
+#else
+
 /* Make sure %fs is initialized properly in idle threads */
 struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
 {
@@ -651,25 +922,136 @@
 	regs->fs = __KERNEL_PERCPU;
 	return regs;
 }
-
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
-void switch_to_new_gdt(void)
-{
-	struct desc_ptr gdt_descr;
-
-	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
-	gdt_descr.size = GDT_SIZE - 1;
-	load_gdt(&gdt_descr);
-	asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
-}
+#endif
 
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
  * and IDT. We reload them nevertheless, this function acts as a
  * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init for 64 bit
  */
+#ifdef CONFIG_X86_64
+void __cpuinit cpu_init(void)
+{
+	int cpu = stack_smp_processor_id();
+	struct tss_struct *t = &per_cpu(init_tss, cpu);
+	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
+	unsigned long v;
+	char *estacks = NULL;
+	struct task_struct *me;
+	int i;
+
+	/* CPU 0 is initialised in head64.c */
+	if (cpu != 0)
+		pda_init(cpu);
+	else
+		estacks = boot_exception_stacks;
+
+	me = current;
+
+	if (cpu_test_and_set(cpu, cpu_initialized))
+		panic("CPU#%d already initialized!\n", cpu);
+
+	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+	clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+	/*
+	 * Initialize the per-CPU GDT with the boot GDT,
+	 * and set up the GDT descriptor:
+	 */
+
+	switch_to_new_gdt();
+	load_idt((const struct desc_ptr *)&idt_descr);
+
+	memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+	syscall_init();
+
+	wrmsrl(MSR_FS_BASE, 0);
+	wrmsrl(MSR_KERNEL_GS_BASE, 0);
+	barrier();
+
+	check_efer();
+	if (cpu != 0 && x2apic)
+		enable_x2apic();
+
+	/*
+	 * set up and load the per-CPU TSS
+	 */
+	if (!orig_ist->ist[0]) {
+		static const unsigned int order[N_EXCEPTION_STACKS] = {
+		  [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+		  [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+		};
+		for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+			if (cpu) {
+				estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
+				if (!estacks)
+					panic("Cannot allocate exception "
+					      "stack %ld %d\n", v, cpu);
+			}
+			estacks += PAGE_SIZE << order[v];
+			orig_ist->ist[v] = t->x86_tss.ist[v] =
+					(unsigned long)estacks;
+		}
+	}
+
+	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+	/*
+	 * <= is required because the CPU will access up to
+	 * 8 bits beyond the end of the IO permission bitmap.
+	 */
+	for (i = 0; i <= IO_BITMAP_LONGS; i++)
+		t->io_bitmap[i] = ~0UL;
+
+	atomic_inc(&init_mm.mm_count);
+	me->active_mm = &init_mm;
+	if (me->mm)
+		BUG();
+	enter_lazy_tlb(&init_mm, me);
+
+	load_sp0(t, &current->thread);
+	set_tss_desc(cpu, t);
+	load_TR_desc();
+	load_LDT(&init_mm.context);
+
+#ifdef CONFIG_KGDB
+	/*
+	 * If the kgdb is connected no debug regs should be altered.  This
+	 * is only applicable when KGDB and a KGDB I/O module are built
+	 * into the kernel and you are using early debugging with
+	 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
+	 */
+	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
+		arch_kgdb_ops.correct_hw_break();
+	else {
+#endif
+	/*
+	 * Clear all 6 debug registers:
+	 */
+
+	set_debugreg(0UL, 0);
+	set_debugreg(0UL, 1);
+	set_debugreg(0UL, 2);
+	set_debugreg(0UL, 3);
+	set_debugreg(0UL, 6);
+	set_debugreg(0UL, 7);
+#ifdef CONFIG_KGDB
+	/* If the kgdb is connected no debug regs should be altered. */
+	}
+#endif
+
+	fpu_init();
+
+	raw_local_save_flags(kernel_eflags);
+
+	if (is_uv_system())
+		uv_cpu_init();
+}
+
+#else
+
 void __cpuinit cpu_init(void)
 {
 	int cpu = smp_processor_id();
@@ -723,9 +1105,20 @@
 	/*
 	 * Force FPU initialization:
 	 */
-	current_thread_info()->status = 0;
+	if (cpu_has_xsave)
+		current_thread_info()->status = TS_XSAVE;
+	else
+		current_thread_info()->status = 0;
 	clear_used_math();
 	mxcsr_feature_mask_init();
+
+	/*
+	 * Boot processor to setup the FP and extended state context info.
+	 */
+	if (!smp_processor_id())
+		init_thread_xstate();
+
+	xsave_init();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -739,3 +1132,5 @@
 	per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
 }
 #endif
+
+#endif
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
deleted file mode 100644
index 305b465..0000000
--- a/arch/x86/kernel/cpu/common_64.c
+++ /dev/null
@@ -1,763 +0,0 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/bootmem.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/kgdb.h>
-#include <linux/topology.h>
-#include <linux/delay.h>
-#include <linux/smp.h>
-#include <linux/percpu.h>
-#include <asm/i387.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/linkage.h>
-#include <asm/mmu_context.h>
-#include <asm/mtrr.h>
-#include <asm/mce.h>
-#include <asm/pat.h>
-#include <asm/asm.h>
-#include <asm/numa.h>
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <mach_apic.h>
-#endif
-#include <asm/pda.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/genapic.h>
-
-#include "cpu.h"
-
-/* We need valid kernel segments for data and code in long mode too
- * IRET will check the segment types  kkeil 2000/10/28
- * Also sysret mandates a special GDT layout
- */
-/* The TLS descriptors are currently at a different place compared to i386.
-   Hopefully nobody expects them at a fixed place (Wine?) */
-DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
-	[GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
-	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
-	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
-	[GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
-	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
-	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
-} };
-EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
-
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
-
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
-void switch_to_new_gdt(void)
-{
-	struct desc_ptr gdt_descr;
-
-	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
-	gdt_descr.size = GDT_SIZE - 1;
-	load_gdt(&gdt_descr);
-}
-
-struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
-
-static void __cpuinit default_init(struct cpuinfo_x86 *c)
-{
-	display_cacheinfo(c);
-}
-
-static struct cpu_dev __cpuinitdata default_cpu = {
-	.c_init	= default_init,
-	.c_vendor = "Unknown",
-};
-static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
-
-int __cpuinit get_model_name(struct cpuinfo_x86 *c)
-{
-	unsigned int *v;
-
-	if (c->extended_cpuid_level < 0x80000004)
-		return 0;
-
-	v = (unsigned int *) c->x86_model_id;
-	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
-	cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
-	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
-	c->x86_model_id[48] = 0;
-	return 1;
-}
-
-
-void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
-{
-	unsigned int n, dummy, ebx, ecx, edx;
-
-	n = c->extended_cpuid_level;
-
-	if (n >= 0x80000005) {
-		cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
-		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
-		       "D cache %dK (%d bytes/line)\n",
-		       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
-		c->x86_cache_size = (ecx>>24) + (edx>>24);
-		/* On K8 L1 TLB is inclusive, so don't count it */
-		c->x86_tlbsize = 0;
-	}
-
-	if (n >= 0x80000006) {
-		cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
-		ecx = cpuid_ecx(0x80000006);
-		c->x86_cache_size = ecx >> 16;
-		c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
-
-		printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
-		c->x86_cache_size, ecx & 0xFF);
-	}
-}
-
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	u32 eax, ebx, ecx, edx;
-	int index_msb, core_bits;
-
-	cpuid(1, &eax, &ebx, &ecx, &edx);
-
-
-	if (!cpu_has(c, X86_FEATURE_HT))
-		return;
-	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
-		goto out;
-
-	smp_num_siblings = (ebx & 0xff0000) >> 16;
-
-	if (smp_num_siblings == 1) {
-		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-	} else if (smp_num_siblings > 1) {
-
-		if (smp_num_siblings > NR_CPUS) {
-			printk(KERN_WARNING "CPU: Unsupported number of "
-			       "siblings %d", smp_num_siblings);
-			smp_num_siblings = 1;
-			return;
-		}
-
-		index_msb = get_count_order(smp_num_siblings);
-		c->phys_proc_id = phys_pkg_id(index_msb);
-
-		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
-		index_msb = get_count_order(smp_num_siblings);
-
-		core_bits = get_count_order(c->x86_max_cores);
-
-		c->cpu_core_id = phys_pkg_id(index_msb) &
-					       ((1 << core_bits) - 1);
-	}
-out:
-	if ((c->x86_max_cores * smp_num_siblings) > 1) {
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       c->phys_proc_id);
-		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-		       c->cpu_core_id);
-	}
-
-#endif
-}
-
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
-{
-	char *v = c->x86_vendor_id;
-	int i;
-	static int printed;
-
-	for (i = 0; i < X86_VENDOR_NUM; i++) {
-		if (cpu_devs[i]) {
-			if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
-			    (cpu_devs[i]->c_ident[1] &&
-			    !strcmp(v, cpu_devs[i]->c_ident[1]))) {
-				c->x86_vendor = i;
-				this_cpu = cpu_devs[i];
-				return;
-			}
-		}
-	}
-	if (!printed) {
-		printed++;
-		printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
-		printk(KERN_ERR "CPU: Your system may be unstable.\n");
-	}
-	c->x86_vendor = X86_VENDOR_UNKNOWN;
-}
-
-static void __init early_cpu_support_print(void)
-{
-	int i,j;
-	struct cpu_dev *cpu_devx;
-
-	printk("KERNEL supported cpus:\n");
-	for (i = 0; i < X86_VENDOR_NUM; i++) {
-		cpu_devx = cpu_devs[i];
-		if (!cpu_devx)
-			continue;
-		for (j = 0; j < 2; j++) {
-			if (!cpu_devx->c_ident[j])
-				continue;
-			printk("  %s %s\n", cpu_devx->c_vendor,
-				cpu_devx->c_ident[j]);
-		}
-	}
-}
-
-/*
- * The NOPL instruction is supposed to exist on all CPUs with
- * family >= 6, unfortunately, that's not true in practice because
- * of early VIA chips and (more importantly) broken virtualizers that
- * are not easy to detect.  Hence, probe for it based on first
- * principles.
- *
- * Note: no 64-bit chip is known to lack these, but put the code here
- * for consistency with 32 bits, and to make it utterly trivial to
- * diagnose the problem should it ever surface.
- */
-static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
-{
-	const u32 nopl_signature = 0x888c53b1; /* Random number */
-	u32 has_nopl = nopl_signature;
-
-	clear_cpu_cap(c, X86_FEATURE_NOPL);
-	if (c->x86 >= 6) {
-		asm volatile("\n"
-			     "1:      .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
-			     "2:\n"
-			     "        .section .fixup,\"ax\"\n"
-			     "3:      xor %0,%0\n"
-			     "        jmp 2b\n"
-			     "        .previous\n"
-			     _ASM_EXTABLE(1b,3b)
-			     : "+a" (has_nopl));
-
-		if (has_nopl == nopl_signature)
-			set_cpu_cap(c, X86_FEATURE_NOPL);
-	}
-}
-
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
-
-void __init early_cpu_init(void)
-{
-        struct cpu_vendor_dev *cvdev;
-
-        for (cvdev = __x86cpuvendor_start ;
-             cvdev < __x86cpuvendor_end   ;
-             cvdev++)
-                cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
-	early_cpu_support_print();
-	early_identify_cpu(&boot_cpu_data);
-}
-
-/* Do some early cpuid on the boot CPU to get some parameter that are
-   needed before check_bugs. Everything advanced is in identify_cpu
-   below. */
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
-{
-	u32 tfms, xlvl;
-
-	c->loops_per_jiffy = loops_per_jiffy;
-	c->x86_cache_size = -1;
-	c->x86_vendor = X86_VENDOR_UNKNOWN;
-	c->x86_model = c->x86_mask = 0;	/* So far unknown... */
-	c->x86_vendor_id[0] = '\0'; /* Unset */
-	c->x86_model_id[0] = '\0';  /* Unset */
-	c->x86_clflush_size = 64;
-	c->x86_cache_alignment = c->x86_clflush_size;
-	c->x86_max_cores = 1;
-	c->x86_coreid_bits = 0;
-	c->extended_cpuid_level = 0;
-	memset(&c->x86_capability, 0, sizeof c->x86_capability);
-
-	/* Get vendor name */
-	cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
-	      (unsigned int *)&c->x86_vendor_id[0],
-	      (unsigned int *)&c->x86_vendor_id[8],
-	      (unsigned int *)&c->x86_vendor_id[4]);
-
-	get_cpu_vendor(c);
-
-	/* Initialize the standard set of capabilities */
-	/* Note that the vendor-specific code below might override */
-
-	/* Intel-defined flags: level 0x00000001 */
-	if (c->cpuid_level >= 0x00000001) {
-		__u32 misc;
-		cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
-		      &c->x86_capability[0]);
-		c->x86 = (tfms >> 8) & 0xf;
-		c->x86_model = (tfms >> 4) & 0xf;
-		c->x86_mask = tfms & 0xf;
-		if (c->x86 == 0xf)
-			c->x86 += (tfms >> 20) & 0xff;
-		if (c->x86 >= 0x6)
-			c->x86_model += ((tfms >> 16) & 0xF) << 4;
-		if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
-			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
-	} else {
-		/* Have CPUID level 0 only - unheard of */
-		c->x86 = 4;
-	}
-
-	c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
-#ifdef CONFIG_SMP
-	c->phys_proc_id = c->initial_apicid;
-#endif
-	/* AMD-defined flags: level 0x80000001 */
-	xlvl = cpuid_eax(0x80000000);
-	c->extended_cpuid_level = xlvl;
-	if ((xlvl & 0xffff0000) == 0x80000000) {
-		if (xlvl >= 0x80000001) {
-			c->x86_capability[1] = cpuid_edx(0x80000001);
-			c->x86_capability[6] = cpuid_ecx(0x80000001);
-		}
-		if (xlvl >= 0x80000004)
-			get_model_name(c); /* Default name */
-	}
-
-	/* Transmeta-defined flags: level 0x80860001 */
-	xlvl = cpuid_eax(0x80860000);
-	if ((xlvl & 0xffff0000) == 0x80860000) {
-		/* Don't set x86_cpuid_level here for now to not confuse. */
-		if (xlvl >= 0x80860001)
-			c->x86_capability[2] = cpuid_edx(0x80860001);
-	}
-
-	if (c->extended_cpuid_level >= 0x80000007)
-		c->x86_power = cpuid_edx(0x80000007);
-
-	if (c->extended_cpuid_level >= 0x80000008) {
-		u32 eax = cpuid_eax(0x80000008);
-
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-	}
-
-	detect_nopl(c);
-
-	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
-	    cpu_devs[c->x86_vendor]->c_early_init)
-		cpu_devs[c->x86_vendor]->c_early_init(c);
-
-	validate_pat_support(c);
-}
-
-/*
- * This does the hard work of actually picking apart the CPU stuff...
- */
-static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
-{
-	int i;
-
-	early_identify_cpu(c);
-
-	init_scattered_cpuid_features(c);
-
-	c->apicid = phys_pkg_id(0);
-
-	/*
-	 * Vendor-specific initialization.  In this section we
-	 * canonicalize the feature flags, meaning if there are
-	 * features a certain CPU supports which CPUID doesn't
-	 * tell us, CPUID claiming incorrect flags, or other bugs,
-	 * we handle them here.
-	 *
-	 * At the end of this section, c->x86_capability better
-	 * indicate the features this CPU genuinely supports!
-	 */
-	if (this_cpu->c_init)
-		this_cpu->c_init(c);
-
-	detect_ht(c);
-
-	/*
-	 * On SMP, boot_cpu_data holds the common feature set between
-	 * all CPUs; so make sure that we indicate which features are
-	 * common between the CPUs.  The first time this routine gets
-	 * executed, c == &boot_cpu_data.
-	 */
-	if (c != &boot_cpu_data) {
-		/* AND the already accumulated flags with these */
-		for (i = 0; i < NCAPINTS; i++)
-			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
-	}
-
-	/* Clear all flags overriden by options */
-	for (i = 0; i < NCAPINTS; i++)
-		c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
-#ifdef CONFIG_X86_MCE
-	mcheck_init(c);
-#endif
-	select_idle_routine(c);
-
-#ifdef CONFIG_NUMA
-	numa_add_cpu(smp_processor_id());
-#endif
-
-}
-
-void __cpuinit identify_boot_cpu(void)
-{
-	identify_cpu(&boot_cpu_data);
-}
-
-void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
-{
-	BUG_ON(c == &boot_cpu_data);
-	identify_cpu(c);
-	mtrr_ap_init();
-}
-
-static __init int setup_noclflush(char *arg)
-{
-	setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
-	return 1;
-}
-__setup("noclflush", setup_noclflush);
-
-struct msr_range {
-	unsigned min;
-	unsigned max;
-};
-
-static struct msr_range msr_range_array[] __cpuinitdata = {
-	{ 0x00000000, 0x00000418},
-	{ 0xc0000000, 0xc000040b},
-	{ 0xc0010000, 0xc0010142},
-	{ 0xc0011000, 0xc001103b},
-};
-
-static void __cpuinit print_cpu_msr(void)
-{
-	unsigned index;
-	u64 val;
-	int i;
-	unsigned index_min, index_max;
-
-	for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
-		index_min = msr_range_array[i].min;
-		index_max = msr_range_array[i].max;
-		for (index = index_min; index < index_max; index++) {
-			if (rdmsrl_amd_safe(index, &val))
-				continue;
-			printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
-		}
-	}
-}
-
-static int show_msr __cpuinitdata;
-static __init int setup_show_msr(char *arg)
-{
-	int num;
-
-	get_option(&arg, &num);
-
-	if (num > 0)
-		show_msr = num;
-	return 1;
-}
-__setup("show_msr=", setup_show_msr);
-
-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
-{
-	if (c->x86_model_id[0])
-		printk(KERN_CONT "%s", c->x86_model_id);
-
-	if (c->x86_mask || c->cpuid_level >= 0)
-		printk(KERN_CONT " stepping %02x\n", c->x86_mask);
-	else
-		printk(KERN_CONT "\n");
-
-#ifdef CONFIG_SMP
-	if (c->cpu_index < show_msr)
-		print_cpu_msr();
-#else
-	if (show_msr)
-		print_cpu_msr();
-#endif
-}
-
-static __init int setup_disablecpuid(char *arg)
-{
-	int bit;
-	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
-		setup_clear_cpu_cap(bit);
-	else
-		return 0;
-	return 1;
-}
-__setup("clearcpuid=", setup_disablecpuid);
-
-cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
-
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-
-struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-
-char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
-
-unsigned long __supported_pte_mask __read_mostly = ~0UL;
-EXPORT_SYMBOL_GPL(__supported_pte_mask);
-
-static int do_not_nx __cpuinitdata;
-
-/* noexec=on|off
-Control non executable mappings for 64bit processes.
-
-on	Enable(default)
-off	Disable
-*/
-static int __init nonx_setup(char *str)
-{
-	if (!str)
-		return -EINVAL;
-	if (!strncmp(str, "on", 2)) {
-		__supported_pte_mask |= _PAGE_NX;
-		do_not_nx = 0;
-	} else if (!strncmp(str, "off", 3)) {
-		do_not_nx = 1;
-		__supported_pte_mask &= ~_PAGE_NX;
-	}
-	return 0;
-}
-early_param("noexec", nonx_setup);
-
-int force_personality32;
-
-/* noexec32=on|off
-Control non executable heap for 32bit processes.
-To control the stack too use noexec=off
-
-on	PROT_READ does not imply PROT_EXEC for 32bit processes (default)
-off	PROT_READ implies PROT_EXEC
-*/
-static int __init nonx32_setup(char *str)
-{
-	if (!strcmp(str, "on"))
-		force_personality32 &= ~READ_IMPLIES_EXEC;
-	else if (!strcmp(str, "off"))
-		force_personality32 |= READ_IMPLIES_EXEC;
-	return 1;
-}
-__setup("noexec32=", nonx32_setup);
-
-void pda_init(int cpu)
-{
-	struct x8664_pda *pda = cpu_pda(cpu);
-
-	/* Setup up data that may be needed in __get_free_pages early */
-	loadsegment(fs, 0);
-	loadsegment(gs, 0);
-	/* Memory clobbers used to order PDA accessed */
-	mb();
-	wrmsrl(MSR_GS_BASE, pda);
-	mb();
-
-	pda->cpunumber = cpu;
-	pda->irqcount = -1;
-	pda->kernelstack = (unsigned long)stack_thread_info() -
-				 PDA_STACKOFFSET + THREAD_SIZE;
-	pda->active_mm = &init_mm;
-	pda->mmu_state = 0;
-
-	if (cpu == 0) {
-		/* others are initialized in smpboot.c */
-		pda->pcurrent = &init_task;
-		pda->irqstackptr = boot_cpu_stack;
-		pda->irqstackptr += IRQSTACKSIZE - 64;
-	} else {
-		if (!pda->irqstackptr) {
-			pda->irqstackptr = (char *)
-				__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
-			if (!pda->irqstackptr)
-				panic("cannot allocate irqstack for cpu %d",
-				      cpu);
-			pda->irqstackptr += IRQSTACKSIZE - 64;
-		}
-
-		if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
-			pda->nodenumber = cpu_to_node(cpu);
-	}
-}
-
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
-			   DEBUG_STKSZ] __page_aligned_bss;
-
-extern asmlinkage void ignore_sysret(void);
-
-/* May not be marked __init: used by software suspend */
-void syscall_init(void)
-{
-	/*
-	 * LSTAR and STAR live in a bit strange symbiosis.
-	 * They both write to the same internal register. STAR allows to
-	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
-	 */
-	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
-	wrmsrl(MSR_LSTAR, system_call);
-	wrmsrl(MSR_CSTAR, ignore_sysret);
-
-#ifdef CONFIG_IA32_EMULATION
-	syscall32_cpu_init();
-#endif
-
-	/* Flags to clear on syscall */
-	wrmsrl(MSR_SYSCALL_MASK,
-	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
-}
-
-void __cpuinit check_efer(void)
-{
-	unsigned long efer;
-
-	rdmsrl(MSR_EFER, efer);
-	if (!(efer & EFER_NX) || do_not_nx)
-		__supported_pte_mask &= ~_PAGE_NX;
-}
-
-unsigned long kernel_eflags;
-
-/*
- * Copies of the original ist values from the tss are only accessed during
- * debugging, no special alignment required.
- */
-DEFINE_PER_CPU(struct orig_ist, orig_ist);
-
-/*
- * cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- * A lot of state is already set up in PDA init.
- */
-void __cpuinit cpu_init(void)
-{
-	int cpu = stack_smp_processor_id();
-	struct tss_struct *t = &per_cpu(init_tss, cpu);
-	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
-	unsigned long v;
-	char *estacks = NULL;
-	struct task_struct *me;
-	int i;
-
-	/* CPU 0 is initialised in head64.c */
-	if (cpu != 0)
-		pda_init(cpu);
-	else
-		estacks = boot_exception_stacks;
-
-	me = current;
-
-	if (cpu_test_and_set(cpu, cpu_initialized))
-		panic("CPU#%d already initialized!\n", cpu);
-
-	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
-
-	clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-
-	/*
-	 * Initialize the per-CPU GDT with the boot GDT,
-	 * and set up the GDT descriptor:
-	 */
-
-	switch_to_new_gdt();
-	load_idt((const struct desc_ptr *)&idt_descr);
-
-	memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
-	syscall_init();
-
-	wrmsrl(MSR_FS_BASE, 0);
-	wrmsrl(MSR_KERNEL_GS_BASE, 0);
-	barrier();
-
-	check_efer();
-
-	/*
-	 * set up and load the per-CPU TSS
-	 */
-	if (!orig_ist->ist[0]) {
-		static const unsigned int order[N_EXCEPTION_STACKS] = {
-		  [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
-		  [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
-		};
-		for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-			if (cpu) {
-				estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
-				if (!estacks)
-					panic("Cannot allocate exception "
-					      "stack %ld %d\n", v, cpu);
-			}
-			estacks += PAGE_SIZE << order[v];
-			orig_ist->ist[v] = t->x86_tss.ist[v] =
-					(unsigned long)estacks;
-		}
-	}
-
-	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
-	/*
-	 * <= is required because the CPU will access up to
-	 * 8 bits beyond the end of the IO permission bitmap.
-	 */
-	for (i = 0; i <= IO_BITMAP_LONGS; i++)
-		t->io_bitmap[i] = ~0UL;
-
-	atomic_inc(&init_mm.mm_count);
-	me->active_mm = &init_mm;
-	if (me->mm)
-		BUG();
-	enter_lazy_tlb(&init_mm, me);
-
-	load_sp0(t, &current->thread);
-	set_tss_desc(cpu, t);
-	load_TR_desc();
-	load_LDT(&init_mm.context);
-
-#ifdef CONFIG_KGDB
-	/*
-	 * If the kgdb is connected no debug regs should be altered.  This
-	 * is only applicable when KGDB and a KGDB I/O module are built
-	 * into the kernel and you are using early debugging with
-	 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
-	 */
-	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
-		arch_kgdb_ops.correct_hw_break();
-	else {
-#endif
-	/*
-	 * Clear all 6 debug registers:
-	 */
-
-	set_debugreg(0UL, 0);
-	set_debugreg(0UL, 1);
-	set_debugreg(0UL, 2);
-	set_debugreg(0UL, 3);
-	set_debugreg(0UL, 6);
-	set_debugreg(0UL, 7);
-#ifdef CONFIG_KGDB
-	/* If the kgdb is connected no debug regs should be altered. */
-	}
-#endif
-
-	fpu_init();
-
-	raw_local_save_flags(kernel_eflags);
-
-	if (is_uv_system())
-		uv_cpu_init();
-}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 4d894e8..de4094a 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -21,23 +21,16 @@
 	void		(*c_init)(struct cpuinfo_x86 * c);
 	void		(*c_identify)(struct cpuinfo_x86 * c);
 	unsigned int	(*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
+	int	c_x86_vendor;
 };
 
-extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
+#define cpu_dev_register(cpu_devX) \
+	static struct cpu_dev *__cpu_dev_##cpu_devX __used \
+	__attribute__((__section__(".x86_cpu_dev.init"))) = \
+	&cpu_devX;
 
-struct cpu_vendor_dev {
-	int vendor;
-	struct cpu_dev *cpu_dev;
-};
+extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
 
-#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \
-	static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \
-	__attribute__((__section__(".x86cpuvendor.init"))) = \
-	{ cpu_vendor_id, cpu_dev }
-
-extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
-
-extern int get_model_name(struct cpuinfo_x86 *c);
 extern void display_cacheinfo(struct cpuinfo_x86 *c);
 
 #endif
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 898a5a2..ffd0f5e 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -121,7 +121,7 @@
 	setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
 
 	/* Load/Store Serialize to mem access disable (=reorder it) */
-	setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80);
+	setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
 	/* set load/store serialize from 1GB to 4GB */
 	ccr3 |= 0xe0;
 	setCx86(CX86_CCR3, ccr3);
@@ -132,11 +132,11 @@
 	printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
 
 	/* CCR2 bit 2: unlock NW bit */
-	setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
+	setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
 	/* set 'Not Write-through' */
 	write_cr0(read_cr0() | X86_CR0_NW);
 	/* CCR2 bit 2: lock NW bit and set WT1 */
-	setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14);
+	setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
 }
 
 /*
@@ -150,14 +150,14 @@
 	local_irq_save(flags);
 
 	/* Suspend on halt power saving and enable #SUSP pin */
-	setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
+	setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
 
 	ccr3 = getCx86(CX86_CCR3);
 	setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);	/* enable MAPEN */
 
 
 	/* FPU fast, DTE cache, Mem bypass */
-	setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
+	setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
 	setCx86(CX86_CCR3, ccr3);			/* disable MAPEN */
 
 	set_cx86_memwb();
@@ -291,7 +291,7 @@
 		/* GXm supports extended cpuid levels 'ala' AMD */
 		if (c->cpuid_level == 2) {
 			/* Enable cxMMX extensions (GX1 Datasheet 54) */
-			setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
+			setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
 
 			/*
 			 * GXm : 0x30 ... 0x5f GXm  datasheet 51
@@ -301,7 +301,6 @@
 			 */
 			if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
 				geode_configure();
-			get_model_name(c);  /* get CPU marketing name */
 			return;
 		} else { /* MediaGX */
 			Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
@@ -314,7 +313,7 @@
 		if (dir1 > 7) {
 			dir0_msn++;  /* M II */
 			/* Enable MMX extensions (App note 108) */
-			setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
+			setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
 		} else {
 			c->coma_bug = 1;      /* 6x86MX, it has the bug. */
 		}
@@ -429,7 +428,7 @@
 			local_irq_save(flags);
 			ccr3 = getCx86(CX86_CCR3);
 			setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);       /* enable MAPEN  */
-			setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80);  /* enable cpuid  */
+			setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80);  /* enable cpuid  */
 			setCx86(CX86_CCR3, ccr3);                       /* disable MAPEN */
 			local_irq_restore(flags);
 		}
@@ -442,14 +441,16 @@
 	.c_early_init	= early_init_cyrix,
 	.c_init		= init_cyrix,
 	.c_identify	= cyrix_identify,
+	.c_x86_vendor	= X86_VENDOR_CYRIX,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev);
+cpu_dev_register(cyrix_cpu_dev);
 
 static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
 	.c_vendor	= "NSC",
 	.c_ident	= { "Geode by NSC" },
 	.c_init		= init_nsc,
+	.c_x86_vendor	= X86_VENDOR_NSC,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev);
+cpu_dev_register(nsc_cpu_dev);
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
deleted file mode 100644
index c901779..0000000
--- a/arch/x86/kernel/cpu/feature_names.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Strings for the various x86 capability flags.
- *
- * This file must not contain any executable code.
- */
-
-#include <asm/cpufeature.h>
-
-/*
- * These flag bits must match the definitions in <asm/cpufeature.h>.
- * NULL means this bit is undefined or reserved; either way it doesn't
- * have meaning as far as Linux is concerned.  Note that it's important
- * to realize there is a difference between this table and CPUID -- if
- * applications want to get the raw CPUID data, they should access
- * /dev/cpu/<cpu_nr>/cpuid instead.
- */
-const char * const x86_cap_flags[NCAPINTS*32] = {
-	/* Intel-defined */
-	"fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
-	"cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
-	"pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
-	"fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
-
-	/* AMD-defined */
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
-	NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
-	"3dnowext", "3dnow",
-
-	/* Transmeta-defined */
-	"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-	/* Other (Linux-defined) */
-	"cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
-	NULL, NULL, NULL, NULL,
-	"constant_tsc", "up", NULL, "arch_perfmon",
-	"pebs", "bts", NULL, NULL,
-	"rep_good", NULL, NULL, NULL,
-	"nopl", NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-	/* Intel-defined (#2) */
-	"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
-	"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
-	NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-	/* VIA/Cyrix/Centaur-defined */
-	NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
-	"ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-	/* AMD-defined (#2) */
-	"lahf_lm", "cmp_legacy", "svm", "extapic",
-	"cr8_legacy", "abm", "sse4a", "misalignsse",
-	"3dnowprefetch", "osvw", "ibs", "sse5",
-	"skinit", "wdt", NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-	/* Auxiliary (Linux-defined) */
-	"ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-};
-
-const char *const x86_power_flags[32] = {
-	"ts",	/* temperature sensor */
-	"fid",  /* frequency id control */
-	"vid",  /* voltage id control */
-	"ttp",  /* thermal trip */
-	"tm",
-	"stc",
-	"100mhzsteps",
-	"hwpstate",
-	"",	/* tsc invariant mapped to constant_tsc */
-		/* nothing */
-};
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index f113ef4..99468db 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -15,6 +15,11 @@
 #include <asm/ds.h>
 #include <asm/bugs.h>
 
+#ifdef CONFIG_X86_64
+#include <asm/topology.h>
+#include <asm/numa_64.h>
+#endif
+
 #include "cpu.h"
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -23,23 +28,22 @@
 #include <mach_apic.h>
 #endif
 
-#ifdef CONFIG_X86_INTEL_USERCOPY
-/*
- * Alignment at which movsl is preferred for bulk memory copies.
- */
-struct movsl_mask movsl_mask __read_mostly;
-#endif
-
 static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 {
-	/* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
-	if (c->x86 == 15 && c->x86_cache_alignment == 64)
-		c->x86_cache_alignment = 128;
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
 		(c->x86 == 0x6 && c->x86_model >= 0x0e))
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+#ifdef CONFIG_X86_64
+	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#else
+	/* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
+	if (c->x86 == 15 && c->x86_cache_alignment == 64)
+		c->x86_cache_alignment = 128;
+#endif
 }
 
+#ifdef CONFIG_X86_32
 /*
  *	Early probe support logic for ppro memory erratum #50
  *
@@ -59,45 +63,6 @@
 	return 0;
 }
 
-
-/*
- * P4 Xeon errata 037 workaround.
- * Hardware prefetcher may cause stale data to be loaded into the cache.
- */
-static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
-{
-	unsigned long lo, hi;
-
-	if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
-		rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
-		if ((lo & (1<<9)) == 0) {
-			printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
-			printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
-			lo |= (1<<9);	/* Disable hw prefetching */
-			wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
-		}
-	}
-}
-
-
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	if (c->cpuid_level < 4)
-		return 1;
-
-	/* Intel has a non-standard dependency on %ecx for this CPUID level. */
-	cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
-	if (eax & 0x1f)
-		return ((eax >> 26) + 1);
-	else
-		return 1;
-}
-
 #ifdef CONFIG_X86_F00F_BUG
 static void __cpuinit trap_init_f00f_bug(void)
 {
@@ -112,12 +77,9 @@
 }
 #endif
 
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 {
-	unsigned int l2 = 0;
-	char *p = NULL;
-
-	early_init_intel(c);
+	unsigned long lo, hi;
 
 #ifdef CONFIG_X86_F00F_BUG
 	/*
@@ -138,6 +100,148 @@
 	}
 #endif
 
+	/*
+	 * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
+	 * model 3 mask 3
+	 */
+	if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
+		clear_cpu_cap(c, X86_FEATURE_SEP);
+
+	/*
+	 * P4 Xeon errata 037 workaround.
+	 * Hardware prefetcher may cause stale data to be loaded into the cache.
+	 */
+	if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
+		rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
+		if ((lo & (1<<9)) == 0) {
+			printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
+			printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
+			lo |= (1<<9);	/* Disable hw prefetching */
+			wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
+		}
+	}
+
+	/*
+	 * See if we have a good local APIC by checking for buggy Pentia,
+	 * i.e. all B steppings and the C2 stepping of P54C when using their
+	 * integrated APIC (see 11AP erratum in "Pentium Processor
+	 * Specification Update").
+	 */
+	if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
+	    (c->x86_mask < 0x6 || c->x86_mask == 0xb))
+		set_cpu_cap(c, X86_FEATURE_11AP);
+
+
+#ifdef CONFIG_X86_INTEL_USERCOPY
+	/*
+	 * Set up the preferred alignment for movsl bulk memory moves
+	 */
+	switch (c->x86) {
+	case 4:		/* 486: untested */
+		break;
+	case 5:		/* Old Pentia: untested */
+		break;
+	case 6:		/* PII/PIII only like movsl with 8-byte alignment */
+		movsl_mask.mask = 7;
+		break;
+	case 15:	/* P4 is OK down to 8-byte alignment */
+		movsl_mask.mask = 7;
+		break;
+	}
+#endif
+
+#ifdef CONFIG_X86_NUMAQ
+	numaq_tsc_disable();
+#endif
+}
+#else
+static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
+{
+}
+#endif
+
+static void __cpuinit srat_detect_node(void)
+{
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+	unsigned node;
+	int cpu = smp_processor_id();
+	int apicid = hard_smp_processor_id();
+
+	/* Don't do the funky fallback heuristics the AMD version employs
+	   for now. */
+	node = apicid_to_node[apicid];
+	if (node == NUMA_NO_NODE || !node_online(node))
+		node = first_node(node_online_map);
+	numa_set_node(cpu, node);
+
+	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+}
+
+/*
+ * find out the number of processor cores on the die
+ */
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	if (c->cpuid_level < 4)
+		return 1;
+
+	/* Intel has a non-standard dependency on %ecx for this CPUID level. */
+	cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
+	if (eax & 0x1f)
+		return ((eax >> 26) + 1);
+	else
+		return 1;
+}
+
+static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
+{
+	/* Intel VMX MSR indicated features */
+#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW	0x00200000
+#define X86_VMX_FEATURE_PROC_CTLS_VNMI		0x00400000
+#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS	0x80000000
+#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC	0x00000001
+#define X86_VMX_FEATURE_PROC_CTLS2_EPT		0x00000002
+#define X86_VMX_FEATURE_PROC_CTLS2_VPID		0x00000020
+
+	u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
+
+	clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+	clear_cpu_cap(c, X86_FEATURE_VNMI);
+	clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+	clear_cpu_cap(c, X86_FEATURE_EPT);
+	clear_cpu_cap(c, X86_FEATURE_VPID);
+
+	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+	msr_ctl = vmx_msr_high | vmx_msr_low;
+	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
+		set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
+		set_cpu_cap(c, X86_FEATURE_VNMI);
+	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
+		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+		      vmx_msr_low, vmx_msr_high);
+		msr_ctl2 = vmx_msr_high | vmx_msr_low;
+		if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
+		    (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
+			set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+		if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
+			set_cpu_cap(c, X86_FEATURE_EPT);
+		if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
+			set_cpu_cap(c, X86_FEATURE_VPID);
+	}
+}
+
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+{
+	unsigned int l2 = 0;
+
+	early_init_intel(c);
+
+	intel_workarounds(c);
+
 	l2 = init_intel_cacheinfo(c);
 	if (c->cpuid_level > 9) {
 		unsigned eax = cpuid_eax(10);
@@ -146,16 +250,32 @@
 			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
 	}
 
-	/* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */
-	if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
-		clear_cpu_cap(c, X86_FEATURE_SEP);
+	if (cpu_has_xmm2)
+		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+	if (cpu_has_ds) {
+		unsigned int l1;
+		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+		if (!(l1 & (1<<11)))
+			set_cpu_cap(c, X86_FEATURE_BTS);
+		if (!(l1 & (1<<12)))
+			set_cpu_cap(c, X86_FEATURE_PEBS);
+		ds_init_intel(c);
+	}
 
+#ifdef CONFIG_X86_64
+	if (c->x86 == 15)
+		c->x86_cache_alignment = c->x86_clflush_size * 2;
+	if (c->x86 == 6)
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+#else
 	/*
 	 * Names for the Pentium II/Celeron processors
 	 * detectable only by also checking the cache size.
 	 * Dixon is NOT a Celeron.
 	 */
 	if (c->x86 == 6) {
+		char *p = NULL;
+
 		switch (c->x86_model) {
 		case 5:
 			if (c->x86_mask == 0) {
@@ -178,71 +298,41 @@
 				p = "Celeron (Coppermine)";
 			break;
 		}
+
+		if (p)
+			strcpy(c->x86_model_id, p);
 	}
 
-	if (p)
-		strcpy(c->x86_model_id, p);
-
-	c->x86_max_cores = num_cpu_cores(c);
-
-	detect_ht(c);
-
-	/* Work around errata */
-	Intel_errata_workarounds(c);
-
-#ifdef CONFIG_X86_INTEL_USERCOPY
-	/*
-	 * Set up the preferred alignment for movsl bulk memory moves
-	 */
-	switch (c->x86) {
-	case 4:		/* 486: untested */
-		break;
-	case 5:		/* Old Pentia: untested */
-		break;
-	case 6:		/* PII/PIII only like movsl with 8-byte alignment */
-		movsl_mask.mask = 7;
-		break;
-	case 15:	/* P4 is OK down to 8-byte alignment */
-		movsl_mask.mask = 7;
-		break;
-	}
-#endif
-
-	if (cpu_has_xmm2)
-		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-	if (c->x86 == 15) {
+	if (c->x86 == 15)
 		set_cpu_cap(c, X86_FEATURE_P4);
-	}
 	if (c->x86 == 6)
 		set_cpu_cap(c, X86_FEATURE_P3);
-	if (cpu_has_ds) {
-		unsigned int l1;
-		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
-		if (!(l1 & (1<<11)))
-			set_cpu_cap(c, X86_FEATURE_BTS);
-		if (!(l1 & (1<<12)))
-			set_cpu_cap(c, X86_FEATURE_PEBS);
-		ds_init_intel(c);
-	}
 
 	if (cpu_has_bts)
 		ptrace_bts_init_intel(c);
 
-	/*
-	 * See if we have a good local APIC by checking for buggy Pentia,
-	 * i.e. all B steppings and the C2 stepping of P54C when using their
-	 * integrated APIC (see 11AP erratum in "Pentium Processor
-	 * Specification Update").
-	 */
-	if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
-	    (c->x86_mask < 0x6 || c->x86_mask == 0xb))
-		set_cpu_cap(c, X86_FEATURE_11AP);
-
-#ifdef CONFIG_X86_NUMAQ
-	numaq_tsc_disable();
 #endif
+
+	detect_extended_topology(c);
+	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
+		/*
+		 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
+		 * detection.
+		 */
+		c->x86_max_cores = intel_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+		detect_ht(c);
+#endif
+	}
+
+	/* Work around errata */
+	srat_detect_node();
+
+	if (cpu_has(c, X86_FEATURE_VMX))
+		detect_vmx_virtcap(c);
 }
 
+#ifdef CONFIG_X86_32
 static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 {
 	/*
@@ -255,10 +345,12 @@
 		size = 256;
 	return size;
 }
+#endif
 
 static struct cpu_dev intel_cpu_dev __cpuinitdata = {
 	.c_vendor	= "Intel",
 	.c_ident	= { "GenuineIntel" },
+#ifdef CONFIG_X86_32
 	.c_models = {
 		{ .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
 		  {
@@ -308,76 +400,12 @@
 		  }
 		},
 	},
+	.c_size_cache	= intel_size_cache,
+#endif
 	.c_early_init   = early_init_intel,
 	.c_init		= init_intel,
-	.c_size_cache	= intel_size_cache,
+	.c_x86_vendor	= X86_VENDOR_INTEL,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
-
-#ifndef CONFIG_X86_CMPXCHG
-unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
-{
-	u8 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u8 *)ptr;
-	if (prev == old)
-		*(u8 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u8);
-
-unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
-{
-	u16 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u16 *)ptr;
-	if (prev == old)
-		*(u16 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u16);
-
-unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
-{
-	u32 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u32 *)ptr;
-	if (prev == old)
-		*(u32 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u32);
-#endif
-
-#ifndef CONFIG_X86_CMPXCHG64
-unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
-{
-	u64 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u64 *)ptr;
-	if (prev == old)
-		*(u64 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_486_u64);
-#endif
-
-/* arch_initcall(intel_cpu_init); */
+cpu_dev_register(intel_cpu_dev);
 
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
deleted file mode 100644
index 1019c58..0000000
--- a/arch/x86/kernel/cpu/intel_64.c
+++ /dev/null
@@ -1,95 +0,0 @@
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/topology.h>
-#include <asm/numa_64.h>
-
-#include "cpu.h"
-
-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
-{
-	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
-	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
-	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
-}
-
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
-{
-	unsigned int eax, t;
-
-	if (c->cpuid_level < 4)
-		return 1;
-
-	cpuid_count(4, 0, &eax, &t, &t, &t);
-
-	if (eax & 0x1f)
-		return ((eax >> 26) + 1);
-	else
-		return 1;
-}
-
-static void __cpuinit srat_detect_node(void)
-{
-#ifdef CONFIG_NUMA
-	unsigned node;
-	int cpu = smp_processor_id();
-	int apicid = hard_smp_processor_id();
-
-	/* Don't do the funky fallback heuristics the AMD version employs
-	   for now. */
-	node = apicid_to_node[apicid];
-	if (node == NUMA_NO_NODE || !node_online(node))
-		node = first_node(node_online_map);
-	numa_set_node(cpu, node);
-
-	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-}
-
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
-{
-	init_intel_cacheinfo(c);
-	if (c->cpuid_level > 9) {
-		unsigned eax = cpuid_eax(10);
-		/* Check for version and the number of counters */
-		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
-			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
-	}
-
-	if (cpu_has_ds) {
-		unsigned int l1, l2;
-		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
-		if (!(l1 & (1<<11)))
-			set_cpu_cap(c, X86_FEATURE_BTS);
-		if (!(l1 & (1<<12)))
-			set_cpu_cap(c, X86_FEATURE_PEBS);
-	}
-
-
-	if (cpu_has_bts)
-		ds_init_intel(c);
-
-	if (c->x86 == 15)
-		c->x86_cache_alignment = c->x86_clflush_size * 2;
-	if (c->x86 == 6)
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-	c->x86_max_cores = intel_num_cpu_cores(c);
-
-	srat_detect_node();
-}
-
-static struct cpu_dev intel_cpu_dev __cpuinitdata = {
-	.c_vendor	= "Intel",
-	.c_ident	= { "GenuineIntel" },
-	.c_early_init   = early_init_intel,
-	.c_init		= init_intel,
-};
-cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
-
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6b0a10b..3f46afb 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -1,8 +1,8 @@
 /*
- *      Routines to indentify caches on Intel CPU.
+ *	Routines to indentify caches on Intel CPU.
  *
- *      Changes:
- *      Venkatesh Pallipadi	: Adding cache identification through cpuid(4)
+ *	Changes:
+ *	Venkatesh Pallipadi	: Adding cache identification through cpuid(4)
  *		Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
  *	Andi Kleen / Andreas Herrmann	: CPUID4 emulation on AMD.
  */
@@ -13,6 +13,7 @@
 #include <linux/compiler.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/pci.h>
 
 #include <asm/processor.h>
 #include <asm/smp.h>
@@ -130,9 +131,18 @@
 	union _cpuid4_leaf_ebx ebx;
 	union _cpuid4_leaf_ecx ecx;
 	unsigned long size;
+	unsigned long can_disable;
 	cpumask_t shared_cpu_map;	/* future?: only cpus/node is needed */
 };
 
+#ifdef CONFIG_PCI
+static struct pci_device_id k8_nb_id[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
+	{}
+};
+#endif
+
 unsigned short			num_cache_leaves;
 
 /* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -182,9 +192,10 @@
 static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
 static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
 
-static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
-		       union _cpuid4_leaf_ebx *ebx,
-		       union _cpuid4_leaf_ecx *ecx)
+static void __cpuinit
+amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
+		     union _cpuid4_leaf_ebx *ebx,
+		     union _cpuid4_leaf_ecx *ecx)
 {
 	unsigned dummy;
 	unsigned line_size, lines_per_tag, assoc, size_in_kb;
@@ -251,27 +262,40 @@
 		(ebx->split.ways_of_associativity + 1) - 1;
 }
 
-static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
+static void __cpuinit
+amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
+{
+	if (index < 3)
+		return;
+	this_leaf->can_disable = 1;
+}
+
+static int
+__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
 {
 	union _cpuid4_leaf_eax 	eax;
 	union _cpuid4_leaf_ebx 	ebx;
 	union _cpuid4_leaf_ecx 	ecx;
 	unsigned		edx;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
 		amd_cpuid4(index, &eax, &ebx, &ecx);
-	else
-		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full,  &edx);
+		if (boot_cpu_data.x86 >= 0x10)
+			amd_check_l3_disable(index, this_leaf);
+	} else {
+		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
+	}
+
 	if (eax.split.type == CACHE_TYPE_NULL)
 		return -EIO; /* better error ? */
 
 	this_leaf->eax = eax;
 	this_leaf->ebx = ebx;
 	this_leaf->ecx = ecx;
-	this_leaf->size = (ecx.split.number_of_sets + 1) *
-		(ebx.split.coherency_line_size + 1) *
-		(ebx.split.physical_line_partition + 1) *
-		(ebx.split.ways_of_associativity + 1);
+	this_leaf->size = (ecx.split.number_of_sets          + 1) *
+			  (ebx.split.coherency_line_size     + 1) *
+			  (ebx.split.physical_line_partition + 1) *
+			  (ebx.split.ways_of_associativity   + 1);
 	return 0;
 }
 
@@ -453,7 +477,7 @@
 
 /* pointer to _cpuid4_info array (for each cache leaf) */
 static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
-#define CPUID4_INFO_IDX(x, y)    (&((per_cpu(cpuid4_info, x))[y]))
+#define CPUID4_INFO_IDX(x, y)	(&((per_cpu(cpuid4_info, x))[y]))
 
 #ifdef CONFIG_SMP
 static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
@@ -490,7 +514,7 @@
 
 	this_leaf = CPUID4_INFO_IDX(cpu, index);
 	for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
-		sibling_leaf = CPUID4_INFO_IDX(sibling, index);	
+		sibling_leaf = CPUID4_INFO_IDX(sibling, index);
 		cpu_clear(cpu, sibling_leaf->shared_cpu_map);
 	}
 }
@@ -572,7 +596,7 @@
 
 /* pointer to array of kobjects for cpuX/cache/indexY */
 static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
-#define INDEX_KOBJECT_PTR(x, y)    (&((per_cpu(index_kobject, x))[y]))
+#define INDEX_KOBJECT_PTR(x, y)		(&((per_cpu(index_kobject, x))[y]))
 
 #define show_one_plus(file_name, object, val)				\
 static ssize_t show_##file_name						\
@@ -637,6 +661,99 @@
 	}
 }
 
+#define to_object(k)	container_of(k, struct _index_kobject, kobj)
+#define to_attr(a)	container_of(a, struct _cache_attr, attr)
+
+#ifdef CONFIG_PCI
+static struct pci_dev *get_k8_northbridge(int node)
+{
+	struct pci_dev *dev = NULL;
+	int i;
+
+	for (i = 0; i <= node; i++) {
+		do {
+			dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+			if (!dev)
+				break;
+		} while (!pci_match_id(&k8_nb_id[0], dev));
+		if (!dev)
+			break;
+	}
+	return dev;
+}
+#else
+static struct pci_dev *get_k8_northbridge(int node)
+{
+	return NULL;
+}
+#endif
+
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
+{
+	int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+	struct pci_dev *dev = NULL;
+	ssize_t ret = 0;
+	int i;
+
+	if (!this_leaf->can_disable)
+		return sprintf(buf, "Feature not enabled\n");
+
+	dev = get_k8_northbridge(node);
+	if (!dev) {
+		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < 2; i++) {
+		unsigned int reg;
+
+		pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
+
+		ret += sprintf(buf, "%sEntry: %d\n", buf, i);
+		ret += sprintf(buf, "%sReads:  %s\tNew Entries: %s\n",  
+			buf,
+			reg & 0x80000000 ? "Disabled" : "Allowed",
+			reg & 0x40000000 ? "Disabled" : "Allowed");
+		ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
+			buf, (reg & 0x30000) >> 16, reg & 0xfff);
+	}
+	return ret;
+}
+
+static ssize_t
+store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
+		    size_t count)
+{
+	int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+	struct pci_dev *dev = NULL;
+	unsigned int ret, index, val;
+
+	if (!this_leaf->can_disable)
+		return 0;
+
+	if (strlen(buf) > 15)
+		return -EINVAL;
+
+	ret = sscanf(buf, "%x %x", &index, &val);
+	if (ret != 2)
+		return -EINVAL;
+	if (index > 1)
+		return -EINVAL;
+
+	val |= 0xc0000000;
+	dev = get_k8_northbridge(node);
+	if (!dev) {
+		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
+		return -EINVAL;
+	}
+
+	pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
+	wbinvd();
+	pci_write_config_dword(dev, 0x1BC + index * 4, val);
+
+	return 1;
+}
+
 struct _cache_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct _cpuid4_info *, char *);
@@ -657,6 +774,8 @@
 define_one_ro(shared_cpu_map);
 define_one_ro(shared_cpu_list);
 
+static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
+
 static struct attribute * default_attrs[] = {
 	&type.attr,
 	&level.attr,
@@ -667,12 +786,10 @@
 	&size.attr,
 	&shared_cpu_map.attr,
 	&shared_cpu_list.attr,
+	&cache_disable.attr,
 	NULL
 };
 
-#define to_object(k) container_of(k, struct _index_kobject, kobj)
-#define to_attr(a) container_of(a, struct _cache_attr, attr)
-
 static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
 {
 	struct _cache_attr *fattr = to_attr(attr);
@@ -682,14 +799,22 @@
 	ret = fattr->show ?
 		fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
 			buf) :
-	       	0;
+		0;
 	return ret;
 }
 
 static ssize_t store(struct kobject * kobj, struct attribute * attr,
 		     const char * buf, size_t count)
 {
-	return 0;
+	struct _cache_attr *fattr = to_attr(attr);
+	struct _index_kobject *this_leaf = to_object(kobj);
+	ssize_t ret;
+
+	ret = fattr->store ?
+		fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
+			buf, count) :
+		0;
+	return ret;
 }
 
 static struct sysfs_ops sysfs_ops = {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 726a5fc..4b031a4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -860,7 +860,7 @@
 	return err;
 }
 
-static void mce_remove_device(unsigned int cpu)
+static __cpuinit void mce_remove_device(unsigned int cpu)
 {
 	int i;
 
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
new file mode 100644
index 0000000..dfea390
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -0,0 +1,32 @@
+#!/usr/bin/perl
+#
+# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
+#
+
+($in, $out) = @ARGV;
+
+open(IN, "< $in\0")   or die "$0: cannot open: $in: $!\n";
+open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
+
+print OUT "#include <asm/cpufeature.h>\n\n";
+print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
+
+while (defined($line = <IN>)) {
+	if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
+		$macro = $1;
+		$feature = $2;
+		$tail = $3;
+		if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
+			$feature = $1;
+		}
+
+		if ($feature ne '') {
+			printf OUT "\t%-32s = \"%s\",\n",
+				"[$macro]", "\L$feature";
+		}
+	}
+}
+print OUT "};\n";
+
+close(IN);
+close(OUT);
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
new file mode 100644
index 0000000..5abbea2
--- /dev/null
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -0,0 +1,20 @@
+/*
+ * Strings for the various x86 power flags
+ *
+ * This file must not contain any executable code.
+ */
+
+#include <asm/cpufeature.h>
+
+const char *const x86_power_flags[32] = {
+	"ts",	/* temperature sensor */
+	"fid",  /* frequency id control */
+	"vid",  /* voltage id control */
+	"ttp",  /* thermal trip */
+	"tm",
+	"stc",
+	"100mhzsteps",
+	"hwpstate",
+	"",	/* tsc invariant mapped to constant_tsc */
+		/* nothing */
+};
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index b911a2c..52b3fef 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -5,6 +5,18 @@
 #include <asm/msr.h>
 #include "cpu.h"
 
+static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
+{
+	u32 xlvl;
+
+	/* Transmeta-defined flags: level 0x80860001 */
+	xlvl = cpuid_eax(0x80860000);
+	if ((xlvl & 0xffff0000) == 0x80860000) {
+		if (xlvl >= 0x80860001)
+			c->x86_capability[2] = cpuid_edx(0x80860001);
+	}
+}
+
 static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
 {
 	unsigned int cap_mask, uk, max, dummy;
@@ -12,7 +24,8 @@
 	unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
 	char cpu_info[65];
 
-	get_model_name(c);	/* Same as AMD/Cyrix */
+	early_init_transmeta(c);
+
 	display_cacheinfo(c);
 
 	/* Print CMS and CPU revision */
@@ -85,23 +98,12 @@
 #endif
 }
 
-static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c)
-{
-	u32 xlvl;
-
-	/* Transmeta-defined flags: level 0x80860001 */
-	xlvl = cpuid_eax(0x80860000);
-	if ((xlvl & 0xffff0000) == 0x80860000) {
-		if (xlvl >= 0x80860001)
-			c->x86_capability[2] = cpuid_edx(0x80860001);
-	}
-}
-
 static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
 	.c_vendor	= "Transmeta",
 	.c_ident	= { "GenuineTMx86", "TransmetaCPU" },
+	.c_early_init	= early_init_transmeta,
 	.c_init		= init_transmeta,
-	.c_identify	= transmeta_identify,
+	.c_x86_vendor	= X86_VENDOR_TRANSMETA,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev);
+cpu_dev_register(transmeta_cpu_dev);
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index b1fc909..e777f79 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -19,7 +19,8 @@
 		  }
 		},
 	},
+	.c_x86_vendor	= X86_VENDOR_UMC,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev);
+cpu_dev_register(umc_cpu_dev);
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 66e48aa..78e642f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -148,6 +148,9 @@
 		case E820_NVS:
 			printk(KERN_CONT "(ACPI NVS)\n");
 			break;
+		case E820_UNUSABLE:
+			printk("(unusable)\n");
+			break;
 		default:
 			printk(KERN_CONT "type %u\n", e820.map[i].type);
 			break;
@@ -1260,6 +1263,7 @@
 	case E820_RAM:	return "System RAM";
 	case E820_ACPI:	return "ACPI Tables";
 	case E820_NVS:	return "ACPI Non-volatile Storage";
+	case E820_UNUSABLE:	return "Unusable memory";
 	default:	return "reserved";
 	}
 }
@@ -1267,6 +1271,7 @@
 /*
  * Mark e820 reserved areas as busy for the resource manager.
  */
+static struct resource __initdata *e820_res;
 void __init e820_reserve_resources(void)
 {
 	int i;
@@ -1274,6 +1279,7 @@
 	u64 end;
 
 	res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
+	e820_res = res;
 	for (i = 0; i < e820.nr_map; i++) {
 		end = e820.map[i].addr + e820.map[i].size - 1;
 #ifndef CONFIG_RESOURCES_64BIT
@@ -1287,7 +1293,14 @@
 		res->end = end;
 
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		insert_resource(&iomem_resource, res);
+
+		/*
+		 * don't register the region that could be conflicted with
+		 * pci device BAR resource and insert them later in
+		 * pcibios_resource_survey()
+		 */
+		if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20))
+			insert_resource(&iomem_resource, res);
 		res++;
 	}
 
@@ -1299,6 +1312,19 @@
 	}
 }
 
+void __init e820_reserve_resources_late(void)
+{
+	int i;
+	struct resource *res;
+
+	res = e820_res;
+	for (i = 0; i < e820.nr_map; i++) {
+		if (!res->parent && res->end)
+			reserve_region_with_split(&iomem_resource, res->start, res->end, res->name);
+		res++;
+	}
+}
+
 char *__init default_machine_specific_memory_setup(void)
 {
 	char *who = "BIOS-e820";
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/kernel/es7000_32.c
similarity index 77%
rename from arch/x86/mach-es7000/es7000plat.c
rename to arch/x86/kernel/es7000_32.c
index 50189af..849e5cd 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -39,10 +39,93 @@
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include <asm/apicdef.h>
-#include "es7000.h"
 #include <mach_mpparse.h>
 
 /*
+ * ES7000 chipsets
+ */
+
+#define NON_UNISYS		0
+#define ES7000_CLASSIC		1
+#define ES7000_ZORRO		2
+
+
+#define	MIP_REG			1
+#define	MIP_PSAI_REG		4
+
+#define	MIP_BUSY		1
+#define	MIP_SPIN		0xf0000
+#define	MIP_VALID		0x0100000000000000ULL
+#define	MIP_PORT(VALUE)	((VALUE >> 32) & 0xffff)
+
+#define	MIP_RD_LO(VALUE)	(VALUE & 0xffffffff)
+
+struct mip_reg_info {
+	unsigned long long mip_info;
+	unsigned long long delivery_info;
+	unsigned long long host_reg;
+	unsigned long long mip_reg;
+};
+
+struct part_info {
+	unsigned char type;
+	unsigned char length;
+	unsigned char part_id;
+	unsigned char apic_mode;
+	unsigned long snum;
+	char ptype[16];
+	char sname[64];
+	char pname[64];
+};
+
+struct psai {
+	unsigned long long entry_type;
+	unsigned long long addr;
+	unsigned long long bep_addr;
+};
+
+struct es7000_mem_info {
+	unsigned char type;
+	unsigned char length;
+	unsigned char resv[6];
+	unsigned long long  start;
+	unsigned long long  size;
+};
+
+struct es7000_oem_table {
+	unsigned long long hdr;
+	struct mip_reg_info mip;
+	struct part_info pif;
+	struct es7000_mem_info shm;
+	struct psai psai;
+};
+
+#ifdef CONFIG_ACPI
+
+struct oem_table {
+	struct acpi_table_header Header;
+	u32 OEMTableAddr;
+	u32 OEMTableSize;
+};
+
+extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
+#endif
+
+struct mip_reg {
+	unsigned long long off_0;
+	unsigned long long off_8;
+	unsigned long long off_10;
+	unsigned long long off_18;
+	unsigned long long off_20;
+	unsigned long long off_28;
+	unsigned long long off_30;
+	unsigned long long off_38;
+};
+
+#define	MIP_SW_APIC		0x1020b
+#define	MIP_FUNC(VALUE)		(VALUE & 0xff)
+
+/*
  * ES7000 Globals
  */
 
@@ -72,7 +155,7 @@
 			base += nr_ioapic_registers[i];
 	}
 
-	if (!ioapic && (gsi < 16)) 
+	if (!ioapic && (gsi < 16))
 		gsi += base;
 	return gsi;
 }
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index eaff0bb..6c9bfc9 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -16,87 +16,63 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/hardirq.h>
+#include <linux/dmar.h>
 
 #include <asm/smp.h>
 #include <asm/ipi.h>
 #include <asm/genapic.h>
 
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
-
-DEFINE_PER_CPU(int, x2apic_extra_bits);
+extern struct genapic apic_flat;
+extern struct genapic apic_physflat;
+extern struct genapic apic_x2xpic_uv_x;
+extern struct genapic apic_x2apic_phys;
+extern struct genapic apic_x2apic_cluster;
 
 struct genapic __read_mostly *genapic = &apic_flat;
 
-static enum uv_system_type uv_system_type;
+static struct genapic *apic_probe[] __initdata = {
+	&apic_x2apic_uv_x,
+	&apic_x2apic_phys,
+	&apic_x2apic_cluster,
+	&apic_physflat,
+	NULL,
+};
 
 /*
  * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
  */
 void __init setup_apic_routing(void)
 {
-	if (uv_system_type == UV_NON_UNIQUE_APIC)
-		genapic = &apic_x2apic_uv_x;
-	else
-#ifdef CONFIG_ACPI
-	/*
-	 * Quirk: some x86_64 machines can only use physical APIC mode
-	 * regardless of how many processors are present (x86_64 ES7000
-	 * is an example).
-	 */
-	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
-			(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
-		genapic = &apic_physflat;
-	else
-#endif
+	if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) {
+		if (!intr_remapping_enabled)
+			genapic = &apic_flat;
+	}
 
-	if (max_physical_apicid < 8)
-		genapic = &apic_flat;
-	else
-		genapic = &apic_physflat;
-
-	printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
+	if (genapic == &apic_flat) {
+		if (max_physical_apicid >= 8)
+			genapic = &apic_physflat;
+		printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
+	}
 }
 
 /* Same for both flat and physical. */
 
-void send_IPI_self(int vector)
+void apic_send_IPI_self(int vector)
 {
 	__send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
 }
 
 int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-	if (!strcmp(oem_id, "SGI")) {
-		if (!strcmp(oem_table_id, "UVL"))
-			uv_system_type = UV_LEGACY_APIC;
-		else if (!strcmp(oem_table_id, "UVX"))
-			uv_system_type = UV_X2APIC;
-		else if (!strcmp(oem_table_id, "UVH"))
-			uv_system_type = UV_NON_UNIQUE_APIC;
+	int i;
+
+	for (i = 0; apic_probe[i]; ++i) {
+		if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
+			genapic = apic_probe[i];
+			printk(KERN_INFO "Setting APIC routing to %s.\n",
+				genapic->name);
+			return 1;
+		}
 	}
 	return 0;
 }
-
-unsigned int read_apic_id(void)
-{
-	unsigned int id;
-
-	WARN_ON(preemptible() && num_online_cpus() > 1);
-	id = apic_read(APIC_ID);
-	if (uv_system_type >= UV_X2APIC)
-		id  |= __get_cpu_var(x2apic_extra_bits);
-	return id;
-}
-
-enum uv_system_type get_uv_system_type(void)
-{
-	return uv_system_type;
-}
-
-int is_uv_system(void)
-{
-	return uv_system_type != UV_NONE;
-}
-EXPORT_SYMBOL_GPL(is_uv_system);
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 786548a..9eca5ba 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -15,9 +15,20 @@
 #include <linux/kernel.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
+#include <linux/hardirq.h>
 #include <asm/smp.h>
 #include <asm/ipi.h>
 #include <asm/genapic.h>
+#include <mach_apicdef.h>
+
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+
+static int __init flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	return 1;
+}
 
 static cpumask_t flat_target_cpus(void)
 {
@@ -95,9 +106,33 @@
 		__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
 }
 
+static unsigned int get_apic_id(unsigned long x)
+{
+	unsigned int id;
+
+	id = (((x)>>24) & 0xFFu);
+	return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+	unsigned long x;
+
+	x = ((id & 0xFFu)<<24);
+	return x;
+}
+
+static unsigned int read_xapic_id(void)
+{
+	unsigned int id;
+
+	id = get_apic_id(apic_read(APIC_ID));
+	return id;
+}
+
 static int flat_apic_id_registered(void)
 {
-	return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map);
+	return physid_isset(read_xapic_id(), phys_cpu_present_map);
 }
 
 static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
@@ -112,6 +147,7 @@
 
 struct genapic apic_flat =  {
 	.name = "flat",
+	.acpi_madt_oem_check = flat_acpi_madt_oem_check,
 	.int_delivery_mode = dest_LowestPrio,
 	.int_dest_mode = (APIC_DEST_LOGICAL != 0),
 	.target_cpus = flat_target_cpus,
@@ -121,8 +157,12 @@
 	.send_IPI_all = flat_send_IPI_all,
 	.send_IPI_allbutself = flat_send_IPI_allbutself,
 	.send_IPI_mask = flat_send_IPI_mask,
+	.send_IPI_self = apic_send_IPI_self,
 	.cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
 	.phys_pkg_id = phys_pkg_id,
+	.get_apic_id = get_apic_id,
+	.set_apic_id = set_apic_id,
+	.apic_id_mask = (0xFFu<<24),
 };
 
 /*
@@ -130,6 +170,21 @@
  * We cannot use logical delivery in this case because the mask
  * overflows, so use physical mode.
  */
+static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+#ifdef CONFIG_ACPI
+	/*
+	 * Quirk: some x86_64 machines can only use physical APIC mode
+	 * regardless of how many processors are present (x86_64 ES7000
+	 * is an example).
+	 */
+	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+		(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
+		return 1;
+#endif
+
+	return 0;
+}
 
 static cpumask_t physflat_target_cpus(void)
 {
@@ -176,6 +231,7 @@
 
 struct genapic apic_physflat =  {
 	.name = "physical flat",
+	.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
 	.int_delivery_mode = dest_Fixed,
 	.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
 	.target_cpus = physflat_target_cpus,
@@ -185,6 +241,10 @@
 	.send_IPI_all = physflat_send_IPI_all,
 	.send_IPI_allbutself = physflat_send_IPI_allbutself,
 	.send_IPI_mask = physflat_send_IPI_mask,
+	.send_IPI_self = apic_send_IPI_self,
 	.cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
 	.phys_pkg_id = phys_pkg_id,
+	.get_apic_id = get_apic_id,
+	.set_apic_id = set_apic_id,
+	.apic_id_mask = (0xFFu<<24),
 };
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
new file mode 100644
index 0000000..e4bf2cc
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -0,0 +1,159 @@
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/ipi.h>
+#include <asm/genapic.h>
+
+DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+
+static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	if (cpu_has_x2apic)
+		return 1;
+
+	return 0;
+}
+
+/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
+
+static cpumask_t x2apic_target_cpus(void)
+{
+	return cpumask_of_cpu(0);
+}
+
+/*
+ * for now each logical cpu is in its own vector allocation domain.
+ */
+static cpumask_t x2apic_vector_allocation_domain(int cpu)
+{
+	cpumask_t domain = CPU_MASK_NONE;
+	cpu_set(cpu, domain);
+	return domain;
+}
+
+static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
+				   unsigned int dest)
+{
+	unsigned long cfg;
+
+	cfg = __prepare_ICR(0, vector, dest);
+
+	/*
+	 * send the IPI.
+	 */
+	x2apic_icr_write(cfg, apicid);
+}
+
+/*
+ * for now, we send the IPI's one by one in the cpumask.
+ * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
+ * at once. We have 16 cpu's in a cluster. This will minimize IPI register
+ * writes.
+ */
+static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+{
+	unsigned long flags;
+	unsigned long query_cpu;
+
+	local_irq_save(flags);
+	for_each_cpu_mask(query_cpu, mask) {
+		__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+				       vector, APIC_DEST_LOGICAL);
+	}
+	local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+	cpumask_t mask = cpu_online_map;
+
+	cpu_clear(smp_processor_id(), mask);
+
+	if (!cpus_empty(mask))
+		x2apic_send_IPI_mask(mask, vector);
+}
+
+static void x2apic_send_IPI_all(int vector)
+{
+	x2apic_send_IPI_mask(cpu_online_map, vector);
+}
+
+static int x2apic_apic_id_registered(void)
+{
+	return 1;
+}
+
+static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+	int cpu;
+
+	/*
+	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
+	 * May as well be the first.
+	 */
+	cpu = first_cpu(cpumask);
+	if ((unsigned)cpu < NR_CPUS)
+		return per_cpu(x86_cpu_to_logical_apicid, cpu);
+	else
+		return BAD_APICID;
+}
+
+static unsigned int get_apic_id(unsigned long x)
+{
+	unsigned int id;
+
+	id = x;
+	return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+	unsigned long x;
+
+	x = id;
+	return x;
+}
+
+static unsigned int phys_pkg_id(int index_msb)
+{
+	return current_cpu_data.initial_apicid >> index_msb;
+}
+
+static void x2apic_send_IPI_self(int vector)
+{
+	apic_write(APIC_SELF_IPI, vector);
+}
+
+static void init_x2apic_ldr(void)
+{
+	int cpu = smp_processor_id();
+
+	per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
+	return;
+}
+
+struct genapic apic_x2apic_cluster = {
+	.name = "cluster x2apic",
+	.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+	.int_delivery_mode = dest_LowestPrio,
+	.int_dest_mode = (APIC_DEST_LOGICAL != 0),
+	.target_cpus = x2apic_target_cpus,
+	.vector_allocation_domain = x2apic_vector_allocation_domain,
+	.apic_id_registered = x2apic_apic_id_registered,
+	.init_apic_ldr = init_x2apic_ldr,
+	.send_IPI_all = x2apic_send_IPI_all,
+	.send_IPI_allbutself = x2apic_send_IPI_allbutself,
+	.send_IPI_mask = x2apic_send_IPI_mask,
+	.send_IPI_self = x2apic_send_IPI_self,
+	.cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+	.phys_pkg_id = phys_pkg_id,
+	.get_apic_id = get_apic_id,
+	.set_apic_id = set_apic_id,
+	.apic_id_mask = (0xFFFFFFFFu),
+};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
new file mode 100644
index 0000000..8f1343d
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -0,0 +1,154 @@
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/ipi.h>
+#include <asm/genapic.h>
+
+static int x2apic_phys;
+
+static int set_x2apic_phys_mode(char *arg)
+{
+	x2apic_phys = 1;
+	return 0;
+}
+early_param("x2apic_phys", set_x2apic_phys_mode);
+
+static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	if (cpu_has_x2apic && x2apic_phys)
+		return 1;
+
+	return 0;
+}
+
+/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
+
+static cpumask_t x2apic_target_cpus(void)
+{
+	return cpumask_of_cpu(0);
+}
+
+static cpumask_t x2apic_vector_allocation_domain(int cpu)
+{
+	cpumask_t domain = CPU_MASK_NONE;
+	cpu_set(cpu, domain);
+	return domain;
+}
+
+static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
+				   unsigned int dest)
+{
+	unsigned long cfg;
+
+	cfg = __prepare_ICR(0, vector, dest);
+
+	/*
+	 * send the IPI.
+	 */
+	x2apic_icr_write(cfg, apicid);
+}
+
+static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+{
+	unsigned long flags;
+	unsigned long query_cpu;
+
+	local_irq_save(flags);
+	for_each_cpu_mask(query_cpu, mask) {
+		__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
+				       vector, APIC_DEST_PHYSICAL);
+	}
+	local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+	cpumask_t mask = cpu_online_map;
+
+	cpu_clear(smp_processor_id(), mask);
+
+	if (!cpus_empty(mask))
+		x2apic_send_IPI_mask(mask, vector);
+}
+
+static void x2apic_send_IPI_all(int vector)
+{
+	x2apic_send_IPI_mask(cpu_online_map, vector);
+}
+
+static int x2apic_apic_id_registered(void)
+{
+	return 1;
+}
+
+static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+	int cpu;
+
+	/*
+	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
+	 * May as well be the first.
+	 */
+	cpu = first_cpu(cpumask);
+	if ((unsigned)cpu < NR_CPUS)
+		return per_cpu(x86_cpu_to_apicid, cpu);
+	else
+		return BAD_APICID;
+}
+
+static unsigned int get_apic_id(unsigned long x)
+{
+	unsigned int id;
+
+	id = x;
+	return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+	unsigned long x;
+
+	x = id;
+	return x;
+}
+
+static unsigned int phys_pkg_id(int index_msb)
+{
+	return current_cpu_data.initial_apicid >> index_msb;
+}
+
+void x2apic_send_IPI_self(int vector)
+{
+	apic_write(APIC_SELF_IPI, vector);
+}
+
+void init_x2apic_ldr(void)
+{
+	return;
+}
+
+struct genapic apic_x2apic_phys = {
+	.name = "physical x2apic",
+	.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+	.int_delivery_mode = dest_Fixed,
+	.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
+	.target_cpus = x2apic_target_cpus,
+	.vector_allocation_domain = x2apic_vector_allocation_domain,
+	.apic_id_registered = x2apic_apic_id_registered,
+	.init_apic_ldr = init_x2apic_ldr,
+	.send_IPI_all = x2apic_send_IPI_all,
+	.send_IPI_allbutself = x2apic_send_IPI_allbutself,
+	.send_IPI_mask = x2apic_send_IPI_mask,
+	.send_IPI_self = x2apic_send_IPI_self,
+	.cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+	.phys_pkg_id = phys_pkg_id,
+	.get_apic_id = get_apic_id,
+	.set_apic_id = set_apic_id,
+	.apic_id_mask = (0xFFFFFFFFu),
+};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index bfa837c..ae2ffc8 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -12,12 +12,12 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/string.h>
-#include <linux/kernel.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
+#include <linux/hardirq.h>
 #include <asm/smp.h>
 #include <asm/ipi.h>
 #include <asm/genapic.h>
@@ -26,6 +26,36 @@
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/bios.h>
 
+DEFINE_PER_CPU(int, x2apic_extra_bits);
+
+static enum uv_system_type uv_system_type;
+
+static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	if (!strcmp(oem_id, "SGI")) {
+		if (!strcmp(oem_table_id, "UVL"))
+			uv_system_type = UV_LEGACY_APIC;
+		else if (!strcmp(oem_table_id, "UVX"))
+			uv_system_type = UV_X2APIC;
+		else if (!strcmp(oem_table_id, "UVH")) {
+			uv_system_type = UV_NON_UNIQUE_APIC;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+enum uv_system_type get_uv_system_type(void)
+{
+	return uv_system_type;
+}
+
+int is_uv_system(void)
+{
+	return uv_system_type != UV_NONE;
+}
+EXPORT_SYMBOL_GPL(is_uv_system);
+
 DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
 
@@ -123,6 +153,10 @@
 	return 1;
 }
 
+static void uv_init_apic_ldr(void)
+{
+}
+
 static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
 {
 	int cpu;
@@ -138,9 +172,34 @@
 		return BAD_APICID;
 }
 
+static unsigned int get_apic_id(unsigned long x)
+{
+	unsigned int id;
+
+	WARN_ON(preemptible() && num_online_cpus() > 1);
+	id = x | __get_cpu_var(x2apic_extra_bits);
+
+	return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+	unsigned long x;
+
+	/* maskout x2apic_extra_bits ? */
+	x = id;
+	return x;
+}
+
+static unsigned int uv_read_apic_id(void)
+{
+
+	return get_apic_id(apic_read(APIC_ID));
+}
+
 static unsigned int phys_pkg_id(int index_msb)
 {
-	return GET_APIC_ID(read_apic_id()) >> index_msb;
+	return uv_read_apic_id() >> index_msb;
 }
 
 #ifdef ZZZ		/* Needs x2apic patch */
@@ -152,17 +211,22 @@
 
 struct genapic apic_x2apic_uv_x = {
 	.name = "UV large system",
+	.acpi_madt_oem_check = uv_acpi_madt_oem_check,
 	.int_delivery_mode = dest_Fixed,
 	.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
 	.target_cpus = uv_target_cpus,
 	.vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */
 	.apic_id_registered = uv_apic_id_registered,
+	.init_apic_ldr = uv_init_apic_ldr,
 	.send_IPI_all = uv_send_IPI_all,
 	.send_IPI_allbutself = uv_send_IPI_allbutself,
 	.send_IPI_mask = uv_send_IPI_mask,
 	/* ZZZ.send_IPI_self = uv_send_IPI_self, */
 	.cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
 	.phys_pkg_id = phys_pkg_id,	/* Fixme ZZZ */
+	.get_apic_id = get_apic_id,
+	.set_apic_id = set_apic_id,
+	.apic_id_mask = (0xFFFFFFFFu),
 };
 
 static __cpuinit void set_x2apic_extra_bits(int pnode)
@@ -401,3 +465,5 @@
 	if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
 		set_x2apic_extra_bits(uv_hub_info->pnode);
 }
+
+
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index eb9ddd8..45723f1 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -21,9 +21,12 @@
 # include <asm/sigcontext32.h>
 # include <asm/user32.h>
 #else
-# define save_i387_ia32		save_i387
-# define restore_i387_ia32	restore_i387
+# define save_i387_xstate_ia32		save_i387_xstate
+# define restore_i387_xstate_ia32	restore_i387_xstate
 # define _fpstate_ia32		_fpstate
+# define _xstate_ia32		_xstate
+# define sig_xstate_ia32_size   sig_xstate_size
+# define fx_sw_reserved_ia32	fx_sw_reserved
 # define user_i387_ia32_struct	user_i387_struct
 # define user32_fxsr_struct	user_fxsr_struct
 #endif
@@ -36,6 +39,7 @@
 
 static unsigned int		mxcsr_feature_mask __read_mostly = 0xffffffffu;
 unsigned int xstate_size;
+unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
 static struct i387_fxsave_struct fx_scratch __cpuinitdata;
 
 void __cpuinit mxcsr_feature_mask_init(void)
@@ -61,6 +65,11 @@
 		return;
 	}
 
+	if (cpu_has_xsave) {
+		xsave_cntxt_init();
+		return;
+	}
+
 	if (cpu_has_fxsr)
 		xstate_size = sizeof(struct i387_fxsave_struct);
 #ifdef CONFIG_X86_32
@@ -83,9 +92,19 @@
 
 	write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
 
+	/*
+	 * Boot processor to setup the FP and extended state context info.
+	 */
+	if (!smp_processor_id())
+		init_thread_xstate();
+	xsave_init();
+
 	mxcsr_feature_mask_init();
 	/* clean state in init */
-	current_thread_info()->status = 0;
+	if (cpu_has_xsave)
+		current_thread_info()->status = TS_XSAVE;
+	else
+		current_thread_info()->status = 0;
 	clear_used_math();
 }
 #endif	/* CONFIG_X86_64 */
@@ -195,6 +214,13 @@
 	 */
 	target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
 
+	/*
+	 * update the header bits in the xsave header, indicating the
+	 * presence of FP and SSE state.
+	 */
+	if (cpu_has_xsave)
+		target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+
 	return ret;
 }
 
@@ -395,6 +421,12 @@
 	if (!ret)
 		convert_to_fxsr(target, &env);
 
+	/*
+	 * update the header bit in the xsave header, indicating the
+	 * presence of FP.
+	 */
+	if (cpu_has_xsave)
+		target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
 	return ret;
 }
 
@@ -407,7 +439,6 @@
 	struct task_struct *tsk = current;
 	struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
 
-	unlazy_fpu(tsk);
 	fp->status = fp->swd;
 	if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
 		return -1;
@@ -421,8 +452,6 @@
 	struct user_i387_ia32_struct env;
 	int err = 0;
 
-	unlazy_fpu(tsk);
-
 	convert_from_fxsr(&env, tsk);
 	if (__copy_to_user(buf, &env, sizeof(env)))
 		return -1;
@@ -432,16 +461,40 @@
 	if (err)
 		return -1;
 
-	if (__copy_to_user(&buf->_fxsr_env[0], fx,
-			   sizeof(struct i387_fxsave_struct)))
+	if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
 		return -1;
 	return 1;
 }
 
-int save_i387_ia32(struct _fpstate_ia32 __user *buf)
+static int save_i387_xsave(void __user *buf)
 {
+	struct _fpstate_ia32 __user *fx = buf;
+	int err = 0;
+
+	if (save_i387_fxsave(fx) < 0)
+		return -1;
+
+	err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
+			     sizeof(struct _fpx_sw_bytes));
+	err |= __put_user(FP_XSTATE_MAGIC2,
+			  (__u32 __user *) (buf + sig_xstate_ia32_size
+					    - FP_XSTATE_MAGIC2_SIZE));
+	if (err)
+		return -1;
+
+	return 1;
+}
+
+int save_i387_xstate_ia32(void __user *buf)
+{
+	struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
+	struct task_struct *tsk = current;
+
 	if (!used_math())
 		return 0;
+
+	if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
+		return -EACCES;
 	/*
 	 * This will cause a "finit" to be triggered by the next
 	 * attempted FPU operation by the 'current' process.
@@ -451,13 +504,17 @@
 	if (!HAVE_HWFP) {
 		return fpregs_soft_get(current, NULL,
 				       0, sizeof(struct user_i387_ia32_struct),
-				       NULL, buf) ? -1 : 1;
+				       NULL, fp) ? -1 : 1;
 	}
 
+	unlazy_fpu(tsk);
+
+	if (cpu_has_xsave)
+		return save_i387_xsave(fp);
 	if (cpu_has_fxsr)
-		return save_i387_fxsave(buf);
+		return save_i387_fxsave(fp);
 	else
-		return save_i387_fsave(buf);
+		return save_i387_fsave(fp);
 }
 
 static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
@@ -468,14 +525,15 @@
 				sizeof(struct i387_fsave_struct));
 }
 
-static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
+static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
+			       unsigned int size)
 {
 	struct task_struct *tsk = current;
 	struct user_i387_ia32_struct env;
 	int err;
 
 	err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
-			       sizeof(struct i387_fxsave_struct));
+			       size);
 	/* mxcsr reserved bits must be masked to zero for security reasons */
 	tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
 	if (err || __copy_from_user(&env, buf, sizeof(env)))
@@ -485,14 +543,69 @@
 	return 0;
 }
 
-int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
+static int restore_i387_xsave(void __user *buf)
+{
+	struct _fpx_sw_bytes fx_sw_user;
+	struct _fpstate_ia32 __user *fx_user =
+			((struct _fpstate_ia32 __user *) buf);
+	struct i387_fxsave_struct __user *fx =
+		(struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
+	struct xsave_hdr_struct *xsave_hdr =
+				&current->thread.xstate->xsave.xsave_hdr;
+	u64 mask;
+	int err;
+
+	if (check_for_xstate(fx, buf, &fx_sw_user))
+		goto fx_only;
+
+	mask = fx_sw_user.xstate_bv;
+
+	err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
+
+	xsave_hdr->xstate_bv &= pcntxt_mask;
+	/*
+	 * These bits must be zero.
+	 */
+	xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+
+	/*
+	 * Init the state that is not present in the memory layout
+	 * and enabled by the OS.
+	 */
+	mask = ~(pcntxt_mask & ~mask);
+	xsave_hdr->xstate_bv &= mask;
+
+	return err;
+fx_only:
+	/*
+	 * Couldn't find the extended state information in the memory
+	 * layout. Restore the FP/SSE and init the other extended state
+	 * enabled by the OS.
+	 */
+	xsave_hdr->xstate_bv = XSTATE_FPSSE;
+	return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
+}
+
+int restore_i387_xstate_ia32(void __user *buf)
 {
 	int err;
 	struct task_struct *tsk = current;
+	struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
 
 	if (HAVE_HWFP)
 		clear_fpu(tsk);
 
+	if (!buf) {
+		if (used_math()) {
+			clear_fpu(tsk);
+			clear_used_math();
+		}
+
+		return 0;
+	} else
+		if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
+			return -EACCES;
+
 	if (!used_math()) {
 		err = init_fpu(tsk);
 		if (err)
@@ -500,14 +613,17 @@
 	}
 
 	if (HAVE_HWFP) {
-		if (cpu_has_fxsr)
-			err = restore_i387_fxsave(buf);
+		if (cpu_has_xsave)
+			err = restore_i387_xsave(buf);
+		else if (cpu_has_fxsr)
+			err = restore_i387_fxsave(fp, sizeof(struct
+							   i387_fxsave_struct));
 		else
-			err = restore_i387_fsave(buf);
+			err = restore_i387_fsave(fp);
 	} else {
 		err = fpregs_soft_set(current, NULL,
 				      0, sizeof(struct user_i387_ia32_struct),
-				      NULL, buf) != 0;
+				      NULL, fp) != 0;
 	}
 	set_used_math();
 
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index dc92b49..4b8a53d 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -282,6 +282,30 @@
 
 device_initcall(i8259A_init_sysfs);
 
+void mask_8259A(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+
+	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
+	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
+
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+void unmask_8259A(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+
+	outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+	outb(cached_slave_mask, PIC_SLAVE_IMR);	  /* restore slave IRQ mask */
+
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
 void init_8259A(int auto_eoi)
 {
 	unsigned long flags;
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 09cddb5..e710289 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -46,10 +46,13 @@
 #include <asm/nmi.h>
 #include <asm/msidef.h>
 #include <asm/hypertransport.h>
+#include <asm/setup.h>
 
 #include <mach_apic.h>
 #include <mach_apicdef.h>
 
+#define __apicdebuginit(type) static type __init
+
 int (*ioapic_renumber_irq)(int ioapic, int irq);
 atomic_t irq_mis_count;
 
@@ -1341,7 +1344,8 @@
 	ioapic_write_entry(apic, pin, entry);
 }
 
-void __init print_IO_APIC(void)
+
+__apicdebuginit(void) print_IO_APIC(void)
 {
 	int apic, i;
 	union IO_APIC_reg_00 reg_00;
@@ -1456,9 +1460,7 @@
 	return;
 }
 
-#if 0
-
-static void print_APIC_bitfield(int base)
+__apicdebuginit(void) print_APIC_bitfield(int base)
 {
 	unsigned int v;
 	int i, j;
@@ -1479,9 +1481,10 @@
 	}
 }
 
-void /*__init*/ print_local_APIC(void *dummy)
+__apicdebuginit(void) print_local_APIC(void *dummy)
 {
 	unsigned int v, ver, maxlvt;
+	u64 icr;
 
 	if (apic_verbosity == APIC_QUIET)
 		return;
@@ -1490,7 +1493,7 @@
 		smp_processor_id(), hard_smp_processor_id());
 	v = apic_read(APIC_ID);
 	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v,
-			GET_APIC_ID(read_apic_id()));
+			GET_APIC_ID(v));
 	v = apic_read(APIC_LVR);
 	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
 	ver = GET_APIC_VERSION(v);
@@ -1532,10 +1535,9 @@
 		printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
 	}
 
-	v = apic_read(APIC_ICR);
-	printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
-	v = apic_read(APIC_ICR2);
-	printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+	icr = apic_icr_read();
+	printk(KERN_DEBUG "... APIC ICR: %08x\n", icr);
+	printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32);
 
 	v = apic_read(APIC_LVTT);
 	printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1563,12 +1565,12 @@
 	printk("\n");
 }
 
-void print_all_local_APICs(void)
+__apicdebuginit(void) print_all_local_APICs(void)
 {
 	on_each_cpu(print_local_APIC, NULL, 1);
 }
 
-void /*__init*/ print_PIC(void)
+__apicdebuginit(void) print_PIC(void)
 {
 	unsigned int v;
 	unsigned long flags;
@@ -1600,7 +1602,17 @@
 	printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
 }
 
-#endif  /*  0  */
+__apicdebuginit(int) print_all_ICs(void)
+{
+	print_PIC();
+	print_all_local_APICs();
+	print_IO_APIC();
+
+	return 0;
+}
+
+fs_initcall(print_all_ICs);
+
 
 static void __init enable_IO_APIC(void)
 {
@@ -1698,8 +1710,7 @@
 		entry.dest_mode       = 0; /* Physical */
 		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
 		entry.vector          = 0;
-		entry.dest.physical.physical_dest =
-					GET_APIC_ID(read_apic_id());
+		entry.dest.physical.physical_dest = read_apic_id();
 
 		/*
 		 * Add it to the IO-APIC irq-routing table:
@@ -1725,10 +1736,8 @@
 	unsigned char old_id;
 	unsigned long flags;
 
-#ifdef CONFIG_X86_NUMAQ
-	if (found_numaq)
+	if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
 		return;
-#endif
 
 	/*
 	 * Don't check I/O APIC IDs for xAPIC systems.  They have
@@ -2329,8 +2338,6 @@
 	setup_IO_APIC_irqs();
 	init_IO_APIC_traps();
 	check_timer();
-	if (!acpi_ioapic)
-		print_IO_APIC();
 }
 
 /*
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 61a83b7..a1bec29 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -37,6 +37,7 @@
 #include <acpi/acpi_bus.h>
 #endif
 #include <linux/bootmem.h>
+#include <linux/dmar.h>
 
 #include <asm/idle.h>
 #include <asm/io.h>
@@ -49,10 +50,13 @@
 #include <asm/nmi.h>
 #include <asm/msidef.h>
 #include <asm/hypertransport.h>
+#include <asm/irq_remapping.h>
 
 #include <mach_ipi.h>
 #include <mach_apic.h>
 
+#define __apicdebuginit(type) static type __init
+
 struct irq_cfg {
 	cpumask_t domain;
 	cpumask_t old_domain;
@@ -87,8 +91,6 @@
 
 char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
 
-#define __apicdebuginit  __init
-
 int sis_apic_bug; /* not actually supported, dummy for compile */
 
 static int no_timer_check;
@@ -108,6 +110,9 @@
  */
 int nr_ioapic_registers[MAX_IO_APICS];
 
+/* I/O APIC RTE contents at the OS boot up */
+struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
+
 /* I/O APIC entries */
 struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;
@@ -303,7 +308,12 @@
 		pin = entry->pin;
 		if (pin == -1)
 			break;
-		io_apic_write(apic, 0x11 + pin*2, dest);
+		/*
+		 * With interrupt-remapping, destination information comes
+		 * from interrupt-remapping table entry.
+		 */
+		if (!irq_remapped(irq))
+			io_apic_write(apic, 0x11 + pin*2, dest);
 		reg = io_apic_read(apic, 0x10 + pin*2);
 		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
 		reg |= vector;
@@ -440,6 +450,69 @@
 			clear_IO_APIC_pin(apic, pin);
 }
 
+/*
+ * Saves and masks all the unmasked IO-APIC RTE's
+ */
+int save_mask_IO_APIC_setup(void)
+{
+	union IO_APIC_reg_01 reg_01;
+	unsigned long flags;
+	int apic, pin;
+
+	/*
+	 * The number of IO-APIC IRQ registers (== #pins):
+	 */
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		spin_lock_irqsave(&ioapic_lock, flags);
+		reg_01.raw = io_apic_read(apic, 1);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
+	}
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		early_ioapic_entries[apic] =
+			kzalloc(sizeof(struct IO_APIC_route_entry) *
+				nr_ioapic_registers[apic], GFP_KERNEL);
+		if (!early_ioapic_entries[apic])
+			return -ENOMEM;
+	}
+
+	for (apic = 0; apic < nr_ioapics; apic++)
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+			struct IO_APIC_route_entry entry;
+
+			entry = early_ioapic_entries[apic][pin] =
+				ioapic_read_entry(apic, pin);
+			if (!entry.mask) {
+				entry.mask = 1;
+				ioapic_write_entry(apic, pin, entry);
+			}
+		}
+	return 0;
+}
+
+void restore_IO_APIC_setup(void)
+{
+	int apic, pin;
+
+	for (apic = 0; apic < nr_ioapics; apic++)
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+			ioapic_write_entry(apic, pin,
+					   early_ioapic_entries[apic][pin]);
+}
+
+void reinit_intr_remapped_IO_APIC(int intr_remapping)
+{
+	/*
+	 * for now plain restore of previous settings.
+	 * TBD: In the case of OS enabling interrupt-remapping,
+	 * IO-APIC RTE's need to be setup to point to interrupt-remapping
+	 * table entries. for now, do a plain restore, and wait for
+	 * the setup_IO_APIC_irqs() to do proper initialization.
+	 */
+	restore_IO_APIC_setup();
+}
+
 int skip_ioapic_setup;
 int ioapic_force;
 
@@ -839,18 +912,98 @@
 }
 
 static struct irq_chip ioapic_chip;
+#ifdef CONFIG_INTR_REMAP
+static struct irq_chip ir_ioapic_chip;
+#endif
 
 static void ioapic_register_intr(int irq, unsigned long trigger)
 {
-	if (trigger) {
+	if (trigger)
 		irq_desc[irq].status |= IRQ_LEVEL;
-		set_irq_chip_and_handler_name(irq, &ioapic_chip,
-					      handle_fasteoi_irq, "fasteoi");
-	} else {
+	else
 		irq_desc[irq].status &= ~IRQ_LEVEL;
+
+#ifdef CONFIG_INTR_REMAP
+	if (irq_remapped(irq)) {
+		irq_desc[irq].status |= IRQ_MOVE_PCNTXT;
+		if (trigger)
+			set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
+						      handle_fasteoi_irq,
+						     "fasteoi");
+		else
+			set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
+						      handle_edge_irq, "edge");
+		return;
+	}
+#endif
+	if (trigger)
+		set_irq_chip_and_handler_name(irq, &ioapic_chip,
+					      handle_fasteoi_irq,
+					      "fasteoi");
+	else
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
 					      handle_edge_irq, "edge");
+}
+
+static int setup_ioapic_entry(int apic, int irq,
+			      struct IO_APIC_route_entry *entry,
+			      unsigned int destination, int trigger,
+			      int polarity, int vector)
+{
+	/*
+	 * add it to the IO-APIC irq-routing table:
+	 */
+	memset(entry,0,sizeof(*entry));
+
+#ifdef CONFIG_INTR_REMAP
+	if (intr_remapping_enabled) {
+		struct intel_iommu *iommu = map_ioapic_to_ir(apic);
+		struct irte irte;
+		struct IR_IO_APIC_route_entry *ir_entry =
+			(struct IR_IO_APIC_route_entry *) entry;
+		int index;
+
+		if (!iommu)
+			panic("No mapping iommu for ioapic %d\n", apic);
+
+		index = alloc_irte(iommu, irq, 1);
+		if (index < 0)
+			panic("Failed to allocate IRTE for ioapic %d\n", apic);
+
+		memset(&irte, 0, sizeof(irte));
+
+		irte.present = 1;
+		irte.dst_mode = INT_DEST_MODE;
+		irte.trigger_mode = trigger;
+		irte.dlvry_mode = INT_DELIVERY_MODE;
+		irte.vector = vector;
+		irte.dest_id = IRTE_DEST(destination);
+
+		modify_irte(irq, &irte);
+
+		ir_entry->index2 = (index >> 15) & 0x1;
+		ir_entry->zero = 0;
+		ir_entry->format = 1;
+		ir_entry->index = (index & 0x7fff);
+	} else
+#endif
+	{
+		entry->delivery_mode = INT_DELIVERY_MODE;
+		entry->dest_mode = INT_DEST_MODE;
+		entry->dest = destination;
 	}
+
+	entry->mask = 0;				/* enable IRQ */
+	entry->trigger = trigger;
+	entry->polarity = polarity;
+	entry->vector = vector;
+
+	/* Mask level triggered irqs.
+	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+	 */
+	if (trigger)
+		entry->mask = 1;
+	return 0;
 }
 
 static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
@@ -875,24 +1028,15 @@
 		    apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
 		    irq, trigger, polarity);
 
-	/*
-	 * add it to the IO-APIC irq-routing table:
-	 */
-	memset(&entry,0,sizeof(entry));
 
-	entry.delivery_mode = INT_DELIVERY_MODE;
-	entry.dest_mode = INT_DEST_MODE;
-	entry.dest = cpu_mask_to_apicid(mask);
-	entry.mask = 0;				/* enable IRQ */
-	entry.trigger = trigger;
-	entry.polarity = polarity;
-	entry.vector = cfg->vector;
-
-	/* Mask level triggered irqs.
-	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
-	 */
-	if (trigger)
-		entry.mask = 1;
+	if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
+			       cpu_mask_to_apicid(mask), trigger, polarity,
+			       cfg->vector)) {
+		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
+		       mp_ioapics[apic].mp_apicid, pin);
+		__clear_irq_vector(irq);
+		return;
+	}
 
 	ioapic_register_intr(irq, trigger);
 	if (irq < 16)
@@ -944,6 +1088,9 @@
 {
 	struct IO_APIC_route_entry entry;
 
+	if (intr_remapping_enabled)
+		return;
+
 	memset(&entry, 0, sizeof(entry));
 
 	/*
@@ -970,7 +1117,8 @@
 	ioapic_write_entry(apic, pin, entry);
 }
 
-void __apicdebuginit print_IO_APIC(void)
+
+__apicdebuginit(void) print_IO_APIC(void)
 {
 	int apic, i;
 	union IO_APIC_reg_00 reg_00;
@@ -1064,9 +1212,7 @@
 	return;
 }
 
-#if 0
-
-static __apicdebuginit void print_APIC_bitfield (int base)
+__apicdebuginit(void) print_APIC_bitfield(int base)
 {
 	unsigned int v;
 	int i, j;
@@ -1087,9 +1233,10 @@
 	}
 }
 
-void __apicdebuginit print_local_APIC(void * dummy)
+__apicdebuginit(void) print_local_APIC(void *dummy)
 {
 	unsigned int v, ver, maxlvt;
+	unsigned long icr;
 
 	if (apic_verbosity == APIC_QUIET)
 		return;
@@ -1097,7 +1244,7 @@
 	printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
 		smp_processor_id(), hard_smp_processor_id());
 	v = apic_read(APIC_ID);
-	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
+	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, read_apic_id());
 	v = apic_read(APIC_LVR);
 	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
 	ver = GET_APIC_VERSION(v);
@@ -1133,10 +1280,9 @@
 	v = apic_read(APIC_ESR);
 	printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
 
-	v = apic_read(APIC_ICR);
-	printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
-	v = apic_read(APIC_ICR2);
-	printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+	icr = apic_icr_read();
+	printk(KERN_DEBUG "... APIC ICR: %08x\n", icr);
+	printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32);
 
 	v = apic_read(APIC_LVTT);
 	printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1164,12 +1310,12 @@
 	printk("\n");
 }
 
-void print_all_local_APICs (void)
+__apicdebuginit(void) print_all_local_APICs(void)
 {
 	on_each_cpu(print_local_APIC, NULL, 1);
 }
 
-void __apicdebuginit print_PIC(void)
+__apicdebuginit(void) print_PIC(void)
 {
 	unsigned int v;
 	unsigned long flags;
@@ -1201,7 +1347,17 @@
 	printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
 }
 
-#endif  /*  0  */
+__apicdebuginit(int) print_all_ICs(void)
+{
+	print_PIC();
+	print_all_local_APICs();
+	print_IO_APIC();
+
+	return 0;
+}
+
+fs_initcall(print_all_ICs);
+
 
 void __init enable_IO_APIC(void)
 {
@@ -1291,7 +1447,7 @@
 		entry.dest_mode       = 0; /* Physical */
 		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
 		entry.vector          = 0;
-		entry.dest          = GET_APIC_ID(read_apic_id());
+		entry.dest            = read_apic_id();
 
 		/*
 		 * Add it to the IO-APIC irq-routing table:
@@ -1397,6 +1553,147 @@
  */
 
 #ifdef CONFIG_SMP
+
+#ifdef CONFIG_INTR_REMAP
+static void ir_irq_migration(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
+
+/*
+ * Migrate the IO-APIC irq in the presence of intr-remapping.
+ *
+ * For edge triggered, irq migration is a simple atomic update(of vector
+ * and cpu destination) of IRTE and flush the hardware cache.
+ *
+ * For level triggered, we need to modify the io-apic RTE aswell with the update
+ * vector information, along with modifying IRTE with vector and destination.
+ * So irq migration for level triggered is little  bit more complex compared to
+ * edge triggered migration. But the good news is, we use the same algorithm
+ * for level triggered migration as we have today, only difference being,
+ * we now initiate the irq migration from process context instead of the
+ * interrupt context.
+ *
+ * In future, when we do a directed EOI (combined with cpu EOI broadcast
+ * suppression) to the IO-APIC, level triggered irq migration will also be
+ * as simple as edge triggered migration and we can do the irq migration
+ * with a simple atomic update to IO-APIC RTE.
+ */
+static void migrate_ioapic_irq(int irq, cpumask_t mask)
+{
+	struct irq_cfg *cfg = irq_cfg + irq;
+	struct irq_desc *desc = irq_desc + irq;
+	cpumask_t tmp, cleanup_mask;
+	struct irte irte;
+	int modify_ioapic_rte = desc->status & IRQ_LEVEL;
+	unsigned int dest;
+	unsigned long flags;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		return;
+
+	if (get_irte(irq, &irte))
+		return;
+
+	if (assign_irq_vector(irq, mask))
+		return;
+
+	cpus_and(tmp, cfg->domain, mask);
+	dest = cpu_mask_to_apicid(tmp);
+
+	if (modify_ioapic_rte) {
+		spin_lock_irqsave(&ioapic_lock, flags);
+		__target_IO_APIC_irq(irq, dest, cfg->vector);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+	}
+
+	irte.vector = cfg->vector;
+	irte.dest_id = IRTE_DEST(dest);
+
+	/*
+	 * Modified the IRTE and flushes the Interrupt entry cache.
+	 */
+	modify_irte(irq, &irte);
+
+	if (cfg->move_in_progress) {
+		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+		cfg->move_in_progress = 0;
+	}
+
+	irq_desc[irq].affinity = mask;
+}
+
+static int migrate_irq_remapped_level(int irq)
+{
+	int ret = -1;
+
+	mask_IO_APIC_irq(irq);
+
+	if (io_apic_level_ack_pending(irq)) {
+		/*
+	 	 * Interrupt in progress. Migrating irq now will change the
+		 * vector information in the IO-APIC RTE and that will confuse
+		 * the EOI broadcast performed by cpu.
+		 * So, delay the irq migration to the next instance.
+		 */
+		schedule_delayed_work(&ir_migration_work, 1);
+		goto unmask;
+	}
+
+	/* everthing is clear. we have right of way */
+	migrate_ioapic_irq(irq, irq_desc[irq].pending_mask);
+
+	ret = 0;
+	irq_desc[irq].status &= ~IRQ_MOVE_PENDING;
+	cpus_clear(irq_desc[irq].pending_mask);
+
+unmask:
+	unmask_IO_APIC_irq(irq);
+	return ret;
+}
+
+static void ir_irq_migration(struct work_struct *work)
+{
+	int irq;
+
+	for (irq = 0; irq < NR_IRQS; irq++) {
+		struct irq_desc *desc = irq_desc + irq;
+		if (desc->status & IRQ_MOVE_PENDING) {
+			unsigned long flags;
+
+			spin_lock_irqsave(&desc->lock, flags);
+			if (!desc->chip->set_affinity ||
+			    !(desc->status & IRQ_MOVE_PENDING)) {
+				desc->status &= ~IRQ_MOVE_PENDING;
+				spin_unlock_irqrestore(&desc->lock, flags);
+				continue;
+			}
+
+			desc->chip->set_affinity(irq,
+					         irq_desc[irq].pending_mask);
+			spin_unlock_irqrestore(&desc->lock, flags);
+		}
+	}
+}
+
+/*
+ * Migrates the IRQ destination in the process context.
+ */
+static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	if (irq_desc[irq].status & IRQ_LEVEL) {
+		irq_desc[irq].status |= IRQ_MOVE_PENDING;
+		irq_desc[irq].pending_mask = mask;
+		migrate_irq_remapped_level(irq);
+		return;
+	}
+
+	migrate_ioapic_irq(irq, mask);
+}
+#endif
+
 asmlinkage void smp_irq_move_cleanup_interrupt(void)
 {
 	unsigned vector, me;
@@ -1453,6 +1750,17 @@
 #else
 static inline void irq_complete_move(unsigned int irq) {}
 #endif
+#ifdef CONFIG_INTR_REMAP
+static void ack_x2apic_level(unsigned int irq)
+{
+	ack_x2APIC_irq();
+}
+
+static void ack_x2apic_edge(unsigned int irq)
+{
+	ack_x2APIC_irq();
+}
+#endif
 
 static void ack_apic_edge(unsigned int irq)
 {
@@ -1527,6 +1835,21 @@
 	.retrigger	= ioapic_retrigger_irq,
 };
 
+#ifdef CONFIG_INTR_REMAP
+static struct irq_chip ir_ioapic_chip __read_mostly = {
+	.name 		= "IR-IO-APIC",
+	.startup 	= startup_ioapic_irq,
+	.mask	 	= mask_IO_APIC_irq,
+	.unmask	 	= unmask_IO_APIC_irq,
+	.ack 		= ack_x2apic_edge,
+	.eoi 		= ack_x2apic_level,
+#ifdef CONFIG_SMP
+	.set_affinity 	= set_ir_ioapic_affinity_irq,
+#endif
+	.retrigger	= ioapic_retrigger_irq,
+};
+#endif
+
 static inline void init_IO_APIC_traps(void)
 {
 	int irq;
@@ -1712,6 +2035,8 @@
 	 * 8259A.
 	 */
 	if (pin1 == -1) {
+		if (intr_remapping_enabled)
+			panic("BIOS bug: timer not connected to IO-APIC");
 		pin1 = pin2;
 		apic1 = apic2;
 		no_pin1 = 1;
@@ -1738,6 +2063,8 @@
 				clear_IO_APIC_pin(0, pin1);
 			goto out;
 		}
+		if (intr_remapping_enabled)
+			panic("timer doesn't work through Interrupt-remapped IO-APIC");
 		clear_IO_APIC_pin(apic1, pin1);
 		if (!no_pin1)
 			apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -1854,8 +2181,6 @@
 	setup_IO_APIC_irqs();
 	init_IO_APIC_traps();
 	check_timer();
-	if (!acpi_ioapic)
-		print_IO_APIC();
 }
 
 struct sysfs_ioapic_data {
@@ -1977,6 +2302,9 @@
 
 	dynamic_irq_cleanup(irq);
 
+#ifdef CONFIG_INTR_REMAP
+	free_irte(irq);
+#endif
 	spin_lock_irqsave(&vector_lock, flags);
 	__clear_irq_vector(irq);
 	spin_unlock_irqrestore(&vector_lock, flags);
@@ -1995,10 +2323,41 @@
 
 	tmp = TARGET_CPUS;
 	err = assign_irq_vector(irq, tmp);
-	if (!err) {
-		cpus_and(tmp, cfg->domain, tmp);
-		dest = cpu_mask_to_apicid(tmp);
+	if (err)
+		return err;
 
+	cpus_and(tmp, cfg->domain, tmp);
+	dest = cpu_mask_to_apicid(tmp);
+
+#ifdef CONFIG_INTR_REMAP
+	if (irq_remapped(irq)) {
+		struct irte irte;
+		int ir_index;
+		u16 sub_handle;
+
+		ir_index = map_irq_to_irte_handle(irq, &sub_handle);
+		BUG_ON(ir_index == -1);
+
+		memset (&irte, 0, sizeof(irte));
+
+		irte.present = 1;
+		irte.dst_mode = INT_DEST_MODE;
+		irte.trigger_mode = 0; /* edge */
+		irte.dlvry_mode = INT_DELIVERY_MODE;
+		irte.vector = cfg->vector;
+		irte.dest_id = IRTE_DEST(dest);
+
+		modify_irte(irq, &irte);
+
+		msg->address_hi = MSI_ADDR_BASE_HI;
+		msg->data = sub_handle;
+		msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+				  MSI_ADDR_IR_SHV |
+				  MSI_ADDR_IR_INDEX1(ir_index) |
+				  MSI_ADDR_IR_INDEX2(ir_index);
+	} else
+#endif
+	{
 		msg->address_hi = MSI_ADDR_BASE_HI;
 		msg->address_lo =
 			MSI_ADDR_BASE_LO |
@@ -2049,6 +2408,55 @@
 	write_msi_msg(irq, &msg);
 	irq_desc[irq].affinity = mask;
 }
+
+#ifdef CONFIG_INTR_REMAP
+/*
+ * Migrate the MSI irq to another cpumask. This migration is
+ * done in the process context using interrupt-remapping hardware.
+ */
+static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct irq_cfg *cfg = irq_cfg + irq;
+	unsigned int dest;
+	cpumask_t tmp, cleanup_mask;
+	struct irte irte;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		return;
+
+	if (get_irte(irq, &irte))
+		return;
+
+	if (assign_irq_vector(irq, mask))
+		return;
+
+	cpus_and(tmp, cfg->domain, mask);
+	dest = cpu_mask_to_apicid(tmp);
+
+	irte.vector = cfg->vector;
+	irte.dest_id = IRTE_DEST(dest);
+
+	/*
+	 * atomically update the IRTE with the new destination and vector.
+	 */
+	modify_irte(irq, &irte);
+
+	/*
+	 * After this point, all the interrupts will start arriving
+	 * at the new destination. So, time to cleanup the previous
+	 * vector allocation.
+	 */
+	if (cfg->move_in_progress) {
+		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+		cfg->move_in_progress = 0;
+	}
+
+	irq_desc[irq].affinity = mask;
+}
+#endif
 #endif /* CONFIG_SMP */
 
 /*
@@ -2066,28 +2474,159 @@
 	.retrigger	= ioapic_retrigger_irq,
 };
 
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+#ifdef CONFIG_INTR_REMAP
+static struct irq_chip msi_ir_chip = {
+	.name		= "IR-PCI-MSI",
+	.unmask		= unmask_msi_irq,
+	.mask		= mask_msi_irq,
+	.ack		= ack_x2apic_edge,
+#ifdef CONFIG_SMP
+	.set_affinity	= ir_set_msi_irq_affinity,
+#endif
+	.retrigger	= ioapic_retrigger_irq,
+};
+
+/*
+ * Map the PCI dev to the corresponding remapping hardware unit
+ * and allocate 'nvec' consecutive interrupt-remapping table entries
+ * in it.
+ */
+static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
 {
+	struct intel_iommu *iommu;
+	int index;
+
+	iommu = map_dev_to_ir(dev);
+	if (!iommu) {
+		printk(KERN_ERR
+		       "Unable to map PCI %s to iommu\n", pci_name(dev));
+		return -ENOENT;
+	}
+
+	index = alloc_irte(iommu, irq, nvec);
+	if (index < 0) {
+		printk(KERN_ERR
+		       "Unable to allocate %d IRTE for PCI %s\n", nvec,
+		        pci_name(dev));
+		return -ENOSPC;
+	}
+	return index;
+}
+#endif
+
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+{
+	int ret;
 	struct msi_msg msg;
-	int irq, ret;
-	irq = create_irq();
-	if (irq < 0)
-		return irq;
 
 	ret = msi_compose_msg(dev, irq, &msg);
-	if (ret < 0) {
-		destroy_irq(irq);
+	if (ret < 0)
 		return ret;
-	}
 
 	set_irq_msi(irq, desc);
 	write_msi_msg(irq, &msg);
 
-	set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+#ifdef CONFIG_INTR_REMAP
+	if (irq_remapped(irq)) {
+		struct irq_desc *desc = irq_desc + irq;
+		/*
+		 * irq migration in process context
+		 */
+		desc->status |= IRQ_MOVE_PCNTXT;
+		set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
+	} else
+#endif
+		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 
 	return 0;
 }
 
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
+	int irq, ret;
+
+	irq = create_irq();
+	if (irq < 0)
+		return irq;
+
+#ifdef CONFIG_INTR_REMAP
+	if (!intr_remapping_enabled)
+		goto no_ir;
+
+	ret = msi_alloc_irte(dev, irq, 1);
+	if (ret < 0)
+		goto error;
+no_ir:
+#endif
+	ret = setup_msi_irq(dev, desc, irq);
+	if (ret < 0) {
+		destroy_irq(irq);
+		return ret;
+	}
+	return 0;
+
+#ifdef CONFIG_INTR_REMAP
+error:
+	destroy_irq(irq);
+	return ret;
+#endif
+}
+
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	int irq, ret, sub_handle;
+	struct msi_desc *desc;
+#ifdef CONFIG_INTR_REMAP
+	struct intel_iommu *iommu = 0;
+	int index = 0;
+#endif
+
+	sub_handle = 0;
+	list_for_each_entry(desc, &dev->msi_list, list) {
+		irq = create_irq();
+		if (irq < 0)
+			return irq;
+#ifdef CONFIG_INTR_REMAP
+		if (!intr_remapping_enabled)
+			goto no_ir;
+
+		if (!sub_handle) {
+			/*
+			 * allocate the consecutive block of IRTE's
+			 * for 'nvec'
+			 */
+			index = msi_alloc_irte(dev, irq, nvec);
+			if (index < 0) {
+				ret = index;
+				goto error;
+			}
+		} else {
+			iommu = map_dev_to_ir(dev);
+			if (!iommu) {
+				ret = -ENOENT;
+				goto error;
+			}
+			/*
+			 * setup the mapping between the irq and the IRTE
+			 * base index, the sub_handle pointing to the
+			 * appropriate interrupt remap table entry.
+			 */
+			set_irte_irq(irq, iommu, index, sub_handle);
+		}
+no_ir:
+#endif
+		ret = setup_msi_irq(dev, desc, irq);
+		if (ret < 0)
+			goto error;
+		sub_handle++;
+	}
+	return 0;
+
+error:
+	destroy_irq(irq);
+	return ret;
+}
+
 void arch_teardown_msi_irq(unsigned int irq)
 {
 	destroy_irq(irq);
@@ -2333,6 +2872,10 @@
 				setup_IO_APIC_irq(ioapic, pin, irq,
 						  irq_trigger(irq_entry),
 						  irq_polarity(irq_entry));
+#ifdef CONFIG_INTR_REMAP
+			else if (intr_remapping_enabled)
+				set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
+#endif
 			else
 				set_ioapic_affinity_irq(irq, TARGET_CPUS);
 		}
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index d669142..9200a1e 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -74,6 +74,15 @@
 	}
 }
 
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+static struct irqaction irq2 = {
+	.handler = no_action,
+	.mask = CPU_MASK_NONE,
+	.name = "cascade",
+};
+
 /* Overridden in paravirt.c */
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
@@ -98,6 +107,46 @@
 			set_intr_gate(vector, interrupt[i]);
 	}
 
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
+	/*
+	 * IRQ0 must be given a fixed assignment and initialized,
+	 * because it's used before the IO-APIC is set up.
+	 */
+	set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+
+	/*
+	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+	 * IPI, driven by wakeup.
+	 */
+	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+	/* IPI for invalidation */
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+
+	/* IPI for generic function call */
+	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+	/* IPI for single call function */
+	set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	/* self generated IPI for local APIC timer */
+	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+	/* IPI vectors for APIC spurious and error interrupts */
+	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+#endif
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+	/* thermal monitor LVT interrupt */
+	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+
+	if (!acpi_ioapic)
+		setup_irq(2, &irq2);
+
 	/* setup after call gates are initialised (usually add in
 	 * the architecture specific gates)
 	 */
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b3fb430..f98f4e1 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -397,7 +397,9 @@
        generic_bigsmp_probe();
 #endif
 
+#ifdef CONFIG_X86_32
 	setup_apic_routing();
+#endif
 	if (!num_processors)
 		printk(KERN_ERR "MPTABLE: no processors registered!\n");
 	return num_processors;
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index eecc8c1..4caff39 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -229,6 +229,12 @@
 	}
 }
 
+static int __init numaq_setup_ioapic_ids(void)
+{
+	/* so can skip it */
+	return 1;
+}
+
 static struct x86_quirks numaq_x86_quirks __initdata = {
 	.arch_pre_time_init	= numaq_pre_time_init,
 	.arch_time_init		= NULL,
@@ -243,6 +249,7 @@
 	.mpc_oem_bus_info	= mpc_oem_bus_info,
 	.mpc_oem_pci_bus	= mpc_oem_pci_bus,
 	.smp_read_mpc_oem	= smp_read_mpc_oem,
+	.setup_ioapic_ids	= numaq_setup_ioapic_ids,
 };
 
 void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e2f4376..6b0bb73 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -374,8 +374,6 @@
 
 struct pv_apic_ops pv_apic_ops = {
 #ifdef CONFIG_X86_LOCAL_APIC
-	.apic_write = native_apic_write,
-	.apic_read = native_apic_read,
 	.setup_boot_clock = setup_boot_APIC_clock,
 	.setup_secondary_clock = setup_secondary_APIC_clock,
 	.startup_ipi_hook = paravirt_nop,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ec7a2ba..c622772 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,7 +15,6 @@
 EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
-static int force_mwait __cpuinitdata;
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 141efab..46c98ef 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -758,6 +758,8 @@
 #else
 	num_physpages = max_pfn;
 
+ 	if (cpu_has_x2apic)
+ 		check_x2apic();
 
 	/* How many end-of-memory variables you have, grandma! */
 	/* need this before calling reserve_initrd */
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 8b4956e..cc673aa 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -3,9 +3,18 @@
 	char __user *pretcode;
 	int sig;
 	struct sigcontext sc;
-	struct _fpstate fpstate;
+	/*
+	 * fpstate is unused. fpstate is moved/allocated after
+	 * retcode[] below. This movement allows to have the FP state and the
+	 * future state extensions (xsave) stay together.
+	 * And at the same time retaining the unused fpstate, prevents changing
+	 * the offset of extramask[] in the sigframe and thus prevent any
+	 * legacy application accessing/modifying it.
+	 */
+	struct _fpstate fpstate_unused;
 	unsigned long extramask[_NSIG_WORDS-1];
 	char retcode[8];
+	/* fp state follows here */
 };
 
 struct rt_sigframe {
@@ -15,14 +24,15 @@
 	void __user *puc;
 	struct siginfo info;
 	struct ucontext uc;
-	struct _fpstate fpstate;
 	char retcode[8];
+	/* fp state follows here */
 };
 #else
 struct rt_sigframe {
 	char __user *pretcode;
 	struct ucontext uc;
 	struct siginfo info;
+	/* fp state follows here */
 };
 
 int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 2a2435d..b21070e 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -161,28 +161,14 @@
 	}
 
 	{
-		struct _fpstate __user *buf;
+		void __user *buf;
 
 		err |= __get_user(buf, &sc->fpstate);
-		if (buf) {
-			if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
-				goto badframe;
-			err |= restore_i387(buf);
-		} else {
-			struct task_struct *me = current;
-
-			if (used_math()) {
-				clear_fpu(me);
-				clear_used_math();
-			}
-		}
+		err |= restore_i387_xstate(buf);
 	}
 
 	err |= __get_user(*pax, &sc->ax);
 	return err;
-
-badframe:
-	return 1;
 }
 
 asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
@@ -264,7 +250,7 @@
  * Set up a signal frame.
  */
 static int
-setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
+setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		 struct pt_regs *regs, unsigned long mask)
 {
 	int tmp, err = 0;
@@ -291,7 +277,7 @@
 	err |= __put_user(regs->sp, &sc->sp_at_signal);
 	err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
 
-	tmp = save_i387(fpstate);
+	tmp = save_i387_xstate(fpstate);
 	if (tmp < 0)
 		err = 1;
 	else
@@ -308,7 +294,8 @@
  * Determine which stack to use..
  */
 static inline void __user *
-get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
+get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
+	     void **fpstate)
 {
 	unsigned long sp;
 
@@ -334,6 +321,11 @@
 			sp = (unsigned long) ka->sa.sa_restorer;
 	}
 
+	if (used_math()) {
+		sp = sp - sig_xstate_size;
+		*fpstate = (struct _fpstate *) sp;
+	}
+
 	sp -= frame_size;
 	/*
 	 * Align the stack pointer according to the i386 ABI,
@@ -352,8 +344,9 @@
 	void __user *restorer;
 	int err = 0;
 	int usig;
+	void __user *fpstate = NULL;
 
-	frame = get_sigframe(ka, regs, sizeof(*frame));
+	frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
 
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
 		goto give_sigsegv;
@@ -368,7 +361,7 @@
 	if (err)
 		goto give_sigsegv;
 
-	err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
+	err = setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]);
 	if (err)
 		goto give_sigsegv;
 
@@ -429,8 +422,9 @@
 	void __user *restorer;
 	int err = 0;
 	int usig;
+	void __user *fpstate = NULL;
 
-	frame = get_sigframe(ka, regs, sizeof(*frame));
+	frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
 
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
 		goto give_sigsegv;
@@ -449,13 +443,16 @@
 		goto give_sigsegv;
 
 	/* Create the ucontext.  */
-	err |= __put_user(0, &frame->uc.uc_flags);
+	if (cpu_has_xsave)
+		err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
+	else
+		err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __put_user(0, &frame->uc.uc_link);
 	err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
 	err |= __put_user(sas_ss_flags(regs->sp),
 			  &frame->uc.uc_stack.ss_flags);
 	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
-	err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
+	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
 				regs, set->sig[0]);
 	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
 	if (err)
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 694aa88..823a55b 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -53,69 +53,6 @@
 }
 
 /*
- * Signal frame handlers.
- */
-
-static inline int save_i387(struct _fpstate __user *buf)
-{
-	struct task_struct *tsk = current;
-	int err = 0;
-
-	BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
-			sizeof(tsk->thread.xstate->fxsave));
-
-	if ((unsigned long)buf % 16)
-		printk("save_i387: bad fpstate %p\n", buf);
-
-	if (!used_math())
-		return 0;
-	clear_used_math(); /* trigger finit */
-	if (task_thread_info(tsk)->status & TS_USEDFPU) {
-		err = save_i387_checking((struct i387_fxsave_struct __user *)
-					 buf);
-		if (err)
-			return err;
-		task_thread_info(tsk)->status &= ~TS_USEDFPU;
-		stts();
-	} else {
-		if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
-				   sizeof(struct i387_fxsave_struct)))
-			return -1;
-	}
-	return 1;
-}
-
-/*
- * This restores directly out of user space. Exceptions are handled.
- */
-static inline int restore_i387(struct _fpstate __user *buf)
-{
-	struct task_struct *tsk = current;
-	int err;
-
-	if (!used_math()) {
-		err = init_fpu(tsk);
-		if (err)
-			return err;
-	}
-
-	if (!(task_thread_info(current)->status & TS_USEDFPU)) {
-		clts();
-		task_thread_info(current)->status |= TS_USEDFPU;
-	}
-	err = restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
-	if (unlikely(err)) {
-		/*
-		 * Encountered an error while doing the restore from the
-		 * user buffer, clear the fpu state.
-		 */
-		clear_fpu(tsk);
-		clear_used_math();
-	}
-	return err;
-}
-
-/*
  * Do a signal return; undo the signal stack.
  */
 static int
@@ -159,25 +96,11 @@
 	{
 		struct _fpstate __user *buf;
 		err |= __get_user(buf, &sc->fpstate);
-
-		if (buf) {
-			if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
-				goto badframe;
-			err |= restore_i387(buf);
-		} else {
-			struct task_struct *me = current;
-			if (used_math()) {
-				clear_fpu(me);
-				clear_used_math();
-			}
-		}
+		err |= restore_i387_xstate(buf);
 	}
 
 	err |= __get_user(*pax, &sc->ax);
 	return err;
-
-badframe:
-	return 1;
 }
 
 asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
@@ -269,26 +192,23 @@
 			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
 
-	return (void __user *)round_down(sp - size, 16);
+	return (void __user *)round_down(sp - size, 64);
 }
 
 static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 			   sigset_t *set, struct pt_regs *regs)
 {
 	struct rt_sigframe __user *frame;
-	struct _fpstate __user *fp = NULL;
+	void __user *fp = NULL;
 	int err = 0;
 	struct task_struct *me = current;
 
 	if (used_math()) {
-		fp = get_stack(ka, regs, sizeof(struct _fpstate));
+		fp = get_stack(ka, regs, sig_xstate_size);
 		frame = (void __user *)round_down(
 			(unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
 
-		if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
-			goto give_sigsegv;
-
-		if (save_i387(fp) < 0)
+		if (save_i387_xstate(fp) < 0)
 			err |= -1;
 	} else
 		frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
@@ -303,7 +223,10 @@
 	}
 
 	/* Create the ucontext.  */
-	err |= __put_user(0, &frame->uc.uc_flags);
+	if (cpu_has_xsave)
+		err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
+	else
+		err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __put_user(0, &frame->uc.uc_link);
 	err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
 	err |= __put_user(sas_ss_flags(regs->sp),
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 4e7ccb0..9056f7e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -123,7 +123,6 @@
 
 static atomic_t init_deasserted;
 
-static int boot_cpu_logical_apicid;
 
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
@@ -165,6 +164,8 @@
 #endif
 
 #ifdef CONFIG_X86_32
+static int boot_cpu_logical_apicid;
+
 u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
 					{ [0 ... NR_CPUS-1] = BAD_APICID };
 
@@ -210,7 +211,7 @@
 	/*
 	 * (This works even if the APIC is not enabled.)
 	 */
-	phys_id = GET_APIC_ID(read_apic_id());
+	phys_id = read_apic_id();
 	cpuid = smp_processor_id();
 	if (cpu_isset(cpuid, cpu_callin_map)) {
 		panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
@@ -551,8 +552,7 @@
 			printk(KERN_CONT
 			       "a previous APIC delivery may have failed\n");
 
-		apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
-		apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
+		apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
 
 		timeout = 0;
 		do {
@@ -584,11 +584,9 @@
 	int maxlvt;
 
 	/* Target chip */
-	apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
-
 	/* Boot on the stack */
 	/* Kick the second */
-	apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
+	apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid);
 
 	pr_debug("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
@@ -641,13 +639,11 @@
 	/*
 	 * Turn INIT on target chip
 	 */
-	apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
 	/*
 	 * Send IPI
 	 */
-	apic_write(APIC_ICR,
-		   APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT);
+	apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
+		       phys_apicid);
 
 	pr_debug("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
@@ -657,10 +653,8 @@
 	pr_debug("Deasserting INIT.\n");
 
 	/* Target chip */
-	apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
 	/* Send IPI */
-	apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+	apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
 
 	pr_debug("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
@@ -703,11 +697,10 @@
 		 */
 
 		/* Target chip */
-		apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
 		/* Boot on the stack */
 		/* Kick the second */
-		apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12));
+		apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
+			       phys_apicid);
 
 		/*
 		 * Give the other CPU some time to accept the IPI.
@@ -1176,10 +1169,17 @@
 	 * Setup boot CPU information
 	 */
 	smp_store_cpu_info(0); /* Final full version of the data */
+#ifdef CONFIG_X86_32
 	boot_cpu_logical_apicid = logical_smp_processor_id();
+#endif
 	current_thread_info()->cpu = 0;  /* needed? */
 	set_cpu_sibling_map(0);
 
+#ifdef CONFIG_X86_64
+	enable_IR_x2apic();
+	setup_apic_routing();
+#endif
+
 	if (smp_sanity_check(max_cpus) < 0) {
 		printk(KERN_INFO "SMP disabled\n");
 		disable_smp();
@@ -1187,9 +1187,9 @@
 	}
 
 	preempt_disable();
-	if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) {
+	if (read_apic_id() != boot_cpu_physical_apicid) {
 		panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
-		     GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid);
+		     read_apic_id(), boot_cpu_physical_apicid);
 		/* Or can we switch back to PIC here? */
 	}
 	preempt_enable();
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index d67ce5f..7b98785 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -30,7 +30,7 @@
 #include <linux/init.h>
 #include <asm/io.h>
 #include <asm/bios_ebda.h>
-#include <asm/mach-summit/mach_mpparse.h>
+#include <asm/summit/mpparse.h>
 
 static struct rio_table_hdr *rio_table_hdr __initdata;
 static struct scal_detail   *scal_devs[MAX_NUMNODES] __initdata;
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 03df8e4..da5a596 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -1228,7 +1228,6 @@
 
 	set_bit(SYSCALL_VECTOR, used_vectors);
 
-	init_thread_xstate();
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 7a31f10..2887a78 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -1138,7 +1138,7 @@
 	/*
 	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
 	 */
-	if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
+	if (unlikely(restore_fpu_checking(me))) {
 		stts();
 		force_sig(SIGSEGV, me);
 		return;
@@ -1179,10 +1179,6 @@
 	set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
 #endif
 	/*
-	 * initialize the per thread extended state:
-	 */
-	init_thread_xstate();
-	/*
 	 * Should be a barrier for any external CPU state:
 	 */
 	cpu_init();
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 8c9ad02..8b6c393 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -905,8 +905,8 @@
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
-	para_fill(pv_apic_ops.apic_read, APICRead);
-	para_fill(pv_apic_ops.apic_write, APICWrite);
+       para_fill(apic_ops->read, APICRead);
+       para_fill(apic_ops->write, APICWrite);
 #endif
 
 	/*
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index af5bdad..a9b8560 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -140,10 +140,10 @@
 	*(.con_initcall.init)
   	__con_initcall_end = .;
   }
-  .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) {
-	__x86cpuvendor_start = .;
-	*(.x86cpuvendor.init)
-	__x86cpuvendor_end = .;
+  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+	__x86_cpu_dev_start = .;
+	*(.x86_cpu_dev.init)
+	__x86_cpu_dev_end = .;
   }
   SECURITY_INIT
   . = ALIGN(4);
@@ -180,6 +180,7 @@
   . = ALIGN(PAGE_SIZE);
   .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
 	__per_cpu_start = .;
+	*(.data.percpu.page_aligned)
 	*(.data.percpu)
 	*(.data.percpu.shared_aligned)
 	__per_cpu_end = .;
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 63e5c1a..201e81a 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -168,13 +168,12 @@
 	*(.con_initcall.init)
   }
   __con_initcall_end = .;
-  . = ALIGN(16);
-  __x86cpuvendor_start = .;
-  .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) {
-	*(.x86cpuvendor.init)
+  __x86_cpu_dev_start = .;
+  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+	*(.x86_cpu_dev.init)
   }
-  __x86cpuvendor_end = .;
   SECURITY_INIT
+  __x86_cpu_dev_end = .;
 
   . = ALIGN(8);
   .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
new file mode 100644
index 0000000..07713d6
--- /dev/null
+++ b/arch/x86/kernel/xsave.c
@@ -0,0 +1,316 @@
+/*
+ * xsave/xrstor support.
+ *
+ * Author: Suresh Siddha <suresh.b.siddha@intel.com>
+ */
+#include <linux/bootmem.h>
+#include <linux/compat.h>
+#include <asm/i387.h>
+#ifdef CONFIG_IA32_EMULATION
+#include <asm/sigcontext32.h>
+#endif
+#include <asm/xcr.h>
+
+/*
+ * Supported feature mask by the CPU and the kernel.
+ */
+u64 pcntxt_mask;
+
+struct _fpx_sw_bytes fx_sw_reserved;
+#ifdef CONFIG_IA32_EMULATION
+struct _fpx_sw_bytes fx_sw_reserved_ia32;
+#endif
+
+/*
+ * Check for the presence of extended state information in the
+ * user fpstate pointer in the sigcontext.
+ */
+int check_for_xstate(struct i387_fxsave_struct __user *buf,
+		     void __user *fpstate,
+		     struct _fpx_sw_bytes *fx_sw_user)
+{
+	int min_xstate_size = sizeof(struct i387_fxsave_struct) +
+			      sizeof(struct xsave_hdr_struct);
+	unsigned int magic2;
+	int err;
+
+	err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
+			       sizeof(struct _fpx_sw_bytes));
+
+	if (err)
+		return err;
+
+	/*
+	 * First Magic check failed.
+	 */
+	if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
+		return -1;
+
+	/*
+	 * Check for error scenarios.
+	 */
+	if (fx_sw_user->xstate_size < min_xstate_size ||
+	    fx_sw_user->xstate_size > xstate_size ||
+	    fx_sw_user->xstate_size > fx_sw_user->extended_size)
+		return -1;
+
+	err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
+					    fx_sw_user->extended_size -
+					    FP_XSTATE_MAGIC2_SIZE));
+	/*
+	 * Check for the presence of second magic word at the end of memory
+	 * layout. This detects the case where the user just copied the legacy
+	 * fpstate layout with out copying the extended state information
+	 * in the memory layout.
+	 */
+	if (err || magic2 != FP_XSTATE_MAGIC2)
+		return -1;
+
+	return 0;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Signal frame handlers.
+ */
+
+int save_i387_xstate(void __user *buf)
+{
+	struct task_struct *tsk = current;
+	int err = 0;
+
+	if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
+		return -EACCES;
+
+	BUG_ON(sig_xstate_size < xstate_size);
+
+	if ((unsigned long)buf % 64)
+		printk("save_i387_xstate: bad fpstate %p\n", buf);
+
+	if (!used_math())
+		return 0;
+	clear_used_math(); /* trigger finit */
+	if (task_thread_info(tsk)->status & TS_USEDFPU) {
+		/*
+	 	 * Start with clearing the user buffer. This will present a
+	 	 * clean context for the bytes not touched by the fxsave/xsave.
+		 */
+		__clear_user(buf, sig_xstate_size);
+
+		if (task_thread_info(tsk)->status & TS_XSAVE)
+			err = xsave_user(buf);
+		else
+			err = fxsave_user(buf);
+
+		if (err)
+			return err;
+		task_thread_info(tsk)->status &= ~TS_USEDFPU;
+		stts();
+	} else {
+		if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
+				   xstate_size))
+			return -1;
+	}
+
+	if (task_thread_info(tsk)->status & TS_XSAVE) {
+		struct _fpstate __user *fx = buf;
+
+		err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
+				     sizeof(struct _fpx_sw_bytes));
+
+		err |= __put_user(FP_XSTATE_MAGIC2,
+				  (__u32 __user *) (buf + sig_xstate_size
+						    - FP_XSTATE_MAGIC2_SIZE));
+	}
+
+	return 1;
+}
+
+/*
+ * Restore the extended state if present. Otherwise, restore the FP/SSE
+ * state.
+ */
+int restore_user_xstate(void __user *buf)
+{
+	struct _fpx_sw_bytes fx_sw_user;
+	u64 mask;
+	int err;
+
+	if (((unsigned long)buf % 64) ||
+	     check_for_xstate(buf, buf, &fx_sw_user))
+		goto fx_only;
+
+	mask = fx_sw_user.xstate_bv;
+
+	/*
+	 * restore the state passed by the user.
+	 */
+	err = xrestore_user(buf, mask);
+	if (err)
+		return err;
+
+	/*
+	 * init the state skipped by the user.
+	 */
+	mask = pcntxt_mask & ~mask;
+
+	xrstor_state(init_xstate_buf, mask);
+
+	return 0;
+
+fx_only:
+	/*
+	 * couldn't find the extended state information in the
+	 * memory layout. Restore just the FP/SSE and init all
+	 * the other extended state.
+	 */
+	xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
+	return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
+}
+
+/*
+ * This restores directly out of user space. Exceptions are handled.
+ */
+int restore_i387_xstate(void __user *buf)
+{
+	struct task_struct *tsk = current;
+	int err = 0;
+
+	if (!buf) {
+		if (used_math())
+			goto clear;
+		return 0;
+	} else
+		if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
+			return -EACCES;
+
+	if (!used_math()) {
+		err = init_fpu(tsk);
+		if (err)
+			return err;
+	}
+
+	if (!(task_thread_info(current)->status & TS_USEDFPU)) {
+		clts();
+		task_thread_info(current)->status |= TS_USEDFPU;
+	}
+	if (task_thread_info(tsk)->status & TS_XSAVE)
+		err = restore_user_xstate(buf);
+	else
+		err = fxrstor_checking((__force struct i387_fxsave_struct *)
+				       buf);
+	if (unlikely(err)) {
+		/*
+		 * Encountered an error while doing the restore from the
+		 * user buffer, clear the fpu state.
+		 */
+clear:
+		clear_fpu(tsk);
+		clear_used_math();
+	}
+	return err;
+}
+#endif
+
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed by the fpstate pointer in the sigcontext.
+ * This will be saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+void prepare_fx_sw_frame(void)
+{
+	int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
+			     FP_XSTATE_MAGIC2_SIZE;
+
+	sig_xstate_size = sizeof(struct _fpstate) + size_extended;
+
+#ifdef CONFIG_IA32_EMULATION
+	sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
+#endif
+
+	memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
+
+	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
+	fx_sw_reserved.extended_size = sig_xstate_size;
+	fx_sw_reserved.xstate_bv = pcntxt_mask;
+	fx_sw_reserved.xstate_size = xstate_size;
+#ifdef CONFIG_IA32_EMULATION
+	memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
+	       sizeof(struct _fpx_sw_bytes));
+	fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
+#endif
+}
+
+/*
+ * Represents init state for the supported extended state.
+ */
+struct xsave_struct *init_xstate_buf;
+
+#ifdef CONFIG_X86_64
+unsigned int sig_xstate_size = sizeof(struct _fpstate);
+#endif
+
+/*
+ * Enable the extended processor state save/restore feature
+ */
+void __cpuinit xsave_init(void)
+{
+	if (!cpu_has_xsave)
+		return;
+
+	set_in_cr4(X86_CR4_OSXSAVE);
+
+	/*
+	 * Enable all the features that the HW is capable of
+	 * and the Linux kernel is aware of.
+	 */
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
+}
+
+/*
+ * setup the xstate image representing the init state
+ */
+void setup_xstate_init(void)
+{
+	init_xstate_buf = alloc_bootmem(xstate_size);
+	init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
+}
+
+/*
+ * Enable and initialize the xsave feature.
+ */
+void __init xsave_cntxt_init(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+	pcntxt_mask = eax + ((u64)edx << 32);
+
+	if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
+		printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
+		       pcntxt_mask);
+		BUG();
+	}
+
+	/*
+	 * for now OS knows only about FP/SSE
+	 */
+	pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
+	xsave_init();
+
+	/*
+	 * Recompute the context size for enabled features
+	 */
+	cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+	xstate_size = ebx;
+
+	prepare_fx_sw_frame();
+
+	setup_xstate_init();
+
+	printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
+	       "cntxt size 0x%x\n",
+	       pcntxt_mask, xstate_size);
+}
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 23e8373..17e2599 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,21 +331,6 @@
 
 #define AR_RESERVD_MASK 0xfffe0f00
 
-#define MSR_IA32_VMX_BASIC                      0x480
-#define MSR_IA32_VMX_PINBASED_CTLS              0x481
-#define MSR_IA32_VMX_PROCBASED_CTLS             0x482
-#define MSR_IA32_VMX_EXIT_CTLS                  0x483
-#define MSR_IA32_VMX_ENTRY_CTLS                 0x484
-#define MSR_IA32_VMX_MISC                       0x485
-#define MSR_IA32_VMX_CR0_FIXED0                 0x486
-#define MSR_IA32_VMX_CR0_FIXED1                 0x487
-#define MSR_IA32_VMX_CR4_FIXED0                 0x488
-#define MSR_IA32_VMX_CR4_FIXED1                 0x489
-#define MSR_IA32_VMX_VMCS_ENUM                  0x48a
-#define MSR_IA32_VMX_PROCBASED_CTLS2            0x48b
-#define MSR_IA32_VMX_EPT_VPID_CAP               0x48c
-
-#define MSR_IA32_FEATURE_CONTROL                0x3a
 #define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
 #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
 
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index d9249a8..65f0b8a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -55,6 +55,7 @@
 #include <linux/lguest_launcher.h>
 #include <linux/virtio_console.h>
 #include <linux/pm.h>
+#include <asm/apic.h>
 #include <asm/lguest.h>
 #include <asm/paravirt.h>
 #include <asm/param.h>
@@ -783,14 +784,44 @@
  * code qualifies for Advanced.  It will also never interrupt anything.  It
  * does, however, allow us to get through the Linux boot code. */
 #ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(unsigned long reg, u32 v)
+static void lguest_apic_write(u32 reg, u32 v)
 {
 }
 
-static u32 lguest_apic_read(unsigned long reg)
+static u32 lguest_apic_read(u32 reg)
 {
 	return 0;
 }
+
+static u64 lguest_apic_icr_read(void)
+{
+	return 0;
+}
+
+static void lguest_apic_icr_write(u32 low, u32 id)
+{
+	/* Warn to see if there's any stray references */
+	WARN_ON(1);
+}
+
+static void lguest_apic_wait_icr_idle(void)
+{
+	return;
+}
+
+static u32 lguest_apic_safe_wait_icr_idle(void)
+{
+	return 0;
+}
+
+static struct apic_ops lguest_basic_apic_ops = {
+	.read = lguest_apic_read,
+	.write = lguest_apic_write,
+	.icr_read = lguest_apic_icr_read,
+	.icr_write = lguest_apic_icr_write,
+	.wait_icr_idle = lguest_apic_wait_icr_idle,
+	.safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle,
+};
 #endif
 
 /* STOP!  Until an interrupt comes in. */
@@ -990,8 +1021,7 @@
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* apic read/write intercepts */
-	pv_apic_ops.apic_write = lguest_apic_write;
-	pv_apic_ops.apic_read = lguest_apic_read;
+	apic_ops = &lguest_basic_apic_ops;
 #endif
 
 	/* time operations */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index aa3fa41..55e11aa 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -17,9 +17,6 @@
         lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
 else
         obj-y += io_64.o iomap_copy_64.o
-
-        CFLAGS_csum-partial_64.o := -funroll-loops
-
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
         lib-y += thunk_64.o clear_page_64.o copy_page_64.o
         lib-y += memmove_64.o memset_64.o
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 24e6094..9e68075 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -14,6 +14,13 @@
 #include <asm/uaccess.h>
 #include <asm/mmx.h>
 
+#ifdef CONFIG_X86_INTEL_USERCOPY
+/*
+ * Alignment at which movsl is preferred for bulk memory copies.
+ */
+struct movsl_mask movsl_mask __read_mostly;
+#endif
+
 static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
 {
 #ifdef CONFIG_X86_INTEL_USERCOPY
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 3f2cf11..37b9ae4 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -38,15 +38,6 @@
 	init_ISA_irqs();
 }
 
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-static struct irqaction irq2 = {
-	.handler = no_action,
-	.mask = CPU_MASK_NONE,
-	.name = "cascade",
-};
-
 /**
  * intr_init_hook - post gate setup interrupt initialisation
  *
@@ -62,12 +53,6 @@
 		if (x86_quirks->arch_intr_init())
 			return;
 	}
-#ifdef CONFIG_X86_LOCAL_APIC
-	apic_intr_init();
-#endif
-
-	if (!acpi_ioapic)
-		setup_irq(2, &irq2);
 }
 
 /**
diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile
deleted file mode 100644
index 3ef8b43..0000000
--- a/arch/x86/mach-es7000/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-#
-# Makefile for the linux kernel.
-#
-
-obj-$(CONFIG_X86_ES7000)	:= es7000plat.o
diff --git a/arch/x86/mach-es7000/es7000.h b/arch/x86/mach-es7000/es7000.h
deleted file mode 100644
index c8d5aa1..0000000
--- a/arch/x86/mach-es7000/es7000.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Written by: Garry Forsgren, Unisys Corporation
- *             Natalie Protasevich, Unisys Corporation
- * This file contains the code to configure and interface 
- * with Unisys ES7000 series hardware system manager.
- *
- * Copyright (c) 2003 Unisys Corporation.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Unisys Corporation, Township Line & Union Meeting 
- * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
- *
- * http://www.unisys.com
- */
-
-/*
- * ES7000 chipsets
- */
-
-#define NON_UNISYS		0
-#define ES7000_CLASSIC		1
-#define ES7000_ZORRO		2
-
-
-#define	MIP_REG			1
-#define	MIP_PSAI_REG		4
-
-#define	MIP_BUSY		1
-#define	MIP_SPIN		0xf0000
-#define	MIP_VALID		0x0100000000000000ULL
-#define	MIP_PORT(VALUE)	((VALUE >> 32) & 0xffff)
-
-#define	MIP_RD_LO(VALUE)	(VALUE & 0xffffffff)   
-
-struct mip_reg_info {
-	unsigned long long mip_info;
-	unsigned long long delivery_info;
-	unsigned long long host_reg;
-	unsigned long long mip_reg;
-};
-
-struct part_info {
-	unsigned char type;   
-	unsigned char length;
-	unsigned char part_id;
-	unsigned char apic_mode;
-	unsigned long snum;    
-	char ptype[16];
-	char sname[64];
-	char pname[64];
-};
-
-struct psai {
-	unsigned long long entry_type;
-	unsigned long long addr;
-	unsigned long long bep_addr;
-};
-
-struct es7000_mem_info {
-	unsigned char type;   
-	unsigned char length;
-	unsigned char resv[6];
-	unsigned long long  start; 
-	unsigned long long  size; 
-};
-
-struct es7000_oem_table {
-	unsigned long long hdr;
-	struct mip_reg_info mip;
-	struct part_info pif;
-	struct es7000_mem_info shm;
-	struct psai psai;
-};
-
-#ifdef CONFIG_ACPI
-
-struct oem_table {
-	struct acpi_table_header Header;
-	u32 OEMTableAddr;
-	u32 OEMTableSize;
-};
-
-extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
-#endif
-
-struct mip_reg {
-	unsigned long long off_0;
-	unsigned long long off_8;
-	unsigned long long off_10;
-	unsigned long long off_18;
-	unsigned long long off_20;
-	unsigned long long off_28;
-	unsigned long long off_30;
-	unsigned long long off_38;
-};
-
-#define	MIP_SW_APIC		0x1020b
-#define	MIP_FUNC(VALUE) 	(VALUE & 0xff)
-
-extern int parse_unisys_oem (char *oemptr);
-extern void setup_unisys(void);
-extern int es7000_start_cpu(int cpu, unsigned long eip);
-extern void es7000_sw_apic(void);
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
index 0dbd780..6730f4e 100644
--- a/arch/x86/mach-generic/Makefile
+++ b/arch/x86/mach-generic/Makefile
@@ -9,4 +9,3 @@
 obj-$(CONFIG_X86_SUMMIT)	+= summit.o
 obj-$(CONFIG_X86_BIGSMP)	+= bigsmp.o
 obj-$(CONFIG_X86_ES7000)	+= es7000.o
-obj-$(CONFIG_X86_ES7000)	+= ../../x86/mach-es7000/
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 59d7717..df37fc9 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -5,18 +5,17 @@
 #define APIC_DEFINITION 1
 #include <linux/threads.h>
 #include <linux/cpumask.h>
-#include <asm/smp.h>
 #include <asm/mpspec.h>
 #include <asm/genapic.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
 #include <linux/kernel.h>
-#include <linux/smp.h>
 #include <linux/init.h>
 #include <linux/dmi.h>
-#include <asm/mach-bigsmp/mach_apic.h>
-#include <asm/mach-bigsmp/mach_apicdef.h>
-#include <asm/mach-bigsmp/mach_ipi.h>
+#include <asm/bigsmp/apicdef.h>
+#include <linux/smp.h>
+#include <asm/bigsmp/apic.h>
+#include <asm/bigsmp/ipi.h>
 #include <asm/mach-default/mach_mpparse.h>
 
 static int dmi_bigsmp; /* can be set by dmi scanners */
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 4742626..520cca0 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -4,20 +4,19 @@
 #define APIC_DEFINITION 1
 #include <linux/threads.h>
 #include <linux/cpumask.h>
-#include <asm/smp.h>
 #include <asm/mpspec.h>
 #include <asm/genapic.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
-#include <linux/smp.h>
 #include <linux/init.h>
-#include <asm/mach-es7000/mach_apicdef.h>
-#include <asm/mach-es7000/mach_apic.h>
-#include <asm/mach-es7000/mach_ipi.h>
-#include <asm/mach-es7000/mach_mpparse.h>
-#include <asm/mach-es7000/mach_wakecpu.h>
+#include <asm/es7000/apicdef.h>
+#include <linux/smp.h>
+#include <asm/es7000/apic.h>
+#include <asm/es7000/ipi.h>
+#include <asm/es7000/mpparse.h>
+#include <asm/es7000/wakecpu.h>
 
 static int probe_es7000(void)
 {
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
index 8091e68..8cf5839 100644
--- a/arch/x86/mach-generic/numaq.c
+++ b/arch/x86/mach-generic/numaq.c
@@ -4,7 +4,6 @@
 #define APIC_DEFINITION 1
 #include <linux/threads.h>
 #include <linux/cpumask.h>
-#include <linux/smp.h>
 #include <asm/mpspec.h>
 #include <asm/genapic.h>
 #include <asm/fixmap.h>
@@ -12,11 +11,12 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/init.h>
-#include <asm/mach-numaq/mach_apic.h>
-#include <asm/mach-numaq/mach_apicdef.h>
-#include <asm/mach-numaq/mach_ipi.h>
-#include <asm/mach-numaq/mach_mpparse.h>
-#include <asm/mach-numaq/mach_wakecpu.h>
+#include <asm/numaq/apicdef.h>
+#include <linux/smp.h>
+#include <asm/numaq/apic.h>
+#include <asm/numaq/ipi.h>
+#include <asm/numaq/mpparse.h>
+#include <asm/numaq/wakecpu.h>
 #include <asm/numaq.h>
 
 static int mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index a97ea0f..6ad6b67 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -4,19 +4,18 @@
 #define APIC_DEFINITION 1
 #include <linux/threads.h>
 #include <linux/cpumask.h>
-#include <asm/smp.h>
 #include <asm/mpspec.h>
 #include <asm/genapic.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
-#include <linux/smp.h>
 #include <linux/init.h>
-#include <asm/mach-summit/mach_apic.h>
-#include <asm/mach-summit/mach_apicdef.h>
-#include <asm/mach-summit/mach_ipi.h>
-#include <asm/mach-summit/mach_mpparse.h>
+#include <asm/summit/apicdef.h>
+#include <linux/smp.h>
+#include <asm/summit/apic.h>
+#include <asm/summit/ipi.h>
+#include <asm/summit/mpparse.h>
 
 static int probe_summit(void)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index fb30486..83e13f2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -88,6 +88,62 @@
 
 int after_bootmem;
 
+unsigned long __supported_pte_mask __read_mostly = ~0UL;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+static int do_not_nx __cpuinitdata;
+
+/*
+ * noexec=on|off
+ * Control non-executable mappings for 64-bit processes.
+ *
+ * on	Enable (default)
+ * off	Disable
+ */
+static int __init nonx_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+	if (!strncmp(str, "on", 2)) {
+		__supported_pte_mask |= _PAGE_NX;
+		do_not_nx = 0;
+	} else if (!strncmp(str, "off", 3)) {
+		do_not_nx = 1;
+		__supported_pte_mask &= ~_PAGE_NX;
+	}
+	return 0;
+}
+early_param("noexec", nonx_setup);
+
+void __cpuinit check_efer(void)
+{
+	unsigned long efer;
+
+	rdmsrl(MSR_EFER, efer);
+	if (!(efer & EFER_NX) || do_not_nx)
+		__supported_pte_mask &= ~_PAGE_NX;
+}
+
+int force_personality32;
+
+/*
+ * noexec32=on|off
+ * Control non executable heap for 32bit processes.
+ * To control the stack too use noexec=off
+ *
+ * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
+ * off	PROT_READ implies PROT_EXEC
+ */
+static int __init nonx32_setup(char *str)
+{
+	if (!strcmp(str, "on"))
+		force_personality32 &= ~READ_IMPLIES_EXEC;
+	else if (!strcmp(str, "off"))
+		force_personality32 |= READ_IMPLIES_EXEC;
+	return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
 /*
  * NOTE: This function is marked __ref because it calls __init function
  * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 19af069..1d88d2b 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -250,10 +250,5 @@
 			acpi_pci_irq_enable(dev);
 	}
 
-#ifdef CONFIG_X86_IO_APIC
-	if (acpi_ioapic)
-		print_IO_APIC();
-#endif
-
 	return 0;
 }
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 8791fc5..844df0c 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -33,6 +33,7 @@
 #include <linux/bootmem.h>
 
 #include <asm/pat.h>
+#include <asm/e820.h>
 
 #include "pci.h"
 
@@ -227,6 +228,8 @@
 	pcibios_allocate_bus_resources(&pci_root_buses);
 	pcibios_allocate_resources(0);
 	pcibios_allocate_resources(1);
+
+	e820_reserve_resources_late();
 }
 
 /**
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index d963576..654a223 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -209,7 +209,7 @@
 	return name != NULL;
 }
 
-static void __init pci_mmcfg_insert_resources(unsigned long resource_flags)
+static void __init pci_mmcfg_insert_resources(void)
 {
 #define PCI_MMCFG_RESOURCE_NAME_LEN 19
 	int i;
@@ -233,7 +233,7 @@
 			 cfg->pci_segment);
 		res->start = cfg->address;
 		res->end = res->start + (num_buses << 20) - 1;
-		res->flags = IORESOURCE_MEM | resource_flags;
+		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		insert_resource(&iomem_resource, res);
 		names += PCI_MMCFG_RESOURCE_NAME_LEN;
 	}
@@ -434,11 +434,9 @@
 	    (pci_mmcfg_config[0].address == 0))
 		return;
 
-	if (pci_mmcfg_arch_init()) {
-		if (known_bridge)
-			pci_mmcfg_insert_resources(IORESOURCE_BUSY);
+	if (pci_mmcfg_arch_init())
 		pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
-	} else {
+	else {
 		/*
 		 * Signal not to attempt to insert mmcfg resources because
 		 * the architecture mmcfg setup could not initialize.
@@ -475,7 +473,7 @@
 	 * marked so it won't cause request errors when __request_region is
 	 * called.
 	 */
-	pci_mmcfg_insert_resources(0);
+	pci_mmcfg_insert_resources();
 
 	return 0;
 }
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index d3e083d..274d060 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -11,6 +11,7 @@
 #include <linux/suspend.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
+#include <asm/xcr.h>
 
 static struct saved_context saved_context;
 
@@ -126,6 +127,12 @@
 	if (boot_cpu_has(X86_FEATURE_SEP))
 		enable_sep_cpu();
 
+	/*
+	 * restore XCR0 for xsave capable cpu's.
+	 */
+	if (cpu_has_xsave)
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
+
 	fix_processor_context();
 	do_fpu_end();
 	mtrr_ap_init();
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c
index 66bdfb5..e3b6cf7 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -14,6 +14,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
+#include <asm/xcr.h>
 
 static void fix_processor_context(void);
 
@@ -122,6 +123,12 @@
 	wrmsrl(MSR_GS_BASE, ctxt->gs_base);
 	wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
 
+	/*
+	 * restore XCR0 for xsave capable cpu's.
+	 */
+	if (cpu_has_xsave)
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
+
 	fix_processor_context();
 
 	do_fpu_end();
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7dcd321..a27d562 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -36,6 +36,7 @@
 #include <xen/hvc-console.h>
 
 #include <asm/paravirt.h>
+#include <asm/apic.h>
 #include <asm/page.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
@@ -580,16 +581,47 @@
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
-static u32 xen_apic_read(unsigned long reg)
+static u32 xen_apic_read(u32 reg)
 {
 	return 0;
 }
 
-static void xen_apic_write(unsigned long reg, u32 val)
+static void xen_apic_write(u32 reg, u32 val)
 {
 	/* Warn to see if there's any stray references */
 	WARN_ON(1);
 }
+
+static u64 xen_apic_icr_read(void)
+{
+	return 0;
+}
+
+static void xen_apic_icr_write(u32 low, u32 id)
+{
+	/* Warn to see if there's any stray references */
+	WARN_ON(1);
+}
+
+static void xen_apic_wait_icr_idle(void)
+{
+        return;
+}
+
+static u32 xen_safe_apic_wait_icr_idle(void)
+{
+        return 0;
+}
+
+static struct apic_ops xen_basic_apic_ops = {
+	.read = xen_apic_read,
+	.write = xen_apic_write,
+	.icr_read = xen_apic_icr_read,
+	.icr_write = xen_apic_icr_write,
+	.wait_icr_idle = xen_apic_wait_icr_idle,
+	.safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
+};
+
 #endif
 
 static void xen_flush_tlb(void)
@@ -1273,8 +1305,6 @@
 
 static const struct pv_apic_ops xen_apic_ops __initdata = {
 #ifdef CONFIG_X86_LOCAL_APIC
-	.apic_write = xen_apic_write,
-	.apic_read = xen_apic_read,
 	.setup_boot_clock = paravirt_nop,
 	.setup_secondary_clock = paravirt_nop,
 	.startup_ipi_hook = paravirt_nop,
@@ -1677,6 +1707,13 @@
 	pv_apic_ops = xen_apic_ops;
 	pv_mmu_ops = xen_mmu_ops;
 
+#ifdef CONFIG_X86_LOCAL_APIC
+	/*
+	 * set up the basic apic ops.
+	 */
+	apic_ops = &xen_basic_apic_ops;
+#endif
+
 	if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
 		pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
 		pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c
index 8dd3336..3c578ef 100644
--- a/drivers/acpi/glue.c
+++ b/drivers/acpi/glue.c
@@ -369,7 +369,6 @@
 		DBG("RTC unavailable?\n");
 	return 0;
 }
-/* do this between RTC subsys_initcall() and rtc_cmos driver_initcall() */
-fs_initcall(acpi_rtc_init);
+module_init(acpi_rtc_init);
 
 #endif
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 59fe051..5d312dc 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -503,7 +503,7 @@
 	scsi_cmd[0] = ATA_16;
 
 	scsi_cmd[4] = args[2];
-	if (args[0] == WIN_SMART) { /* hack -- ide driver does this too... */
+	if (args[0] == ATA_CMD_SMART) { /* hack -- ide driver does this too */
 		scsi_cmd[6]  = args[3];
 		scsi_cmd[8]  = args[1];
 		scsi_cmd[10] = 0x4f;
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 682243b..482c0c4 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -39,6 +39,7 @@
 #include <linux/ioport.h>
 #include <linux/init.h>
 #include <linux/blkpg.h>
+#include <linux/ata.h>
 #include <linux/hdreg.h>
 
 #define REALLY_SLOW_IO
@@ -370,7 +371,7 @@
 		struct hd_i_struct *disk = &hd_info[i];
 		disk->special_op = disk->recalibrate = 1;
 		hd_out(disk, disk->sect, disk->sect, disk->head-1,
-			disk->cyl, WIN_SPECIFY, &reset_hd);
+			disk->cyl, ATA_CMD_INIT_DEV_PARAMS, &reset_hd);
 		if (reset)
 			goto repeat;
 	} else
@@ -558,7 +559,7 @@
 {
 	if (disk->recalibrate) {
 		disk->recalibrate = 0;
-		hd_out(disk, disk->sect, 0, 0, 0, WIN_RESTORE, &recal_intr);
+		hd_out(disk, disk->sect, 0, 0, 0, ATA_CMD_RESTORE, &recal_intr);
 		return reset;
 	}
 	if (disk->head > 16) {
@@ -631,13 +632,13 @@
 	if (blk_fs_request(req)) {
 		switch (rq_data_dir(req)) {
 		case READ:
-			hd_out(disk, nsect, sec, head, cyl, WIN_READ,
+			hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ,
 				&read_intr);
 			if (reset)
 				goto repeat;
 			break;
 		case WRITE:
-			hd_out(disk, nsect, sec, head, cyl, WIN_WRITE,
+			hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_WRITE,
 				&write_intr);
 			if (reset)
 				goto repeat;
diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 052879a..b50b5da 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -131,29 +131,6 @@
 
 	  If unsure, say Y.
 
-config IDEDISK_MULTI_MODE
-	bool "Use multiple sector mode for Programmed Input/Output by default"
-	help
-	  This setting is irrelevant for most IDE disks, with direct memory
-	  access, to which multiple sector mode does not apply. Multiple sector
-	  mode is a feature of most modern IDE hard drives, permitting the
-	  transfer of multiple sectors per Programmed Input/Output interrupt,
-	  rather than the usual one sector per interrupt. When this feature is
-	  enabled, it can reduce operating system overhead for disk Programmed
-	  Input/Output. On some systems, it also can increase the data
-	  throughput of Programmed Input/Output. Some drives, however, seemed
-	  to run slower with multiple sector mode enabled. Some drives claimed
-	  to support multiple sector mode, but lost data at some settings.
-	  Under rare circumstances, such failures could result in massive
-	  filesystem corruption.
-
-	  If you get the following error, try to say Y here:
-
-	  hda: set_multmode: status=0x51 { DriveReady SeekComplete Error }
-	  hda: set_multmode: error=0x04 { DriveStatusError }
-
-	  If in doubt, say N.
-
 config BLK_DEV_IDECS
 	tristate "PCMCIA IDE support"
 	depends on PCMCIA
diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile
index 64e0ecd..308b8a1 100644
--- a/drivers/ide/Makefile
+++ b/drivers/ide/Makefile
@@ -4,8 +4,8 @@
 
 EXTRA_CFLAGS				+= -Idrivers/ide
 
-ide-core-y += ide.o ide-io.o ide-iops.o ide-lib.o ide-probe.o ide-taskfile.o \
-	      ide-pio-blacklist.o
+ide-core-y += ide.o ide-ioctls.o ide-io.o ide-iops.o ide-lib.o ide-probe.o \
+	      ide-taskfile.o ide-pio-blacklist.o
 
 # core IDE code
 ide-core-$(CONFIG_IDE_TIMINGS)		+= ide-timings.o
@@ -37,11 +37,12 @@
 obj-$(CONFIG_BLK_DEV_IDEPNP)		+= ide-pnp.o
 
 ide-cd_mod-y += ide-cd.o ide-cd_ioctl.o ide-cd_verbose.o
+ide-floppy_mod-y += ide-floppy.o ide-floppy_ioctl.o
 
 obj-$(CONFIG_BLK_DEV_IDEDISK)		+= ide-disk.o
 obj-$(CONFIG_BLK_DEV_IDECD)		+= ide-cd_mod.o
+obj-$(CONFIG_BLK_DEV_IDEFLOPPY)		+= ide-floppy_mod.o
 obj-$(CONFIG_BLK_DEV_IDETAPE)		+= ide-tape.o
-obj-$(CONFIG_BLK_DEV_IDEFLOPPY)		+= ide-floppy.o
 
 ifeq ($(CONFIG_BLK_DEV_IDECS), y)
 	ide-cs-core-y += legacy/ide-cs.o
diff --git a/drivers/ide/arm/icside.c b/drivers/ide/arm/icside.c
index df4af40..70f5b16 100644
--- a/drivers/ide/arm/icside.c
+++ b/drivers/ide/arm/icside.c
@@ -10,7 +10,6 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/errno.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/dma-mapping.h>
 #include <linux/device.h>
@@ -265,8 +264,8 @@
 	 * If we're going to be doing MW_DMA_1 or MW_DMA_2, we should
 	 * take care to note the values in the ID...
 	 */
-	if (use_dma_info && drive->id->eide_dma_time > cycle_time)
-		cycle_time = drive->id->eide_dma_time;
+	if (use_dma_info && drive->id[ATA_ID_EIDE_DMA_TIME] > cycle_time)
+		cycle_time = drive->id[ATA_ID_EIDE_DMA_TIME];
 
 	drive->drive_data = cycle_time;
 
diff --git a/drivers/ide/arm/palm_bk3710.c b/drivers/ide/arm/palm_bk3710.c
index 4fd91dc..122ed3c 100644
--- a/drivers/ide/arm/palm_bk3710.c
+++ b/drivers/ide/arm/palm_bk3710.c
@@ -27,7 +27,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/ioport.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/delay.h>
 #include <linux/init.h>
@@ -180,7 +179,7 @@
 	val32 |= (t2i << (dev ? 8 : 0));
 	writel(val32, base + BK3710_DATRCVR);
 
-	if (mate && mate->present) {
+	if (mate) {
 		u8 mode2 = ide_get_best_pio_mode(mate, 255, 4);
 
 		if (mode2 < mode)
@@ -213,7 +212,8 @@
 		palm_bk3710_setudmamode(base, is_slave,
 					xferspeed - XFER_UDMA_0);
 	} else {
-		palm_bk3710_setdmamode(base, is_slave, drive->id->eide_dma_min,
+		palm_bk3710_setdmamode(base, is_slave,
+				       drive->id[ATA_ID_EIDE_DMA_MIN],
 				       xferspeed);
 	}
 }
@@ -229,7 +229,7 @@
 	 * Obtain the drive PIO data for tuning the Palm Chip registers
 	 */
 	cycle_time = ide_pio_cycle_time(drive, pio);
-	mate = ide_get_paired_drive(drive);
+	mate = ide_get_pair_dev(drive);
 	palm_bk3710_setpiomode(base, mate, is_slave, cycle_time, pio);
 }
 
diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index 6f70462..2427c38 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -584,7 +584,7 @@
  * This function executes the _STM ACPI method for the target channel.
  *
  * _STM requires Identify Drive data, which has to passed as an argument.
- * Unfortunately hd_driveid is a mangled version which we can't readily
+ * Unfortunately drive->id is a mangled version which we can't readily
  * use; hence we'll get the information afresh.
  */
 void ide_acpi_push_timing(ide_hwif_t *hwif)
@@ -614,10 +614,10 @@
 	in_params[0].buffer.length = sizeof(struct GTM_buffer);
 	in_params[0].buffer.pointer = (u8 *)&hwif->acpidata->gtm;
 	in_params[1].type = ACPI_TYPE_BUFFER;
-	in_params[1].buffer.length = sizeof(struct hd_driveid);
+	in_params[1].buffer.length = sizeof(ATA_ID_WORDS * 2);
 	in_params[1].buffer.pointer = (u8 *)&master->idbuff;
 	in_params[2].type = ACPI_TYPE_BUFFER;
-	in_params[2].buffer.length = sizeof(struct hd_driveid);
+	in_params[2].buffer.length = sizeof(ATA_ID_WORDS * 2);
 	in_params[2].buffer.pointer = (u8 *)&slave->idbuff;
 	/* Output buffer: _STM has no output */
 
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index adf04f9..608c5ba 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -14,12 +14,201 @@
 #define debug_log(fmt, args...) do {} while (0)
 #endif
 
+/*
+ * Check whether we can support a device,
+ * based on the ATAPI IDENTIFY command results.
+ */
+int ide_check_atapi_device(ide_drive_t *drive, const char *s)
+{
+	u16 *id = drive->id;
+	u8 gcw[2], protocol, device_type, removable, drq_type, packet_size;
+
+	*((u16 *)&gcw) = id[ATA_ID_CONFIG];
+
+	protocol    = (gcw[1] & 0xC0) >> 6;
+	device_type =  gcw[1] & 0x1F;
+	removable   = (gcw[0] & 0x80) >> 7;
+	drq_type    = (gcw[0] & 0x60) >> 5;
+	packet_size =  gcw[0] & 0x03;
+
+#ifdef CONFIG_PPC
+	/* kludge for Apple PowerBook internal zip */
+	if (drive->media == ide_floppy && device_type == 5 &&
+	    !strstr((char *)&id[ATA_ID_PROD], "CD-ROM") &&
+	    strstr((char *)&id[ATA_ID_PROD], "ZIP"))
+		device_type = 0;
+#endif
+
+	if (protocol != 2)
+		printk(KERN_ERR "%s: %s: protocol (0x%02x) is not ATAPI\n",
+			s, drive->name, protocol);
+	else if ((drive->media == ide_floppy && device_type != 0) ||
+		 (drive->media == ide_tape && device_type != 1))
+		printk(KERN_ERR "%s: %s: invalid device type (0x%02x)\n",
+			s, drive->name, device_type);
+	else if (removable == 0)
+		printk(KERN_ERR "%s: %s: the removable flag is not set\n",
+			s, drive->name);
+	else if (drive->media == ide_floppy && drq_type == 3)
+		printk(KERN_ERR "%s: %s: sorry, DRQ type (0x%02x) not "
+			"supported\n", s, drive->name, drq_type);
+	else if (packet_size != 0)
+		printk(KERN_ERR "%s: %s: packet size (0x%02x) is not 12 "
+			"bytes\n", s, drive->name, packet_size);
+	else
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ide_check_atapi_device);
+
+/* PIO data transfer routine using the scatter gather table. */
+int ide_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
+		    unsigned int bcount, int write)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
+	xfer_func_t *xf = write ? tp_ops->output_data : tp_ops->input_data;
+	struct scatterlist *sg = pc->sg;
+	char *buf;
+	int count, done = 0;
+
+	while (bcount) {
+		count = min(sg->length - pc->b_count, bcount);
+
+		if (PageHighMem(sg_page(sg))) {
+			unsigned long flags;
+
+			local_irq_save(flags);
+			buf = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset;
+			xf(drive, NULL, buf + pc->b_count, count);
+			kunmap_atomic(buf - sg->offset, KM_IRQ0);
+			local_irq_restore(flags);
+		} else {
+			buf = sg_virt(sg);
+			xf(drive, NULL, buf + pc->b_count, count);
+		}
+
+		bcount -= count;
+		pc->b_count += count;
+		done += count;
+
+		if (pc->b_count == sg->length) {
+			if (!--pc->sg_cnt)
+				break;
+			pc->sg = sg = sg_next(sg);
+			pc->b_count = 0;
+		}
+	}
+
+	if (bcount) {
+		printk(KERN_ERR "%s: %d leftover bytes, %s\n", drive->name,
+			bcount, write ? "padding with zeros"
+				      : "discarding data");
+		ide_pad_transfer(drive, write, bcount);
+	}
+
+	return done;
+}
+EXPORT_SYMBOL_GPL(ide_io_buffers);
+
+void ide_init_pc(struct ide_atapi_pc *pc)
+{
+	memset(pc, 0, sizeof(*pc));
+	pc->buf = pc->pc_buf;
+	pc->buf_size = IDE_PC_BUFFER_SIZE;
+}
+EXPORT_SYMBOL_GPL(ide_init_pc);
+
+/*
+ * Generate a new packet command request in front of the request queue, before
+ * the current request, so that it will be processed immediately, on the next
+ * pass through the driver.
+ */
+void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk,
+		       struct ide_atapi_pc *pc, struct request *rq)
+{
+	blk_rq_init(NULL, rq);
+	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_flags |= REQ_PREEMPT;
+	rq->buffer = (char *)pc;
+	rq->rq_disk = disk;
+	memcpy(rq->cmd, pc->c, 12);
+	if (drive->media == ide_tape)
+		rq->cmd[13] = REQ_IDETAPE_PC1;
+	ide_do_drive_cmd(drive, rq);
+}
+EXPORT_SYMBOL_GPL(ide_queue_pc_head);
+
+/*
+ * Add a special packet command request to the tail of the request queue,
+ * and wait for it to be serviced.
+ */
+int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
+		      struct ide_atapi_pc *pc)
+{
+	struct request *rq;
+	int error;
+
+	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->buffer = (char *)pc;
+	memcpy(rq->cmd, pc->c, 12);
+	if (drive->media == ide_tape)
+		rq->cmd[13] = REQ_IDETAPE_PC1;
+	error = blk_execute_rq(drive->queue, disk, rq, 0);
+	blk_put_request(rq);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(ide_queue_pc_tail);
+
+int ide_do_test_unit_ready(ide_drive_t *drive, struct gendisk *disk)
+{
+	struct ide_atapi_pc pc;
+
+	ide_init_pc(&pc);
+	pc.c[0] = TEST_UNIT_READY;
+
+	return ide_queue_pc_tail(drive, disk, &pc);
+}
+EXPORT_SYMBOL_GPL(ide_do_test_unit_ready);
+
+int ide_do_start_stop(ide_drive_t *drive, struct gendisk *disk, int start)
+{
+	struct ide_atapi_pc pc;
+
+	ide_init_pc(&pc);
+	pc.c[0] = START_STOP;
+	pc.c[4] = start;
+
+	if (drive->media == ide_tape)
+		pc.flags |= PC_FLAG_WAIT_FOR_DSC;
+
+	return ide_queue_pc_tail(drive, disk, &pc);
+}
+EXPORT_SYMBOL_GPL(ide_do_start_stop);
+
+int ide_set_media_lock(ide_drive_t *drive, struct gendisk *disk, int on)
+{
+	struct ide_atapi_pc pc;
+
+	if (drive->atapi_flags & IDE_AFLAG_NO_DOORLOCK)
+		return 0;
+
+	ide_init_pc(&pc);
+	pc.c[0] = ALLOW_MEDIUM_REMOVAL;
+	pc.c[4] = on;
+
+	return ide_queue_pc_tail(drive, disk, &pc);
+}
+EXPORT_SYMBOL_GPL(ide_set_media_lock);
+
 /* TODO: unify the code thus making some arguments go away */
 ide_startstop_t ide_pc_intr(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	ide_handler_t *handler, unsigned int timeout, ide_expiry_t *expiry,
 	void (*update_buffers)(ide_drive_t *, struct ide_atapi_pc *),
 	void (*retry_pc)(ide_drive_t *), void (*dsc_handle)(ide_drive_t *),
-	void (*io_buffers)(ide_drive_t *, struct ide_atapi_pc *, unsigned, int))
+	int (*io_buffers)(ide_drive_t *, struct ide_atapi_pc *, unsigned, int))
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq = hwif->hwgroup->rq;
@@ -41,7 +230,7 @@
 
 	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
 		if (hwif->dma_ops->dma_end(drive) ||
-		    (drive->media == ide_tape && !scsi && (stat & ERR_STAT))) {
+		    (drive->media == ide_tape && !scsi && (stat & ATA_ERR))) {
 			if (drive->media == ide_floppy && !scsi)
 				printk(KERN_ERR "%s: DMA %s error\n",
 					drive->name, rq_data_dir(pc->rq)
@@ -56,7 +245,7 @@
 	}
 
 	/* No more interrupts */
-	if ((stat & DRQ_STAT) == 0) {
+	if ((stat & ATA_DRQ) == 0) {
 		debug_log("Packet command completed, %d bytes transferred\n",
 			  pc->xferred);
 
@@ -65,10 +254,10 @@
 		local_irq_enable_in_hardirq();
 
 		if (drive->media == ide_tape && !scsi &&
-		    (stat & ERR_STAT) && rq->cmd[0] == REQUEST_SENSE)
-			stat &= ~ERR_STAT;
+		    (stat & ATA_ERR) && rq->cmd[0] == REQUEST_SENSE)
+			stat &= ~ATA_ERR;
 
-		if ((stat & ERR_STAT) || (pc->flags & PC_FLAG_DMA_ERROR)) {
+		if ((stat & ATA_ERR) || (pc->flags & PC_FLAG_DMA_ERROR)) {
 			/* Error detected */
 			debug_log("%s: I/O error\n", drive->name);
 
@@ -95,7 +284,7 @@
 cmd_finished:
 		pc->error = 0;
 		if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) &&
-		    (stat & SEEK_STAT) == 0) {
+		    (stat & ATA_DSC) == 0) {
 			dsc_handle(drive);
 			return ide_stopped;
 		}
@@ -117,17 +306,18 @@
 	/* Get the number of bytes to transfer on this interrupt. */
 	ide_read_bcount_and_ireason(drive, &bcount, &ireason);
 
-	if (ireason & CD) {
+	if (ireason & ATAPI_COD) {
 		printk(KERN_ERR "%s: CoD != 0 in %s\n", drive->name, __func__);
 		return ide_do_reset(drive);
 	}
 
-	if (((ireason & IO) == IO) == !!(pc->flags & PC_FLAG_WRITING)) {
+	if (((ireason & ATAPI_IO) == ATAPI_IO) ==
+		!!(pc->flags & PC_FLAG_WRITING)) {
 		/* Hopefully, we will never get here */
 		printk(KERN_ERR "%s: We wanted to %s, but the device wants us "
 				"to %s!\n", drive->name,
-				(ireason & IO) ? "Write" : "Read",
-				(ireason & IO) ? "Read" : "Write");
+				(ireason & ATAPI_IO) ? "Write" : "Read",
+				(ireason & ATAPI_IO) ? "Read" : "Write");
 		return ide_do_reset(drive);
 	}
 
@@ -171,9 +361,14 @@
 
 	if ((drive->media == ide_floppy && !scsi && !pc->buf) ||
 	    (drive->media == ide_tape && !scsi && pc->bh) ||
-	    (scsi && pc->sg))
-		io_buffers(drive, pc, bcount, !!(pc->flags & PC_FLAG_WRITING));
-	else
+	    (scsi && pc->sg)) {
+		int done = io_buffers(drive, pc, bcount,
+				  !!(pc->flags & PC_FLAG_WRITING));
+
+		/* FIXME: don't do partial completions */
+		if (drive->media == ide_floppy && !scsi)
+			ide_end_request(drive, 1, done >> 9);
+	} else
 		xferfunc(drive, NULL, pc->cur_pos, bcount);
 
 	/* Update the current position */
@@ -205,7 +400,8 @@
 {
 	int retries = 100;
 
-	while (retries-- && ((ireason & CD) == 0 || (ireason & IO))) {
+	while (retries-- && ((ireason & ATAPI_COD) == 0 ||
+		(ireason & ATAPI_IO))) {
 		printk(KERN_ERR "%s: (IO,CoD != (0,1) while issuing "
 				"a packet command, retrying\n", drive->name);
 		udelay(100);
@@ -214,8 +410,8 @@
 			printk(KERN_ERR "%s: (IO,CoD != (0,1) while issuing "
 					"a packet command, ignoring\n",
 					drive->name);
-			ireason |= CD;
-			ireason &= ~IO;
+			ireason |= ATAPI_COD;
+			ireason &= ~ATAPI_IO;
 		}
 	}
 
@@ -231,7 +427,7 @@
 	ide_startstop_t startstop;
 	u8 ireason;
 
-	if (ide_wait_stat(&startstop, drive, DRQ_STAT, BUSY_STAT, WAIT_READY)) {
+	if (ide_wait_stat(&startstop, drive, ATA_DRQ, ATA_BUSY, WAIT_READY)) {
 		printk(KERN_ERR "%s: Strange, packet command initiated yet "
 				"DRQ isn't asserted\n", drive->name);
 		return startstop;
@@ -241,7 +437,7 @@
 	if (drive->media == ide_tape && !drive->scsi)
 		ireason = ide_wait_ireason(drive, ireason);
 
-	if ((ireason & CD) == 0 || (ireason & IO)) {
+	if ((ireason & ATAPI_COD) == 0 || (ireason & ATAPI_IO)) {
 		printk(KERN_ERR "%s: (IO,CoD) != (0,1) while issuing "
 				"a packet command\n", drive->name);
 		return ide_do_reset(drive);
@@ -303,7 +499,7 @@
 
 	/* Issue the packet command */
 	if (drive->atapi_flags & IDE_AFLAG_DRQ_INTERRUPT) {
-		ide_execute_command(drive, WIN_PACKETCMD, handler,
+		ide_execute_command(drive, ATA_CMD_PACKET, handler,
 				    timeout, NULL);
 		return ide_started;
 	} else {
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 03c2cb6..465a92c 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -436,7 +436,7 @@
 			ide_dump_status_no_sense(drive, "media error (blank)",
 						 stat);
 			do_end_request = 1;
-		} else if ((err & ~ABRT_ERR) != 0) {
+		} else if ((err & ~ATA_ABORTED) != 0) {
 			/* go to the default handler for other errors */
 			ide_error(drive, "cdrom_decode_status", stat);
 			return 1;
@@ -457,7 +457,7 @@
 		 * If we got a CHECK_CONDITION status, queue
 		 * a request sense command.
 		 */
-		if (stat & ERR_STAT)
+		if (stat & ATA_ERR)
 			cdrom_queue_request_sense(drive, NULL, NULL);
 	} else {
 		blk_dump_rq_flags(rq, "ide-cd: bad rq");
@@ -468,7 +468,7 @@
 	return 1;
 
 end_request:
-	if (stat & ERR_STAT) {
+	if (stat & ATA_ERR) {
 		unsigned long flags;
 
 		spin_lock_irqsave(&ide_lock, flags);
@@ -541,7 +541,7 @@
 			drive->waiting_for_dma = 0;
 
 		/* packet command */
-		ide_execute_command(drive, WIN_PACKETCMD, handler,
+		ide_execute_command(drive, ATA_CMD_PACKET, handler,
 				    ATAPI_WAIT_PC, cdrom_timer_expiry);
 		return ide_started;
 	} else {
@@ -574,7 +574,7 @@
 		 */
 
 		/* check for errors */
-		if (cdrom_decode_status(drive, DRQ_STAT, NULL))
+		if (cdrom_decode_status(drive, ATA_DRQ, NULL))
 			return ide_stopped;
 
 		/* ok, next interrupt will be DMA interrupt */
@@ -582,8 +582,8 @@
 			drive->waiting_for_dma = 1;
 	} else {
 		/* otherwise, we must wait for DRQ to get set */
-		if (ide_wait_stat(&startstop, drive, DRQ_STAT,
-				BUSY_STAT, WAIT_READY))
+		if (ide_wait_stat(&startstop, drive, ATA_DRQ,
+				  ATA_BUSY, WAIT_READY))
 			return startstop;
 	}
 
@@ -938,7 +938,7 @@
 		thislen = len;
 
 	/* If DRQ is clear, the command has completed. */
-	if ((stat & DRQ_STAT) == 0) {
+	if ((stat & ATA_DRQ) == 0) {
 		if (blk_fs_request(rq)) {
 			/*
 			 * If we're not done reading/writing, complain.
@@ -1164,13 +1164,12 @@
 	if (rq->bio || ((rq->cmd_type == REQ_TYPE_ATA_PC) && rq->data_len)) {
 		struct request_queue *q = drive->queue;
 		unsigned int alignment;
-		unsigned long addr;
-		unsigned long stack_mask = ~(THREAD_SIZE - 1);
+		char *buf;
 
 		if (rq->bio)
-			addr = (unsigned long)bio_data(rq->bio);
+			buf = bio_data(rq->bio);
 		else
-			addr = (unsigned long)rq->data;
+			buf = rq->data;
 
 		info->dma = drive->using_dma;
 
@@ -1181,11 +1180,8 @@
 		 * separate masks.
 		 */
 		alignment = queue_dma_alignment(q) | q->dma_pad_mask;
-		if (addr & alignment || rq->data_len & alignment)
-			info->dma = 0;
-
-		if (!((addr & stack_mask) ^
-		      ((unsigned long)current->stack & stack_mask)))
+		if ((unsigned long)buf & alignment || rq->data_len & alignment
+		    || object_is_on_stack(buf))
 			info->dma = 0;
 	}
 }
@@ -1206,7 +1202,7 @@
 			unsigned long elapsed = jiffies - info->start_seek;
 			int stat = hwif->tp_ops->read_status(hwif);
 
-			if ((stat & SEEK_STAT) != SEEK_STAT) {
+			if ((stat & ATA_DSC) != ATA_DSC) {
 				if (elapsed < IDECD_SEEK_TIMEOUT) {
 					ide_stall_queue(drive,
 							IDECD_SEEK_TIMER);
@@ -1813,13 +1809,12 @@
 	{ NULL, 0, NULL, NULL }
 };
 
-static void ide_cdrom_add_settings(ide_drive_t *drive)
-{
-	ide_add_setting(drive, "dsc_overlap", SETTING_RW, TYPE_BYTE, 0, 1, 1, 1,
-			&drive->dsc_overlap, NULL);
-}
-#else
-static inline void ide_cdrom_add_settings(ide_drive_t *drive) { ; }
+ide_devset_rw_field(dsc_overlap, dsc_overlap);
+
+static const struct ide_proc_devset idecd_settings[] = {
+	IDE_PROC_DEVSET(dsc_overlap, 0, 1),
+	{ 0 },
+};
 #endif
 
 static const struct cd_list_entry ide_cd_quirks_list[] = {
@@ -1866,14 +1861,14 @@
 	{ NULL, NULL, 0 }
 };
 
-static unsigned int ide_cd_flags(struct hd_driveid *id)
+static unsigned int ide_cd_flags(u16 *id)
 {
 	const struct cd_list_entry *cle = ide_cd_quirks_list;
 
 	while (cle->id_model) {
-		if (strcmp(cle->id_model, id->model) == 0 &&
+		if (strcmp(cle->id_model, (char *)&id[ATA_ID_PROD]) == 0 &&
 		    (cle->id_firmware == NULL ||
-		     strstr(id->fw_rev, cle->id_firmware)))
+		     strstr((char *)&id[ATA_ID_FW_REV], cle->id_firmware)))
 			return cle->cd_flags;
 		cle++;
 	}
@@ -1885,7 +1880,8 @@
 {
 	struct cdrom_info *cd = drive->driver_data;
 	struct cdrom_device_info *cdi = &cd->devinfo;
-	struct hd_driveid *id = drive->id;
+	u16 *id = drive->id;
+	char *fw_rev = (char *)&id[ATA_ID_FW_REV];
 	int nslots;
 
 	blk_queue_prep_rq(drive->queue, ide_cdrom_prep_fn);
@@ -1900,15 +1896,15 @@
 	drive->atapi_flags = IDE_AFLAG_MEDIA_CHANGED | IDE_AFLAG_NO_EJECT |
 		       ide_cd_flags(id);
 
-	if ((id->config & 0x0060) == 0x20)
+	if ((id[ATA_ID_CONFIG] & 0x0060) == 0x20)
 		drive->atapi_flags |= IDE_AFLAG_DRQ_INTERRUPT;
 
 	if ((drive->atapi_flags & IDE_AFLAG_VERTOS_300_SSD) &&
-	    id->fw_rev[4] == '1' && id->fw_rev[6] <= '2')
+	    fw_rev[4] == '1' && fw_rev[6] <= '2')
 		drive->atapi_flags |= (IDE_AFLAG_TOCTRACKS_AS_BCD |
 				     IDE_AFLAG_TOCADDR_AS_BCD);
 	else if ((drive->atapi_flags & IDE_AFLAG_VERTOS_600_ESD) &&
-		 id->fw_rev[4] == '1' && id->fw_rev[6] <= '2')
+		 fw_rev[4] == '1' && fw_rev[6] <= '2')
 		drive->atapi_flags |= IDE_AFLAG_TOCTRACKS_AS_BCD;
 	else if (drive->atapi_flags & IDE_AFLAG_SANYO_3CD)
 		/* 3 => use CD in slot 0 */
@@ -1927,7 +1923,8 @@
 		cd->devinfo.handle = NULL;
 		return 1;
 	}
-	ide_cdrom_add_settings(drive);
+
+	ide_proc_register_driver(drive, cd->driver);
 	return 0;
 }
 
@@ -1972,12 +1969,12 @@
 	.remove			= ide_cd_remove,
 	.version		= IDECD_VERSION,
 	.media			= ide_cdrom,
-	.supports_dsc_overlap	= 1,
 	.do_request		= ide_cd_do_request,
 	.end_request		= ide_end_request,
 	.error			= __ide_error,
 #ifdef CONFIG_IDE_PROC_FS
 	.proc			= idecd_proc,
+	.settings		= idecd_settings,
 #endif
 };
 
@@ -2112,10 +2109,10 @@
 
 	if (!strstr("ide-cdrom", drive->driver_req))
 		goto failed;
-	if (!drive->present)
-		goto failed;
+
 	if (drive->media != ide_cdrom && drive->media != ide_optical)
 		goto failed;
+
 	/* skip drives that we were told to ignore */
 	if (ignore != NULL) {
 		if (strstr(ignore, drive->name)) {
@@ -2137,8 +2134,6 @@
 
 	ide_init_disk(g, drive);
 
-	ide_proc_register_driver(drive, &ide_cdrom_driver);
-
 	kref_init(&info->kref);
 
 	info->drive = drive;
@@ -2153,7 +2148,6 @@
 	g->driverfs_dev = &drive->gendev;
 	g->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE;
 	if (ide_cdrom_setup(drive)) {
-		ide_proc_unregister_driver(drive, &ide_cdrom_driver);
 		ide_cd_release(&info->kref);
 		goto failed;
 	}
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 33ea8c0..01846f2 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -30,10 +30,8 @@
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/leds.h>
-
-#define _IDE_DISK
-
 #include <linux/ide.h>
+#include <linux/hdreg.h>
 
 #include <asm/byteorder.h>
 #include <asm/irq.h>
@@ -90,68 +88,19 @@
 	mutex_unlock(&idedisk_ref_mutex);
 }
 
-/*
- * lba_capacity_is_ok() performs a sanity check on the claimed "lba_capacity"
- * value for this drive (from its reported identification information).
- *
- * Returns:	1 if lba_capacity looks sensible
- *		0 otherwise
- *
- * It is called only once for each drive.
- */
-static int lba_capacity_is_ok(struct hd_driveid *id)
-{
-	unsigned long lba_sects, chs_sects, head, tail;
-
-	/* No non-LBA info .. so valid! */
-	if (id->cyls == 0)
-		return 1;
-
-	/*
-	 * The ATA spec tells large drives to return
-	 * C/H/S = 16383/16/63 independent of their size.
-	 * Some drives can be jumpered to use 15 heads instead of 16.
-	 * Some drives can be jumpered to use 4092 cyls instead of 16383.
-	 */
-	if ((id->cyls == 16383
-	     || (id->cyls == 4092 && id->cur_cyls == 16383)) &&
-	    id->sectors == 63 &&
-	    (id->heads == 15 || id->heads == 16) &&
-	    (id->lba_capacity >= 16383*63*id->heads))
-		return 1;
-
-	lba_sects   = id->lba_capacity;
-	chs_sects   = id->cyls * id->heads * id->sectors;
-
-	/* perform a rough sanity check on lba_sects:  within 10% is OK */
-	if ((lba_sects - chs_sects) < chs_sects/10)
-		return 1;
-
-	/* some drives have the word order reversed */
-	head = ((lba_sects >> 16) & 0xffff);
-	tail = (lba_sects & 0xffff);
-	lba_sects = (head | (tail << 16));
-	if ((lba_sects - chs_sects) < chs_sects/10) {
-		id->lba_capacity = lba_sects;
-		return 1;	/* lba_capacity is (now) good */
-	}
-
-	return 0;	/* lba_capacity value may be bad */
-}
-
 static const u8 ide_rw_cmds[] = {
-	WIN_MULTREAD,
-	WIN_MULTWRITE,
-	WIN_MULTREAD_EXT,
-	WIN_MULTWRITE_EXT,
-	WIN_READ,
-	WIN_WRITE,
-	WIN_READ_EXT,
-	WIN_WRITE_EXT,
-	WIN_READDMA,
-	WIN_WRITEDMA,
-	WIN_READDMA_EXT,
-	WIN_WRITEDMA_EXT,
+	ATA_CMD_READ_MULTI,
+	ATA_CMD_WRITE_MULTI,
+	ATA_CMD_READ_MULTI_EXT,
+	ATA_CMD_WRITE_MULTI_EXT,
+	ATA_CMD_PIO_READ,
+	ATA_CMD_PIO_WRITE,
+	ATA_CMD_PIO_READ_EXT,
+	ATA_CMD_PIO_WRITE_EXT,
+	ATA_CMD_READ,
+	ATA_CMD_WRITE,
+	ATA_CMD_READ_EXT,
+	ATA_CMD_WRITE_EXT,
 };
 
 static const u8 ide_data_phases[] = {
@@ -322,9 +271,9 @@
 	/* Create IDE/ATA command request structure */
 	memset(&args, 0, sizeof(ide_task_t));
 	if (lba48)
-		tf->command = WIN_READ_NATIVE_MAX_EXT;
+		tf->command = ATA_CMD_READ_NATIVE_MAX_EXT;
 	else
-		tf->command = WIN_READ_NATIVE_MAX;
+		tf->command = ATA_CMD_READ_NATIVE_MAX;
 	tf->device  = ATA_LBA;
 	args.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 	if (lba48)
@@ -359,10 +308,10 @@
 		tf->hob_lbal = (addr_req >>= 8) & 0xff;
 		tf->hob_lbam = (addr_req >>= 8) & 0xff;
 		tf->hob_lbah = (addr_req >>= 8) & 0xff;
-		tf->command  = WIN_SET_MAX_EXT;
+		tf->command  = ATA_CMD_SET_MAX_EXT;
 	} else {
 		tf->device   = (addr_req >>= 8) & 0x0f;
-		tf->command  = WIN_SET_MAX;
+		tf->command  = ATA_CMD_SET_MAX;
 	}
 	tf->device |= ATA_LBA;
 	args.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
@@ -385,25 +334,6 @@
 }
 
 /*
- * Bits 10 of command_set_1 and cfs_enable_1 must be equal,
- * so on non-buggy drives we need test only one.
- * However, we should also check whether these fields are valid.
- */
-static inline int idedisk_supports_hpa(const struct hd_driveid *id)
-{
-	return (id->command_set_1 & 0x0400) && (id->cfs_enable_1 & 0x0400);
-}
-
-/*
- * The same here.
- */
-static inline int idedisk_supports_lba48(const struct hd_driveid *id)
-{
-	return (id->command_set_2 & 0x0400) && (id->cfs_enable_2 & 0x0400)
-	       && id->lba_capacity_2;
-}
-
-/*
  * Some disks report total number of sectors instead of
  * maximum sector address.  We list them here.
  */
@@ -417,7 +347,7 @@
 static void idedisk_check_hpa(ide_drive_t *drive)
 {
 	unsigned long long capacity, set_max;
-	int lba48 = idedisk_supports_lba48(drive->id);
+	int lba48 = ata_id_lba48_enabled(drive->id);
 
 	capacity = drive->capacity64;
 
@@ -453,23 +383,23 @@
 
 static void init_idedisk_capacity(ide_drive_t *drive)
 {
-	struct hd_driveid *id = drive->id;
+	u16 *id = drive->id;
 	/*
 	 * If this drive supports the Host Protected Area feature set,
 	 * then we may need to change our opinion about the drive's capacity.
 	 */
-	int hpa = idedisk_supports_hpa(id);
+	int hpa = ata_id_hpa_enabled(id);
 
-	if (idedisk_supports_lba48(id)) {
+	if (ata_id_lba48_enabled(id)) {
 		/* drive speaks 48-bit LBA */
 		drive->select.b.lba = 1;
-		drive->capacity64 = id->lba_capacity_2;
+		drive->capacity64 = ata_id_u64(id, ATA_ID_LBA_CAPACITY_2);
 		if (hpa)
 			idedisk_check_hpa(drive);
-	} else if ((id->capability & 2) && lba_capacity_is_ok(id)) {
+	} else if (ata_id_has_lba(id) && ata_id_is_lba_capacity_ok(id)) {
 		/* drive speaks 28-bit LBA */
 		drive->select.b.lba = 1;
-		drive->capacity64 = id->lba_capacity;
+		drive->capacity64 = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
 		if (hpa)
 			idedisk_check_hpa(drive);
 	} else {
@@ -480,7 +410,7 @@
 
 static sector_t idedisk_capacity(ide_drive_t *drive)
 {
-	return drive->capacity64 - drive->sect0;
+	return drive->capacity64;
 }
 
 #ifdef CONFIG_IDE_PROC_FS
@@ -490,10 +420,10 @@
 	struct ide_taskfile *tf = &args.tf;
 
 	memset(&args, 0, sizeof(ide_task_t));
-	tf->feature = SMART_ENABLE;
-	tf->lbam    = SMART_LCYL_PASS;
-	tf->lbah    = SMART_HCYL_PASS;
-	tf->command = WIN_SMART;
+	tf->feature = ATA_SMART_ENABLE;
+	tf->lbam    = ATA_SMART_LBAM_PASS;
+	tf->lbah    = ATA_SMART_LBAH_PASS;
+	tf->command = ATA_CMD_SMART;
 	args.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 	return ide_no_data_taskfile(drive, &args);
 }
@@ -506,9 +436,9 @@
 	memset(&args, 0, sizeof(ide_task_t));
 	tf->feature = sub_cmd;
 	tf->nsect   = 0x01;
-	tf->lbam    = SMART_LCYL_PASS;
-	tf->lbah    = SMART_HCYL_PASS;
-	tf->command = WIN_SMART;
+	tf->lbam    = ATA_SMART_LBAM_PASS;
+	tf->lbah    = ATA_SMART_LBAH_PASS;
+	tf->command = ATA_CMD_SMART;
 	args.tf_flags	= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 	args.data_phase	= TASKFILE_IN;
 	(void) smart_enable(drive);
@@ -523,7 +453,7 @@
 	int		len;
 
 	if (drive->id_read)
-		len = sprintf(out, "%i\n", drive->id->buf_size / 2);
+		len = sprintf(out, "%i\n", drive->id[ATA_ID_BUF_SIZE] / 2);
 	else
 		len = sprintf(out, "(none)\n");
 
@@ -549,13 +479,14 @@
 
 	if (get_smart_data(drive, page, sub_cmd) == 0) {
 		unsigned short *val = (unsigned short *) page;
-		char *out = ((char *)val) + (SECTOR_WORDS * 4);
+		char *out = (char *)val + SECTOR_SIZE;
+
 		page = out;
 		do {
 			out += sprintf(out, "%04x%c", le16_to_cpu(*val),
 				       (++i & 7) ? ' ' : '\n');
 			val += 1;
-		} while (i < (SECTOR_WORDS * 2));
+		} while (i < SECTOR_SIZE / 2);
 		len = out - page;
 	}
 
@@ -566,14 +497,14 @@
 	(char *page, char **start, off_t off, int count, int *eof, void *data)
 {
 	return proc_idedisk_read_smart(page, start, off, count, eof, data,
-				       SMART_READ_VALUES);
+				       ATA_SMART_READ_VALUES);
 }
 
 static int proc_idedisk_read_st
 	(char *page, char **start, off_t off, int count, int *eof, void *data)
 {
 	return proc_idedisk_read_smart(page, start, off, count, eof, data,
-				       SMART_READ_THRESHOLDS);
+				       ATA_SMART_READ_THRESHOLDS);
 }
 
 static ide_proc_entry_t idedisk_proc[] = {
@@ -595,11 +526,11 @@
 	BUG_ON(task == NULL);
 
 	memset(task, 0, sizeof(*task));
-	if (ide_id_has_flush_cache_ext(drive->id) &&
+	if (ata_id_flush_ext_enabled(drive->id) &&
 	    (drive->capacity64 >= (1UL << 28)))
-		task->tf.command = WIN_FLUSH_CACHE_EXT;
+		task->tf.command = ATA_CMD_FLUSH_EXT;
 	else
-		task->tf.command = WIN_FLUSH_CACHE;
+		task->tf.command = ATA_CMD_FLUSH;
 	task->tf_flags	 = IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
 			   IDE_TFLAG_DYN;
 	task->data_phase = TASKFILE_NO_DATA;
@@ -609,6 +540,8 @@
 	rq->special = task;
 }
 
+ide_devset_get(multcount, mult_count);
+
 /*
  * This is tightly woven into the driver->do_special can not touch.
  * DON'T do it again until a total personality rewrite is committed.
@@ -618,7 +551,7 @@
 	struct request *rq;
 	int error;
 
-	if (arg < 0 || arg > drive->id->max_multsect)
+	if (arg < 0 || arg > (drive->id[ATA_ID_MAX_MULTSECT] & 0xff))
 		return -EINVAL;
 
 	if (drive->special.b.set_multmode)
@@ -635,22 +568,21 @@
 	return (drive->mult_count == arg) ? 0 : -EIO;
 }
 
+ide_devset_get(nowerr, nowerr);
+
 static int set_nowerr(ide_drive_t *drive, int arg)
 {
 	if (arg < 0 || arg > 1)
 		return -EINVAL;
 
-	if (ide_spin_wait_hwgroup(drive))
-		return -EBUSY;
 	drive->nowerr = arg;
 	drive->bad_wstat = arg ? BAD_R_STAT : BAD_W_STAT;
-	spin_unlock_irq(&ide_lock);
 	return 0;
 }
 
 static void update_ordered(ide_drive_t *drive)
 {
-	struct hd_driveid *id = drive->id;
+	u16 *id = drive->id;
 	unsigned ordered = QUEUE_ORDERED_NONE;
 	prepare_flush_fn *prep_fn = NULL;
 
@@ -666,9 +598,9 @@
 		 * not available so we don't need to recheck that.
 		 */
 		capacity = idedisk_capacity(drive);
-		barrier = ide_id_has_flush_cache(id) && !drive->noflush &&
+		barrier = ata_id_flush_enabled(id) && !drive->noflush &&
 			(drive->addressing == 0 || capacity <= (1ULL << 28) ||
-			 ide_id_has_flush_cache_ext(id));
+			 ata_id_flush_ext_enabled(id));
 
 		printk(KERN_INFO "%s: cache flushes %ssupported\n",
 		       drive->name, barrier ? "" : "not ");
@@ -683,7 +615,9 @@
 	blk_queue_ordered(drive->queue, ordered, prep_fn);
 }
 
-static int write_cache(ide_drive_t *drive, int arg)
+ide_devset_get(wcache, wcache);
+
+static int set_wcache(ide_drive_t *drive, int arg)
 {
 	ide_task_t args;
 	int err = 1;
@@ -691,11 +625,11 @@
 	if (arg < 0 || arg > 1)
 		return -EINVAL;
 
-	if (ide_id_has_flush_cache(drive->id)) {
+	if (ata_id_flush_enabled(drive->id)) {
 		memset(&args, 0, sizeof(ide_task_t));
 		args.tf.feature = arg ?
-			SETFEATURES_EN_WCACHE : SETFEATURES_DIS_WCACHE;
-		args.tf.command = WIN_SETFEATURES;
+			SETFEATURES_WC_ON : SETFEATURES_WC_OFF;
+		args.tf.command = ATA_CMD_SET_FEATURES;
 		args.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 		err = ide_no_data_taskfile(drive, &args);
 		if (err == 0)
@@ -712,14 +646,16 @@
 	ide_task_t args;
 
 	memset(&args, 0, sizeof(ide_task_t));
-	if (ide_id_has_flush_cache_ext(drive->id))
-		args.tf.command = WIN_FLUSH_CACHE_EXT;
+	if (ata_id_flush_ext_enabled(drive->id))
+		args.tf.command = ATA_CMD_FLUSH_EXT;
 	else
-		args.tf.command = WIN_FLUSH_CACHE;
+		args.tf.command = ATA_CMD_FLUSH;
 	args.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 	return ide_no_data_taskfile(drive, &args);
 }
 
+ide_devset_get(acoustic, acoustic);
+
 static int set_acoustic(ide_drive_t *drive, int arg)
 {
 	ide_task_t args;
@@ -728,22 +664,24 @@
 		return -EINVAL;
 
 	memset(&args, 0, sizeof(ide_task_t));
-	args.tf.feature = arg ? SETFEATURES_EN_AAM : SETFEATURES_DIS_AAM;
+	args.tf.feature = arg ? SETFEATURES_AAM_ON : SETFEATURES_AAM_OFF;
 	args.tf.nsect   = arg;
-	args.tf.command = WIN_SETFEATURES;
+	args.tf.command = ATA_CMD_SET_FEATURES;
 	args.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 	ide_no_data_taskfile(drive, &args);
 	drive->acoustic = arg;
 	return 0;
 }
 
+ide_devset_get(addressing, addressing);
+
 /*
  * drive->addressing:
  *	0: 28-bit
  *	1: 48-bit
  *	2: 48-bit capable doing 28-bit
  */
-static int set_lba_addressing(ide_drive_t *drive, int arg)
+static int set_addressing(ide_drive_t *drive, int arg)
 {
 	if (arg < 0 || arg > 2)
 		return -EINVAL;
@@ -753,52 +691,54 @@
 	if (drive->hwif->host_flags & IDE_HFLAG_NO_LBA48)
 		return 0;
 
-	if (!idedisk_supports_lba48(drive->id))
+	if (ata_id_lba48_enabled(drive->id) == 0)
 		return -EIO;
+
 	drive->addressing = arg;
+
 	return 0;
 }
 
-#ifdef CONFIG_IDE_PROC_FS
-static void idedisk_add_settings(ide_drive_t *drive)
-{
-	struct hd_driveid *id = drive->id;
+ide_devset_rw(acoustic, acoustic);
+ide_devset_rw(address, addressing);
+ide_devset_rw(multcount, multcount);
+ide_devset_rw(wcache, wcache);
 
-	ide_add_setting(drive, "bios_cyl", SETTING_RW, TYPE_INT, 0, 65535, 1, 1,
-			&drive->bios_cyl, NULL);
-	ide_add_setting(drive, "bios_head", SETTING_RW, TYPE_BYTE, 0, 255, 1, 1,
-			&drive->bios_head, NULL);
-	ide_add_setting(drive, "bios_sect", SETTING_RW, TYPE_BYTE, 0, 63, 1, 1,
-			&drive->bios_sect, NULL);
-	ide_add_setting(drive, "address", SETTING_RW, TYPE_BYTE, 0, 2, 1, 1,
-			&drive->addressing, set_lba_addressing);
-	ide_add_setting(drive, "multcount", SETTING_RW, TYPE_BYTE, 0,
-			id->max_multsect, 1, 1, &drive->mult_count,
-			set_multcount);
-	ide_add_setting(drive, "nowerr", SETTING_RW, TYPE_BYTE, 0, 1, 1, 1,
-			&drive->nowerr, set_nowerr);
-	ide_add_setting(drive, "lun", SETTING_RW, TYPE_INT, 0, 7, 1, 1,
-			&drive->lun, NULL);
-	ide_add_setting(drive, "wcache", SETTING_RW, TYPE_BYTE, 0, 1, 1, 1,
-			&drive->wcache, write_cache);
-	ide_add_setting(drive, "acoustic", SETTING_RW, TYPE_BYTE, 0, 254, 1, 1,
-			&drive->acoustic, set_acoustic);
-	ide_add_setting(drive, "failures", SETTING_RW, TYPE_INT, 0, 65535, 1, 1,
-			&drive->failures, NULL);
-	ide_add_setting(drive, "max_failures", SETTING_RW, TYPE_INT, 0, 65535,
-			1, 1, &drive->max_failures, NULL);
-}
-#else
-static inline void idedisk_add_settings(ide_drive_t *drive) { ; }
+ide_devset_rw_sync(nowerr, nowerr);
+
+#ifdef CONFIG_IDE_PROC_FS
+ide_devset_rw_field(bios_cyl, bios_cyl);
+ide_devset_rw_field(bios_head, bios_head);
+ide_devset_rw_field(bios_sect, bios_sect);
+ide_devset_rw_field(failures, failures);
+ide_devset_rw_field(lun, lun);
+ide_devset_rw_field(max_failures, max_failures);
+
+static const struct ide_proc_devset idedisk_settings[] = {
+	IDE_PROC_DEVSET(acoustic,	0,   254),
+	IDE_PROC_DEVSET(address,	0,     2),
+	IDE_PROC_DEVSET(bios_cyl,	0, 65535),
+	IDE_PROC_DEVSET(bios_head,	0,   255),
+	IDE_PROC_DEVSET(bios_sect,	0,    63),
+	IDE_PROC_DEVSET(failures,	0, 65535),
+	IDE_PROC_DEVSET(lun,		0,     7),
+	IDE_PROC_DEVSET(max_failures,	0, 65535),
+	IDE_PROC_DEVSET(multcount,	0,    16),
+	IDE_PROC_DEVSET(nowerr,		0,     1),
+	IDE_PROC_DEVSET(wcache,		0,     1),
+	{ 0 },
+};
 #endif
 
 static void idedisk_setup(ide_drive_t *drive)
 {
+	struct ide_disk_obj *idkp = drive->driver_data;
 	ide_hwif_t *hwif = drive->hwif;
-	struct hd_driveid *id = drive->id;
+	u16 *id = drive->id;
+	char *m = (char *)&id[ATA_ID_PROD];
 	unsigned long long capacity;
 
-	idedisk_add_settings(drive);
+	ide_proc_register_driver(drive, idkp->driver);
 
 	if (drive->id_read == 0)
 		return;
@@ -807,11 +747,11 @@
 		/*
 		 * Removable disks (eg. SYQUEST); ignore 'WD' drives
 		 */
-		if (id->model[0] != 'W' || id->model[1] != 'D')
+		if (m[0] != 'W' || m[1] != 'D')
 			drive->doorlocking = 1;
 	}
 
-	(void)set_lba_addressing(drive, 1);
+	(void)set_addressing(drive, 1);
 
 	if (drive->addressing == 1) {
 		int max_s = 2048;
@@ -853,8 +793,7 @@
 	capacity = idedisk_capacity(drive);
 
 	if (!drive->forced_geom) {
-
-		if (idedisk_supports_lba48(drive->id)) {
+		if (ata_id_lba48_enabled(drive->id)) {
 			/* compatibility */
 			drive->bios_sect = 63;
 			drive->bios_head = 255;
@@ -880,22 +819,22 @@
 			 drive->name, capacity, sectors_to_MB(capacity));
 
 	/* Only print cache size when it was specified */
-	if (id->buf_size)
-		printk(KERN_CONT " w/%dKiB Cache", id->buf_size / 2);
+	if (id[ATA_ID_BUF_SIZE])
+		printk(KERN_CONT " w/%dKiB Cache", id[ATA_ID_BUF_SIZE] / 2);
 
 	printk(KERN_CONT ", CHS=%d/%d/%d\n",
 			 drive->bios_cyl, drive->bios_head, drive->bios_sect);
 
 	/* write cache enabled? */
-	if ((id->csfo & 1) || (id->cfs_enable_1 & (1 << 5)))
+	if ((id[ATA_ID_CSFO] & 1) || ata_id_wcache_enabled(id))
 		drive->wcache = 1;
 
-	write_cache(drive, 1);
+	set_wcache(drive, 1);
 }
 
 static void ide_cacheflush_p(ide_drive_t *drive)
 {
-	if (!drive->wcache || !ide_id_has_flush_cache(drive->id))
+	if (!drive->wcache || ata_id_flush_enabled(drive->id) == 0)
 		return;
 
 	if (do_idedisk_flushcache(drive))
@@ -937,7 +876,7 @@
  */
 static void ide_disk_resume(ide_drive_t *drive)
 {
-	if (idedisk_supports_hpa(drive->id))
+	if (ata_id_hpa_enabled(drive->id))
 		init_idedisk_capacity(drive);
 }
 
@@ -980,12 +919,12 @@
 	.shutdown		= ide_device_shutdown,
 	.version		= IDEDISK_VERSION,
 	.media			= ide_disk,
-	.supports_dsc_overlap	= 0,
 	.do_request		= ide_do_rw_disk,
 	.end_request		= ide_end_request,
 	.error			= __ide_error,
 #ifdef CONFIG_IDE_PROC_FS
 	.proc			= idedisk_proc,
+	.settings		= idedisk_settings,
 #endif
 };
 
@@ -994,7 +933,7 @@
 	ide_task_t task;
 
 	memset(&task, 0, sizeof(task));
-	task.tf.command = on ? WIN_DOORLOCK : WIN_DOORUNLOCK;
+	task.tf.command = on ? ATA_CMD_MEDIA_LOCK : ATA_CMD_MEDIA_UNLOCK;
 	task.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 
 	return ide_no_data_taskfile(drive, &task);
@@ -1059,52 +998,28 @@
 	return 0;
 }
 
+static const struct ide_ioctl_devset ide_disk_ioctl_settings[] = {
+{ HDIO_GET_ADDRESS,	HDIO_SET_ADDRESS,   &ide_devset_address   },
+{ HDIO_GET_MULTCOUNT,	HDIO_SET_MULTCOUNT, &ide_devset_multcount },
+{ HDIO_GET_NOWERR,	HDIO_SET_NOWERR,    &ide_devset_nowerr	  },
+{ HDIO_GET_WCACHE,	HDIO_SET_WCACHE,    &ide_devset_wcache	  },
+{ HDIO_GET_ACOUSTIC,	HDIO_SET_ACOUSTIC,  &ide_devset_acoustic  },
+{ 0 }
+};
+
 static int idedisk_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
-	unsigned long flags;
 	struct block_device *bdev = inode->i_bdev;
 	struct ide_disk_obj *idkp = ide_disk_g(bdev->bd_disk);
 	ide_drive_t *drive = idkp->drive;
-	int err, (*setfunc)(ide_drive_t *, int);
-	u8 *val;
+	int err;
 
-	switch (cmd) {
-	case HDIO_GET_ADDRESS:	 val = &drive->addressing;	goto read_val;
-	case HDIO_GET_MULTCOUNT: val = &drive->mult_count;	goto read_val;
-	case HDIO_GET_NOWERR:	 val = &drive->nowerr;		goto read_val;
-	case HDIO_GET_WCACHE:	 val = &drive->wcache;		goto read_val;
-	case HDIO_GET_ACOUSTIC:	 val = &drive->acoustic;	goto read_val;
-	case HDIO_SET_ADDRESS:	 setfunc = set_lba_addressing;	goto set_val;
-	case HDIO_SET_MULTCOUNT: setfunc = set_multcount;	goto set_val;
-	case HDIO_SET_NOWERR:	 setfunc = set_nowerr;		goto set_val;
-	case HDIO_SET_WCACHE:	 setfunc = write_cache;		goto set_val;
-	case HDIO_SET_ACOUSTIC:	 setfunc = set_acoustic;	goto set_val;
-	}
+	err = ide_setting_ioctl(drive, bdev, cmd, arg, ide_disk_ioctl_settings);
+	if (err != -EOPNOTSUPP)
+		return err;
 
 	return generic_ide_ioctl(drive, file, bdev, cmd, arg);
-
-read_val:
-	mutex_lock(&ide_setting_mtx);
-	spin_lock_irqsave(&ide_lock, flags);
-	err = *val;
-	spin_unlock_irqrestore(&ide_lock, flags);
-	mutex_unlock(&ide_setting_mtx);
-	return err >= 0 ? put_user(err, (long __user *)arg) : err;
-
-set_val:
-	if (bdev != bdev->bd_contains)
-		err = -EINVAL;
-	else {
-		if (!capable(CAP_SYS_ADMIN))
-			err = -EACCES;
-		else {
-			mutex_lock(&ide_setting_mtx);
-			err = setfunc(drive, arg);
-			mutex_unlock(&ide_setting_mtx);
-		}
-	}
-	return err;
 }
 
 static int idedisk_media_changed(struct gendisk *disk)
@@ -1148,8 +1063,7 @@
 	/* strstr("foo", "") is non-NULL */
 	if (!strstr("ide-disk", drive->driver_req))
 		goto failed;
-	if (!drive->present)
-		goto failed;
+
 	if (drive->media != ide_disk)
 		goto failed;
 
@@ -1163,8 +1077,6 @@
 
 	ide_init_disk(g, drive);
 
-	ide_proc_register_driver(drive, &idedisk_driver);
-
 	kref_init(&idkp->kref);
 
 	idkp->drive = drive;
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 3fa07c0..ef2f150 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -106,7 +106,7 @@
 	dma_stat = hwif->dma_ops->dma_end(drive);
 	stat = hwif->tp_ops->read_status(hwif);
 
-	if (OK_STAT(stat,DRIVE_READY,drive->bad_wstat|DRQ_STAT)) {
+	if (OK_STAT(stat, DRIVE_READY, drive->bad_wstat | ATA_DRQ)) {
 		if (!dma_stat) {
 			struct request *rq = HWGROUP(drive)->rq;
 
@@ -288,7 +288,7 @@
 static int config_drive_for_dma (ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	struct hd_driveid *id = drive->id;
+	u16 *id = drive->id;
 
 	if (drive->media != ide_disk) {
 		if (hwif->host_flags & IDE_HFLAG_NO_ATAPI_DMA)
@@ -299,16 +299,17 @@
 	 * Enable DMA on any drive that has
 	 * UltraDMA (mode 0/1/2/3/4/5/6) enabled
 	 */
-	if ((id->field_valid & 4) && ((id->dma_ultra >> 8) & 0x7f))
+	if ((id[ATA_ID_FIELD_VALID] & 4) &&
+	    ((id[ATA_ID_UDMA_MODES] >> 8) & 0x7f))
 		return 1;
 
 	/*
 	 * Enable DMA on any drive that has mode2 DMA
 	 * (multi or single) enabled
 	 */
-	if (id->field_valid & 2)	/* regular DMA */
-		if ((id->dma_mword & 0x404) == 0x404 ||
-		    (id->dma_1word & 0x404) == 0x404)
+	if (id[ATA_ID_FIELD_VALID] & 2)	/* regular DMA */
+		if ((id[ATA_ID_MWDMA_MODES] & 0x404) == 0x404 ||
+		    (id[ATA_ID_SWDMA_MODES] & 0x404) == 0x404)
 			return 1;
 
 	/* Consult the list of known "good" drives */
@@ -591,12 +592,12 @@
 
 int __ide_dma_bad_drive (ide_drive_t *drive)
 {
-	struct hd_driveid *id = drive->id;
+	u16 *id = drive->id;
 
 	int blacklist = ide_in_drive_list(id, drive_blacklist);
 	if (blacklist) {
 		printk(KERN_WARNING "%s: Disabling (U)DMA for %s (blacklisted)\n",
-				    drive->name, id->model);
+				    drive->name, (char *)&id[ATA_ID_PROD]);
 		return blacklist;
 	}
 	return 0;
@@ -612,21 +613,21 @@
 
 static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base, u8 req_mode)
 {
-	struct hd_driveid *id = drive->id;
+	u16 *id = drive->id;
 	ide_hwif_t *hwif = drive->hwif;
 	const struct ide_port_ops *port_ops = hwif->port_ops;
 	unsigned int mask = 0;
 
 	switch(base) {
 	case XFER_UDMA_0:
-		if ((id->field_valid & 4) == 0)
+		if ((id[ATA_ID_FIELD_VALID] & 4) == 0)
 			break;
 
 		if (port_ops && port_ops->udma_filter)
 			mask = port_ops->udma_filter(drive);
 		else
 			mask = hwif->ultra_mask;
-		mask &= id->dma_ultra;
+		mask &= id[ATA_ID_UDMA_MODES];
 
 		/*
 		 * avoid false cable warning from eighty_ninty_three()
@@ -637,19 +638,19 @@
 		}
 		break;
 	case XFER_MW_DMA_0:
-		if ((id->field_valid & 2) == 0)
+		if ((id[ATA_ID_FIELD_VALID] & 2) == 0)
 			break;
 		if (port_ops && port_ops->mdma_filter)
 			mask = port_ops->mdma_filter(drive);
 		else
 			mask = hwif->mwdma_mask;
-		mask &= id->dma_mword;
+		mask &= id[ATA_ID_MWDMA_MODES];
 		break;
 	case XFER_SW_DMA_0:
-		if (id->field_valid & 2) {
-			mask = id->dma_1word & hwif->swdma_mask;
-		} else if (id->tDMA) {
-			u8 mode = id->tDMA;
+		if (id[ATA_ID_FIELD_VALID] & 2) {
+			mask = id[ATA_ID_SWDMA_MODES] & hwif->swdma_mask;
+		} else if (id[ATA_ID_OLD_DMA_MODES] >> 8) {
+			u8 mode = id[ATA_ID_OLD_DMA_MODES] >> 8;
 
 			/*
 			 * if the mode is valid convert it to the mask
@@ -706,7 +707,8 @@
 		/*
 		 * is this correct?
 		 */
-		if (ide_dma_good_drive(drive) && drive->id->eide_dma_time < 150)
+		if (ide_dma_good_drive(drive) &&
+		    drive->id[ATA_ID_EIDE_DMA_TIME] < 150)
 			mode = XFER_MW_DMA_1;
 	}
 
@@ -725,7 +727,7 @@
 	ide_hwif_t *hwif = drive->hwif;
 	u8 speed;
 
-	if (drive->nodma || (drive->id->capability & 1) == 0)
+	if (drive->nodma || ata_id_has_dma(drive->id) == 0)
 		return 0;
 
 	/* consult the list of known "bad" drives */
@@ -767,13 +769,15 @@
 
 int ide_id_dma_bug(ide_drive_t *drive)
 {
-	struct hd_driveid *id = drive->id;
+	u16 *id = drive->id;
 
-	if (id->field_valid & 4) {
-		if ((id->dma_ultra >> 8) && (id->dma_mword >> 8))
+	if (id[ATA_ID_FIELD_VALID] & 4) {
+		if ((id[ATA_ID_UDMA_MODES] >> 8) &&
+		    (id[ATA_ID_MWDMA_MODES] >> 8))
 			goto err_out;
-	} else if (id->field_valid & 2) {
-		if ((id->dma_mword >> 8) && (id->dma_1word >> 8))
+	} else if (id[ATA_ID_FIELD_VALID] & 2) {
+		if ((id[ATA_ID_MWDMA_MODES] >> 8) &&
+		    (id[ATA_ID_SWDMA_MODES] >> 8))
 			goto err_out;
 	}
 	return 0;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index e9034c0..d36f155 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -15,6 +15,8 @@
  * Documentation/ide/ChangeLog.ide-floppy.1996-2002
  */
 
+#define DRV_NAME "ide-floppy"
+
 #define IDEFLOPPY_VERSION "1.00"
 
 #include <linux/module.h>
@@ -31,8 +33,10 @@
 #include <linux/slab.h>
 #include <linux/cdrom.h>
 #include <linux/ide.h>
+#include <linux/hdreg.h>
 #include <linux/bitops.h>
 #include <linux/mutex.h>
+#include <linux/scatterlist.h>
 
 #include <scsi/scsi_ioctl.h>
 
@@ -42,6 +46,8 @@
 #include <linux/io.h>
 #include <asm/unaligned.h>
 
+#include "ide-floppy.h"
+
 /* define to see debug info */
 #define IDEFLOPPY_DEBUG_LOG		0
 
@@ -55,102 +61,23 @@
 #define debug_log(fmt, args...) do {} while (0)
 #endif
 
-
-/* Some drives require a longer irq timeout. */
-#define IDEFLOPPY_WAIT_CMD		(5 * WAIT_CMD)
-
 /*
  * After each failed packet command we issue a request sense command and retry
  * the packet command IDEFLOPPY_MAX_PC_RETRIES times.
  */
 #define IDEFLOPPY_MAX_PC_RETRIES	3
 
-/*
- * With each packet command, we allocate a buffer of IDEFLOPPY_PC_BUFFER_SIZE
- * bytes.
- */
-#define IDEFLOPPY_PC_BUFFER_SIZE	256
-
-/*
- * In various places in the driver, we need to allocate storage for packet
- * commands and requests, which will remain valid while	we leave the driver to
- * wait for an interrupt or a timeout event.
- */
-#define IDEFLOPPY_PC_STACK		(10 + IDEFLOPPY_MAX_PC_RETRIES)
-
 /* format capacities descriptor codes */
 #define CAPACITY_INVALID	0x00
 #define CAPACITY_UNFORMATTED	0x01
 #define CAPACITY_CURRENT	0x02
 #define CAPACITY_NO_CARTRIDGE	0x03
 
-/*
- * Most of our global data which we need to save even as we leave the driver
- * due to an interrupt or a timer event is stored in a variable of type
- * idefloppy_floppy_t, defined below.
- */
-typedef struct ide_floppy_obj {
-	ide_drive_t	*drive;
-	ide_driver_t	*driver;
-	struct gendisk	*disk;
-	struct kref	kref;
-	unsigned int	openers;	/* protected by BKL for now */
-
-	/* Current packet command */
-	struct ide_atapi_pc *pc;
-	/* Last failed packet command */
-	struct ide_atapi_pc *failed_pc;
-	/* Packet command stack */
-	struct ide_atapi_pc pc_stack[IDEFLOPPY_PC_STACK];
-	/* Next free packet command storage space */
-	int pc_stack_index;
-	struct request rq_stack[IDEFLOPPY_PC_STACK];
-	/* We implement a circular array */
-	int rq_stack_index;
-
-	/* Last error information */
-	u8 sense_key, asc, ascq;
-	/* delay this long before sending packet command */
-	u8 ticks;
-	int progress_indication;
-
-	/* Device information */
-	/* Current format */
-	int blocks, block_size, bs_factor;
-	/* Last format capacity descriptor */
-	u8 cap_desc[8];
-	/* Copy of the flexible disk page */
-	u8 flexible_disk_page[32];
-	/* Write protect */
-	int wp;
-	/* Supports format progress report */
-	int srfp;
-} idefloppy_floppy_t;
-
 #define IDEFLOPPY_TICKS_DELAY	HZ/20	/* default delay for ZIP 100 (50ms) */
 
-/* Defines for the MODE SENSE command */
-#define MODE_SENSE_CURRENT		0x00
-#define MODE_SENSE_CHANGEABLE		0x01
-#define MODE_SENSE_DEFAULT		0x02
-#define MODE_SENSE_SAVED		0x03
-
-/* IOCTLs used in low-level formatting. */
-#define	IDEFLOPPY_IOCTL_FORMAT_SUPPORTED	0x4600
-#define	IDEFLOPPY_IOCTL_FORMAT_GET_CAPACITY	0x4601
-#define	IDEFLOPPY_IOCTL_FORMAT_START		0x4602
-#define IDEFLOPPY_IOCTL_FORMAT_GET_PROGRESS	0x4603
-
 /* Error code returned in rq->errors to the higher part of the driver. */
 #define	IDEFLOPPY_ERROR_GENERAL		101
 
-/*
- * Pages of the SELECT SENSE / MODE SENSE packet commands.
- * See SFF-8070i spec.
- */
-#define	IDEFLOPPY_CAPABILITIES_PAGE	0x1b
-#define IDEFLOPPY_FLEXIBLE_DISK_PAGE	0x05
-
 static DEFINE_MUTEX(idefloppy_ref_mutex);
 
 #define to_ide_floppy(obj) container_of(obj, struct ide_floppy_obj, kref)
@@ -219,44 +146,6 @@
 	return 0;
 }
 
-static void ide_floppy_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				  unsigned int bcount, int direction)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq = pc->rq;
-	struct req_iterator iter;
-	struct bio_vec *bvec;
-	unsigned long flags;
-	int count, done = 0;
-	char *data;
-
-	rq_for_each_segment(bvec, rq, iter) {
-		if (!bcount)
-			break;
-
-		count = min(bvec->bv_len, bcount);
-
-		data = bvec_kmap_irq(bvec, &flags);
-		if (direction)
-			hwif->tp_ops->output_data(drive, NULL, data, count);
-		else
-			hwif->tp_ops->input_data(drive, NULL, data, count);
-		bvec_kunmap_irq(data, &flags);
-
-		bcount -= count;
-		pc->b_count += count;
-		done += count;
-	}
-
-	idefloppy_end_request(drive, 1, done >> 9);
-
-	if (bcount) {
-		printk(KERN_ERR "%s: leftover data in %s, bcount == %d\n",
-				drive->name, __func__, bcount);
-		ide_pad_transfer(drive, direction, bcount);
-	}
-}
-
 static void idefloppy_update_buffers(ide_drive_t *drive,
 				struct ide_atapi_pc *pc)
 {
@@ -267,43 +156,6 @@
 		idefloppy_end_request(drive, 1, 0);
 }
 
-/*
- * Generate a new packet command request in front of the request queue, before
- * the current request so that it will be processed immediately, on the next
- * pass through the driver.
- */
-static void idefloppy_queue_pc_head(ide_drive_t *drive, struct ide_atapi_pc *pc,
-		struct request *rq)
-{
-	struct ide_floppy_obj *floppy = drive->driver_data;
-
-	blk_rq_init(NULL, rq);
-	rq->buffer = (char *) pc;
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->cmd_flags |= REQ_PREEMPT;
-	rq->rq_disk = floppy->disk;
-	memcpy(rq->cmd, pc->c, 12);
-	ide_do_drive_cmd(drive, rq);
-}
-
-static struct ide_atapi_pc *idefloppy_next_pc_storage(ide_drive_t *drive)
-{
-	idefloppy_floppy_t *floppy = drive->driver_data;
-
-	if (floppy->pc_stack_index == IDEFLOPPY_PC_STACK)
-		floppy->pc_stack_index = 0;
-	return (&floppy->pc_stack[floppy->pc_stack_index++]);
-}
-
-static struct request *idefloppy_next_rq_storage(ide_drive_t *drive)
-{
-	idefloppy_floppy_t *floppy = drive->driver_data;
-
-	if (floppy->rq_stack_index == IDEFLOPPY_PC_STACK)
-		floppy->rq_stack_index = 0;
-	return (&floppy->rq_stack[floppy->rq_stack_index++]);
-}
-
 static void ide_floppy_callback(ide_drive_t *drive)
 {
 	idefloppy_floppy_t *floppy = drive->driver_data;
@@ -341,16 +193,9 @@
 	idefloppy_end_request(drive, uptodate, 0);
 }
 
-static void idefloppy_init_pc(struct ide_atapi_pc *pc)
+void ide_floppy_create_request_sense_cmd(struct ide_atapi_pc *pc)
 {
-	memset(pc, 0, sizeof(*pc));
-	pc->buf = pc->pc_buf;
-	pc->buf_size = IDEFLOPPY_PC_BUFFER_SIZE;
-}
-
-static void idefloppy_create_request_sense_cmd(struct ide_atapi_pc *pc)
-{
-	idefloppy_init_pc(pc);
+	ide_init_pc(pc);
 	pc->c[0] = GPCMD_REQUEST_SENSE;
 	pc->c[4] = 255;
 	pc->req_xfer = 18;
@@ -362,14 +207,13 @@
  */
 static void idefloppy_retry_pc(ide_drive_t *drive)
 {
-	struct ide_atapi_pc *pc;
-	struct request *rq;
+	struct ide_floppy_obj *floppy = drive->driver_data;
+	struct request *rq = &floppy->request_sense_rq;
+	struct ide_atapi_pc *pc = &floppy->request_sense_pc;
 
 	(void)ide_read_error(drive);
-	pc = idefloppy_next_pc_storage(drive);
-	rq = idefloppy_next_rq_storage(drive);
-	idefloppy_create_request_sense_cmd(pc);
-	idefloppy_queue_pc_head(drive, pc, rq);
+	ide_floppy_create_request_sense_cmd(pc);
+	ide_queue_pc_head(drive, floppy->disk, pc, rq);
 }
 
 /* The usual interrupt handler called during a packet command. */
@@ -378,8 +222,8 @@
 	idefloppy_floppy_t *floppy = drive->driver_data;
 
 	return ide_pc_intr(drive, floppy->pc, idefloppy_pc_intr,
-			   IDEFLOPPY_WAIT_CMD, NULL, idefloppy_update_buffers,
-			   idefloppy_retry_pc, NULL, ide_floppy_io_buffers);
+			   WAIT_FLOPPY_CMD, NULL, idefloppy_update_buffers,
+			   idefloppy_retry_pc, NULL, ide_io_buffers);
 }
 
 /*
@@ -396,10 +240,9 @@
 	drive->hwif->tp_ops->output_data(drive, NULL, floppy->pc->c, 12);
 
 	/* Timeout for the packet command */
-	return IDEFLOPPY_WAIT_CMD;
+	return WAIT_FLOPPY_CMD;
 }
 
-
 /*
  * Called as an interrupt (or directly). When the device says it's ready for a
  * packet, we schedule the packet transfer to occur about 2-3 ticks later in
@@ -424,7 +267,7 @@
 		timeout = floppy->ticks;
 		expiry = &idefloppy_transfer_pc;
 	} else {
-		timeout = IDEFLOPPY_WAIT_CMD;
+		timeout = WAIT_FLOPPY_CMD;
 		expiry = NULL;
 	}
 
@@ -474,58 +317,27 @@
 	pc->retries++;
 
 	return ide_issue_pc(drive, pc, idefloppy_start_pc_transfer,
-			    IDEFLOPPY_WAIT_CMD, NULL);
+			    WAIT_FLOPPY_CMD, NULL);
 }
 
-static void idefloppy_create_prevent_cmd(struct ide_atapi_pc *pc, int prevent)
+void ide_floppy_create_read_capacity_cmd(struct ide_atapi_pc *pc)
 {
-	debug_log("creating prevent removal command, prevent = %d\n", prevent);
-
-	idefloppy_init_pc(pc);
-	pc->c[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL;
-	pc->c[4] = prevent;
-}
-
-static void idefloppy_create_read_capacity_cmd(struct ide_atapi_pc *pc)
-{
-	idefloppy_init_pc(pc);
+	ide_init_pc(pc);
 	pc->c[0] = GPCMD_READ_FORMAT_CAPACITIES;
 	pc->c[7] = 255;
 	pc->c[8] = 255;
 	pc->req_xfer = 255;
 }
 
-static void idefloppy_create_format_unit_cmd(struct ide_atapi_pc *pc, int b,
-		int l, int flags)
-{
-	idefloppy_init_pc(pc);
-	pc->c[0] = GPCMD_FORMAT_UNIT;
-	pc->c[1] = 0x17;
-
-	memset(pc->buf, 0, 12);
-	pc->buf[1] = 0xA2;
-	/* Default format list header, u8 1: FOV/DCRT/IMM bits set */
-
-	if (flags & 1)				/* Verify bit on... */
-		pc->buf[1] ^= 0x20;		/* ... turn off DCRT bit */
-	pc->buf[3] = 8;
-
-	put_unaligned(cpu_to_be32(b), (unsigned int *)(&pc->buf[4]));
-	put_unaligned(cpu_to_be32(l), (unsigned int *)(&pc->buf[8]));
-	pc->buf_size = 12;
-	pc->flags |= PC_FLAG_WRITING;
-}
-
 /* A mode sense command is used to "sense" floppy parameters. */
-static void idefloppy_create_mode_sense_cmd(struct ide_atapi_pc *pc,
-		u8 page_code, u8 type)
+void ide_floppy_create_mode_sense_cmd(struct ide_atapi_pc *pc, u8 page_code)
 {
 	u16 length = 8; /* sizeof(Mode Parameter Header) = 8 Bytes */
 
-	idefloppy_init_pc(pc);
+	ide_init_pc(pc);
 	pc->c[0] = GPCMD_MODE_SENSE_10;
 	pc->c[1] = 0;
-	pc->c[2] = page_code + (type << 6);
+	pc->c[2] = page_code;
 
 	switch (page_code) {
 	case IDEFLOPPY_CAPABILITIES_PAGE:
@@ -542,13 +354,6 @@
 	pc->req_xfer = length;
 }
 
-static void idefloppy_create_start_stop_cmd(struct ide_atapi_pc *pc, int start)
-{
-	idefloppy_init_pc(pc);
-	pc->c[0] = GPCMD_START_STOP_UNIT;
-	pc->c[4] = start;
-}
-
 static void idefloppy_create_rw_cmd(idefloppy_floppy_t *floppy,
 				    struct ide_atapi_pc *pc, struct request *rq,
 				    unsigned long sector)
@@ -560,7 +365,7 @@
 	debug_log("create_rw10_cmd: block == %d, blocks == %d\n",
 		block, blocks);
 
-	idefloppy_init_pc(pc);
+	ide_init_pc(pc);
 	pc->c[0] = cmd == READ ? GPCMD_READ_10 : GPCMD_WRITE_10;
 	put_unaligned(cpu_to_be16(blocks), (unsigned short *)&pc->c[7]);
 	put_unaligned(cpu_to_be32(block), (unsigned int *) &pc->c[2]);
@@ -568,7 +373,7 @@
 	memcpy(rq->cmd, pc->c, 12);
 
 	pc->rq = rq;
-	pc->b_count = cmd == READ ? 0 : rq->bio->bi_size;
+	pc->b_count = 0;
 	if (rq->cmd_flags & REQ_RW)
 		pc->flags |= PC_FLAG_WRITING;
 	pc->buf = NULL;
@@ -579,10 +384,10 @@
 static void idefloppy_blockpc_cmd(idefloppy_floppy_t *floppy,
 		struct ide_atapi_pc *pc, struct request *rq)
 {
-	idefloppy_init_pc(pc);
+	ide_init_pc(pc);
 	memcpy(pc->c, rq->cmd, sizeof(pc->c));
 	pc->rq = rq;
-	pc->b_count = rq->data_len;
+	pc->b_count = 0;
 	if (rq->data_len && rq_data_dir(rq) == WRITE)
 		pc->flags |= PC_FLAG_WRITING;
 	pc->buf = rq->data;
@@ -599,15 +404,17 @@
 		struct request *rq, sector_t block_s)
 {
 	idefloppy_floppy_t *floppy = drive->driver_data;
+	ide_hwif_t *hwif = drive->hwif;
 	struct ide_atapi_pc *pc;
 	unsigned long block = (unsigned long)block_s;
 
-	debug_log("dev: %s, cmd_type: %x, errors: %d\n",
-			rq->rq_disk ? rq->rq_disk->disk_name : "?",
-			rq->cmd_type, rq->errors);
-	debug_log("sector: %ld, nr_sectors: %ld, "
-			"current_nr_sectors: %d\n", (long)rq->sector,
-			rq->nr_sectors, rq->current_nr_sectors);
+	debug_log("%s: dev: %s, cmd: 0x%x, cmd_type: %x, errors: %d\n",
+		  __func__, rq->rq_disk ? rq->rq_disk->disk_name : "?",
+		  rq->cmd[0], rq->cmd_type, rq->errors);
+
+	debug_log("%s: sector: %ld, nr_sectors: %ld, current_nr_sectors: %d\n",
+		  __func__, (long)rq->sector, rq->nr_sectors,
+		  rq->current_nr_sectors);
 
 	if (rq->errors >= ERROR_MAX) {
 		if (floppy->failed_pc)
@@ -626,12 +433,12 @@
 			idefloppy_end_request(drive, 0, 0);
 			return ide_stopped;
 		}
-		pc = idefloppy_next_pc_storage(drive);
+		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(floppy, pc, rq, block);
 	} else if (blk_special_request(rq)) {
 		pc = (struct ide_atapi_pc *) rq->buffer;
 	} else if (blk_pc_request(rq)) {
-		pc = idefloppy_next_pc_storage(drive);
+		pc = &floppy->queued_pc;
 		idefloppy_blockpc_cmd(floppy, pc, rq);
 	} else {
 		blk_dump_rq_flags(rq,
@@ -640,54 +447,46 @@
 		return ide_stopped;
 	}
 
+	ide_init_sg_cmd(drive, rq);
+	ide_map_sg(drive, rq);
+
+	pc->sg = hwif->sg_table;
+	pc->sg_cnt = hwif->sg_nents;
+
 	pc->rq = rq;
 
 	return idefloppy_issue_pc(drive, pc);
 }
 
 /*
- * Add a special packet command request to the tail of the request queue,
- * and wait for it to be serviced.
- */
-static int idefloppy_queue_pc_tail(ide_drive_t *drive, struct ide_atapi_pc *pc)
-{
-	struct ide_floppy_obj *floppy = drive->driver_data;
-	struct request *rq;
-	int error;
-
-	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->buffer = (char *) pc;
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-	memcpy(rq->cmd, pc->c, 12);
-	error = blk_execute_rq(drive->queue, floppy->disk, rq, 0);
-	blk_put_request(rq);
-
-	return error;
-}
-
-/*
  * Look at the flexible disk page parameters. We ignore the CHS capacity
  * parameters and use the LBA parameters instead.
  */
 static int ide_floppy_get_flexible_disk_page(ide_drive_t *drive)
 {
 	idefloppy_floppy_t *floppy = drive->driver_data;
+	struct gendisk *disk = floppy->disk;
 	struct ide_atapi_pc pc;
 	u8 *page;
 	int capacity, lba_capacity;
 	u16 transfer_rate, sector_size, cyls, rpm;
 	u8 heads, sectors;
 
-	idefloppy_create_mode_sense_cmd(&pc, IDEFLOPPY_FLEXIBLE_DISK_PAGE,
-					MODE_SENSE_CURRENT);
+	ide_floppy_create_mode_sense_cmd(&pc, IDEFLOPPY_FLEXIBLE_DISK_PAGE);
 
-	if (idefloppy_queue_pc_tail(drive, &pc)) {
+	if (ide_queue_pc_tail(drive, disk, &pc)) {
 		printk(KERN_ERR "ide-floppy: Can't get flexible disk page"
 				" parameters\n");
 		return 1;
 	}
-	floppy->wp = !!(pc.buf[3] & 0x80);
-	set_disk_ro(floppy->disk, floppy->wp);
+
+	if (pc.buf[3] & 0x80)
+		drive->atapi_flags |= IDE_AFLAG_WP;
+	else
+		drive->atapi_flags &= ~IDE_AFLAG_WP;
+
+	set_disk_ro(disk, !!(drive->atapi_flags & IDE_AFLAG_WP));
+
 	page = &pc.buf[8];
 
 	transfer_rate = be16_to_cpup((__be16 *)&pc.buf[8 + 2]);
@@ -721,23 +520,6 @@
 	return 0;
 }
 
-static int idefloppy_get_sfrp_bit(ide_drive_t *drive)
-{
-	idefloppy_floppy_t *floppy = drive->driver_data;
-	struct ide_atapi_pc pc;
-
-	floppy->srfp = 0;
-	idefloppy_create_mode_sense_cmd(&pc, IDEFLOPPY_CAPABILITIES_PAGE,
-						 MODE_SENSE_CURRENT);
-
-	pc.flags |= PC_FLAG_SUPPRESS_ERROR;
-	if (idefloppy_queue_pc_tail(drive, &pc))
-		return 1;
-
-	floppy->srfp = pc.buf[8 + 2] & 0x40;
-	return (0);
-}
-
 /*
  * Determine if a media is present in the floppy drive, and if so, its LBA
  * capacity.
@@ -745,6 +527,7 @@
 static int ide_floppy_get_capacity(ide_drive_t *drive)
 {
 	idefloppy_floppy_t *floppy = drive->driver_data;
+	struct gendisk *disk = floppy->disk;
 	struct ide_atapi_pc pc;
 	u8 *cap_desc;
 	u8 header_len, desc_cnt;
@@ -756,8 +539,8 @@
 	floppy->bs_factor = 1;
 	set_capacity(floppy->disk, 0);
 
-	idefloppy_create_read_capacity_cmd(&pc);
-	if (idefloppy_queue_pc_tail(drive, &pc)) {
+	ide_floppy_create_read_capacity_cmd(&pc);
+	if (ide_queue_pc_tail(drive, disk, &pc)) {
 		printk(KERN_ERR "ide-floppy: Can't get floppy parameters\n");
 		return 1;
 	}
@@ -832,127 +615,11 @@
 	if (!(drive->atapi_flags & IDE_AFLAG_CLIK_DRIVE))
 		(void) ide_floppy_get_flexible_disk_page(drive);
 
-	set_capacity(floppy->disk, floppy->blocks * floppy->bs_factor);
+	set_capacity(disk, floppy->blocks * floppy->bs_factor);
+
 	return rc;
 }
 
-/*
- * Obtain the list of formattable capacities.
- * Very similar to ide_floppy_get_capacity, except that we push the capacity
- * descriptors to userland, instead of our own structures.
- *
- * Userland gives us the following structure:
- *
- * struct idefloppy_format_capacities {
- *	int nformats;
- *	struct {
- *		int nblocks;
- *		int blocksize;
- *	} formats[];
- * };
- *
- * userland initializes nformats to the number of allocated formats[] records.
- * On exit we set nformats to the number of records we've actually initialized.
- */
-
-static int ide_floppy_get_format_capacities(ide_drive_t *drive, int __user *arg)
-{
-	struct ide_atapi_pc pc;
-	u8 header_len, desc_cnt;
-	int i, blocks, length, u_array_size, u_index;
-	int __user *argp;
-
-	if (get_user(u_array_size, arg))
-		return (-EFAULT);
-
-	if (u_array_size <= 0)
-		return (-EINVAL);
-
-	idefloppy_create_read_capacity_cmd(&pc);
-	if (idefloppy_queue_pc_tail(drive, &pc)) {
-		printk(KERN_ERR "ide-floppy: Can't get floppy parameters\n");
-		return (-EIO);
-	}
-	header_len = pc.buf[3];
-	desc_cnt = header_len / 8; /* capacity descriptor of 8 bytes */
-
-	u_index = 0;
-	argp = arg + 1;
-
-	/*
-	 * We always skip the first capacity descriptor.  That's the current
-	 * capacity.  We are interested in the remaining descriptors, the
-	 * formattable capacities.
-	 */
-	for (i = 1; i < desc_cnt; i++) {
-		unsigned int desc_start = 4 + i*8;
-
-		if (u_index >= u_array_size)
-			break;	/* User-supplied buffer too small */
-
-		blocks = be32_to_cpup((__be32 *)&pc.buf[desc_start]);
-		length = be16_to_cpup((__be16 *)&pc.buf[desc_start + 6]);
-
-		if (put_user(blocks, argp))
-			return(-EFAULT);
-		++argp;
-
-		if (put_user(length, argp))
-			return (-EFAULT);
-		++argp;
-
-		++u_index;
-	}
-
-	if (put_user(u_index, arg))
-		return (-EFAULT);
-	return (0);
-}
-
-/*
- * Get ATAPI_FORMAT_UNIT progress indication.
- *
- * Userland gives a pointer to an int.  The int is set to a progress
- * indicator 0-65536, with 65536=100%.
- *
- * If the drive does not support format progress indication, we just check
- * the dsc bit, and return either 0 or 65536.
- */
-
-static int idefloppy_get_format_progress(ide_drive_t *drive, int __user *arg)
-{
-	idefloppy_floppy_t *floppy = drive->driver_data;
-	struct ide_atapi_pc pc;
-	int progress_indication = 0x10000;
-
-	if (floppy->srfp) {
-		idefloppy_create_request_sense_cmd(&pc);
-		if (idefloppy_queue_pc_tail(drive, &pc))
-			return (-EIO);
-
-		if (floppy->sense_key == 2 &&
-		    floppy->asc == 4 &&
-		    floppy->ascq == 4)
-			progress_indication = floppy->progress_indication;
-
-		/* Else assume format_unit has finished, and we're at 0x10000 */
-	} else {
-		ide_hwif_t *hwif = drive->hwif;
-		unsigned long flags;
-		u8 stat;
-
-		local_irq_save(flags);
-		stat = hwif->tp_ops->read_status(hwif);
-		local_irq_restore(flags);
-
-		progress_indication = ((stat & SEEK_STAT) == 0) ? 0 : 0x10000;
-	}
-	if (put_user(progress_indication, arg))
-		return (-EFAULT);
-
-	return (0);
-}
-
 static sector_t idefloppy_capacity(ide_drive_t *drive)
 {
 	idefloppy_floppy_t *floppy = drive->driver_data;
@@ -961,73 +628,42 @@
 	return capacity;
 }
 
-/*
- * Check whether we can support a drive, based on the ATAPI IDENTIFY command
- * results.
- */
-static int idefloppy_identify_device(ide_drive_t *drive, struct hd_driveid *id)
+#ifdef CONFIG_IDE_PROC_FS
+ide_devset_rw_field(bios_cyl, bios_cyl);
+ide_devset_rw_field(bios_head, bios_head);
+ide_devset_rw_field(bios_sect, bios_sect);
+
+static int get_ticks(ide_drive_t *drive)
 {
-	u8 gcw[2];
-	u8 device_type, protocol, removable, drq_type, packet_size;
+	idefloppy_floppy_t *floppy = drive->driver_data;
+	return floppy->ticks;
+}
 
-	*((u16 *) &gcw) = id->config;
-
-	device_type =  gcw[1] & 0x1F;
-	removable   = (gcw[0] & 0x80) >> 7;
-	protocol    = (gcw[1] & 0xC0) >> 6;
-	drq_type    = (gcw[0] & 0x60) >> 5;
-	packet_size =  gcw[0] & 0x03;
-
-#ifdef CONFIG_PPC
-	/* kludge for Apple PowerBook internal zip */
-	if (device_type == 5 &&
-	    !strstr(id->model, "CD-ROM") && strstr(id->model, "ZIP"))
-		device_type = 0;
-#endif
-
-	if (protocol != 2)
-		printk(KERN_ERR "ide-floppy: Protocol (0x%02x) is not ATAPI\n",
-			protocol);
-	else if (device_type != 0)
-		printk(KERN_ERR "ide-floppy: Device type (0x%02x) is not set "
-				"to floppy\n", device_type);
-	else if (!removable)
-		printk(KERN_ERR "ide-floppy: The removable flag is not set\n");
-	else if (drq_type == 3)
-		printk(KERN_ERR "ide-floppy: Sorry, DRQ type (0x%02x) not "
-				"supported\n", drq_type);
-	else if (packet_size != 0)
-		printk(KERN_ERR "ide-floppy: Packet size (0x%02x) is not 12 "
-				"bytes\n", packet_size);
-	else
-		return 1;
+static int set_ticks(ide_drive_t *drive, int arg)
+{
+	idefloppy_floppy_t *floppy = drive->driver_data;
+	floppy->ticks = arg;
 	return 0;
 }
 
-#ifdef CONFIG_IDE_PROC_FS
-static void idefloppy_add_settings(ide_drive_t *drive)
-{
-	idefloppy_floppy_t *floppy = drive->driver_data;
+IDE_DEVSET(ticks, DS_SYNC, get_ticks, set_ticks);
 
-	ide_add_setting(drive, "bios_cyl", SETTING_RW, TYPE_INT, 0, 1023, 1, 1,
-			&drive->bios_cyl, NULL);
-	ide_add_setting(drive, "bios_head", SETTING_RW, TYPE_BYTE, 0, 255, 1, 1,
-			&drive->bios_head, NULL);
-	ide_add_setting(drive, "bios_sect", SETTING_RW,	TYPE_BYTE, 0,  63, 1, 1,
-			&drive->bios_sect, NULL);
-	ide_add_setting(drive, "ticks",	   SETTING_RW, TYPE_BYTE, 0, 255, 1, 1,
-			&floppy->ticks,	 NULL);
-}
-#else
-static inline void idefloppy_add_settings(ide_drive_t *drive) { ; }
+static const struct ide_proc_devset idefloppy_settings[] = {
+	IDE_PROC_DEVSET(bios_cyl,  0, 1023),
+	IDE_PROC_DEVSET(bios_head, 0,  255),
+	IDE_PROC_DEVSET(bios_sect, 0,   63),
+	IDE_PROC_DEVSET(ticks,	   0,  255),
+	{ 0 },
+};
 #endif
 
 static void idefloppy_setup(ide_drive_t *drive, idefloppy_floppy_t *floppy)
 {
+	u16 *id = drive->id;
 	u8 gcw[2];
 
-	*((u16 *) &gcw) = drive->id->config;
-	floppy->pc = floppy->pc_stack;
+	*((u16 *)&gcw) = id[ATA_ID_CONFIG];
+
 	drive->pc_callback = ide_floppy_callback;
 
 	if (((gcw[0] & 0x60) >> 5) == 1)
@@ -1041,7 +677,7 @@
 	 * it. It should be fixed as of version 1.9, but to be on the safe side
 	 * we'll leave the limitation below for the 2.2.x tree.
 	 */
-	if (!strncmp(drive->id->model, "IOMEGA ZIP 100 ATAPI", 20)) {
+	if (!strncmp((char *)&id[ATA_ID_PROD], "IOMEGA ZIP 100 ATAPI", 20)) {
 		drive->atapi_flags |= IDE_AFLAG_ZIP_DRIVE;
 		/* This value will be visible in the /proc/ide/hdx/settings */
 		floppy->ticks = IDEFLOPPY_TICKS_DELAY;
@@ -1052,13 +688,16 @@
 	 * Guess what? The IOMEGA Clik! drive also needs the above fix. It makes
 	 * nasty clicking noises without it, so please don't remove this.
 	 */
-	if (strncmp(drive->id->model, "IOMEGA Clik!", 11) == 0) {
+	if (strncmp((char *)&id[ATA_ID_PROD], "IOMEGA Clik!", 11) == 0) {
 		blk_queue_max_sectors(drive->queue, 64);
 		drive->atapi_flags |= IDE_AFLAG_CLIK_DRIVE;
+		/* IOMEGA Clik! drives do not support lock/unlock commands */
+		drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK;
 	}
 
 	(void) ide_floppy_get_capacity(drive);
-	idefloppy_add_settings(drive);
+
+	ide_proc_register_driver(drive, floppy->driver);
 }
 
 static void ide_floppy_remove(ide_drive_t *drive)
@@ -1115,12 +754,12 @@
 	.remove			= ide_floppy_remove,
 	.version		= IDEFLOPPY_VERSION,
 	.media			= ide_floppy,
-	.supports_dsc_overlap	= 0,
 	.do_request		= idefloppy_do_request,
 	.end_request		= idefloppy_end_request,
 	.error			= __ide_error,
 #ifdef CONFIG_IDE_PROC_FS
 	.proc			= idefloppy_proc,
+	.settings		= idefloppy_settings,
 #endif
 };
 
@@ -1129,7 +768,6 @@
 	struct gendisk *disk = inode->i_bdev->bd_disk;
 	struct ide_floppy_obj *floppy;
 	ide_drive_t *drive;
-	struct ide_atapi_pc pc;
 	int ret = 0;
 
 	debug_log("Reached %s\n", __func__);
@@ -1146,13 +784,8 @@
 		drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
 		/* Just in case */
 
-		idefloppy_init_pc(&pc);
-		pc.c[0] = GPCMD_TEST_UNIT_READY;
-
-		if (idefloppy_queue_pc_tail(drive, &pc)) {
-			idefloppy_create_start_stop_cmd(&pc, 1);
-			(void) idefloppy_queue_pc_tail(drive, &pc);
-		}
+		if (ide_do_test_unit_ready(drive, disk))
+			ide_do_start_stop(drive, disk, 1);
 
 		if (ide_floppy_get_capacity(drive)
 		   && (filp->f_flags & O_NDELAY) == 0
@@ -1166,16 +799,13 @@
 			goto out_put_floppy;
 		}
 
-		if (floppy->wp && (filp->f_mode & 2)) {
+		if ((drive->atapi_flags & IDE_AFLAG_WP) && (filp->f_mode & 2)) {
 			ret = -EROFS;
 			goto out_put_floppy;
 		}
+
 		drive->atapi_flags |= IDE_AFLAG_MEDIA_CHANGED;
-		/* IOMEGA Clik! drives do not support lock/unlock commands */
-		if (!(drive->atapi_flags & IDE_AFLAG_CLIK_DRIVE)) {
-			idefloppy_create_prevent_cmd(&pc, 1);
-			(void) idefloppy_queue_pc_tail(drive, &pc);
-		}
+		ide_set_media_lock(drive, disk, 1);
 		check_disk_change(inode->i_bdev);
 	} else if (drive->atapi_flags & IDE_AFLAG_FORMAT_IN_PROGRESS) {
 		ret = -EBUSY;
@@ -1194,17 +824,11 @@
 	struct gendisk *disk = inode->i_bdev->bd_disk;
 	struct ide_floppy_obj *floppy = ide_floppy_g(disk);
 	ide_drive_t *drive = floppy->drive;
-	struct ide_atapi_pc pc;
 
 	debug_log("Reached %s\n", __func__);
 
 	if (floppy->openers == 1) {
-		/* IOMEGA Clik! drives do not support lock/unlock commands */
-		if (!(drive->atapi_flags & IDE_AFLAG_CLIK_DRIVE)) {
-			idefloppy_create_prevent_cmd(&pc, 0);
-			(void) idefloppy_queue_pc_tail(drive, &pc);
-		}
-
+		ide_set_media_lock(drive, disk, 0);
 		drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
 	}
 
@@ -1230,80 +854,20 @@
 			       unsigned long arg, unsigned int cmd)
 {
 	idefloppy_floppy_t *floppy = drive->driver_data;
+	struct gendisk *disk = floppy->disk;
+	int prevent = (arg && cmd != CDROMEJECT) ? 1 : 0;
 
 	if (floppy->openers > 1)
 		return -EBUSY;
 
-	/* The IOMEGA Clik! Drive doesn't support this command -
-	 * no room for an eject mechanism */
-	if (!(drive->atapi_flags & IDE_AFLAG_CLIK_DRIVE)) {
-		int prevent = arg ? 1 : 0;
+	ide_set_media_lock(drive, disk, prevent);
 
-		if (cmd == CDROMEJECT)
-			prevent = 0;
-
-		idefloppy_create_prevent_cmd(pc, prevent);
-		(void) idefloppy_queue_pc_tail(floppy->drive, pc);
-	}
-
-	if (cmd == CDROMEJECT) {
-		idefloppy_create_start_stop_cmd(pc, 2);
-		(void) idefloppy_queue_pc_tail(floppy->drive, pc);
-	}
+	if (cmd == CDROMEJECT)
+		ide_do_start_stop(drive, disk, 2);
 
 	return 0;
 }
 
-static int ide_floppy_format_unit(idefloppy_floppy_t *floppy,
-				  int __user *arg)
-{
-	struct ide_atapi_pc pc;
-	ide_drive_t *drive = floppy->drive;
-	int blocks, length, flags, err = 0;
-
-	if (floppy->openers > 1) {
-		/* Don't format if someone is using the disk */
-		drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
-		return -EBUSY;
-	}
-
-	drive->atapi_flags |= IDE_AFLAG_FORMAT_IN_PROGRESS;
-
-	/*
-	 * Send ATAPI_FORMAT_UNIT to the drive.
-	 *
-	 * Userland gives us the following structure:
-	 *
-	 * struct idefloppy_format_command {
-	 *        int nblocks;
-	 *        int blocksize;
-	 *        int flags;
-	 *        } ;
-	 *
-	 * flags is a bitmask, currently, the only defined flag is:
-	 *
-	 *        0x01 - verify media after format.
-	 */
-	if (get_user(blocks, arg) ||
-			get_user(length, arg+1) ||
-			get_user(flags, arg+2)) {
-		err = -EFAULT;
-		goto out;
-	}
-
-	(void) idefloppy_get_sfrp_bit(drive);
-	idefloppy_create_format_unit_cmd(&pc, blocks, length, flags);
-
-	if (idefloppy_queue_pc_tail(drive, &pc))
-		err = -EIO;
-
-out:
-	if (err)
-		drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
-	return err;
-}
-
-
 static int idefloppy_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
@@ -1314,23 +878,12 @@
 	void __user *argp = (void __user *)arg;
 	int err;
 
-	switch (cmd) {
-	case CDROMEJECT:
-		/* fall through */
-	case CDROM_LOCKDOOR:
+	if (cmd == CDROMEJECT || cmd == CDROM_LOCKDOOR)
 		return ide_floppy_lockdoor(drive, &pc, arg, cmd);
-	case IDEFLOPPY_IOCTL_FORMAT_SUPPORTED:
-		return 0;
-	case IDEFLOPPY_IOCTL_FORMAT_GET_CAPACITY:
-		return ide_floppy_get_format_capacities(drive, argp);
-	case IDEFLOPPY_IOCTL_FORMAT_START:
-		if (!(file->f_mode & 2))
-			return -EPERM;
 
-		return ide_floppy_format_unit(floppy, (int __user *)arg);
-	case IDEFLOPPY_IOCTL_FORMAT_GET_PROGRESS:
-		return idefloppy_get_format_progress(drive, argp);
-	}
+	err = ide_floppy_format_ioctl(drive, file, cmd, argp);
+	if (err != -ENOTTY)
+		return err;
 
 	/*
 	 * skip SCSI_IOCTL_SEND_COMMAND (deprecated)
@@ -1339,8 +892,6 @@
 	if (cmd != CDROM_SEND_PACKET && cmd != SCSI_IOCTL_SEND_COMMAND)
 		err = scsi_cmd_ioctl(file, bdev->bd_disk->queue,
 					bdev->bd_disk, cmd, argp);
-	else
-		err = -ENOTTY;
 
 	if (err == -ENOTTY)
 		err = generic_ide_ioctl(drive, file, bdev, cmd, arg);
@@ -1388,11 +939,11 @@
 
 	if (!strstr("ide-floppy", drive->driver_req))
 		goto failed;
-	if (!drive->present)
-		goto failed;
+
 	if (drive->media != ide_floppy)
 		goto failed;
-	if (!idefloppy_identify_device(drive, drive->id)) {
+
+	if (!ide_check_atapi_device(drive, DRV_NAME)) {
 		printk(KERN_ERR "ide-floppy: %s: not supported by this version"
 				" of ide-floppy\n", drive->name);
 		goto failed;
@@ -1410,8 +961,6 @@
 
 	ide_init_disk(g, drive);
 
-	ide_proc_register_driver(drive, &idefloppy_driver);
-
 	kref_init(&floppy->kref);
 
 	floppy->drive = drive;
@@ -1450,6 +999,7 @@
 }
 
 MODULE_ALIAS("ide:*m-floppy*");
+MODULE_ALIAS("ide-floppy");
 module_init(idefloppy_init);
 module_exit(idefloppy_exit);
 MODULE_LICENSE("GPL");
diff --git a/drivers/ide/ide-floppy.h b/drivers/ide/ide-floppy.h
new file mode 100644
index 0000000..ecadc2b
--- /dev/null
+++ b/drivers/ide/ide-floppy.h
@@ -0,0 +1,63 @@
+#ifndef __IDE_FLOPPY_H
+#define __IDE_FLOPPY_H
+
+/*
+ * Most of our global data which we need to save even as we leave the driver
+ * due to an interrupt or a timer event is stored in a variable of type
+ * idefloppy_floppy_t, defined below.
+ */
+typedef struct ide_floppy_obj {
+	ide_drive_t	*drive;
+	ide_driver_t	*driver;
+	struct gendisk	*disk;
+	struct kref	kref;
+	unsigned int	openers;	/* protected by BKL for now */
+
+	/* Current packet command */
+	struct ide_atapi_pc *pc;
+	/* Last failed packet command */
+	struct ide_atapi_pc *failed_pc;
+	/* used for blk_{fs,pc}_request() requests */
+	struct ide_atapi_pc queued_pc;
+
+	struct ide_atapi_pc request_sense_pc;
+	struct request request_sense_rq;
+
+	/* Last error information */
+	u8 sense_key, asc, ascq;
+	/* delay this long before sending packet command */
+	u8 ticks;
+	int progress_indication;
+
+	/* Device information */
+	/* Current format */
+	int blocks, block_size, bs_factor;
+	/* Last format capacity descriptor */
+	u8 cap_desc[8];
+	/* Copy of the flexible disk page */
+	u8 flexible_disk_page[32];
+} idefloppy_floppy_t;
+
+/*
+ * Pages of the SELECT SENSE / MODE SENSE packet commands.
+ * See SFF-8070i spec.
+ */
+#define	IDEFLOPPY_CAPABILITIES_PAGE	0x1b
+#define IDEFLOPPY_FLEXIBLE_DISK_PAGE	0x05
+
+/* IOCTLs used in low-level formatting. */
+#define	IDEFLOPPY_IOCTL_FORMAT_SUPPORTED	0x4600
+#define	IDEFLOPPY_IOCTL_FORMAT_GET_CAPACITY	0x4601
+#define	IDEFLOPPY_IOCTL_FORMAT_START		0x4602
+#define IDEFLOPPY_IOCTL_FORMAT_GET_PROGRESS	0x4603
+
+/* ide-floppy.c */
+void ide_floppy_create_mode_sense_cmd(struct ide_atapi_pc *, u8);
+void ide_floppy_create_read_capacity_cmd(struct ide_atapi_pc *);
+void ide_floppy_create_request_sense_cmd(struct ide_atapi_pc *);
+
+/* ide-floppy_ioctl.c */
+int ide_floppy_format_ioctl(ide_drive_t *, struct file *, unsigned int,
+			    void __user *);
+
+#endif /*__IDE_FLOPPY_H */
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
new file mode 100644
index 0000000..5ffc451
--- /dev/null
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -0,0 +1,243 @@
+/*
+ * ide-floppy IOCTLs handling.
+ */
+
+#include <linux/kernel.h>
+#include <linux/ide.h>
+#include <linux/cdrom.h>
+
+#include <asm/unaligned.h>
+
+#include <scsi/scsi_ioctl.h>
+
+#include "ide-floppy.h"
+
+/*
+ * Obtain the list of formattable capacities.
+ * Very similar to ide_floppy_get_capacity, except that we push the capacity
+ * descriptors to userland, instead of our own structures.
+ *
+ * Userland gives us the following structure:
+ *
+ * struct idefloppy_format_capacities {
+ *	int nformats;
+ *	struct {
+ *		int nblocks;
+ *		int blocksize;
+ *	} formats[];
+ * };
+ *
+ * userland initializes nformats to the number of allocated formats[] records.
+ * On exit we set nformats to the number of records we've actually initialized.
+ */
+
+static int ide_floppy_get_format_capacities(ide_drive_t *drive, int __user *arg)
+{
+	struct ide_floppy_obj *floppy = drive->driver_data;
+	struct ide_atapi_pc pc;
+	u8 header_len, desc_cnt;
+	int i, blocks, length, u_array_size, u_index;
+	int __user *argp;
+
+	if (get_user(u_array_size, arg))
+		return -EFAULT;
+
+	if (u_array_size <= 0)
+		return -EINVAL;
+
+	ide_floppy_create_read_capacity_cmd(&pc);
+	if (ide_queue_pc_tail(drive, floppy->disk, &pc)) {
+		printk(KERN_ERR "ide-floppy: Can't get floppy parameters\n");
+		return -EIO;
+	}
+
+	header_len = pc.buf[3];
+	desc_cnt = header_len / 8; /* capacity descriptor of 8 bytes */
+
+	u_index = 0;
+	argp = arg + 1;
+
+	/*
+	 * We always skip the first capacity descriptor.  That's the current
+	 * capacity.  We are interested in the remaining descriptors, the
+	 * formattable capacities.
+	 */
+	for (i = 1; i < desc_cnt; i++) {
+		unsigned int desc_start = 4 + i*8;
+
+		if (u_index >= u_array_size)
+			break;	/* User-supplied buffer too small */
+
+		blocks = be32_to_cpup((__be32 *)&pc.buf[desc_start]);
+		length = be16_to_cpup((__be16 *)&pc.buf[desc_start + 6]);
+
+		if (put_user(blocks, argp))
+			return -EFAULT;
+
+		++argp;
+
+		if (put_user(length, argp))
+			return -EFAULT;
+
+		++argp;
+
+		++u_index;
+	}
+
+	if (put_user(u_index, arg))
+		return -EFAULT;
+
+	return 0;
+}
+
+static void ide_floppy_create_format_unit_cmd(struct ide_atapi_pc *pc, int b,
+		int l, int flags)
+{
+	ide_init_pc(pc);
+	pc->c[0] = GPCMD_FORMAT_UNIT;
+	pc->c[1] = 0x17;
+
+	memset(pc->buf, 0, 12);
+	pc->buf[1] = 0xA2;
+	/* Default format list header, u8 1: FOV/DCRT/IMM bits set */
+
+	if (flags & 1)				/* Verify bit on... */
+		pc->buf[1] ^= 0x20;		/* ... turn off DCRT bit */
+	pc->buf[3] = 8;
+
+	put_unaligned(cpu_to_be32(b), (unsigned int *)(&pc->buf[4]));
+	put_unaligned(cpu_to_be32(l), (unsigned int *)(&pc->buf[8]));
+	pc->buf_size = 12;
+	pc->flags |= PC_FLAG_WRITING;
+}
+
+static int ide_floppy_get_sfrp_bit(ide_drive_t *drive)
+{
+	idefloppy_floppy_t *floppy = drive->driver_data;
+	struct ide_atapi_pc pc;
+
+	drive->atapi_flags &= ~IDE_AFLAG_SRFP;
+
+	ide_floppy_create_mode_sense_cmd(&pc, IDEFLOPPY_CAPABILITIES_PAGE);
+	pc.flags |= PC_FLAG_SUPPRESS_ERROR;
+
+	if (ide_queue_pc_tail(drive, floppy->disk, &pc))
+		return 1;
+
+	if (pc.buf[8 + 2] & 0x40)
+		drive->atapi_flags |= IDE_AFLAG_SRFP;
+
+	return 0;
+}
+
+static int ide_floppy_format_unit(ide_drive_t *drive, int __user *arg)
+{
+	idefloppy_floppy_t *floppy = drive->driver_data;
+	struct ide_atapi_pc pc;
+	int blocks, length, flags, err = 0;
+
+	if (floppy->openers > 1) {
+		/* Don't format if someone is using the disk */
+		drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
+		return -EBUSY;
+	}
+
+	drive->atapi_flags |= IDE_AFLAG_FORMAT_IN_PROGRESS;
+
+	/*
+	 * Send ATAPI_FORMAT_UNIT to the drive.
+	 *
+	 * Userland gives us the following structure:
+	 *
+	 * struct idefloppy_format_command {
+	 *        int nblocks;
+	 *        int blocksize;
+	 *        int flags;
+	 *        } ;
+	 *
+	 * flags is a bitmask, currently, the only defined flag is:
+	 *
+	 *        0x01 - verify media after format.
+	 */
+	if (get_user(blocks, arg) ||
+			get_user(length, arg+1) ||
+			get_user(flags, arg+2)) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	(void)ide_floppy_get_sfrp_bit(drive);
+	ide_floppy_create_format_unit_cmd(&pc, blocks, length, flags);
+
+	if (ide_queue_pc_tail(drive, floppy->disk, &pc))
+		err = -EIO;
+
+out:
+	if (err)
+		drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
+	return err;
+}
+
+/*
+ * Get ATAPI_FORMAT_UNIT progress indication.
+ *
+ * Userland gives a pointer to an int.  The int is set to a progress
+ * indicator 0-65536, with 65536=100%.
+ *
+ * If the drive does not support format progress indication, we just check
+ * the dsc bit, and return either 0 or 65536.
+ */
+
+static int ide_floppy_get_format_progress(ide_drive_t *drive, int __user *arg)
+{
+	idefloppy_floppy_t *floppy = drive->driver_data;
+	struct ide_atapi_pc pc;
+	int progress_indication = 0x10000;
+
+	if (drive->atapi_flags & IDE_AFLAG_SRFP) {
+		ide_floppy_create_request_sense_cmd(&pc);
+		if (ide_queue_pc_tail(drive, floppy->disk, &pc))
+			return -EIO;
+
+		if (floppy->sense_key == 2 &&
+		    floppy->asc == 4 &&
+		    floppy->ascq == 4)
+			progress_indication = floppy->progress_indication;
+
+		/* Else assume format_unit has finished, and we're at 0x10000 */
+	} else {
+		ide_hwif_t *hwif = drive->hwif;
+		unsigned long flags;
+		u8 stat;
+
+		local_irq_save(flags);
+		stat = hwif->tp_ops->read_status(hwif);
+		local_irq_restore(flags);
+
+		progress_indication = ((stat & ATA_DSC) == 0) ? 0 : 0x10000;
+	}
+
+	if (put_user(progress_indication, arg))
+		return -EFAULT;
+
+	return 0;
+}
+
+int ide_floppy_format_ioctl(ide_drive_t *drive, struct file *file,
+			    unsigned int cmd, void __user *argp)
+{
+	switch (cmd) {
+	case IDEFLOPPY_IOCTL_FORMAT_SUPPORTED:
+		return 0;
+	case IDEFLOPPY_IOCTL_FORMAT_GET_CAPACITY:
+		return ide_floppy_get_format_capacities(drive, argp);
+	case IDEFLOPPY_IOCTL_FORMAT_START:
+		if (!(file->f_mode & 2))
+			return -EPERM;
+		return ide_floppy_format_unit(drive, (int __user *)argp);
+	case IDEFLOPPY_IOCTL_FORMAT_GET_PROGRESS:
+		return ide_floppy_get_format_progress(drive, argp);
+	default:
+		return -ENOTTY;
+	}
+}
diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c
index 8fe8b5b..0a3cb0c 100644
--- a/drivers/ide/ide-generic.c
+++ b/drivers/ide/ide-generic.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/ide.h>
+#include <linux/pci_ids.h>
 
 /* FIXME: convert m32r to use ide_platform host driver */
 #ifdef CONFIG_M32R
@@ -27,7 +28,7 @@
 
 #define DRV_NAME	"ide_generic"
 
-static int probe_mask = 0x03;
+static int probe_mask;
 module_param(probe_mask, int, 0);
 MODULE_PARM_DESC(probe_mask, "probe mask for legacy ISA IDE ports");
 
@@ -100,19 +101,65 @@
 static const int legacy_irqs[]  = { 14, 15, 11, 10, 8, 12 };
 #endif
 
+static void ide_generic_check_pci_legacy_iobases(int *primary, int *secondary)
+{
+	struct pci_dev *p = NULL;
+	u16 val;
+
+	for_each_pci_dev(p) {
+
+		if (pci_resource_start(p, 0) == 0x1f0)
+			*primary = 1;
+		if (pci_resource_start(p, 2) == 0x170)
+			*secondary = 1;
+
+		/* Cyrix CS55{1,2}0 pre SFF MWDMA ATA on the bridge */
+		if (p->vendor == PCI_VENDOR_ID_CYRIX &&
+		    (p->device == PCI_DEVICE_ID_CYRIX_5510 ||
+		     p->device == PCI_DEVICE_ID_CYRIX_5520))
+			*primary = *secondary = 1;
+
+		/* Intel MPIIX - PIO ATA on non PCI side of bridge */
+		if (p->vendor == PCI_VENDOR_ID_INTEL &&
+		    p->device == PCI_DEVICE_ID_INTEL_82371MX) {
+
+			pci_read_config_word(p, 0x6C, &val);
+			if (val & 0x8000) {
+				/* ATA port enabled */
+				if (val & 0x4000)
+					*secondary = 1;
+				else
+					*primary = 1;
+			}
+		}
+	}
+}
+
 static int __init ide_generic_init(void)
 {
 	hw_regs_t hw[MAX_HWIFS], *hws[MAX_HWIFS];
 	struct ide_host *host;
 	unsigned long io_addr;
-	int i, rc;
+	int i, rc, primary = 0, secondary = 0;
 
 #ifdef CONFIG_MIPS
 	if (!ide_probe_legacy())
 		return -ENODEV;
 #endif
-	printk(KERN_INFO DRV_NAME ": please use \"probe_mask=0x3f\" module "
-			 "parameter for probing all legacy ISA IDE ports\n");
+	ide_generic_check_pci_legacy_iobases(&primary, &secondary);
+
+	if (!probe_mask) {
+		printk(KERN_INFO DRV_NAME ": please use \"probe_mask=0x3f\" "
+		     "module parameter for probing all legacy ISA IDE ports\n");
+
+		if (primary == 0)
+			probe_mask |= 0x1;
+
+		if (secondary == 0)
+			probe_mask |= 0x2;
+	} else
+		printk(KERN_INFO DRV_NAME ": enforcing probing of I/O ports "
+			"upon user request\n");
 
 	memset(hws, 0, sizeof(hw_regs_t *) * MAX_HWIFS);
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index a896a28..1c51949 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -40,6 +40,7 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/ide.h>
+#include <linux/hdreg.h>
 #include <linux/completion.h>
 #include <linux/reboot.h>
 #include <linux/cdrom.h>
@@ -183,18 +184,18 @@
 		if (drive->media != ide_disk)
 			break;
 		/* Not supported? Switch to next step now. */
-		if (!drive->wcache || !ide_id_has_flush_cache(drive->id)) {
+		if (!drive->wcache || ata_id_flush_enabled(drive->id) == 0) {
 			ide_complete_power_step(drive, rq, 0, 0);
 			return ide_stopped;
 		}
-		if (ide_id_has_flush_cache_ext(drive->id))
-			args->tf.command = WIN_FLUSH_CACHE_EXT;
+		if (ata_id_flush_ext_enabled(drive->id))
+			args->tf.command = ATA_CMD_FLUSH_EXT;
 		else
-			args->tf.command = WIN_FLUSH_CACHE;
+			args->tf.command = ATA_CMD_FLUSH;
 		goto out_do_tf;
 
 	case idedisk_pm_standby:	/* Suspend step 2 (standby) */
-		args->tf.command = WIN_STANDBYNOW1;
+		args->tf.command = ATA_CMD_STANDBYNOW1;
 		goto out_do_tf;
 
 	case idedisk_pm_restore_pio:	/* Resume step 1 (restore PIO) */
@@ -209,7 +210,7 @@
 		return ide_stopped;
 
 	case idedisk_pm_idle:		/* Resume step 2 (idle) */
-		args->tf.command = WIN_IDLEIMMEDIATE;
+		args->tf.command = ATA_CMD_IDLEIMMEDIATE;
 		goto out_do_tf;
 
 	case ide_pm_restore_dma:	/* Resume step 3 (restore DMA) */
@@ -322,7 +323,7 @@
 		ide_task_t *task = (ide_task_t *)rq->special;
 
 		if (rq->errors == 0)
-			rq->errors = !OK_STAT(stat, READY_STAT, BAD_STAT);
+			rq->errors = !OK_STAT(stat, ATA_DRDY, BAD_STAT);
 
 		if (task) {
 			struct ide_taskfile *tf = &task->tf;
@@ -373,29 +374,29 @@
 {
 	ide_hwif_t *hwif = drive->hwif;
 
-	if (stat & BUSY_STAT || ((stat & WRERR_STAT) && !drive->nowerr)) {
+	if ((stat & ATA_BUSY) || ((stat & ATA_DF) && !drive->nowerr)) {
 		/* other bits are useless when BUSY */
 		rq->errors |= ERROR_RESET;
-	} else if (stat & ERR_STAT) {
+	} else if (stat & ATA_ERR) {
 		/* err has different meaning on cdrom and tape */
-		if (err == ABRT_ERR) {
+		if (err == ATA_ABORTED) {
 			if (drive->select.b.lba &&
-			    /* some newer drives don't support WIN_SPECIFY */
-			    hwif->tp_ops->read_status(hwif) == WIN_SPECIFY)
+			    /* some newer drives don't support ATA_CMD_INIT_DEV_PARAMS */
+			    hwif->tp_ops->read_status(hwif) == ATA_CMD_INIT_DEV_PARAMS)
 				return ide_stopped;
 		} else if ((err & BAD_CRC) == BAD_CRC) {
 			/* UDMA crc error, just retry the operation */
 			drive->crc_count++;
-		} else if (err & (BBD_ERR | ECC_ERR)) {
+		} else if (err & (ATA_BBK | ATA_UNC)) {
 			/* retries won't help these */
 			rq->errors = ERROR_MAX;
-		} else if (err & TRK0_ERR) {
+		} else if (err & ATA_TRK0NF) {
 			/* help it find track zero */
 			rq->errors |= ERROR_RECAL;
 		}
 	}
 
-	if ((stat & DRQ_STAT) && rq_data_dir(rq) == READ &&
+	if ((stat & ATA_DRQ) && rq_data_dir(rq) == READ &&
 	    (hwif->host_flags & IDE_HFLAG_ERROR_STOPS_FIFO) == 0) {
 		int nsect = drive->mult_count ? drive->mult_count : 1;
 
@@ -407,7 +408,7 @@
 		return ide_stopped;
 	}
 
-	if (hwif->tp_ops->read_status(hwif) & (BUSY_STAT | DRQ_STAT))
+	if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
 		rq->errors |= ERROR_RESET;
 
 	if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
@@ -427,16 +428,16 @@
 {
 	ide_hwif_t *hwif = drive->hwif;
 
-	if (stat & BUSY_STAT || ((stat & WRERR_STAT) && !drive->nowerr)) {
+	if ((stat & ATA_BUSY) || ((stat & ATA_DF) && !drive->nowerr)) {
 		/* other bits are useless when BUSY */
 		rq->errors |= ERROR_RESET;
 	} else {
 		/* add decoding error stuff */
 	}
 
-	if (hwif->tp_ops->read_status(hwif) & (BUSY_STAT | DRQ_STAT))
+	if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
 		/* force an abort */
-		hwif->tp_ops->exec_command(hwif, WIN_IDLEIMMEDIATE);
+		hwif->tp_ops->exec_command(hwif, ATA_CMD_IDLEIMMEDIATE);
 
 	if (rq->errors >= ERROR_MAX) {
 		ide_kill_rq(drive, rq);
@@ -509,19 +510,19 @@
 	tf->lbam    = drive->cyl;
 	tf->lbah    = drive->cyl >> 8;
 	tf->device  = ((drive->head - 1) | drive->select.all) & ~ATA_LBA;
-	tf->command = WIN_SPECIFY;
+	tf->command = ATA_CMD_INIT_DEV_PARAMS;
 }
 
 static void ide_tf_set_restore_cmd(ide_drive_t *drive, struct ide_taskfile *tf)
 {
 	tf->nsect   = drive->sect;
-	tf->command = WIN_RESTORE;
+	tf->command = ATA_CMD_RESTORE;
 }
 
 static void ide_tf_set_setmult_cmd(ide_drive_t *drive, struct ide_taskfile *tf)
 {
 	tf->nsect   = drive->mult_req;
-	tf->command = WIN_SETMULT;
+	tf->command = ATA_CMD_SET_MULTI;
 }
 
 static ide_startstop_t ide_disk_special(ide_drive_t *drive)
@@ -540,8 +541,6 @@
 		ide_tf_set_restore_cmd(drive, &args.tf);
 	} else if (s->b.set_multmode) {
 		s->b.set_multmode = 0;
-		if (drive->mult_req > drive->id->max_multsect)
-			drive->mult_req = drive->id->max_multsect;
 		ide_tf_set_setmult_cmd(drive, &args.tf);
 	} else if (s->all) {
 		int special = s->all;
@@ -586,9 +585,10 @@
  *	do_special		-	issue some special commands
  *	@drive: drive the command is for
  *
- *	do_special() is used to issue WIN_SPECIFY, WIN_RESTORE, and WIN_SETMULT
- *	commands to a drive.  It used to do much more, but has been scaled
- *	back.
+ *	do_special() is used to issue ATA_CMD_INIT_DEV_PARAMS,
+ *	ATA_CMD_RESTORE and ATA_CMD_SET_MULTI commands to a drive.
+ *
+ *	It used to do much more, but has been scaled back.
  */
 
 static ide_startstop_t do_special (ide_drive_t *drive)
@@ -716,9 +716,49 @@
  	return ide_stopped;
 }
 
+int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
+		       int arg)
+{
+	struct request_queue *q = drive->queue;
+	struct request *rq;
+	int ret = 0;
+
+	if (!(setting->flags & DS_SYNC))
+		return setting->set(drive, arg);
+
+	rq = blk_get_request(q, READ, GFP_KERNEL);
+	if (!rq)
+		return -ENOMEM;
+
+	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_len = 5;
+	rq->cmd[0] = REQ_DEVSET_EXEC;
+	*(int *)&rq->cmd[1] = arg;
+	rq->special = setting->set;
+
+	if (blk_execute_rq(q, NULL, rq, 0))
+		ret = rq->errors;
+	blk_put_request(rq);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ide_devset_execute);
+
 static ide_startstop_t ide_special_rq(ide_drive_t *drive, struct request *rq)
 {
 	switch (rq->cmd[0]) {
+	case REQ_DEVSET_EXEC:
+	{
+		int err, (*setfunc)(ide_drive_t *, int) = rq->special;
+
+		err = setfunc(drive, *(int *)&rq->cmd[1]);
+		if (err)
+			rq->errors = err;
+		else
+			err = 1;
+		ide_end_request(drive, err, 0);
+		return ide_stopped;
+	}
 	case REQ_DRIVE_RESET:
 		return ide_do_reset(drive);
 	default:
@@ -766,9 +806,7 @@
  *	start_request	-	start of I/O and command issuing for IDE
  *
  *	start_request() initiates handling of a new I/O request. It
- *	accepts commands and I/O (read/write) requests. It also does
- *	the final remapping for weird stuff like EZDrive. Once 
- *	device mapper can work sector level the EZDrive stuff can go away
+ *	accepts commands and I/O (read/write) requests.
  *
  *	FIXME: this function needs a rename
  */
@@ -776,7 +814,6 @@
 static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 {
 	ide_startstop_t startstop;
-	sector_t block;
 
 	BUG_ON(!blk_rq_started(rq));
 
@@ -791,21 +828,12 @@
 		goto kill_rq;
 	}
 
-	block    = rq->sector;
-	if (blk_fs_request(rq) &&
-	    (drive->media == ide_disk || drive->media == ide_floppy)) {
-		block += drive->sect0;
-	}
-	/* Yecch - this will shift the entire interval,
-	   possibly killing some innocent following sector */
-	if (block == 0 && drive->remap_0_to_1 == 1)
-		block = 1;  /* redirect MBR access to EZ-Drive partn table */
-
 	if (blk_pm_request(rq))
 		ide_check_pm_state(drive, rq);
 
 	SELECT_DRIVE(drive);
-	if (ide_wait_stat(&startstop, drive, drive->ready_stat, BUSY_STAT|DRQ_STAT, WAIT_READY)) {
+	if (ide_wait_stat(&startstop, drive, drive->ready_stat,
+			  ATA_BUSY | ATA_DRQ, WAIT_READY)) {
 		printk(KERN_ERR "%s: drive not ready for command\n", drive->name);
 		return startstop;
 	}
@@ -844,7 +872,8 @@
 			return ide_special_rq(drive, rq);
 
 		drv = *(ide_driver_t **)rq->rq_disk->private_data;
-		return drv->do_request(drive, rq, block);
+
+		return drv->do_request(drive, rq, rq->sector);
 	}
 	return do_special(drive);
 kill_rq:
@@ -1325,7 +1354,7 @@
 		if (hwif->irq == irq) {
 			stat = hwif->tp_ops->read_status(hwif);
 
-			if (!OK_STAT(stat, READY_STAT, BAD_STAT)) {
+			if (!OK_STAT(stat, ATA_DRDY, BAD_STAT)) {
 				/* Try to not flood the console with msgs */
 				static unsigned long last_msgtime, count;
 				++count;
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
new file mode 100644
index 0000000..cf01564
--- /dev/null
+++ b/drivers/ide/ide-ioctls.c
@@ -0,0 +1,290 @@
+/*
+ * IDE ioctls handling.
+ */
+
+#include <linux/hdreg.h>
+#include <linux/ide.h>
+
+static const struct ide_ioctl_devset ide_ioctl_settings[] = {
+{ HDIO_GET_32BIT,	 HDIO_SET_32BIT,	&ide_devset_io_32bit  },
+{ HDIO_GET_KEEPSETTINGS, HDIO_SET_KEEPSETTINGS,	&ide_devset_keepsettings },
+{ HDIO_GET_UNMASKINTR,	 HDIO_SET_UNMASKINTR,	&ide_devset_unmaskirq },
+{ HDIO_GET_DMA,		 HDIO_SET_DMA,		&ide_devset_using_dma },
+{ -1,			 HDIO_SET_PIO_MODE,	&ide_devset_pio_mode  },
+{ 0 }
+};
+
+int ide_setting_ioctl(ide_drive_t *drive, struct block_device *bdev,
+		      unsigned int cmd, unsigned long arg,
+		      const struct ide_ioctl_devset *s)
+{
+	const struct ide_devset *ds;
+	unsigned long flags;
+	int err = -EOPNOTSUPP;
+
+	for (; (ds = s->setting); s++) {
+		if (ds->get && s->get_ioctl == cmd)
+			goto read_val;
+		else if (ds->set && s->set_ioctl == cmd)
+			goto set_val;
+	}
+
+	return err;
+
+read_val:
+	mutex_lock(&ide_setting_mtx);
+	spin_lock_irqsave(&ide_lock, flags);
+	err = ds->get(drive);
+	spin_unlock_irqrestore(&ide_lock, flags);
+	mutex_unlock(&ide_setting_mtx);
+	return err >= 0 ? put_user(err, (long __user *)arg) : err;
+
+set_val:
+	if (bdev != bdev->bd_contains)
+		err = -EINVAL;
+	else {
+		if (!capable(CAP_SYS_ADMIN))
+			err = -EACCES;
+		else {
+			mutex_lock(&ide_setting_mtx);
+			err = ide_devset_execute(drive, ds, arg);
+			mutex_unlock(&ide_setting_mtx);
+		}
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(ide_setting_ioctl);
+
+static int ide_get_identity_ioctl(ide_drive_t *drive, unsigned int cmd,
+				  unsigned long arg)
+{
+	u16 *id = NULL;
+	int size = (cmd == HDIO_GET_IDENTITY) ? (ATA_ID_WORDS * 2) : 142;
+	int rc = 0;
+
+	if (drive->id_read == 0) {
+		rc = -ENOMSG;
+		goto out;
+	}
+
+	id = kmalloc(size, GFP_KERNEL);
+	if (id == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	memcpy(id, drive->id, size);
+	ata_id_to_hd_driveid(id);
+
+	if (copy_to_user((void __user *)arg, id, size))
+		rc = -EFAULT;
+
+	kfree(id);
+out:
+	return rc;
+}
+
+static int ide_get_nice_ioctl(ide_drive_t *drive, unsigned long arg)
+{
+	return put_user((drive->dsc_overlap << IDE_NICE_DSC_OVERLAP) |
+			(drive->nice1 << IDE_NICE_1), (long __user *)arg);
+}
+
+static int ide_set_nice_ioctl(ide_drive_t *drive, unsigned long arg)
+{
+	if (arg != (arg & ((1 << IDE_NICE_DSC_OVERLAP) | (1 << IDE_NICE_1))))
+		return -EPERM;
+
+	if (((arg >> IDE_NICE_DSC_OVERLAP) & 1) &&
+	    (drive->media == ide_disk || drive->media == ide_floppy ||
+	     drive->scsi))
+		return -EPERM;
+
+	drive->dsc_overlap = (arg >> IDE_NICE_DSC_OVERLAP) & 1;
+	drive->nice1 = (arg >> IDE_NICE_1) & 1;
+
+	return 0;
+}
+
+static int ide_cmd_ioctl(ide_drive_t *drive, unsigned cmd, unsigned long arg)
+{
+	u8 *buf = NULL;
+	int bufsize = 0, err = 0;
+	u8 args[4], xfer_rate = 0;
+	ide_task_t tfargs;
+	struct ide_taskfile *tf = &tfargs.tf;
+	u16 *id = drive->id;
+
+	if (NULL == (void *) arg) {
+		struct request *rq;
+
+		rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+		rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
+		err = blk_execute_rq(drive->queue, NULL, rq, 0);
+		blk_put_request(rq);
+
+		return err;
+	}
+
+	if (copy_from_user(args, (void __user *)arg, 4))
+		return -EFAULT;
+
+	memset(&tfargs, 0, sizeof(ide_task_t));
+	tf->feature = args[2];
+	if (args[0] == ATA_CMD_SMART) {
+		tf->nsect = args[3];
+		tf->lbal  = args[1];
+		tf->lbam  = 0x4f;
+		tf->lbah  = 0xc2;
+		tfargs.tf_flags = IDE_TFLAG_OUT_TF | IDE_TFLAG_IN_NSECT;
+	} else {
+		tf->nsect = args[1];
+		tfargs.tf_flags = IDE_TFLAG_OUT_FEATURE |
+				  IDE_TFLAG_OUT_NSECT | IDE_TFLAG_IN_NSECT;
+	}
+	tf->command = args[0];
+	tfargs.data_phase = args[3] ? TASKFILE_IN : TASKFILE_NO_DATA;
+
+	if (args[3]) {
+		tfargs.tf_flags |= IDE_TFLAG_IO_16BIT;
+		bufsize = SECTOR_SIZE * args[3];
+		buf = kzalloc(bufsize, GFP_KERNEL);
+		if (buf == NULL)
+			return -ENOMEM;
+	}
+
+	if (tf->command == ATA_CMD_SET_FEATURES &&
+	    tf->feature == SETFEATURES_XFER &&
+	    tf->nsect >= XFER_SW_DMA_0 &&
+	    (id[ATA_ID_UDMA_MODES] ||
+	     id[ATA_ID_MWDMA_MODES] ||
+	     id[ATA_ID_SWDMA_MODES])) {
+		xfer_rate = args[1];
+		if (tf->nsect > XFER_UDMA_2 && !eighty_ninty_three(drive)) {
+			printk(KERN_WARNING "%s: UDMA speeds >UDMA33 cannot "
+					    "be set\n", drive->name);
+			goto abort;
+		}
+	}
+
+	err = ide_raw_taskfile(drive, &tfargs, buf, args[3]);
+
+	args[0] = tf->status;
+	args[1] = tf->error;
+	args[2] = tf->nsect;
+
+	if (!err && xfer_rate) {
+		/* active-retuning-calls future */
+		ide_set_xfer_rate(drive, xfer_rate);
+		ide_driveid_update(drive);
+	}
+abort:
+	if (copy_to_user((void __user *)arg, &args, 4))
+		err = -EFAULT;
+	if (buf) {
+		if (copy_to_user((void __user *)(arg + 4), buf, bufsize))
+			err = -EFAULT;
+		kfree(buf);
+	}
+	return err;
+}
+
+static int ide_task_ioctl(ide_drive_t *drive, unsigned cmd, unsigned long arg)
+{
+	void __user *p = (void __user *)arg;
+	int err = 0;
+	u8 args[7];
+	ide_task_t task;
+
+	if (copy_from_user(args, p, 7))
+		return -EFAULT;
+
+	memset(&task, 0, sizeof(task));
+	memcpy(&task.tf_array[7], &args[1], 6);
+	task.tf.command = args[0];
+	task.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+
+	err = ide_no_data_taskfile(drive, &task);
+
+	args[0] = task.tf.command;
+	memcpy(&args[1], &task.tf_array[7], 6);
+
+	if (copy_to_user(p, args, 7))
+		err = -EFAULT;
+
+	return err;
+}
+
+static int generic_drive_reset(ide_drive_t *drive)
+{
+	struct request *rq;
+	int ret = 0;
+
+	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_len = 1;
+	rq->cmd[0] = REQ_DRIVE_RESET;
+	rq->cmd_flags |= REQ_SOFTBARRIER;
+	if (blk_execute_rq(drive->queue, NULL, rq, 1))
+		ret = rq->errors;
+	blk_put_request(rq);
+	return ret;
+}
+
+int generic_ide_ioctl(ide_drive_t *drive, struct file *file,
+		      struct block_device *bdev,
+		      unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	err = ide_setting_ioctl(drive, bdev, cmd, arg, ide_ioctl_settings);
+	if (err != -EOPNOTSUPP)
+		return err;
+
+	switch (cmd) {
+	case HDIO_OBSOLETE_IDENTITY:
+	case HDIO_GET_IDENTITY:
+		if (bdev != bdev->bd_contains)
+			return -EINVAL;
+		return ide_get_identity_ioctl(drive, cmd, arg);
+	case HDIO_GET_NICE:
+		return ide_get_nice_ioctl(drive, arg);
+	case HDIO_SET_NICE:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		return ide_set_nice_ioctl(drive, arg);
+#ifdef CONFIG_IDE_TASK_IOCTL
+	case HDIO_DRIVE_TASKFILE:
+		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
+			return -EACCES;
+		if (drive->media == ide_disk)
+			return ide_taskfile_ioctl(drive, cmd, arg);
+		return -ENOMSG;
+#endif
+	case HDIO_DRIVE_CMD:
+		if (!capable(CAP_SYS_RAWIO))
+			return -EACCES;
+		return ide_cmd_ioctl(drive, cmd, arg);
+	case HDIO_DRIVE_TASK:
+		if (!capable(CAP_SYS_RAWIO))
+			return -EACCES;
+		return ide_task_ioctl(drive, cmd, arg);
+	case HDIO_DRIVE_RESET:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		return generic_drive_reset(drive);
+	case HDIO_GET_BUSSTATE:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		if (put_user(BUSSTATE_ON, (long __user *)arg))
+			return -EFAULT;
+		return 0;
+	case HDIO_SET_BUSSTATE:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		return -EOPNOTSUPP;
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(generic_ide_ioctl);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 2cbadff..0a2fd3b 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -18,7 +18,6 @@
 #include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/bitops.h>
 #include <linux/nmi.h>
@@ -400,97 +399,14 @@
 	.output_data		= ide_output_data,
 };
 
-void ide_fix_driveid (struct hd_driveid *id)
+void ide_fix_driveid(u16 *id)
 {
 #ifndef __LITTLE_ENDIAN
 # ifdef __BIG_ENDIAN
 	int i;
-	u16 *stringcast;
 
-	id->config         = __le16_to_cpu(id->config);
-	id->cyls           = __le16_to_cpu(id->cyls);
-	id->reserved2      = __le16_to_cpu(id->reserved2);
-	id->heads          = __le16_to_cpu(id->heads);
-	id->track_bytes    = __le16_to_cpu(id->track_bytes);
-	id->sector_bytes   = __le16_to_cpu(id->sector_bytes);
-	id->sectors        = __le16_to_cpu(id->sectors);
-	id->vendor0        = __le16_to_cpu(id->vendor0);
-	id->vendor1        = __le16_to_cpu(id->vendor1);
-	id->vendor2        = __le16_to_cpu(id->vendor2);
-	stringcast = (u16 *)&id->serial_no[0];
-	for (i = 0; i < (20/2); i++)
-		stringcast[i] = __le16_to_cpu(stringcast[i]);
-	id->buf_type       = __le16_to_cpu(id->buf_type);
-	id->buf_size       = __le16_to_cpu(id->buf_size);
-	id->ecc_bytes      = __le16_to_cpu(id->ecc_bytes);
-	stringcast = (u16 *)&id->fw_rev[0];
-	for (i = 0; i < (8/2); i++)
-		stringcast[i] = __le16_to_cpu(stringcast[i]);
-	stringcast = (u16 *)&id->model[0];
-	for (i = 0; i < (40/2); i++)
-		stringcast[i] = __le16_to_cpu(stringcast[i]);
-	id->dword_io       = __le16_to_cpu(id->dword_io);
-	id->reserved50     = __le16_to_cpu(id->reserved50);
-	id->field_valid    = __le16_to_cpu(id->field_valid);
-	id->cur_cyls       = __le16_to_cpu(id->cur_cyls);
-	id->cur_heads      = __le16_to_cpu(id->cur_heads);
-	id->cur_sectors    = __le16_to_cpu(id->cur_sectors);
-	id->cur_capacity0  = __le16_to_cpu(id->cur_capacity0);
-	id->cur_capacity1  = __le16_to_cpu(id->cur_capacity1);
-	id->lba_capacity   = __le32_to_cpu(id->lba_capacity);
-	id->dma_1word      = __le16_to_cpu(id->dma_1word);
-	id->dma_mword      = __le16_to_cpu(id->dma_mword);
-	id->eide_pio_modes = __le16_to_cpu(id->eide_pio_modes);
-	id->eide_dma_min   = __le16_to_cpu(id->eide_dma_min);
-	id->eide_dma_time  = __le16_to_cpu(id->eide_dma_time);
-	id->eide_pio       = __le16_to_cpu(id->eide_pio);
-	id->eide_pio_iordy = __le16_to_cpu(id->eide_pio_iordy);
-	for (i = 0; i < 2; ++i)
-		id->words69_70[i] = __le16_to_cpu(id->words69_70[i]);
-	for (i = 0; i < 4; ++i)
-		id->words71_74[i] = __le16_to_cpu(id->words71_74[i]);
-	id->queue_depth    = __le16_to_cpu(id->queue_depth);
-	for (i = 0; i < 4; ++i)
-		id->words76_79[i] = __le16_to_cpu(id->words76_79[i]);
-	id->major_rev_num  = __le16_to_cpu(id->major_rev_num);
-	id->minor_rev_num  = __le16_to_cpu(id->minor_rev_num);
-	id->command_set_1  = __le16_to_cpu(id->command_set_1);
-	id->command_set_2  = __le16_to_cpu(id->command_set_2);
-	id->cfsse          = __le16_to_cpu(id->cfsse);
-	id->cfs_enable_1   = __le16_to_cpu(id->cfs_enable_1);
-	id->cfs_enable_2   = __le16_to_cpu(id->cfs_enable_2);
-	id->csf_default    = __le16_to_cpu(id->csf_default);
-	id->dma_ultra      = __le16_to_cpu(id->dma_ultra);
-	id->trseuc         = __le16_to_cpu(id->trseuc);
-	id->trsEuc         = __le16_to_cpu(id->trsEuc);
-	id->CurAPMvalues   = __le16_to_cpu(id->CurAPMvalues);
-	id->mprc           = __le16_to_cpu(id->mprc);
-	id->hw_config      = __le16_to_cpu(id->hw_config);
-	id->acoustic       = __le16_to_cpu(id->acoustic);
-	id->msrqs          = __le16_to_cpu(id->msrqs);
-	id->sxfert         = __le16_to_cpu(id->sxfert);
-	id->sal            = __le16_to_cpu(id->sal);
-	id->spg            = __le32_to_cpu(id->spg);
-	id->lba_capacity_2 = __le64_to_cpu(id->lba_capacity_2);
-	for (i = 0; i < 22; i++)
-		id->words104_125[i]   = __le16_to_cpu(id->words104_125[i]);
-	id->last_lun       = __le16_to_cpu(id->last_lun);
-	id->word127        = __le16_to_cpu(id->word127);
-	id->dlf            = __le16_to_cpu(id->dlf);
-	id->csfo           = __le16_to_cpu(id->csfo);
-	for (i = 0; i < 26; i++)
-		id->words130_155[i] = __le16_to_cpu(id->words130_155[i]);
-	id->word156        = __le16_to_cpu(id->word156);
-	for (i = 0; i < 3; i++)
-		id->words157_159[i] = __le16_to_cpu(id->words157_159[i]);
-	id->cfa_power      = __le16_to_cpu(id->cfa_power);
-	for (i = 0; i < 15; i++)
-		id->words161_175[i] = __le16_to_cpu(id->words161_175[i]);
-	for (i = 0; i < 30; i++)
-		id->words176_205[i] = __le16_to_cpu(id->words176_205[i]);
-	for (i = 0; i < 49; i++)
-		id->words206_254[i] = __le16_to_cpu(id->words206_254[i]);
-	id->integrity_word  = __le16_to_cpu(id->integrity_word);
+	for (i = 0; i < 256; i++)
+		id[i] = __le16_to_cpu(id[i]);
 # else
 #  error "Please fix <asm/byteorder.h>"
 # endif
@@ -501,19 +417,21 @@
  * ide_fixstring() cleans up and (optionally) byte-swaps a text string,
  * removing leading/trailing blanks and compressing internal blanks.
  * It is primarily used to tidy up the model name/number fields as
- * returned by the WIN_[P]IDENTIFY commands.
+ * returned by the ATA_CMD_ID_ATA[PI] commands.
  */
 
 void ide_fixstring (u8 *s, const int bytecount, const int byteswap)
 {
-	u8 *p = s, *end = &s[bytecount & ~1]; /* bytecount must be even */
+	u8 *p, *end = &s[bytecount & ~1]; /* bytecount must be even */
 
 	if (byteswap) {
 		/* convert from big-endian to host byte order */
-		for (p = end ; p != s;)
-			be16_to_cpus((u16 *)(p -= 2));
+		for (p = s ; p != end ; p += 2)
+			be16_to_cpus((u16 *) p);
 	}
+
 	/* strip leading blanks */
+	p = s;
 	while (s != end && *s == ' ')
 		++s;
 	/* compress internal blanks and strip trailing blanks */
@@ -556,7 +474,7 @@
 		/* Note: this may clear a pending IRQ!! */
 		stat = hwif->tp_ops->read_status(hwif);
 
-	if (stat & BUSY_STAT)
+	if (stat & ATA_BUSY)
 		/* drive busy:  definitely not interrupting */
 		return 0;
 
@@ -588,10 +506,10 @@
 	udelay(1);	/* spec allows drive 400ns to assert "BUSY" */
 	stat = tp_ops->read_status(hwif);
 
-	if (stat & BUSY_STAT) {
+	if (stat & ATA_BUSY) {
 		local_irq_set(flags);
 		timeout += jiffies;
-		while ((stat = tp_ops->read_status(hwif)) & BUSY_STAT) {
+		while ((stat = tp_ops->read_status(hwif)) & ATA_BUSY) {
 			if (time_after(jiffies, timeout)) {
 				/*
 				 * One last read after the timeout in case
@@ -599,7 +517,7 @@
 				 * progress during the timeout..
 				 */
 				stat = tp_ops->read_status(hwif);
-				if (!(stat & BUSY_STAT))
+				if ((stat & ATA_BUSY) == 0)
 					break;
 
 				local_irq_restore(flags);
@@ -660,18 +578,18 @@
 /**
  *	ide_in_drive_list	-	look for drive in black/white list
  *	@id: drive identifier
- *	@drive_table: list to inspect
+ *	@table: list to inspect
  *
  *	Look for a drive in the blacklist and the whitelist tables
  *	Returns 1 if the drive is found in the table.
  */
 
-int ide_in_drive_list(struct hd_driveid *id, const struct drive_list_entry *drive_table)
+int ide_in_drive_list(u16 *id, const struct drive_list_entry *table)
 {
-	for ( ; drive_table->id_model; drive_table++)
-		if ((!strcmp(drive_table->id_model, id->model)) &&
-		    (!drive_table->id_firmware ||
-		     strstr(id->fw_rev, drive_table->id_firmware)))
+	for ( ; table->id_model; table++)
+		if ((!strcmp(table->id_model, (char *)&id[ATA_ID_PROD])) &&
+		    (!table->id_firmware ||
+		     strstr((char *)&id[ATA_ID_FW_REV], table->id_firmware)))
 			return 1;
 	return 0;
 }
@@ -702,7 +620,7 @@
 u8 eighty_ninty_three (ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	struct hd_driveid *id = drive->id;
+	u16 *id = drive->id;
 	int ivb = ide_in_drive_list(id, ivb_list);
 
 	if (hwif->cbl == ATA_CBL_PATA40_SHORT)
@@ -712,7 +630,7 @@
 		printk(KERN_DEBUG "%s: skipping word 93 validity check\n",
 				  drive->name);
 
-	if (ide_dev_is_sata(id) && !ivb)
+	if (ata_id_is_sata(id) && !ivb)
 		return 1;
 
 	if (hwif->cbl != ATA_CBL_PATA80 && !ivb)
@@ -724,7 +642,8 @@
 	 * - force bit13 (80c cable present) check also for !ivb devices
 	 *   (unless the slave device is pre-ATA3)
 	 */
-	if ((id->hw_config & 0x4000) || (ivb && (id->hw_config & 0x2000)))
+	if ((id[ATA_ID_HW_CONFIG] & 0x4000) ||
+	    (ivb && (id[ATA_ID_HW_CONFIG] & 0x2000)))
 		return 1;
 
 no_80w:
@@ -745,8 +664,8 @@
 {
 	ide_hwif_t *hwif = drive->hwif;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	struct hd_driveid *id;
-	unsigned long timeout, flags;
+	u16 *id;
+	unsigned long flags;
 	u8 stat;
 
 	/*
@@ -757,29 +676,24 @@
 	SELECT_MASK(drive, 1);
 	tp_ops->set_irq(hwif, 0);
 	msleep(50);
-	tp_ops->exec_command(hwif, WIN_IDENTIFY);
-	timeout = jiffies + WAIT_WORSTCASE;
-	do {
-		if (time_after(jiffies, timeout)) {
-			SELECT_MASK(drive, 0);
-			return 0;	/* drive timed-out */
-		}
+	tp_ops->exec_command(hwif, ATA_CMD_ID_ATA);
 
-		msleep(50);	/* give drive a breather */
-		stat = tp_ops->read_altstatus(hwif);
-	} while (stat & BUSY_STAT);
+	if (ide_busy_sleep(hwif, WAIT_WORSTCASE, 1)) {
+		SELECT_MASK(drive, 0);
+		return 0;
+	}
 
-	msleep(50);	/* wait for IRQ and DRQ_STAT */
+	msleep(50);	/* wait for IRQ and ATA_DRQ */
 	stat = tp_ops->read_status(hwif);
 
-	if (!OK_STAT(stat, DRQ_STAT, BAD_R_STAT)) {
+	if (!OK_STAT(stat, ATA_DRQ, BAD_R_STAT)) {
 		SELECT_MASK(drive, 0);
 		printk("%s: CHECK for good STATUS\n", drive->name);
 		return 0;
 	}
 	local_irq_save(flags);
 	SELECT_MASK(drive, 0);
-	id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC);
+	id = kmalloc(SECTOR_SIZE, GFP_ATOMIC);
 	if (!id) {
 		local_irq_restore(flags);
 		return 0;
@@ -789,16 +703,16 @@
 	local_irq_enable();
 	local_irq_restore(flags);
 	ide_fix_driveid(id);
-	if (id) {
-		drive->id->dma_ultra = id->dma_ultra;
-		drive->id->dma_mword = id->dma_mword;
-		drive->id->dma_1word = id->dma_1word;
-		/* anything more ? */
-		kfree(id);
 
-		if (drive->using_dma && ide_id_dma_bug(drive))
-			ide_dma_off(drive);
-	}
+	drive->id[ATA_ID_UDMA_MODES]  = id[ATA_ID_UDMA_MODES];
+	drive->id[ATA_ID_MWDMA_MODES] = id[ATA_ID_MWDMA_MODES];
+	drive->id[ATA_ID_SWDMA_MODES] = id[ATA_ID_SWDMA_MODES];
+	/* anything more ? */
+
+	kfree(id);
+
+	if (drive->using_dma && ide_id_dma_bug(drive))
+		ide_dma_off(drive);
 
 	return 1;
 }
@@ -807,6 +721,7 @@
 {
 	ide_hwif_t *hwif = drive->hwif;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
+	u16 *id = drive->id, i;
 	int error = 0;
 	u8 stat;
 	ide_task_t task;
@@ -817,7 +732,7 @@
 #endif
 
 	/* Skip setting PIO flow-control modes on pre-EIDE drives */
-	if ((speed & 0xf8) == XFER_PIO_0 && !(drive->id->capability & 0x08))
+	if ((speed & 0xf8) == XFER_PIO_0 && ata_id_has_iordy(drive->id) == 0)
 		goto skip;
 
 	/*
@@ -851,13 +766,13 @@
 
 	tp_ops->tf_load(drive, &task);
 
-	tp_ops->exec_command(hwif, WIN_SETFEATURES);
+	tp_ops->exec_command(hwif, ATA_CMD_SET_FEATURES);
 
 	if (drive->quirk_list == 2)
 		tp_ops->set_irq(hwif, 1);
 
 	error = __ide_wait_stat(drive, drive->ready_stat,
-				BUSY_STAT|DRQ_STAT|ERR_STAT,
+				ATA_BUSY | ATA_DRQ | ATA_ERR,
 				WAIT_CMD, &stat);
 
 	SELECT_MASK(drive, 0);
@@ -869,9 +784,9 @@
 		return error;
 	}
 
-	drive->id->dma_ultra &= ~0xFF00;
-	drive->id->dma_mword &= ~0x0F00;
-	drive->id->dma_1word &= ~0x0F00;
+	id[ATA_ID_UDMA_MODES]  &= ~0xFF00;
+	id[ATA_ID_MWDMA_MODES] &= ~0x0F00;
+	id[ATA_ID_SWDMA_MODES] &= ~0x0F00;
 
  skip:
 #ifdef CONFIG_BLK_DEV_IDEDMA
@@ -881,23 +796,17 @@
 		ide_dma_off_quietly(drive);
 #endif
 
-	switch(speed) {
-		case XFER_UDMA_7:   drive->id->dma_ultra |= 0x8080; break;
-		case XFER_UDMA_6:   drive->id->dma_ultra |= 0x4040; break;
-		case XFER_UDMA_5:   drive->id->dma_ultra |= 0x2020; break;
-		case XFER_UDMA_4:   drive->id->dma_ultra |= 0x1010; break;
-		case XFER_UDMA_3:   drive->id->dma_ultra |= 0x0808; break;
-		case XFER_UDMA_2:   drive->id->dma_ultra |= 0x0404; break;
-		case XFER_UDMA_1:   drive->id->dma_ultra |= 0x0202; break;
-		case XFER_UDMA_0:   drive->id->dma_ultra |= 0x0101; break;
-		case XFER_MW_DMA_2: drive->id->dma_mword |= 0x0404; break;
-		case XFER_MW_DMA_1: drive->id->dma_mword |= 0x0202; break;
-		case XFER_MW_DMA_0: drive->id->dma_mword |= 0x0101; break;
-		case XFER_SW_DMA_2: drive->id->dma_1word |= 0x0404; break;
-		case XFER_SW_DMA_1: drive->id->dma_1word |= 0x0202; break;
-		case XFER_SW_DMA_0: drive->id->dma_1word |= 0x0101; break;
-		default: break;
+	if (speed >= XFER_UDMA_0) {
+		i = 1 << (speed - XFER_UDMA_0);
+		id[ATA_ID_UDMA_MODES] |= (i << 8 | i);
+	} else if (speed >= XFER_MW_DMA_0) {
+		i = 1 << (speed - XFER_MW_DMA_0);
+		id[ATA_ID_MWDMA_MODES] |= (i << 8 | i);
+	} else if (speed >= XFER_SW_DMA_0) {
+		i = 1 << (speed - XFER_SW_DMA_0);
+		id[ATA_ID_SWDMA_MODES] |= (i << 8 | i);
 	}
+
 	if (!drive->init_speed)
 		drive->init_speed = speed;
 	drive->current_speed = speed;
@@ -977,7 +886,7 @@
 	unsigned long flags;
 
 	spin_lock_irqsave(&ide_lock, flags);
-	hwif->tp_ops->exec_command(hwif, WIN_PACKETCMD);
+	hwif->tp_ops->exec_command(hwif, ATA_CMD_PACKET);
 	ndelay(400);
 	spin_unlock_irqrestore(&ide_lock, flags);
 }
@@ -1010,7 +919,7 @@
 	udelay (10);
 	stat = hwif->tp_ops->read_status(hwif);
 
-	if (OK_STAT(stat, 0, BUSY_STAT))
+	if (OK_STAT(stat, 0, ATA_BUSY))
 		printk("%s: ATAPI reset complete\n", drive->name);
 	else {
 		if (time_before(jiffies, hwgroup->poll_timeout)) {
@@ -1056,7 +965,7 @@
 
 	tmp = hwif->tp_ops->read_status(hwif);
 
-	if (!OK_STAT(tmp, 0, BUSY_STAT)) {
+	if (!OK_STAT(tmp, 0, ATA_BUSY)) {
 		if (time_before(jiffies, hwgroup->poll_timeout)) {
 			ide_set_handler(drive, &reset_pollfunc, HZ/20, NULL);
 			/* continue polling */
@@ -1102,7 +1011,7 @@
 
 static void ide_disk_pre_reset(ide_drive_t *drive)
 {
-	int legacy = (drive->id->cfs_enable_2 & 0x0400) ? 0 : 1;
+	int legacy = (drive->id[ATA_ID_CFS_ENABLE_2] & 0x0400) ? 0 : 1;
 
 	drive->special.all = 0;
 	drive->special.b.set_geometry = legacy;
@@ -1187,7 +1096,7 @@
 		pre_reset(drive);
 		SELECT_DRIVE(drive);
 		udelay (20);
-		tp_ops->exec_command(hwif, WIN_SRST);
+		tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
 		ndelay(400);
 		hwgroup->poll_timeout = jiffies + WAIT_WORSTCASE;
 		hwgroup->polling = 1;
@@ -1270,7 +1179,7 @@
 		 */
 		mdelay(1);
 		stat = hwif->tp_ops->read_status(hwif);
-		if ((stat & BUSY_STAT) == 0)
+		if ((stat & ATA_BUSY) == 0)
 			return 0;
 		/*
 		 * Assume a value of 0xff means nothing is connected to
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index 97fefab..ed426dd 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -2,7 +2,6 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/bitops.h>
 
@@ -90,29 +89,31 @@
 
 u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode)
 {
-	int pio_mode;
-	struct hd_driveid* id = drive->id;
-	int overridden  = 0;
+	u16 *id = drive->id;
+	int pio_mode = -1, overridden = 0;
 
 	if (mode_wanted != 255)
 		return min_t(u8, mode_wanted, max_mode);
 
-	if ((drive->hwif->host_flags & IDE_HFLAG_PIO_NO_BLACKLIST) == 0 &&
-	    (pio_mode = ide_scan_pio_blacklist(id->model)) != -1) {
+	if ((drive->hwif->host_flags & IDE_HFLAG_PIO_NO_BLACKLIST) == 0)
+		pio_mode = ide_scan_pio_blacklist((char *)&id[ATA_ID_PROD]);
+
+	if (pio_mode != -1) {
 		printk(KERN_INFO "%s: is on PIO blacklist\n", drive->name);
 	} else {
-		pio_mode = id->tPIO;
+		pio_mode = id[ATA_ID_OLD_PIO_MODES] >> 8;
 		if (pio_mode > 2) {	/* 2 is maximum allowed tPIO value */
 			pio_mode = 2;
 			overridden = 1;
 		}
-		if (id->field_valid & 2) {	  /* drive implements ATA2? */
-			if (id->capability & 8) { /* IORDY supported? */
-				if (id->eide_pio_modes & 7) {
+
+		if (id[ATA_ID_FIELD_VALID] & 2) {	      /* ATA2? */
+			if (ata_id_has_iordy(id)) {
+				if (id[ATA_ID_PIO_MODES] & 7) {
 					overridden = 0;
-					if (id->eide_pio_modes & 4)
+					if (id[ATA_ID_PIO_MODES] & 4)
 						pio_mode = 5;
-					else if (id->eide_pio_modes & 2)
+					else if (id[ATA_ID_PIO_MODES] & 2)
 						pio_mode = 4;
 					else
 						pio_mode = 3;
@@ -338,16 +339,16 @@
 static void ide_dump_ata_error(ide_drive_t *drive, u8 err)
 {
 	printk("{ ");
-	if (err & ABRT_ERR)	printk("DriveStatusError ");
-	if (err & ICRC_ERR)
-		printk((err & ABRT_ERR) ? "BadCRC " : "BadSector ");
-	if (err & ECC_ERR)	printk("UncorrectableError ");
-	if (err & ID_ERR)	printk("SectorIdNotFound ");
-	if (err & TRK0_ERR)	printk("TrackZeroNotFound ");
-	if (err & MARK_ERR)	printk("AddrMarkNotFound ");
+	if (err & ATA_ABORTED)	printk("DriveStatusError ");
+	if (err & ATA_ICRC)
+		printk((err & ATA_ABORTED) ? "BadCRC " : "BadSector ");
+	if (err & ATA_UNC)	printk("UncorrectableError ");
+	if (err & ATA_IDNF)	printk("SectorIdNotFound ");
+	if (err & ATA_TRK0NF)	printk("TrackZeroNotFound ");
+	if (err & ATA_AMNF)	printk("AddrMarkNotFound ");
 	printk("}");
-	if ((err & (BBD_ERR | ABRT_ERR)) == BBD_ERR ||
-	    (err & (ECC_ERR|ID_ERR|MARK_ERR))) {
+	if ((err & (ATA_BBK | ATA_ABORTED)) == ATA_BBK ||
+	    (err & (ATA_UNC | ATA_IDNF | ATA_AMNF))) {
 		ide_dump_sector(drive);
 		if (HWGROUP(drive) && HWGROUP(drive)->rq)
 			printk(", sector=%llu",
@@ -359,12 +360,12 @@
 static void ide_dump_atapi_error(ide_drive_t *drive, u8 err)
 {
 	printk("{ ");
-	if (err & ILI_ERR)	printk("IllegalLengthIndication ");
-	if (err & EOM_ERR)	printk("EndOfMedia ");
-	if (err & ABRT_ERR)	printk("AbortedCommand ");
-	if (err & MCR_ERR)	printk("MediaChangeRequested ");
-	if (err & LFS_ERR)	printk("LastFailedSense=0x%02x ",
-				       (err & LFS_ERR) >> 4);
+	if (err & ATAPI_ILI)	printk("IllegalLengthIndication ");
+	if (err & ATAPI_EOM)	printk("EndOfMedia ");
+	if (err & ATA_ABORTED)	printk("AbortedCommand ");
+	if (err & ATA_MCR)	printk("MediaChangeRequested ");
+	if (err & ATAPI_LFS)	printk("LastFailedSense=0x%02x ",
+				       (err & ATAPI_LFS) >> 4);
 	printk("}\n");
 }
 
@@ -386,19 +387,19 @@
 
 	local_irq_save(flags);
 	printk("%s: %s: status=0x%02x { ", drive->name, msg, stat);
-	if (stat & BUSY_STAT)
+	if (stat & ATA_BUSY)
 		printk("Busy ");
 	else {
-		if (stat & READY_STAT)	printk("DriveReady ");
-		if (stat & WRERR_STAT)	printk("DeviceFault ");
-		if (stat & SEEK_STAT)	printk("SeekComplete ");
-		if (stat & DRQ_STAT)	printk("DataRequest ");
-		if (stat & ECC_STAT)	printk("CorrectedError ");
-		if (stat & INDEX_STAT)	printk("Index ");
-		if (stat & ERR_STAT)	printk("Error ");
+		if (stat & ATA_DRDY)	printk("DriveReady ");
+		if (stat & ATA_DF)	printk("DeviceFault ");
+		if (stat & ATA_DSC)	printk("SeekComplete ");
+		if (stat & ATA_DRQ)	printk("DataRequest ");
+		if (stat & ATA_CORR)	printk("CorrectedError ");
+		if (stat & ATA_IDX)	printk("Index ");
+		if (stat & ATA_ERR)	printk("Error ");
 	}
 	printk("}\n");
-	if ((stat & (BUSY_STAT|ERR_STAT)) == ERR_STAT) {
+	if ((stat & (ATA_BUSY | ATA_ERR)) == ATA_ERR) {
 		err = ide_read_error(drive);
 		printk("%s: %s: error=0x%02x ", drive->name, msg, err);
 		if (drive->media == ide_disk)
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 70aa86c..06575a1 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -50,59 +50,54 @@
  
 static void generic_id(ide_drive_t *drive)
 {
-	drive->id->cyls = drive->cyl;
-	drive->id->heads = drive->head;
-	drive->id->sectors = drive->sect;
-	drive->id->cur_cyls = drive->cyl;
-	drive->id->cur_heads = drive->head;
-	drive->id->cur_sectors = drive->sect;
+	u16 *id = drive->id;
+
+	id[ATA_ID_CUR_CYLS]	= id[ATA_ID_CYLS]	= drive->cyl;
+	id[ATA_ID_CUR_HEADS]	= id[ATA_ID_HEADS]	= drive->head;
+	id[ATA_ID_CUR_SECTORS]	= id[ATA_ID_SECTORS]	= drive->sect;
 }
 
 static void ide_disk_init_chs(ide_drive_t *drive)
 {
-	struct hd_driveid *id = drive->id;
+	u16 *id = drive->id;
 
 	/* Extract geometry if we did not already have one for the drive */
 	if (!drive->cyl || !drive->head || !drive->sect) {
-		drive->cyl  = drive->bios_cyl  = id->cyls;
-		drive->head = drive->bios_head = id->heads;
-		drive->sect = drive->bios_sect = id->sectors;
+		drive->cyl  = drive->bios_cyl  = id[ATA_ID_CYLS];
+		drive->head = drive->bios_head = id[ATA_ID_HEADS];
+		drive->sect = drive->bios_sect = id[ATA_ID_SECTORS];
 	}
 
 	/* Handle logical geometry translation by the drive */
-	if ((id->field_valid & 1) && id->cur_cyls &&
-	    id->cur_heads && (id->cur_heads <= 16) && id->cur_sectors) {
-		drive->cyl  = id->cur_cyls;
-		drive->head = id->cur_heads;
-		drive->sect = id->cur_sectors;
+	if (ata_id_current_chs_valid(id)) {
+		drive->cyl  = id[ATA_ID_CUR_CYLS];
+		drive->head = id[ATA_ID_CUR_HEADS];
+		drive->sect = id[ATA_ID_CUR_SECTORS];
 	}
 
 	/* Use physical geometry if what we have still makes no sense */
-	if (drive->head > 16 && id->heads && id->heads <= 16) {
-		drive->cyl  = id->cyls;
-		drive->head = id->heads;
-		drive->sect = id->sectors;
+	if (drive->head > 16 && id[ATA_ID_HEADS] && id[ATA_ID_HEADS] <= 16) {
+		drive->cyl  = id[ATA_ID_CYLS];
+		drive->head = id[ATA_ID_HEADS];
+		drive->sect = id[ATA_ID_SECTORS];
 	}
 }
 
 static void ide_disk_init_mult_count(ide_drive_t *drive)
 {
-	struct hd_driveid *id = drive->id;
+	u16 *id = drive->id;
+	u8 max_multsect = id[ATA_ID_MAX_MULTSECT] & 0xff;
 
-	drive->mult_count = 0;
-	if (id->max_multsect) {
-#ifdef CONFIG_IDEDISK_MULTI_MODE
-		id->multsect = ((id->max_multsect/2) > 1) ? id->max_multsect : 0;
-		id->multsect_valid = id->multsect ? 1 : 0;
-		drive->mult_req = id->multsect_valid ? id->max_multsect : 0;
-		drive->special.b.set_multmode = drive->mult_req ? 1 : 0;
-#else	/* original, pre IDE-NFG, per request of AC */
-		drive->mult_req = 0;
-		if (drive->mult_req > id->max_multsect)
-			drive->mult_req = id->max_multsect;
-		if (drive->mult_req || ((id->multsect_valid & 1) && id->multsect))
+	if (max_multsect) {
+		if ((max_multsect / 2) > 1)
+			id[ATA_ID_MULTSECT] = max_multsect | 0x100;
+		else
+			id[ATA_ID_MULTSECT] &= ~0x1ff;
+
+		drive->mult_req = id[ATA_ID_MULTSECT] & 0xff;
+
+		if (drive->mult_req)
 			drive->special.b.set_multmode = 1;
-#endif
 	}
 }
 
@@ -119,10 +114,10 @@
 static inline void do_identify (ide_drive_t *drive, u8 cmd)
 {
 	ide_hwif_t *hwif = HWIF(drive);
-	int bswap = 1;
-	struct hd_driveid *id;
+	u16 *id = drive->id;
+	char *m = (char *)&id[ATA_ID_PROD];
+	int bswap = 1, is_cfa;
 
-	id = drive->id;
 	/* read 512 bytes of id info */
 	hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
 
@@ -135,27 +130,28 @@
 	ide_fix_driveid(id);
 
 	/*
-	 *  WIN_IDENTIFY returns little-endian info,
-	 *  WIN_PIDENTIFY *usually* returns little-endian info.
+	 *  ATA_CMD_ID_ATA returns little-endian info,
+	 *  ATA_CMD_ID_ATAPI *usually* returns little-endian info.
 	 */
-	if (cmd == WIN_PIDENTIFY) {
-		if ((id->model[0] == 'N' && id->model[1] == 'E') /* NEC */
-		 || (id->model[0] == 'F' && id->model[1] == 'X') /* Mitsumi */
-		 || (id->model[0] == 'P' && id->model[1] == 'i'))/* Pioneer */
+	if (cmd == ATA_CMD_ID_ATAPI) {
+		if ((m[0] == 'N' && m[1] == 'E') ||  /* NEC */
+		    (m[0] == 'F' && m[1] == 'X') ||  /* Mitsumi */
+		    (m[0] == 'P' && m[1] == 'i'))    /* Pioneer */
 			/* Vertos drives may still be weird */
-			bswap ^= 1;	
+			bswap ^= 1;
 	}
-	ide_fixstring(id->model,     sizeof(id->model),     bswap);
-	ide_fixstring(id->fw_rev,    sizeof(id->fw_rev),    bswap);
-	ide_fixstring(id->serial_no, sizeof(id->serial_no), bswap);
+
+	ide_fixstring(m, ATA_ID_PROD_LEN, bswap);
+	ide_fixstring((char *)&id[ATA_ID_FW_REV], ATA_ID_FW_REV_LEN, bswap);
+	ide_fixstring((char *)&id[ATA_ID_SERNO], ATA_ID_SERNO_LEN, bswap);
 
 	/* we depend on this a lot! */
-	id->model[sizeof(id->model)-1] = '\0';
+	m[ATA_ID_PROD_LEN - 1] = '\0';
 
-	if (strstr(id->model, "E X A B Y T E N E S T"))
+	if (strstr(m, "E X A B Y T E N E S T"))
 		goto err_misc;
 
-	printk(KERN_INFO "%s: %s, ", drive->name, id->model);
+	printk(KERN_INFO "%s: %s, ", drive->name, m);
 
 	drive->present = 1;
 	drive->dead = 0;
@@ -163,16 +159,16 @@
 	/*
 	 * Check for an ATAPI device
 	 */
-	if (cmd == WIN_PIDENTIFY) {
-		u8 type = (id->config >> 8) & 0x1f;
+	if (cmd == ATA_CMD_ID_ATAPI) {
+		u8 type = (id[ATA_ID_CONFIG] >> 8) & 0x1f;
 
 		printk(KERN_CONT "ATAPI ");
 		switch (type) {
 			case ide_floppy:
-				if (!strstr(id->model, "CD-ROM")) {
-					if (!strstr(id->model, "oppy") &&
-					    !strstr(id->model, "poyp") &&
-					    !strstr(id->model, "ZIP"))
+				if (!strstr(m, "CD-ROM")) {
+					if (!strstr(m, "oppy") &&
+					    !strstr(m, "poyp") &&
+					    !strstr(m, "ZIP"))
 						printk(KERN_CONT "cdrom or floppy?, assuming ");
 					if (drive->media != ide_cdrom) {
 						printk(KERN_CONT "FLOPPY");
@@ -186,8 +182,7 @@
 				drive->removable = 1;
 #ifdef CONFIG_PPC
 				/* kludge for Apple PowerBook internal zip */
-				if (!strstr(id->model, "CD-ROM") &&
-				    strstr(id->model, "ZIP")) {
+				if (!strstr(m, "CD-ROM") && strstr(m, "ZIP")) {
 					printk(KERN_CONT "FLOPPY");
 					type = ide_floppy;
 					break;
@@ -217,18 +212,15 @@
 	 * Not an ATAPI device: looks like a "regular" hard disk
 	 */
 
-	/*
-	 * 0x848a = CompactFlash device
-	 * These are *not* removable in Linux definition of the term
-	 */
+	is_cfa = ata_id_is_cfa(id);
 
-	if ((id->config != 0x848a) && (id->config & (1<<7)))
+	/* CF devices are *not* removable in Linux definition of the term */
+	if (is_cfa == 0 && (id[ATA_ID_CONFIG] & (1 << 7)))
 		drive->removable = 1;
 
 	drive->media = ide_disk;
 
-	printk(KERN_CONT "%s DISK drive\n",
-		(id->config == 0x848a) ? "CFA" : "ATA");
+	printk(KERN_CONT "%s DISK drive\n", is_cfa ? "CFA" : "ATA");
 
 	return;
 
@@ -268,7 +260,7 @@
 	if (io_ports->ctl_addr) {
 		a = tp_ops->read_altstatus(hwif);
 		s = tp_ops->read_status(hwif);
-		if ((a ^ s) & ~INDEX_STAT)
+		if ((a ^ s) & ~ATA_IDX)
 			/* ancient Seagate drives, broken interfaces */
 			printk(KERN_INFO "%s: probing with STATUS(0x%02x) "
 					 "instead of ALTSTATUS(0x%02x)\n",
@@ -281,7 +273,7 @@
 	/* set features register for atapi
 	 * identify command to be sure of reply
 	 */
-	if (cmd == WIN_PIDENTIFY) {
+	if (cmd == ATA_CMD_ID_ATAPI) {
 		ide_task_t task;
 
 		memset(&task, 0, sizeof(task));
@@ -294,24 +286,16 @@
 	/* ask drive for ID */
 	tp_ops->exec_command(hwif, cmd);
 
-	timeout = ((cmd == WIN_IDENTIFY) ? WAIT_WORSTCASE : WAIT_PIDENTIFY) / 2;
-	timeout += jiffies;
-	do {
-		if (time_after(jiffies, timeout)) {
-			/* drive timed-out */
-			return 1;
-		}
-		/* give drive a breather */
-		msleep(50);
-		s = use_altstatus ? tp_ops->read_altstatus(hwif)
-				  : tp_ops->read_status(hwif);
-	} while (s & BUSY_STAT);
+	timeout = ((cmd == ATA_CMD_ID_ATA) ? WAIT_WORSTCASE : WAIT_PIDENTIFY) / 2;
 
-	/* wait for IRQ and DRQ_STAT */
+	if (ide_busy_sleep(hwif, timeout, use_altstatus))
+		return 1;
+
+	/* wait for IRQ and ATA_DRQ */
 	msleep(50);
 	s = tp_ops->read_status(hwif);
 
-	if (OK_STAT(s, DRQ_STAT, BAD_R_STAT)) {
+	if (OK_STAT(s, ATA_DRQ, BAD_R_STAT)) {
 		unsigned long flags;
 
 		/* local CPU only; some systems need this */
@@ -387,19 +371,21 @@
 	return retval;
 }
 
-static int ide_busy_sleep(ide_hwif_t *hwif)
+int ide_busy_sleep(ide_hwif_t *hwif, unsigned long timeout, int altstatus)
 {
-	unsigned long timeout = jiffies + WAIT_WORSTCASE;
 	u8 stat;
 
+	timeout += jiffies;
+
 	do {
-		msleep(50);
-		stat = hwif->tp_ops->read_status(hwif);
-		if ((stat & BUSY_STAT) == 0)
+		msleep(50);	/* give drive a breather */
+		stat = altstatus ? hwif->tp_ops->read_altstatus(hwif)
+				 : hwif->tp_ops->read_status(hwif);
+		if ((stat & ATA_BUSY) == 0)
 			return 0;
 	} while (time_before(jiffies, timeout));
 
-	return 1;
+	return 1;	/* drive timed-out */
 }
 
 static u8 ide_read_device(ide_drive_t *drive)
@@ -444,13 +430,13 @@
 
 	if (drive->present) {
 		/* avoid waiting for inappropriate probes */
-		if ((drive->media != ide_disk) && (cmd == WIN_IDENTIFY))
+		if (drive->media != ide_disk && cmd == ATA_CMD_ID_ATA)
 			return 4;
 	}
 #ifdef DEBUG
 	printk(KERN_INFO "probing for %s: present=%d, media=%d, probetype=%s\n",
 		drive->name, drive->present, drive->media,
-		(cmd == WIN_IDENTIFY) ? "ATA" : "ATAPI");
+		(cmd == ATA_CMD_ID_ATA) ? "ATA" : "ATAPI");
 #endif
 
 	/* needed for some systems
@@ -464,7 +450,7 @@
 		if (drive->select.b.unit != 0) {
 			/* exit with drive0 selected */
 			SELECT_DRIVE(&hwif->drives[0]);
-			/* allow BUSY_STAT to assert & clear */
+			/* allow ATA_BUSY to assert & clear */
 			msleep(50);
 		}
 		/* no i/f present: mmm.. this should be a 4 -ml */
@@ -473,8 +459,8 @@
 
 	stat = tp_ops->read_status(hwif);
 
-	if (OK_STAT(stat, READY_STAT, BUSY_STAT) ||
-	    drive->present || cmd == WIN_PIDENTIFY) {
+	if (OK_STAT(stat, ATA_DRDY, ATA_BUSY) ||
+	    drive->present || cmd == ATA_CMD_ID_ATAPI) {
 		/* send cmd and wait */
 		if ((rc = try_to_identify(drive, cmd))) {
 			/* failed: try again */
@@ -483,17 +469,17 @@
 
 		stat = tp_ops->read_status(hwif);
 
-		if (stat == (BUSY_STAT | READY_STAT))
+		if (stat == (ATA_BUSY | ATA_DRDY))
 			return 4;
 
-		if (rc == 1 && cmd == WIN_PIDENTIFY) {
+		if (rc == 1 && cmd == ATA_CMD_ID_ATAPI) {
 			printk(KERN_ERR "%s: no response (status = 0x%02x), "
 					"resetting drive\n", drive->name, stat);
 			msleep(50);
 			SELECT_DRIVE(drive);
 			msleep(50);
-			tp_ops->exec_command(hwif, WIN_SRST);
-			(void)ide_busy_sleep(hwif);
+			tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
+			(void)ide_busy_sleep(hwif, WAIT_WORSTCASE, 0);
 			rc = try_to_identify(drive, cmd);
 		}
 
@@ -526,13 +512,14 @@
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	u8 stat;
 
-	printk(KERN_INFO "%s: enabling %s -- ", hwif->name, drive->id->model);
+	printk(KERN_INFO "%s: enabling %s -- ",
+		hwif->name, (char *)&drive->id[ATA_ID_PROD]);
 
 	SELECT_DRIVE(drive);
 	msleep(50);
-	tp_ops->exec_command(hwif, EXABYTE_ENABLE_NEST);
+	tp_ops->exec_command(hwif, ATA_EXABYTE_ENABLE_NEST);
 
-	if (ide_busy_sleep(hwif)) {
+	if (ide_busy_sleep(hwif, WAIT_WORSTCASE, 0)) {
 		printk(KERN_CONT "failed (timeout)\n");
 		return;
 	}
@@ -545,12 +532,6 @@
 		printk(KERN_CONT "failed (status = 0x%02x)\n", stat);
 	else
 		printk(KERN_CONT "success\n");
-
-	/* if !(success||timed-out) */
-	if (do_probe(drive, WIN_IDENTIFY) >= 2) {
-		/* look for ATAPI device */
-		(void) do_probe(drive, WIN_PIDENTIFY);
-	}
 }
 
 /**
@@ -567,6 +548,8 @@
  
 static inline u8 probe_for_drive (ide_drive_t *drive)
 {
+	char *m;
+
 	/*
 	 *	In order to keep things simple we have an id
 	 *	block for all drives at all times. If the device
@@ -576,29 +559,34 @@
 	 *	Also note that 0 everywhere means "can't do X"
 	 */
  
-	drive->id = kzalloc(SECTOR_WORDS *4, GFP_KERNEL);
+	drive->id = kzalloc(SECTOR_SIZE, GFP_KERNEL);
 	drive->id_read = 0;
 	if(drive->id == NULL)
 	{
 		printk(KERN_ERR "ide: out of memory for id data.\n");
 		return 0;
 	}
-	strcpy(drive->id->model, "UNKNOWN");
-	
+
+	m = (char *)&drive->id[ATA_ID_PROD];
+	strcpy(m, "UNKNOWN");
+
 	/* skip probing? */
-	if (!drive->noprobe)
-	{
+	if (!drive->noprobe) {
+retry:
 		/* if !(success||timed-out) */
-		if (do_probe(drive, WIN_IDENTIFY) >= 2) {
+		if (do_probe(drive, ATA_CMD_ID_ATA) >= 2)
 			/* look for ATAPI device */
-			(void) do_probe(drive, WIN_PIDENTIFY);
-		}
+			(void)do_probe(drive, ATA_CMD_ID_ATAPI);
+
 		if (!drive->present)
 			/* drive not found */
 			return 0;
-		if (strstr(drive->id->model, "E X A B Y T E N E S T"))
+
+		if (strstr(m, "E X A B Y T E N E S T")) {
 			enable_nest(drive);
-	
+			goto retry;
+		}
+
 		/* identification failed? */
 		if (!drive->id_read) {
 			if (drive->media == ide_disk) {
@@ -740,36 +728,38 @@
 
 /**
  *	ide_undecoded_slave	-	look for bad CF adapters
- *	@drive1: drive
+ *	@dev1: slave device
  *
  *	Analyse the drives on the interface and attempt to decide if we
  *	have the same drive viewed twice. This occurs with crap CF adapters
  *	and PCMCIA sometimes.
  */
 
-void ide_undecoded_slave(ide_drive_t *drive1)
+void ide_undecoded_slave(ide_drive_t *dev1)
 {
-	ide_drive_t *drive0 = &drive1->hwif->drives[0];
+	ide_drive_t *dev0 = &dev1->hwif->drives[0];
 
-	if ((drive1->dn & 1) == 0 || drive0->present == 0)
+	if ((dev1->dn & 1) == 0 || dev0->present == 0)
 		return;
 
 	/* If the models don't match they are not the same product */
-	if (strcmp(drive0->id->model, drive1->id->model))
+	if (strcmp((char *)&dev0->id[ATA_ID_PROD],
+		   (char *)&dev1->id[ATA_ID_PROD]))
 		return;
 
 	/* Serial numbers do not match */
-	if (strncmp(drive0->id->serial_no, drive1->id->serial_no, 20))
+	if (strncmp((char *)&dev0->id[ATA_ID_SERNO],
+		    (char *)&dev1->id[ATA_ID_SERNO], ATA_ID_SERNO_LEN))
 		return;
 
 	/* No serial number, thankfully very rare for CF */
-	if (drive0->id->serial_no[0] == 0)
+	if (*(char *)&dev0->id[ATA_ID_SERNO] == 0)
 		return;
 
 	/* Appears to be an IDE flash adapter with decode bugs */
 	printk(KERN_WARNING "ide-probe: ignoring undecoded slave\n");
 
-	drive1->present = 0;
+	dev1->present = 0;
 }
 
 EXPORT_SYMBOL_GPL(ide_undecoded_slave);
@@ -853,7 +843,7 @@
 		if (hwif->host_flags & IDE_HFLAG_NO_IO_32BIT)
 			drive->no_io_32bit = 1;
 		else
-			drive->no_io_32bit = drive->id->dword_io ? 1 : 0;
+			drive->no_io_32bit = drive->id[ATA_ID_DWORD_IO] ? 1 : 0;
 	}
 }
 
@@ -1037,11 +1027,6 @@
 	ide_hwgroup_t *hwgroup;
 	ide_hwif_t *match = NULL;
 
-
-	BUG_ON(in_interrupt());
-	BUG_ON(irqs_disabled());	
-	BUG_ON(hwif == NULL);
-
 	mutex_lock(&ide_cfg_mtx);
 	hwif->hwgroup = NULL;
 #if MAX_HWIFS > 1
@@ -1116,7 +1101,8 @@
 		sa = IRQF_SHARED;
 #endif /* __mc68000__ */
 
-		if (IDE_CHIPSET_IS_PCI(hwif->chipset))
+		if (hwif->chipset == ide_pci || hwif->chipset == ide_cmd646 ||
+		    hwif->chipset == ide_ali14xx)
 			sa = IRQF_SHARED;
 
 		if (io_ports->ctl_addr)
@@ -1344,8 +1330,6 @@
 		if (!drive->present)
 			continue;
 
-		ide_add_generic_settings(drive);
-
 		snprintf(dev->bus_id, BUS_ID_SIZE, "%u.%u", hwif->index, i);
 		dev->parent = &hwif->gendev;
 		dev->bus = &ide_bus_type;
@@ -1602,8 +1586,10 @@
 	if (hws[0])
 		host->dev[0] = hws[0]->dev;
 
-	if (d)
+	if (d) {
+		host->init_chipset = d->init_chipset;
 		host->host_flags = d->host_flags;
+	}
 
 	return host;
 }
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index f66c9c3..e7030a4 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -12,14 +12,6 @@
  * "settings" files.  e.g.    "cat /proc/ide0/hda/settings"
  * To write a new value "val" into a specific setting "name", use:
  *   echo "name:val" >/proc/ide/ide0/hda/settings
- *
- * Also useful, "cat /proc/ide0/hda/[identify, smart_values,
- * smart_thresholds, capabilities]" will issue an IDENTIFY /
- * PACKET_IDENTIFY / SMART_READ_VALUES / SMART_READ_THRESHOLDS /
- * SENSE CAPABILITIES command to /dev/hda, and then dump out the
- * returned data as 256 16-bit words.  The "hdparm" utility will
- * be updated someday soon to use this mechanism.
- *
  */
 
 #include <linux/module.h>
@@ -31,7 +23,6 @@
 #include <linux/mm.h>
 #include <linux/pci.h>
 #include <linux/ctype.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/seq_file.h>
 
@@ -109,13 +100,14 @@
 
 		err = taskfile_lib_get_identify(drive, page);
 		if (!err) {
-			char *out = ((char *)page) + (SECTOR_WORDS * 4);
+			char *out = (char *)page + SECTOR_SIZE;
+
 			page = out;
 			do {
 				out += sprintf(out, "%04x%c",
 					le16_to_cpup(val), (++i & 7) ? ' ' : '\n');
 				val += 1;
-			} while (i < (SECTOR_WORDS * 2));
+			} while (i < SECTOR_SIZE / 2);
 			len = out - page;
 		}
 	}
@@ -123,140 +115,25 @@
 }
 
 /**
- *	__ide_add_setting	-	add an ide setting option
- *	@drive: drive to use
- *	@name: setting name
- *	@rw: true if the function is read write
- *	@data_type: type of data
- *	@min: range minimum
- *	@max: range maximum
- *	@mul_factor: multiplication scale
- *	@div_factor: divison scale
- *	@data: private data field
- *	@set: setting
- *	@auto_remove: setting auto removal flag
- *
- *	Removes the setting named from the device if it is present.
- *	The function takes the settings_lock to protect against
- *	parallel changes. This function must not be called from IRQ
- *	context. Returns 0 on success or -1 on failure.
- *
- *	BUGS: This code is seriously over-engineered. There is also
- *	magic about how the driver specific features are setup. If
- *	a driver is attached we assume the driver settings are auto
- *	remove.
- */
-
-static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int data_type, int min, int max, int mul_factor, int div_factor, void *data, ide_procset_t *set, int auto_remove)
-{
-	ide_settings_t **p = (ide_settings_t **) &drive->settings, *setting = NULL;
-
-	mutex_lock(&ide_setting_mtx);
-	while ((*p) && strcmp((*p)->name, name) < 0)
-		p = &((*p)->next);
-	if ((setting = kzalloc(sizeof(*setting), GFP_KERNEL)) == NULL)
-		goto abort;
-	if ((setting->name = kmalloc(strlen(name) + 1, GFP_KERNEL)) == NULL)
-		goto abort;
-	strcpy(setting->name, name);
-	setting->rw = rw;
-	setting->data_type = data_type;
-	setting->min = min;
-	setting->max = max;
-	setting->mul_factor = mul_factor;
-	setting->div_factor = div_factor;
-	setting->data = data;
-	setting->set = set;
-
-	setting->next = *p;
-	if (auto_remove)
-		setting->auto_remove = 1;
-	*p = setting;
-	mutex_unlock(&ide_setting_mtx);
-	return 0;
-abort:
-	mutex_unlock(&ide_setting_mtx);
-	kfree(setting);
-	return -1;
-}
-
-int ide_add_setting(ide_drive_t *drive, const char *name, int rw, int data_type, int min, int max, int mul_factor, int div_factor, void *data, ide_procset_t *set)
-{
-	return __ide_add_setting(drive, name, rw, data_type, min, max, mul_factor, div_factor, data, set, 1);
-}
-
-EXPORT_SYMBOL(ide_add_setting);
-
-/**
- *	__ide_remove_setting	-	remove an ide setting option
- *	@drive: drive to use
+ *	ide_find_setting	-	find a specific setting
+ *	@st: setting table pointer
  *	@name: setting name
  *
- *	Removes the setting named from the device if it is present.
- *	The caller must hold the setting semaphore.
- */
-
-static void __ide_remove_setting(ide_drive_t *drive, char *name)
-{
-	ide_settings_t **p, *setting;
-
-	p = (ide_settings_t **) &drive->settings;
-
-	while ((*p) && strcmp((*p)->name, name))
-		p = &((*p)->next);
-	setting = (*p);
-	if (setting == NULL)
-		return;
-
-	(*p) = setting->next;
-
-	kfree(setting->name);
-	kfree(setting);
-}
-
-/**
- *	auto_remove_settings	-	remove driver specific settings
- *	@drive: drive
- *
- *	Automatically remove all the driver specific settings for this
- *	drive. This function may not be called from IRQ context. The
- *	caller must hold ide_setting_mtx.
- */
-
-static void auto_remove_settings(ide_drive_t *drive)
-{
-	ide_settings_t *setting;
-repeat:
-	setting = drive->settings;
-	while (setting) {
-		if (setting->auto_remove) {
-			__ide_remove_setting(drive, setting->name);
-			goto repeat;
-		}
-		setting = setting->next;
-	}
-}
-
-/**
- *	ide_find_setting_by_name	-	find a drive specific setting
- *	@drive: drive to scan
- *	@name: setting name
- *
- *	Scan's the device setting table for a matching entry and returns
+ *	Scan's the setting table for a matching entry and returns
  *	this or NULL if no entry is found. The caller must hold the
  *	setting semaphore
  */
 
-static ide_settings_t *ide_find_setting_by_name(ide_drive_t *drive, char *name)
+static
+const struct ide_proc_devset *ide_find_setting(const struct ide_proc_devset *st,
+					       char *name)
 {
-	ide_settings_t *setting = drive->settings;
-
-	while (setting) {
-		if (strcmp(setting->name, name) == 0)
+	while (st->name) {
+		if (strcmp(st->name, name) == 0)
 			break;
-		setting = setting->next;
+		st++;
 	}
-	return setting;
+	return st->name ? st : NULL;
 }
 
 /**
@@ -272,26 +149,20 @@
  *	be told apart
  */
 
-static int ide_read_setting(ide_drive_t *drive, ide_settings_t *setting)
+static int ide_read_setting(ide_drive_t *drive,
+			    const struct ide_proc_devset *setting)
 {
-	int		val = -EINVAL;
-	unsigned long	flags;
+	const struct ide_devset *ds = setting->setting;
+	int val = -EINVAL;
 
-	if ((setting->rw & SETTING_READ)) {
+	if (ds->get) {
+		unsigned long flags;
+
 		spin_lock_irqsave(&ide_lock, flags);
-		switch (setting->data_type) {
-		case TYPE_BYTE:
-			val = *((u8 *) setting->data);
-			break;
-		case TYPE_SHORT:
-			val = *((u16 *) setting->data);
-			break;
-		case TYPE_INT:
-			val = *((u32 *) setting->data);
-			break;
-		}
+		val = ds->get(drive);
 		spin_unlock_irqrestore(&ide_lock, flags);
 	}
+
 	return val;
 }
 
@@ -313,33 +184,23 @@
  *	The current scheme of polling is kludgy, though safe enough.
  */
 
-static int ide_write_setting(ide_drive_t *drive, ide_settings_t *setting, int val)
+static int ide_write_setting(ide_drive_t *drive,
+			     const struct ide_proc_devset *setting, int val)
 {
+	const struct ide_devset *ds = setting->setting;
+
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
-	if (setting->set)
-		return setting->set(drive, val);
-	if (!(setting->rw & SETTING_WRITE))
+	if (!ds->set)
 		return -EPERM;
-	if (val < setting->min || val > setting->max)
+	if ((ds->flags & DS_SYNC)
+	    && (val < setting->min || val > setting->max))
 		return -EINVAL;
-	if (ide_spin_wait_hwgroup(drive))
-		return -EBUSY;
-	switch (setting->data_type) {
-	case TYPE_BYTE:
-		*((u8 *) setting->data) = val;
-		break;
-	case TYPE_SHORT:
-		*((u16 *) setting->data) = val;
-		break;
-	case TYPE_INT:
-		*((u32 *) setting->data) = val;
-		break;
-	}
-	spin_unlock_irq(&ide_lock);
-	return 0;
+	return ide_devset_execute(drive, ds, val);
 }
 
+ide_devset_get(xfer_rate, current_speed);
+
 static int set_xfer_rate (ide_drive_t *drive, int arg)
 {
 	ide_task_t task;
@@ -349,7 +210,7 @@
 		return -EINVAL;
 
 	memset(&task, 0, sizeof(task));
-	task.tf.command = WIN_SETFEATURES;
+	task.tf.command = ATA_CMD_SET_FEATURES;
 	task.tf.feature = SETFEATURES_XFER;
 	task.tf.nsect   = (u8)arg;
 	task.tf_flags = IDE_TFLAG_OUT_FEATURE | IDE_TFLAG_OUT_NSECT |
@@ -364,29 +225,23 @@
 	return err;
 }
 
-/**
- *	ide_add_generic_settings	-	generic ide settings
- *	@drive: drive being configured
- *
- *	Add the generic parts of the system settings to the /proc files.
- *	The caller must not be holding the ide_setting_mtx.
- */
+ide_devset_rw(current_speed, xfer_rate);
+ide_devset_rw_field(init_speed, init_speed);
+ide_devset_rw_field(nice1, nice1);
+ide_devset_rw_field(number, dn);
 
-void ide_add_generic_settings (ide_drive_t *drive)
-{
-/*
- *			  drive		setting name		read/write access				data type	min	max				mul_factor	div_factor	data pointer			set function
- */
-	__ide_add_setting(drive,	"io_32bit",		drive->no_io_32bit ? SETTING_READ : SETTING_RW,	TYPE_BYTE,	0,	1 + (SUPPORT_VLB_SYNC << 1),	1,		1,		&drive->io_32bit,		set_io_32bit,	0);
-	__ide_add_setting(drive,	"keepsettings",		SETTING_RW,					TYPE_BYTE,	0,	1,				1,		1,		&drive->keep_settings,		NULL,		0);
-	__ide_add_setting(drive,	"nice1",		SETTING_RW,					TYPE_BYTE,	0,	1,				1,		1,		&drive->nice1,			NULL,		0);
-	__ide_add_setting(drive,	"pio_mode",		SETTING_WRITE,					TYPE_BYTE,	0,	255,				1,		1,		NULL,				set_pio_mode,	0);
-	__ide_add_setting(drive,	"unmaskirq",		drive->no_unmask ? SETTING_READ : SETTING_RW,	TYPE_BYTE,	0,	1,				1,		1,		&drive->unmask,			NULL,		0);
-	__ide_add_setting(drive,	"using_dma",		SETTING_RW,					TYPE_BYTE,	0,	1,				1,		1,		&drive->using_dma,		set_using_dma,	0);
-	__ide_add_setting(drive,	"init_speed",		SETTING_RW,					TYPE_BYTE,	0,	70,				1,		1,		&drive->init_speed,		NULL,		0);
-	__ide_add_setting(drive,	"current_speed",	SETTING_RW,					TYPE_BYTE,	0,	70,				1,		1,		&drive->current_speed,		set_xfer_rate,	0);
-	__ide_add_setting(drive,	"number",		SETTING_RW,					TYPE_BYTE,	0,	3,				1,		1,		&drive->dn,			NULL,		0);
-}
+static const struct ide_proc_devset ide_generic_settings[] = {
+	IDE_PROC_DEVSET(current_speed, 0, 70),
+	IDE_PROC_DEVSET(init_speed, 0, 70),
+	IDE_PROC_DEVSET(io_32bit,  0, 1 + (SUPPORT_VLB_SYNC << 1)),
+	IDE_PROC_DEVSET(keepsettings, 0, 1),
+	IDE_PROC_DEVSET(nice1, 0, 1),
+	IDE_PROC_DEVSET(number, 0, 3),
+	IDE_PROC_DEVSET(pio_mode, 0, 255),
+	IDE_PROC_DEVSET(unmaskirq, 0, 1),
+	IDE_PROC_DEVSET(using_dma, 0, 1),
+	{ 0 },
+};
 
 static void proc_ide_settings_warn(void)
 {
@@ -403,19 +258,32 @@
 static int proc_ide_read_settings
 	(char *page, char **start, off_t off, int count, int *eof, void *data)
 {
+	const struct ide_proc_devset *setting, *g, *d;
+	const struct ide_devset *ds;
 	ide_drive_t	*drive = (ide_drive_t *) data;
-	ide_settings_t	*setting = (ide_settings_t *) drive->settings;
 	char		*out = page;
 	int		len, rc, mul_factor, div_factor;
 
 	proc_ide_settings_warn();
 
 	mutex_lock(&ide_setting_mtx);
+	g = ide_generic_settings;
+	d = drive->settings;
 	out += sprintf(out, "name\t\t\tvalue\t\tmin\t\tmax\t\tmode\n");
 	out += sprintf(out, "----\t\t\t-----\t\t---\t\t---\t\t----\n");
-	while (setting) {
-		mul_factor = setting->mul_factor;
-		div_factor = setting->div_factor;
+	while (g->name || (d && d->name)) {
+		/* read settings in the alphabetical order */
+		if (g->name && d && d->name) {
+			if (strcmp(d->name, g->name) < 0)
+				setting = d++;
+			else
+				setting = g++;
+		} else if (d && d->name) {
+			setting = d++;
+		} else
+			setting = g++;
+		mul_factor = setting->mulf ? setting->mulf(drive) : 1;
+		div_factor = setting->divf ? setting->divf(drive) : 1;
 		out += sprintf(out, "%-24s", setting->name);
 		rc = ide_read_setting(drive, setting);
 		if (rc >= 0)
@@ -423,12 +291,12 @@
 		else
 			out += sprintf(out, "%-16s", "write-only");
 		out += sprintf(out, "%-16d%-16d", (setting->min * mul_factor + div_factor - 1) / div_factor, setting->max * mul_factor / div_factor);
-		if (setting->rw & SETTING_READ)
+		ds = setting->setting;
+		if (ds->get)
 			out += sprintf(out, "r");
-		if (setting->rw & SETTING_WRITE)
+		if (ds->set)
 			out += sprintf(out, "w");
 		out += sprintf(out, "\n");
-		setting = setting->next;
 	}
 	len = out - page;
 	mutex_unlock(&ide_setting_mtx);
@@ -442,9 +310,10 @@
 {
 	ide_drive_t	*drive = (ide_drive_t *) data;
 	char		name[MAX_LEN + 1];
-	int		for_real = 0;
+	int		for_real = 0, mul_factor, div_factor;
 	unsigned long	n;
-	ide_settings_t	*setting;
+
+	const struct ide_proc_devset *setting;
 	char *buf, *s;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -512,13 +381,21 @@
 			}
 
 			mutex_lock(&ide_setting_mtx);
-			setting = ide_find_setting_by_name(drive, name);
+			/* generic settings first, then driver specific ones */
+			setting = ide_find_setting(ide_generic_settings, name);
 			if (!setting) {
-				mutex_unlock(&ide_setting_mtx);
-				goto parse_error;
+				if (drive->settings)
+					setting = ide_find_setting(drive->settings, name);
+				if (!setting) {
+					mutex_unlock(&ide_setting_mtx);
+					goto parse_error;
+				}
 			}
-			if (for_real)
-				ide_write_setting(drive, setting, val * setting->div_factor / setting->mul_factor);
+			if (for_real) {
+				mul_factor = setting->mulf ? setting->mulf(drive) : 1;
+				div_factor = setting->divf ? setting->divf(drive) : 1;
+				ide_write_setting(drive, setting, val * div_factor / mul_factor);
+			}
 			mutex_unlock(&ide_setting_mtx);
 		}
 	} while (!for_real++);
@@ -561,11 +438,10 @@
 	(char *page, char **start, off_t off, int count, int *eof, void *data)
 {
 	ide_drive_t	*drive = (ide_drive_t *) data;
-	struct hd_driveid *id = drive->id;
+	char		*m = (char *)&drive->id[ATA_ID_PROD];
 	int		len;
 
-	len = sprintf(page, "%.40s\n",
-		(id && id->model[0]) ? (char *)id->model : "(none)");
+	len = sprintf(page, "%.40s\n", m[0] ? m : "(none)");
 	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
 }
 
@@ -690,6 +566,10 @@
 
 void ide_proc_register_driver(ide_drive_t *drive, ide_driver_t *driver)
 {
+	mutex_lock(&ide_setting_mtx);
+	drive->settings = driver->settings;
+	mutex_unlock(&ide_setting_mtx);
+
 	ide_add_proc_entries(drive->proc, driver->proc, drive);
 }
 
@@ -726,7 +606,7 @@
 	 * OTOH both ide_{read,write}_setting are only ever used under
 	 * ide_setting_mtx.
 	 */
-	auto_remove_settings(drive);
+	drive->settings = NULL;
 	spin_unlock_irqrestore(&ide_lock, flags);
 	mutex_unlock(&ide_setting_mtx);
 }
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 3833189..f8c84df 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -15,6 +15,8 @@
  * Documentation/ide/ChangeLog.ide-tape.1995-2002
  */
 
+#define DRV_NAME "ide-tape"
+
 #define IDETAPE_VERSION "1.20"
 
 #include <linux/module.h>
@@ -54,8 +56,6 @@
 	DBG_CHRDEV =		(1 << 2),
 	/* all remaining procedures */
 	DBG_PROCS =		(1 << 3),
-	/* buffer alloc info (pc_stack & rq_stack) */
-	DBG_PCRQ_STACK =	(1 << 4),
 };
 
 /* define to see debug info */
@@ -81,26 +81,6 @@
 #define IDETAPE_MAX_PC_RETRIES		3
 
 /*
- * With each packet command, we allocate a buffer of IDETAPE_PC_BUFFER_SIZE
- * bytes. This is used for several packet commands (Not for READ/WRITE commands)
- */
-#define IDETAPE_PC_BUFFER_SIZE		256
-
-/*
- *	In various places in the driver, we need to allocate storage
- *	for packet commands and requests, which will remain valid while
- *	we leave the driver to wait for an interrupt or a timeout event.
- */
-#define IDETAPE_PC_STACK		(10 + IDETAPE_MAX_PC_RETRIES)
-
-/*
- * Some drives (for example, Seagate STT3401A Travan) require a very long
- * timeout, because they don't return an interrupt or clear their busy bit
- * until after the command completes (even retension commands).
- */
-#define IDETAPE_WAIT_CMD		(900*HZ)
-
-/*
  * The following parameter is used to select the point in the internal tape fifo
  * in which we will start to refill the buffer. Decreasing the following
  * parameter will improve the system's latency and interactive response, while
@@ -172,20 +152,6 @@
 #define IDETAPE_LU_RETENSION_MASK	2
 #define IDETAPE_LU_EOT_MASK		4
 
-/*
- * Special requests for our block device strategy routine.
- *
- * In order to service a character device command, we add special requests to
- * the tail of our block device request queue and wait for their completion.
- */
-
-enum {
-	REQ_IDETAPE_PC1		= (1 << 0), /* packet command (first stage) */
-	REQ_IDETAPE_PC2		= (1 << 1), /* packet command (second stage) */
-	REQ_IDETAPE_READ	= (1 << 2),
-	REQ_IDETAPE_WRITE	= (1 << 3),
-};
-
 /* Error codes returned in rq->errors to the higher part of the driver. */
 #define IDETAPE_ERROR_GENERAL		101
 #define IDETAPE_ERROR_FILEMARK		102
@@ -206,13 +172,6 @@
 	struct kref	kref;
 
 	/*
-	 *	Since a typical character device operation requires more
-	 *	than one packet command, we provide here enough memory
-	 *	for the maximum of interconnected packet commands.
-	 *	The packet commands are stored in the circular array pc_stack.
-	 *	pc_stack_index points to the last used entry, and warps around
-	 *	to the start when we get to the last array entry.
-	 *
 	 *	pc points to the current processed packet command.
 	 *
 	 *	failed_pc points to the last failed packet command, or contains
@@ -224,13 +183,11 @@
 	struct ide_atapi_pc *pc;
 	/* Last failed packet command */
 	struct ide_atapi_pc *failed_pc;
-	/* Packet command stack */
-	struct ide_atapi_pc pc_stack[IDETAPE_PC_STACK];
-	/* Next free packet command storage space */
-	int pc_stack_index;
-	struct request rq_stack[IDETAPE_PC_STACK];
-	/* We implement a circular array */
-	int rq_stack_index;
+	/* used by REQ_IDETAPE_{READ,WRITE} requests */
+	struct ide_atapi_pc queued_pc;
+
+	struct ide_atapi_pc request_sense_pc;
+	struct request request_sense_rq;
 
 	/*
 	 * DSC polling variables.
@@ -451,47 +408,6 @@
 }
 
 /*
- *	idetape_next_pc_storage returns a pointer to a place in which we can
- *	safely store a packet command, even though we intend to leave the
- *	driver. A storage space for a maximum of IDETAPE_PC_STACK packet
- *	commands is allocated at initialization time.
- */
-static struct ide_atapi_pc *idetape_next_pc_storage(ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-
-	debug_log(DBG_PCRQ_STACK, "pc_stack_index=%d\n", tape->pc_stack_index);
-
-	if (tape->pc_stack_index == IDETAPE_PC_STACK)
-		tape->pc_stack_index = 0;
-	return (&tape->pc_stack[tape->pc_stack_index++]);
-}
-
-/*
- *	idetape_next_rq_storage is used along with idetape_next_pc_storage.
- *	Since we queue packet commands in the request queue, we need to
- *	allocate a request, along with the allocation of a packet command.
- */
-
-/**************************************************************
- *                                                            *
- *  This should get fixed to use kmalloc(.., GFP_ATOMIC)      *
- *  followed later on by kfree().   -ml                       *
- *                                                            *
- **************************************************************/
-
-static struct request *idetape_next_rq_storage(ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-
-	debug_log(DBG_PCRQ_STACK, "rq_stack_index=%d\n", tape->rq_stack_index);
-
-	if (tape->rq_stack_index == IDETAPE_PC_STACK)
-		tape->rq_stack_index = 0;
-	return (&tape->rq_stack[tape->rq_stack_index++]);
-}
-
-/*
  * called on each failed packet command retry to analyze the request sense. We
  * currently do not utilize this information.
  */
@@ -667,61 +583,14 @@
 	idetape_end_request(drive, uptodate, 0);
 }
 
-static void idetape_init_pc(struct ide_atapi_pc *pc)
-{
-	memset(pc->c, 0, 12);
-	pc->retries = 0;
-	pc->flags = 0;
-	pc->req_xfer = 0;
-	pc->buf = pc->pc_buf;
-	pc->buf_size = IDETAPE_PC_BUFFER_SIZE;
-	pc->bh = NULL;
-	pc->b_data = NULL;
-}
-
 static void idetape_create_request_sense_cmd(struct ide_atapi_pc *pc)
 {
-	idetape_init_pc(pc);
+	ide_init_pc(pc);
 	pc->c[0] = REQUEST_SENSE;
 	pc->c[4] = 20;
 	pc->req_xfer = 20;
 }
 
-static void idetape_init_rq(struct request *rq, u8 cmd)
-{
-	blk_rq_init(NULL, rq);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->cmd[13] = cmd;
-}
-
-/*
- * Generate a new packet command request in front of the request queue, before
- * the current request, so that it will be processed immediately, on the next
- * pass through the driver. The function below is called from the request
- * handling part of the driver (the "bottom" part). Safe storage for the request
- * should be allocated with ide_tape_next_{pc,rq}_storage() prior to that.
- *
- * Memory for those requests is pre-allocated at initialization time, and is
- * limited to IDETAPE_PC_STACK requests. We assume that we have enough space for
- * the maximum possible number of inter-dependent packet commands.
- *
- * The higher level of the driver - The ioctl handler and the character device
- * handling functions should queue request to the lower level part and wait for
- * their completion using idetape_queue_pc_tail or idetape_queue_rw_tail.
- */
-static void idetape_queue_pc_head(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				  struct request *rq)
-{
-	struct ide_tape_obj *tape = drive->driver_data;
-
-	idetape_init_rq(rq, REQ_IDETAPE_PC1);
-	rq->cmd_flags |= REQ_PREEMPT;
-	rq->buffer = (char *) pc;
-	rq->rq_disk = tape->disk;
-	memcpy(rq->cmd, pc->c, 12);
-	ide_do_drive_cmd(drive, rq);
-}
-
 /*
  *	idetape_retry_pc is called when an error was detected during the
  *	last packet command. We queue a request sense packet command in
@@ -729,15 +598,14 @@
  */
 static void idetape_retry_pc(ide_drive_t *drive)
 {
-	struct ide_atapi_pc *pc;
-	struct request *rq;
+	struct ide_tape_obj *tape = drive->driver_data;
+	struct request *rq = &tape->request_sense_rq;
+	struct ide_atapi_pc *pc = &tape->request_sense_pc;
 
 	(void)ide_read_error(drive);
-	pc = idetape_next_pc_storage(drive);
-	rq = idetape_next_rq_storage(drive);
 	idetape_create_request_sense_cmd(pc);
 	set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
-	idetape_queue_pc_head(drive, pc, rq);
+	ide_queue_pc_head(drive, tape->disk, pc, rq);
 }
 
 /*
@@ -766,13 +634,15 @@
 	idetape_postpone_request(drive);
 }
 
-static void ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
+static int ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 				unsigned int bcount, int write)
 {
 	if (write)
 		idetape_output_buffers(drive, pc, bcount);
 	else
 		idetape_input_buffers(drive, pc, bcount);
+
+	return bcount;
 }
 
 /*
@@ -786,7 +656,7 @@
 {
 	idetape_tape_t *tape = drive->driver_data;
 
-	return ide_pc_intr(drive, tape->pc, idetape_pc_intr, IDETAPE_WAIT_CMD,
+	return ide_pc_intr(drive, tape->pc, idetape_pc_intr, WAIT_TAPE_CMD,
 			   NULL, idetape_update_buffers, idetape_retry_pc,
 			   ide_tape_handle_dsc, ide_tape_io_buffers);
 }
@@ -832,7 +702,7 @@
 	idetape_tape_t *tape = drive->driver_data;
 
 	return ide_transfer_pc(drive, tape->pc, idetape_pc_intr,
-			       IDETAPE_WAIT_CMD, NULL);
+			       WAIT_TAPE_CMD, NULL);
 }
 
 static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
@@ -881,13 +751,13 @@
 	pc->retries++;
 
 	return ide_issue_pc(drive, pc, idetape_transfer_pc,
-			    IDETAPE_WAIT_CMD, NULL);
+			    WAIT_TAPE_CMD, NULL);
 }
 
 /* A mode sense command is used to "sense" tape parameters. */
 static void idetape_create_mode_sense_cmd(struct ide_atapi_pc *pc, u8 page_code)
 {
-	idetape_init_pc(pc);
+	ide_init_pc(pc);
 	pc->c[0] = MODE_SENSE;
 	if (page_code != IDETAPE_BLOCK_DESCRIPTOR)
 		/* DBD = 1 - Don't return block descriptors */
@@ -920,8 +790,8 @@
 
 	stat = hwif->tp_ops->read_status(hwif);
 
-	if (stat & SEEK_STAT) {
-		if (stat & ERR_STAT) {
+	if (stat & ATA_DSC) {
+		if (stat & ATA_ERR) {
 			/* Error detected */
 			if (pc->c[0] != TEST_UNIT_READY)
 				printk(KERN_ERR "ide-tape: %s: I/O error, ",
@@ -946,7 +816,7 @@
 	struct idetape_bh *bh = (struct idetape_bh *)rq->special;
 	unsigned int length = rq->current_nr_sectors;
 
-	idetape_init_pc(pc);
+	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
 	pc->c[1] = 1;
 	pc->bh = bh;
@@ -978,9 +848,10 @@
 	struct request *postponed_rq = tape->postponed_rq;
 	u8 stat;
 
-	debug_log(DBG_SENSE, "sector: %ld, nr_sectors: %ld,"
-			" current_nr_sectors: %d\n",
-			rq->sector, rq->nr_sectors, rq->current_nr_sectors);
+	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu,"
+			" current_nr_sectors: %u\n",
+			(unsigned long long)rq->sector, rq->nr_sectors,
+			rq->current_nr_sectors);
 
 	if (!blk_special_request(rq)) {
 		/* We do not support buffer cache originated requests. */
@@ -1021,7 +892,7 @@
 	}
 
 	if (!test_and_clear_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags) &&
-	    (stat & SEEK_STAT) == 0) {
+	    (stat & ATA_DSC) == 0) {
 		if (postponed_rq == NULL) {
 			tape->dsc_polling_start = jiffies;
 			tape->dsc_poll_freq = tape->best_dsc_rw_freq;
@@ -1043,12 +914,12 @@
 		return ide_stopped;
 	}
 	if (rq->cmd[13] & REQ_IDETAPE_READ) {
-		pc = idetape_next_pc_storage(drive);
+		pc = &tape->queued_pc;
 		ide_tape_create_rw_cmd(tape, pc, rq, READ_6);
 		goto out;
 	}
 	if (rq->cmd[13] & REQ_IDETAPE_WRITE) {
-		pc = idetape_next_pc_storage(drive);
+		pc = &tape->queued_pc;
 		ide_tape_create_rw_cmd(tape, pc, rq, WRITE_6);
 		goto out;
 	}
@@ -1235,77 +1106,30 @@
 static void idetape_create_write_filemark_cmd(ide_drive_t *drive,
 		struct ide_atapi_pc *pc, int write_filemark)
 {
-	idetape_init_pc(pc);
+	ide_init_pc(pc);
 	pc->c[0] = WRITE_FILEMARKS;
 	pc->c[4] = write_filemark;
 	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
 }
 
-static void idetape_create_test_unit_ready_cmd(struct ide_atapi_pc *pc)
-{
-	idetape_init_pc(pc);
-	pc->c[0] = TEST_UNIT_READY;
-}
-
-/*
- * We add a special packet command request to the tail of the request queue, and
- * wait for it to be serviced. This is not to be called from within the request
- * handling part of the driver! We allocate here data on the stack and it is
- * valid until the request is finished. This is not the case for the bottom part
- * of the driver, where we are always leaving the functions to wait for an
- * interrupt or a timer event.
- *
- * From the bottom part of the driver, we should allocate safe memory using
- * idetape_next_pc_storage() and ide_tape_next_rq_storage(), and add the request
- * to the request list without waiting for it to be serviced! In that case, we
- * usually use idetape_queue_pc_head().
- */
-static int idetape_queue_pc_tail(ide_drive_t *drive, struct ide_atapi_pc *pc)
-{
-	struct ide_tape_obj *tape = drive->driver_data;
-	struct request *rq;
-	int error;
-
-	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->cmd[13] = REQ_IDETAPE_PC1;
-	rq->buffer = (char *)pc;
-	memcpy(rq->cmd, pc->c, 12);
-	error = blk_execute_rq(drive->queue, tape->disk, rq, 0);
-	blk_put_request(rq);
-	return error;
-}
-
-static void idetape_create_load_unload_cmd(ide_drive_t *drive,
-		struct ide_atapi_pc *pc, int cmd)
-{
-	idetape_init_pc(pc);
-	pc->c[0] = START_STOP;
-	pc->c[4] = cmd;
-	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
-}
-
 static int idetape_wait_ready(ide_drive_t *drive, unsigned long timeout)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	struct ide_atapi_pc pc;
+	struct gendisk *disk = tape->disk;
 	int load_attempted = 0;
 
 	/* Wait for the tape to become ready */
 	set_bit(IDE_AFLAG_MEDIUM_PRESENT, &drive->atapi_flags);
 	timeout += jiffies;
 	while (time_before(jiffies, timeout)) {
-		idetape_create_test_unit_ready_cmd(&pc);
-		if (!idetape_queue_pc_tail(drive, &pc))
+		if (ide_do_test_unit_ready(drive, disk) == 0)
 			return 0;
 		if ((tape->sense_key == 2 && tape->asc == 4 && tape->ascq == 2)
 		    || (tape->asc == 0x3A)) {
 			/* no media */
 			if (load_attempted)
 				return -ENOMEDIUM;
-			idetape_create_load_unload_cmd(drive, &pc,
-							IDETAPE_LU_LOAD_MASK);
-			idetape_queue_pc_tail(drive, &pc);
+			ide_do_start_stop(drive, disk, IDETAPE_LU_LOAD_MASK);
 			load_attempted = 1;
 		/* not about to be ready */
 		} else if (!(tape->sense_key == 2 && tape->asc == 4 &&
@@ -1318,11 +1142,12 @@
 
 static int idetape_flush_tape_buffers(ide_drive_t *drive)
 {
+	struct ide_tape_obj *tape = drive->driver_data;
 	struct ide_atapi_pc pc;
 	int rc;
 
 	idetape_create_write_filemark_cmd(drive, &pc, 0);
-	rc = idetape_queue_pc_tail(drive, &pc);
+	rc = ide_queue_pc_tail(drive, tape->disk, &pc);
 	if (rc)
 		return rc;
 	idetape_wait_ready(drive, 60 * 5 * HZ);
@@ -1331,7 +1156,7 @@
 
 static void idetape_create_read_position_cmd(struct ide_atapi_pc *pc)
 {
-	idetape_init_pc(pc);
+	ide_init_pc(pc);
 	pc->c[0] = READ_POSITION;
 	pc->req_xfer = 20;
 }
@@ -1345,7 +1170,7 @@
 	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	idetape_create_read_position_cmd(&pc);
-	if (idetape_queue_pc_tail(drive, &pc))
+	if (ide_queue_pc_tail(drive, tape->disk, &pc))
 		return -1;
 	position = tape->first_frame;
 	return position;
@@ -1355,7 +1180,7 @@
 		struct ide_atapi_pc *pc,
 		unsigned int block, u8 partition, int skip)
 {
-	idetape_init_pc(pc);
+	ide_init_pc(pc);
 	pc->c[0] = POSITION_TO_ELEMENT;
 	pc->c[1] = 2;
 	put_unaligned(cpu_to_be32(block), (unsigned int *) &pc->c[3]);
@@ -1363,21 +1188,6 @@
 	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
 }
 
-static int idetape_create_prevent_cmd(ide_drive_t *drive,
-		struct ide_atapi_pc *pc, int prevent)
-{
-	idetape_tape_t *tape = drive->driver_data;
-
-	/* device supports locking according to capabilities page */
-	if (!(tape->caps[6] & 0x01))
-		return 0;
-
-	idetape_init_pc(pc);
-	pc->c[0] = ALLOW_MEDIUM_REMOVAL;
-	pc->c[4] = prevent;
-	return 1;
-}
-
 static void __ide_tape_discard_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -1405,6 +1215,7 @@
 		u8 partition, int skip)
 {
 	idetape_tape_t *tape = drive->driver_data;
+	struct gendisk *disk = tape->disk;
 	int retval;
 	struct ide_atapi_pc pc;
 
@@ -1412,12 +1223,12 @@
 		__ide_tape_discard_merge_buffer(drive);
 	idetape_wait_ready(drive, 60 * 5 * HZ);
 	idetape_create_locate_cmd(drive, &pc, block, partition, skip);
-	retval = idetape_queue_pc_tail(drive, &pc);
+	retval = ide_queue_pc_tail(drive, disk, &pc);
 	if (retval)
 		return (retval);
 
 	idetape_create_read_position_cmd(&pc);
-	return (idetape_queue_pc_tail(drive, &pc));
+	return ide_queue_pc_tail(drive, disk, &pc);
 }
 
 static void ide_tape_discard_merge_buffer(ide_drive_t *drive,
@@ -1477,7 +1288,7 @@
 
 static void idetape_create_inquiry_cmd(struct ide_atapi_pc *pc)
 {
-	idetape_init_pc(pc);
+	ide_init_pc(pc);
 	pc->c[0] = INQUIRY;
 	pc->c[4] = 254;
 	pc->req_xfer = 254;
@@ -1486,14 +1297,14 @@
 static void idetape_create_rewind_cmd(ide_drive_t *drive,
 		struct ide_atapi_pc *pc)
 {
-	idetape_init_pc(pc);
+	ide_init_pc(pc);
 	pc->c[0] = REZERO_UNIT;
 	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
 }
 
 static void idetape_create_erase_cmd(struct ide_atapi_pc *pc)
 {
-	idetape_init_pc(pc);
+	ide_init_pc(pc);
 	pc->c[0] = ERASE;
 	pc->c[1] = 1;
 	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
@@ -1501,7 +1312,7 @@
 
 static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
 {
-	idetape_init_pc(pc);
+	ide_init_pc(pc);
 	pc->c[0] = SPACE;
 	put_unaligned(cpu_to_be32(count), (unsigned int *) &pc->c[1]);
 	pc->c[1] = cmd;
@@ -1664,20 +1475,20 @@
  */
 static int idetape_rewind_tape(ide_drive_t *drive)
 {
+	struct ide_tape_obj *tape = drive->driver_data;
+	struct gendisk *disk = tape->disk;
 	int retval;
 	struct ide_atapi_pc pc;
-	idetape_tape_t *tape;
-	tape = drive->driver_data;
 
 	debug_log(DBG_SENSE, "Enter %s\n", __func__);
 
 	idetape_create_rewind_cmd(drive, &pc);
-	retval = idetape_queue_pc_tail(drive, &pc);
+	retval = ide_queue_pc_tail(drive, disk, &pc);
 	if (retval)
 		return retval;
 
 	idetape_create_read_position_cmd(&pc);
-	retval = idetape_queue_pc_tail(drive, &pc);
+	retval = ide_queue_pc_tail(drive, disk, &pc);
 	if (retval)
 		return retval;
 	return 0;
@@ -1720,6 +1531,7 @@
 					int mt_count)
 {
 	idetape_tape_t *tape = drive->driver_data;
+	struct gendisk *disk = tape->disk;
 	struct ide_atapi_pc pc;
 	int retval, count = 0;
 	int sprev = !!(tape->caps[4] & 0x20);
@@ -1744,7 +1556,7 @@
 	case MTBSF:
 		idetape_create_space_cmd(&pc, mt_count - count,
 					 IDETAPE_SPACE_OVER_FILEMARK);
-		return idetape_queue_pc_tail(drive, &pc);
+		return ide_queue_pc_tail(drive, disk, &pc);
 	case MTFSFM:
 	case MTBSFM:
 		if (!sprev)
@@ -1933,11 +1745,12 @@
 
 static int idetape_write_filemark(ide_drive_t *drive)
 {
+	struct ide_tape_obj *tape = drive->driver_data;
 	struct ide_atapi_pc pc;
 
 	/* Write a filemark */
 	idetape_create_write_filemark_cmd(drive, &pc, 1);
-	if (idetape_queue_pc_tail(drive, &pc)) {
+	if (ide_queue_pc_tail(drive, tape->disk, &pc)) {
 		printk(KERN_ERR "ide-tape: Couldn't write a filemark\n");
 		return -EIO;
 	}
@@ -1960,6 +1773,7 @@
 static int idetape_mtioctop(ide_drive_t *drive, short mt_op, int mt_count)
 {
 	idetape_tape_t *tape = drive->driver_data;
+	struct gendisk *disk = tape->disk;
 	struct ide_atapi_pc pc;
 	int i, retval;
 
@@ -1996,9 +1810,7 @@
 		return 0;
 	case MTLOAD:
 		ide_tape_discard_merge_buffer(drive, 0);
-		idetape_create_load_unload_cmd(drive, &pc,
-					       IDETAPE_LU_LOAD_MASK);
-		return idetape_queue_pc_tail(drive, &pc);
+		return ide_do_start_stop(drive, disk, IDETAPE_LU_LOAD_MASK);
 	case MTUNLOAD:
 	case MTOFFL:
 		/*
@@ -2006,14 +1818,11 @@
 		 * attempting to eject.
 		 */
 		if (tape->door_locked) {
-			if (idetape_create_prevent_cmd(drive, &pc, 0))
-				if (!idetape_queue_pc_tail(drive, &pc))
-					tape->door_locked = DOOR_UNLOCKED;
+			if (!ide_set_media_lock(drive, disk, 0))
+				tape->door_locked = DOOR_UNLOCKED;
 		}
 		ide_tape_discard_merge_buffer(drive, 0);
-		idetape_create_load_unload_cmd(drive, &pc,
-					      !IDETAPE_LU_LOAD_MASK);
-		retval = idetape_queue_pc_tail(drive, &pc);
+		retval = ide_do_start_stop(drive, disk, !IDETAPE_LU_LOAD_MASK);
 		if (!retval)
 			clear_bit(IDE_AFLAG_MEDIUM_PRESENT, &drive->atapi_flags);
 		return retval;
@@ -2022,16 +1831,15 @@
 		return idetape_flush_tape_buffers(drive);
 	case MTRETEN:
 		ide_tape_discard_merge_buffer(drive, 0);
-		idetape_create_load_unload_cmd(drive, &pc,
+		return ide_do_start_stop(drive, disk,
 			IDETAPE_LU_RETENSION_MASK | IDETAPE_LU_LOAD_MASK);
-		return idetape_queue_pc_tail(drive, &pc);
 	case MTEOM:
 		idetape_create_space_cmd(&pc, 0, IDETAPE_SPACE_TO_EOD);
-		return idetape_queue_pc_tail(drive, &pc);
+		return ide_queue_pc_tail(drive, disk, &pc);
 	case MTERASE:
 		(void)idetape_rewind_tape(drive);
 		idetape_create_erase_cmd(&pc);
-		return idetape_queue_pc_tail(drive, &pc);
+		return ide_queue_pc_tail(drive, disk, &pc);
 	case MTSETBLK:
 		if (mt_count) {
 			if (mt_count < tape->blk_size ||
@@ -2052,17 +1860,13 @@
 	case MTFSR:
 	case MTBSR:
 	case MTLOCK:
-		if (!idetape_create_prevent_cmd(drive, &pc, 1))
-			return 0;
-		retval = idetape_queue_pc_tail(drive, &pc);
+		retval = ide_set_media_lock(drive, disk, 1);
 		if (retval)
 			return retval;
 		tape->door_locked = DOOR_EXPLICITLY_LOCKED;
 		return 0;
 	case MTUNLOCK:
-		if (!idetape_create_prevent_cmd(drive, &pc, 0))
-			return 0;
-		retval = idetape_queue_pc_tail(drive, &pc);
+		retval = ide_set_media_lock(drive, disk, 0);
 		if (retval)
 			return retval;
 		tape->door_locked = DOOR_UNLOCKED;
@@ -2144,7 +1948,7 @@
 	struct ide_atapi_pc pc;
 
 	idetape_create_mode_sense_cmd(&pc, IDETAPE_BLOCK_DESCRIPTOR);
-	if (idetape_queue_pc_tail(drive, &pc)) {
+	if (ide_queue_pc_tail(drive, tape->disk, &pc)) {
 		printk(KERN_ERR "ide-tape: Can't get block descriptor\n");
 		if (tape->blk_size == 0) {
 			printk(KERN_WARNING "ide-tape: Cannot deal with zero "
@@ -2164,7 +1968,6 @@
 	unsigned int minor = iminor(inode), i = minor & ~0xc0;
 	ide_drive_t *drive;
 	idetape_tape_t *tape;
-	struct ide_atapi_pc pc;
 	int retval;
 
 	if (i >= MAX_HWIFS * MAX_DRIVES)
@@ -2227,11 +2030,9 @@
 
 	/* Lock the tape drive door so user can't eject. */
 	if (tape->chrdev_dir == IDETAPE_DIR_NONE) {
-		if (idetape_create_prevent_cmd(drive, &pc, 1)) {
-			if (!idetape_queue_pc_tail(drive, &pc)) {
-				if (tape->door_locked != DOOR_EXPLICITLY_LOCKED)
-					tape->door_locked = DOOR_LOCKED;
-			}
+		if (!ide_set_media_lock(drive, tape->disk, 1)) {
+			if (tape->door_locked != DOOR_EXPLICITLY_LOCKED)
+				tape->door_locked = DOOR_LOCKED;
 		}
 	}
 	unlock_kernel();
@@ -2264,7 +2065,6 @@
 {
 	struct ide_tape_obj *tape = ide_tape_f(filp);
 	ide_drive_t *drive = tape->drive;
-	struct ide_atapi_pc pc;
 	unsigned int minor = iminor(inode);
 
 	lock_kernel();
@@ -2283,10 +2083,8 @@
 		(void) idetape_rewind_tape(drive);
 	if (tape->chrdev_dir == IDETAPE_DIR_NONE) {
 		if (tape->door_locked == DOOR_LOCKED) {
-			if (idetape_create_prevent_cmd(drive, &pc, 0)) {
-				if (!idetape_queue_pc_tail(drive, &pc))
-					tape->door_locked = DOOR_UNLOCKED;
-			}
+			if (!ide_set_media_lock(drive, tape->disk, 0))
+				tape->door_locked = DOOR_UNLOCKED;
 		}
 	}
 	clear_bit(IDE_AFLAG_BUSY, &drive->atapi_flags);
@@ -2295,45 +2093,6 @@
 	return 0;
 }
 
-/*
- * check the contents of the ATAPI IDENTIFY command results. We return:
- *
- * 1 - If the tape can be supported by us, based on the information we have so
- * far.
- *
- * 0 - If this tape driver is not currently supported by us.
- */
-static int idetape_identify_device(ide_drive_t *drive)
-{
-	u8 gcw[2], protocol, device_type, removable, packet_size;
-
-	if (drive->id_read == 0)
-		return 1;
-
-	*((unsigned short *) &gcw) = drive->id->config;
-
-	protocol	=   (gcw[1] & 0xC0) >> 6;
-	device_type	=    gcw[1] & 0x1F;
-	removable	= !!(gcw[0] & 0x80);
-	packet_size	=    gcw[0] & 0x3;
-
-	/* Check that we can support this device */
-	if (protocol != 2)
-		printk(KERN_ERR "ide-tape: Protocol (0x%02x) is not ATAPI\n",
-				protocol);
-	else if (device_type != 1)
-		printk(KERN_ERR "ide-tape: Device type (0x%02x) is not set "
-				"to tape\n", device_type);
-	else if (!removable)
-		printk(KERN_ERR "ide-tape: The removable flag is not set\n");
-	else if (packet_size != 0) {
-		printk(KERN_ERR "ide-tape: Packet size (0x%02x) is not 12"
-				" bytes\n", packet_size);
-	} else
-		return 1;
-	return 0;
-}
-
 static void idetape_get_inquiry_results(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -2341,7 +2100,7 @@
 	char fw_rev[4], vendor_id[8], product_id[16];
 
 	idetape_create_inquiry_cmd(&pc);
-	if (idetape_queue_pc_tail(drive, &pc)) {
+	if (ide_queue_pc_tail(drive, tape->disk, &pc)) {
 		printk(KERN_ERR "ide-tape: %s: can't get INQUIRY results\n",
 				tape->name);
 		return;
@@ -2370,7 +2129,7 @@
 	u8 speed, max_speed;
 
 	idetape_create_mode_sense_cmd(&pc, IDETAPE_CAPABILITIES_PAGE);
-	if (idetape_queue_pc_tail(drive, &pc)) {
+	if (ide_queue_pc_tail(drive, tape->disk, &pc)) {
 		printk(KERN_ERR "ide-tape: Can't get tape parameters - assuming"
 				" some default values\n");
 		tape->blk_size = 512;
@@ -2402,6 +2161,11 @@
 	}
 
 	memcpy(&tape->caps, caps, 20);
+
+	/* device lacks locking support according to capabilities page */
+	if ((caps[6] & 1) == 0)
+		drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK;
+
 	if (caps[7] & 0x02)
 		tape->blk_size = 512;
 	else if (caps[7] & 0x04)
@@ -2409,28 +2173,56 @@
 }
 
 #ifdef CONFIG_IDE_PROC_FS
-static void idetape_add_settings(ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-
-	ide_add_setting(drive, "buffer", SETTING_READ, TYPE_SHORT, 0, 0xffff,
-			1, 2, (u16 *)&tape->caps[16], NULL);
-	ide_add_setting(drive, "speed", SETTING_READ, TYPE_SHORT, 0, 0xffff,
-			1, 1, (u16 *)&tape->caps[14], NULL);
-	ide_add_setting(drive, "buffer_size", SETTING_READ, TYPE_INT, 0, 0xffff,
-			1, 1024, &tape->buffer_size, NULL);
-	ide_add_setting(drive, "tdsc", SETTING_RW, TYPE_INT, IDETAPE_DSC_RW_MIN,
-			IDETAPE_DSC_RW_MAX, 1000, HZ, &tape->best_dsc_rw_freq,
-			NULL);
-	ide_add_setting(drive, "dsc_overlap", SETTING_RW, TYPE_BYTE, 0, 1, 1,
-			1, &drive->dsc_overlap, NULL);
-	ide_add_setting(drive, "avg_speed", SETTING_READ, TYPE_INT, 0, 0xffff,
-			1, 1, &tape->avg_speed, NULL);
-	ide_add_setting(drive, "debug_mask", SETTING_RW, TYPE_INT, 0, 0xffff, 1,
-			1, &tape->debug_mask, NULL);
+#define ide_tape_devset_get(name, field) \
+static int get_##name(ide_drive_t *drive) \
+{ \
+	idetape_tape_t *tape = drive->driver_data; \
+	return tape->field; \
 }
-#else
-static inline void idetape_add_settings(ide_drive_t *drive) { ; }
+
+#define ide_tape_devset_set(name, field) \
+static int set_##name(ide_drive_t *drive, int arg) \
+{ \
+	idetape_tape_t *tape = drive->driver_data; \
+	tape->field = arg; \
+	return 0; \
+}
+
+#define ide_tape_devset_rw_field(_name, _field) \
+ide_tape_devset_get(_name, _field) \
+ide_tape_devset_set(_name, _field) \
+IDE_DEVSET(_name, DS_SYNC, get_##_name, set_##_name)
+
+#define ide_tape_devset_r_field(_name, _field) \
+ide_tape_devset_get(_name, _field) \
+IDE_DEVSET(_name, 0, get_##_name, NULL)
+
+static int mulf_tdsc(ide_drive_t *drive)	{ return 1000; }
+static int divf_tdsc(ide_drive_t *drive)	{ return   HZ; }
+static int divf_buffer(ide_drive_t *drive)	{ return    2; }
+static int divf_buffer_size(ide_drive_t *drive)	{ return 1024; }
+
+ide_devset_rw_field(dsc_overlap, dsc_overlap);
+
+ide_tape_devset_rw_field(debug_mask, debug_mask);
+ide_tape_devset_rw_field(tdsc, best_dsc_rw_freq);
+
+ide_tape_devset_r_field(avg_speed, avg_speed);
+ide_tape_devset_r_field(speed, caps[14]);
+ide_tape_devset_r_field(buffer, caps[16]);
+ide_tape_devset_r_field(buffer_size, buffer_size);
+
+static const struct ide_proc_devset idetape_settings[] = {
+	__IDE_PROC_DEVSET(avg_speed,	0, 0xffff, NULL, NULL),
+	__IDE_PROC_DEVSET(buffer,	0, 0xffff, NULL, divf_buffer),
+	__IDE_PROC_DEVSET(buffer_size,	0, 0xffff, NULL, divf_buffer_size),
+	__IDE_PROC_DEVSET(debug_mask,	0, 0xffff, NULL, NULL),
+	__IDE_PROC_DEVSET(dsc_overlap,	0,      1, NULL, NULL),
+	__IDE_PROC_DEVSET(speed,	0, 0xffff, NULL, NULL),
+	__IDE_PROC_DEVSET(tdsc,		IDETAPE_DSC_RW_MIN, IDETAPE_DSC_RW_MAX,
+					mulf_tdsc, divf_tdsc),
+	{ 0 },
+};
 #endif
 
 /*
@@ -2462,15 +2254,15 @@
 		drive->dsc_overlap = 0;
 	}
 	/* Seagate Travan drives do not support DSC overlap. */
-	if (strstr(drive->id->model, "Seagate STT3401"))
+	if (strstr((char *)&drive->id[ATA_ID_PROD], "Seagate STT3401"))
 		drive->dsc_overlap = 0;
 	tape->minor = minor;
 	tape->name[0] = 'h';
 	tape->name[1] = 't';
 	tape->name[2] = '0' + minor;
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
-	tape->pc = tape->pc_stack;
-	*((unsigned short *) &gcw) = drive->id->config;
+
+	*((u16 *)&gcw) = drive->id[ATA_ID_CONFIG];
 
 	/* Command packet DRQ type */
 	if (((gcw[0] & 0x60) >> 5) == 1)
@@ -2512,7 +2304,7 @@
 		tape->best_dsc_rw_freq * 1000 / HZ,
 		drive->using_dma ? ", DMA":"");
 
-	idetape_add_settings(drive);
+	ide_proc_register_driver(drive, tape->driver);
 }
 
 static void ide_tape_remove(ide_drive_t *drive)
@@ -2577,12 +2369,12 @@
 	.remove			= ide_tape_remove,
 	.version		= IDETAPE_VERSION,
 	.media			= ide_tape,
-	.supports_dsc_overlap 	= 1,
 	.do_request		= idetape_do_request,
 	.end_request		= idetape_end_request,
 	.error			= __ide_error,
 #ifdef CONFIG_IDE_PROC_FS
 	.proc			= idetape_proc,
+	.settings		= idetape_settings,
 #endif
 };
 
@@ -2645,11 +2437,11 @@
 
 	if (!strstr("ide-tape", drive->driver_req))
 		goto failed;
-	if (!drive->present)
-		goto failed;
+
 	if (drive->media != ide_tape)
 		goto failed;
-	if (!idetape_identify_device(drive)) {
+
+	if (drive->id_read == 1 && !ide_check_atapi_device(drive, DRV_NAME)) {
 		printk(KERN_ERR "ide-tape: %s: not supported by this version of"
 				" the driver\n", drive->name);
 		goto failed;
@@ -2667,8 +2459,6 @@
 
 	ide_init_disk(g, drive);
 
-	ide_proc_register_driver(drive, &idetape_driver);
-
 	kref_init(&tape->kref);
 
 	tape->drive = drive;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 7fb6f1c..487b18b 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -44,9 +44,9 @@
 	memset(&args, 0, sizeof(ide_task_t));
 	args.tf.nsect = 0x01;
 	if (drive->media == ide_disk)
-		args.tf.command = WIN_IDENTIFY;
+		args.tf.command = ATA_CMD_ID_ATA;
 	else
-		args.tf.command = WIN_PIDENTIFY;
+		args.tf.command = ATA_CMD_ID_ATAPI;
 	args.tf_flags	= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 	args.data_phase	= TASKFILE_IN;
 	return ide_raw_taskfile(drive, &args, buf, 1);
@@ -99,12 +99,17 @@
 	case TASKFILE_NO_DATA:
 		if (handler == NULL)
 			handler = task_no_data_intr;
-		/* WIN_{SPECIFY,RESTORE,SETMULT} use custom handlers */
 		if (task->tf_flags & IDE_TFLAG_CUSTOM_HANDLER) {
 			switch (tf->command) {
-			case WIN_SPECIFY: handler = set_geometry_intr;	break;
-			case WIN_RESTORE: handler = recal_intr;		break;
-			case WIN_SETMULT: handler = set_multmode_intr;	break;
+			case ATA_CMD_INIT_DEV_PARAMS:
+				handler = set_geometry_intr;
+				break;
+			case ATA_CMD_RESTORE:
+				handler = recal_intr;
+				break;
+			case ATA_CMD_SET_MULTI:
+				handler = set_multmode_intr;
+				break;
 			}
 		}
 		ide_execute_command(drive, tf->command, handler,
@@ -121,7 +126,7 @@
 EXPORT_SYMBOL_GPL(do_rw_taskfile);
 
 /*
- * set_multmode_intr() is invoked on completion of a WIN_SETMULT cmd.
+ * set_multmode_intr() is invoked on completion of a ATA_CMD_SET_MULTI cmd.
  */
 static ide_startstop_t set_multmode_intr(ide_drive_t *drive)
 {
@@ -131,7 +136,7 @@
 	local_irq_enable_in_hardirq();
 	stat = hwif->tp_ops->read_status(hwif);
 
-	if (OK_STAT(stat, READY_STAT, BAD_STAT))
+	if (OK_STAT(stat, ATA_DRDY, BAD_STAT))
 		drive->mult_count = drive->mult_req;
 	else {
 		drive->mult_req = drive->mult_count = 0;
@@ -142,7 +147,7 @@
 }
 
 /*
- * set_geometry_intr() is invoked on completion of a WIN_SPECIFY cmd.
+ * set_geometry_intr() is invoked on completion of a ATA_CMD_INIT_DEV_PARAMS cmd.
  */
 static ide_startstop_t set_geometry_intr(ide_drive_t *drive)
 {
@@ -154,15 +159,15 @@
 
 	while (1) {
 		stat = hwif->tp_ops->read_status(hwif);
-		if ((stat & BUSY_STAT) == 0 || retries-- == 0)
+		if ((stat & ATA_BUSY) == 0 || retries-- == 0)
 			break;
 		udelay(10);
 	};
 
-	if (OK_STAT(stat, READY_STAT, BAD_STAT))
+	if (OK_STAT(stat, ATA_DRDY, BAD_STAT))
 		return ide_stopped;
 
-	if (stat & (ERR_STAT|DRQ_STAT))
+	if (stat & (ATA_ERR | ATA_DRQ))
 		return ide_error(drive, "set_geometry_intr", stat);
 
 	ide_set_handler(drive, &set_geometry_intr, WAIT_WORSTCASE, NULL);
@@ -170,7 +175,7 @@
 }
 
 /*
- * recal_intr() is invoked on completion of a WIN_RESTORE (recalibrate) cmd.
+ * recal_intr() is invoked on completion of a ATA_CMD_RESTORE (recalibrate) cmd.
  */
 static ide_startstop_t recal_intr(ide_drive_t *drive)
 {
@@ -180,7 +185,7 @@
 	local_irq_enable_in_hardirq();
 	stat = hwif->tp_ops->read_status(hwif);
 
-	if (!OK_STAT(stat, READY_STAT, BAD_STAT))
+	if (!OK_STAT(stat, ATA_DRDY, BAD_STAT))
 		return ide_error(drive, "recal_intr", stat);
 	return ide_stopped;
 }
@@ -197,7 +202,7 @@
 	local_irq_enable_in_hardirq();
 	stat = hwif->tp_ops->read_status(hwif);
 
-	if (!OK_STAT(stat, READY_STAT, BAD_STAT))
+	if (!OK_STAT(stat, ATA_DRDY, BAD_STAT))
 		return ide_error(drive, "task_no_data_intr", stat);
 		/* calls ide_end_drive_cmd */
 
@@ -220,13 +225,13 @@
 	for (retries = 0; retries < 1000; retries++) {
 		stat = hwif->tp_ops->read_status(hwif);
 
-		if (stat & BUSY_STAT)
+		if (stat & ATA_BUSY)
 			udelay(10);
 		else
 			break;
 	}
 
-	if (stat & BUSY_STAT)
+	if (stat & ATA_BUSY)
 		printk(KERN_ERR "%s: drive still BUSY!\n", drive->name);
 
 	return stat;
@@ -385,7 +390,7 @@
 static ide_startstop_t task_in_unexpected(ide_drive_t *drive, struct request *rq, u8 stat)
 {
 	/* Command all done? */
-	if (OK_STAT(stat, READY_STAT, BUSY_STAT)) {
+	if (OK_STAT(stat, ATA_DRDY, ATA_BUSY)) {
 		task_end_request(drive, rq, stat);
 		return ide_stopped;
 	}
@@ -405,11 +410,11 @@
 	u8 stat = hwif->tp_ops->read_status(hwif);
 
 	/* Error? */
-	if (stat & ERR_STAT)
+	if (stat & ATA_ERR)
 		return task_error(drive, rq, __func__, stat);
 
 	/* Didn't want any data? Odd. */
-	if (!(stat & DRQ_STAT))
+	if ((stat & ATA_DRQ) == 0)
 		return task_in_unexpected(drive, rq, stat);
 
 	ide_pio_datablock(drive, rq, 0);
@@ -442,7 +447,7 @@
 		return task_error(drive, rq, __func__, stat);
 
 	/* Deal with unexpected ATA data phase. */
-	if (((stat & DRQ_STAT) == 0) ^ !hwif->nleft)
+	if (((stat & ATA_DRQ) == 0) ^ !hwif->nleft)
 		return task_error(drive, rq, __func__, stat);
 
 	if (!hwif->nleft) {
@@ -461,7 +466,7 @@
 {
 	ide_startstop_t startstop;
 
-	if (ide_wait_stat(&startstop, drive, DRQ_STAT,
+	if (ide_wait_stat(&startstop, drive, ATA_DRQ,
 			  drive->bad_wstat, WAIT_DRQ)) {
 		printk(KERN_ERR "%s: no DRQ after issuing %sWRITE%s\n",
 				drive->name,
@@ -721,110 +726,3 @@
 	return err;
 }
 #endif
-
-int ide_cmd_ioctl (ide_drive_t *drive, unsigned int cmd, unsigned long arg)
-{
-	u8 *buf = NULL;
-	int bufsize = 0, err = 0;
-	u8 args[4], xfer_rate = 0;
-	ide_task_t tfargs;
-	struct ide_taskfile *tf = &tfargs.tf;
-	struct hd_driveid *id = drive->id;
-
-	if (NULL == (void *) arg) {
-		struct request *rq;
-
-		rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-		rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
-		err = blk_execute_rq(drive->queue, NULL, rq, 0);
-		blk_put_request(rq);
-
-		return err;
-	}
-
-	if (copy_from_user(args, (void __user *)arg, 4))
-		return -EFAULT;
-
-	memset(&tfargs, 0, sizeof(ide_task_t));
-	tf->feature = args[2];
-	if (args[0] == WIN_SMART) {
-		tf->nsect = args[3];
-		tf->lbal  = args[1];
-		tf->lbam  = 0x4f;
-		tf->lbah  = 0xc2;
-		tfargs.tf_flags = IDE_TFLAG_OUT_TF | IDE_TFLAG_IN_NSECT;
-	} else {
-		tf->nsect = args[1];
-		tfargs.tf_flags = IDE_TFLAG_OUT_FEATURE |
-				  IDE_TFLAG_OUT_NSECT | IDE_TFLAG_IN_NSECT;
-	}
-	tf->command = args[0];
-	tfargs.data_phase = args[3] ? TASKFILE_IN : TASKFILE_NO_DATA;
-
-	if (args[3]) {
-		tfargs.tf_flags |= IDE_TFLAG_IO_16BIT;
-		bufsize = SECTOR_WORDS * 4 * args[3];
-		buf = kzalloc(bufsize, GFP_KERNEL);
-		if (buf == NULL)
-			return -ENOMEM;
-	}
-
-	if (tf->command == WIN_SETFEATURES &&
-	    tf->feature == SETFEATURES_XFER &&
-	    tf->nsect >= XFER_SW_DMA_0 &&
-	    (id->dma_ultra || id->dma_mword || id->dma_1word)) {
-		xfer_rate = args[1];
-		if (tf->nsect > XFER_UDMA_2 && !eighty_ninty_three(drive)) {
-			printk(KERN_WARNING "%s: UDMA speeds >UDMA33 cannot "
-					    "be set\n", drive->name);
-			goto abort;
-		}
-	}
-
-	err = ide_raw_taskfile(drive, &tfargs, buf, args[3]);
-
-	args[0] = tf->status;
-	args[1] = tf->error;
-	args[2] = tf->nsect;
-
-	if (!err && xfer_rate) {
-		/* active-retuning-calls future */
-		ide_set_xfer_rate(drive, xfer_rate);
-		ide_driveid_update(drive);
-	}
-abort:
-	if (copy_to_user((void __user *)arg, &args, 4))
-		err = -EFAULT;
-	if (buf) {
-		if (copy_to_user((void __user *)(arg + 4), buf, bufsize))
-			err = -EFAULT;
-		kfree(buf);
-	}
-	return err;
-}
-
-int ide_task_ioctl (ide_drive_t *drive, unsigned int cmd, unsigned long arg)
-{
-	void __user *p = (void __user *)arg;
-	int err = 0;
-	u8 args[7];
-	ide_task_t task;
-
-	if (copy_from_user(args, p, 7))
-		return -EFAULT;
-
-	memset(&task, 0, sizeof(task));
-	memcpy(&task.tf_array[7], &args[1], 6);
-	task.tf.command = args[0];
-	task.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-
-	err = ide_no_data_taskfile(drive, &task);
-
-	args[0] = task.tf.command;
-	memcpy(&args[1], &task.tf_array[7], 6);
-
-	if (copy_to_user(p, args, 7))
-		err = -EFAULT;
-
-	return err;
-}
diff --git a/drivers/ide/ide-timings.c b/drivers/ide/ide-timings.c
index 8c2f832..81f527a 100644
--- a/drivers/ide/ide-timings.c
+++ b/drivers/ide/ide-timings.c
@@ -22,7 +22,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/module.h>
 
@@ -78,15 +77,15 @@
 
 u16 ide_pio_cycle_time(ide_drive_t *drive, u8 pio)
 {
-	struct hd_driveid *id = drive->id;
+	u16 *id = drive->id;
 	struct ide_timing *t = ide_timing_find_mode(XFER_PIO_0 + pio);
 	u16 cycle = 0;
 
-	if (id->field_valid & 2) {
-		if (id->capability & 8)
-			cycle = id->eide_pio_iordy;
+	if (id[ATA_ID_FIELD_VALID] & 2) {
+		if (ata_id_has_iordy(drive->id))
+			cycle = id[ATA_ID_EIDE_PIO_IORDY];
 		else
-			cycle = id->eide_pio;
+			cycle = id[ATA_ID_EIDE_PIO];
 
 		/* conservative "downgrade" for all pre-ATA2 drives */
 		if (pio < 3 && cycle < t->cycle)
@@ -138,7 +137,7 @@
 int ide_timing_compute(ide_drive_t *drive, u8 speed,
 		       struct ide_timing *t, int T, int UT)
 {
-	struct hd_driveid *id = drive->id;
+	u16 *id = drive->id;
 	struct ide_timing *s, p;
 
 	/*
@@ -157,16 +156,15 @@
 	 * If the drive is an EIDE drive, it can tell us it needs extended
 	 * PIO/MWDMA cycle timing.
 	 */
-	if (id && id->field_valid & 2) {	/* EIDE drive */
-
+	if (id[ATA_ID_FIELD_VALID] & 2) {	/* EIDE drive */
 		memset(&p, 0, sizeof(p));
 
 		if (speed <= XFER_PIO_2)
-			p.cycle = p.cyc8b = id->eide_pio;
+			p.cycle = p.cyc8b = id[ATA_ID_EIDE_PIO];
 		else if (speed <= XFER_PIO_5)
-			p.cycle = p.cyc8b = id->eide_pio_iordy;
+			p.cycle = p.cyc8b = id[ATA_ID_EIDE_PIO_IORDY];
 		else if (speed >= XFER_MW_DMA_0 && speed <= XFER_MW_DMA_2)
-			p.cycle = id->eide_dma_min;
+			p.cycle = id[ATA_ID_EIDE_DMA_MIN];
 
 		ide_timing_merge(&p, t, t, IDE_TIMING_CYCLE | IDE_TIMING_CYC8B);
 	}
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 7724516..9dcf5ae 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -44,8 +44,6 @@
  *  inspiration from lots of linux users, esp.  hamish@zot.apana.org.au
  */
 
-#define _IDE_C			/* Tell ide.h it's really us */
-
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/string.h>
@@ -58,6 +56,7 @@
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/ide.h>
+#include <linux/hdreg.h>
 #include <linux/completion.h>
 #include <linux/device.h>
 
@@ -97,8 +96,6 @@
 	hwif->name[2]	= 'e';
 	hwif->name[3]	= '0' + index;
 
-	hwif->bus_state	= BUSSTATE_ON;
-
 	init_completion(&hwif->gendev_rel_comp);
 
 	hwif->tp_ops = &default_tp_ops;
@@ -119,7 +116,7 @@
 		drive->media			= ide_disk;
 		drive->select.all		= (unit<<4)|0xa0;
 		drive->hwif			= hwif;
-		drive->ready_stat		= READY_STAT;
+		drive->ready_stat		= ATA_DRDY;
 		drive->bad_wstat		= BAD_W_STAT;
 		drive->special.b.recalibrate	= 1;
 		drive->special.b.set_geometry	= 1;
@@ -253,42 +250,9 @@
 
 DEFINE_MUTEX(ide_setting_mtx);
 
-EXPORT_SYMBOL_GPL(ide_setting_mtx);
+ide_devset_get(io_32bit, io_32bit);
 
-/**
- *	ide_spin_wait_hwgroup	-	wait for group
- *	@drive: drive in the group
- *
- *	Wait for an IDE device group to go non busy and then return
- *	holding the ide_lock which guards the hwgroup->busy status
- *	and right to use it.
- */
-
-int ide_spin_wait_hwgroup (ide_drive_t *drive)
-{
-	ide_hwgroup_t *hwgroup = HWGROUP(drive);
-	unsigned long timeout = jiffies + (3 * HZ);
-
-	spin_lock_irq(&ide_lock);
-
-	while (hwgroup->busy) {
-		unsigned long lflags;
-		spin_unlock_irq(&ide_lock);
-		local_irq_set(lflags);
-		if (time_after(jiffies, timeout)) {
-			local_irq_restore(lflags);
-			printk(KERN_ERR "%s: channel busy\n", drive->name);
-			return -EBUSY;
-		}
-		local_irq_restore(lflags);
-		spin_lock_irq(&ide_lock);
-	}
-	return 0;
-}
-
-EXPORT_SYMBOL(ide_spin_wait_hwgroup);
-
-int set_io_32bit(ide_drive_t *drive, int arg)
+static int set_io_32bit(ide_drive_t *drive, int arg)
 {
 	if (drive->no_io_32bit)
 		return -EPERM;
@@ -296,53 +260,39 @@
 	if (arg < 0 || arg > 1 + (SUPPORT_VLB_SYNC << 1))
 		return -EINVAL;
 
-	if (ide_spin_wait_hwgroup(drive))
-		return -EBUSY;
-
 	drive->io_32bit = arg;
 
-	spin_unlock_irq(&ide_lock);
-
 	return 0;
 }
 
+ide_devset_get(ksettings, keep_settings);
+
 static int set_ksettings(ide_drive_t *drive, int arg)
 {
 	if (arg < 0 || arg > 1)
 		return -EINVAL;
 
-	if (ide_spin_wait_hwgroup(drive))
-		return -EBUSY;
 	drive->keep_settings = arg;
-	spin_unlock_irq(&ide_lock);
 
 	return 0;
 }
 
-int set_using_dma(ide_drive_t *drive, int arg)
+ide_devset_get(using_dma, using_dma);
+
+static int set_using_dma(ide_drive_t *drive, int arg)
 {
 #ifdef CONFIG_BLK_DEV_IDEDMA
-	ide_hwif_t *hwif = drive->hwif;
 	int err = -EPERM;
 
 	if (arg < 0 || arg > 1)
 		return -EINVAL;
 
-	if (!drive->id || !(drive->id->capability & 1))
+	if (ata_id_has_dma(drive->id) == 0)
 		goto out;
 
-	if (hwif->dma_ops == NULL)
+	if (drive->hwif->dma_ops == NULL)
 		goto out;
 
-	err = -EBUSY;
-	if (ide_spin_wait_hwgroup(drive))
-		goto out;
-	/*
-	 * set ->busy flag, unlock and let it ride
-	 */
-	hwif->hwgroup->busy = 1;
-	spin_unlock_irq(&ide_lock);
-
 	err = 0;
 
 	if (arg) {
@@ -351,12 +301,6 @@
 	} else
 		ide_dma_off(drive);
 
-	/*
-	 * lock, clear ->busy flag and unlock before leaving
-	 */
-	spin_lock_irq(&ide_lock);
-	hwif->hwgroup->busy = 0;
-	spin_unlock_irq(&ide_lock);
 out:
 	return err;
 #else
@@ -367,7 +311,7 @@
 #endif
 }
 
-int set_pio_mode(ide_drive_t *drive, int arg)
+static int set_pio_mode(ide_drive_t *drive, int arg)
 {
 	struct request *rq;
 	ide_hwif_t *hwif = drive->hwif;
@@ -395,6 +339,8 @@
 	return 0;
 }
 
+ide_devset_get(unmaskirq, unmask);
+
 static int set_unmaskirq(ide_drive_t *drive, int arg)
 {
 	if (drive->no_unmask)
@@ -403,14 +349,20 @@
 	if (arg < 0 || arg > 1)
 		return -EINVAL;
 
-	if (ide_spin_wait_hwgroup(drive))
-		return -EBUSY;
 	drive->unmask = arg;
-	spin_unlock_irq(&ide_lock);
 
 	return 0;
 }
 
+#define ide_gen_devset_rw(_name, _func) \
+__IDE_DEVSET(_name, DS_SYNC, get_##_func, set_##_func)
+
+ide_gen_devset_rw(io_32bit, io_32bit);
+ide_gen_devset_rw(keepsettings, ksettings);
+ide_gen_devset_rw(unmaskirq, unmaskirq);
+ide_gen_devset_rw(using_dma, using_dma);
+__IDE_DEVSET(pio_mode, 0, NULL, set_pio_mode);
+
 static int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 {
 	ide_drive_t *drive = dev->driver_data;
@@ -486,138 +438,6 @@
 	return err;
 }
 
-static int generic_drive_reset(ide_drive_t *drive)
-{
-	struct request *rq;
-	int ret = 0;
-
-	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->cmd_len = 1;
-	rq->cmd[0] = REQ_DRIVE_RESET;
-	rq->cmd_flags |= REQ_SOFTBARRIER;
-	if (blk_execute_rq(drive->queue, NULL, rq, 1))
-		ret = rq->errors;
-	blk_put_request(rq);
-	return ret;
-}
-
-int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device *bdev,
-			unsigned int cmd, unsigned long arg)
-{
-	unsigned long flags;
-	ide_driver_t *drv;
-	void __user *p = (void __user *)arg;
-	int err = 0, (*setfunc)(ide_drive_t *, int);
-	u8 *val;
-
-	switch (cmd) {
-	case HDIO_GET_32BIT:	    val = &drive->io_32bit;	 goto read_val;
-	case HDIO_GET_KEEPSETTINGS: val = &drive->keep_settings; goto read_val;
-	case HDIO_GET_UNMASKINTR:   val = &drive->unmask;	 goto read_val;
-	case HDIO_GET_DMA:	    val = &drive->using_dma;	 goto read_val;
-	case HDIO_SET_32BIT:	    setfunc = set_io_32bit;	 goto set_val;
-	case HDIO_SET_KEEPSETTINGS: setfunc = set_ksettings;	 goto set_val;
-	case HDIO_SET_PIO_MODE:	    setfunc = set_pio_mode;	 goto set_val;
-	case HDIO_SET_UNMASKINTR:   setfunc = set_unmaskirq;	 goto set_val;
-	case HDIO_SET_DMA:	    setfunc = set_using_dma;	 goto set_val;
-	}
-
-	switch (cmd) {
-		case HDIO_OBSOLETE_IDENTITY:
-		case HDIO_GET_IDENTITY:
-			if (bdev != bdev->bd_contains)
-				return -EINVAL;
-			if (drive->id_read == 0)
-				return -ENOMSG;
-			if (copy_to_user(p, drive->id, (cmd == HDIO_GET_IDENTITY) ? sizeof(*drive->id) : 142))
-				return -EFAULT;
-			return 0;
-
-		case HDIO_GET_NICE:
-			return put_user(drive->dsc_overlap	<<	IDE_NICE_DSC_OVERLAP	|
-					drive->atapi_overlap	<<	IDE_NICE_ATAPI_OVERLAP	|
-					drive->nice1 << IDE_NICE_1,
-					(long __user *) arg);
-#ifdef CONFIG_IDE_TASK_IOCTL
-		case HDIO_DRIVE_TASKFILE:
-		        if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
-				return -EACCES;
-			switch(drive->media) {
-				case ide_disk:
-					return ide_taskfile_ioctl(drive, cmd, arg);
-				default:
-					return -ENOMSG;
-			}
-#endif /* CONFIG_IDE_TASK_IOCTL */
-
-		case HDIO_DRIVE_CMD:
-			if (!capable(CAP_SYS_RAWIO))
-				return -EACCES;
-			return ide_cmd_ioctl(drive, cmd, arg);
-
-		case HDIO_DRIVE_TASK:
-			if (!capable(CAP_SYS_RAWIO))
-				return -EACCES;
-			return ide_task_ioctl(drive, cmd, arg);
-		case HDIO_SET_NICE:
-			if (!capable(CAP_SYS_ADMIN)) return -EACCES;
-			if (arg != (arg & ((1 << IDE_NICE_DSC_OVERLAP) | (1 << IDE_NICE_1))))
-				return -EPERM;
-			drive->dsc_overlap = (arg >> IDE_NICE_DSC_OVERLAP) & 1;
-			drv = *(ide_driver_t **)bdev->bd_disk->private_data;
-			if (drive->dsc_overlap && !drv->supports_dsc_overlap) {
-				drive->dsc_overlap = 0;
-				return -EPERM;
-			}
-			drive->nice1 = (arg >> IDE_NICE_1) & 1;
-			return 0;
-		case HDIO_DRIVE_RESET:
-			if (!capable(CAP_SYS_ADMIN))
-				return -EACCES;
-
-			return generic_drive_reset(drive);
-
-		case HDIO_GET_BUSSTATE:
-			if (!capable(CAP_SYS_ADMIN))
-				return -EACCES;
-			if (put_user(HWIF(drive)->bus_state, (long __user *)arg))
-				return -EFAULT;
-			return 0;
-
-		case HDIO_SET_BUSSTATE:
-			if (!capable(CAP_SYS_ADMIN))
-				return -EACCES;
-			return -EOPNOTSUPP;
-		default:
-			return -EINVAL;
-	}
-
-read_val:
-	mutex_lock(&ide_setting_mtx);
-	spin_lock_irqsave(&ide_lock, flags);
-	err = *val;
-	spin_unlock_irqrestore(&ide_lock, flags);
-	mutex_unlock(&ide_setting_mtx);
-	return err >= 0 ? put_user(err, (long __user *)arg) : err;
-
-set_val:
-	if (bdev != bdev->bd_contains)
-		err = -EINVAL;
-	else {
-		if (!capable(CAP_SYS_ADMIN))
-			err = -EACCES;
-		else {
-			mutex_lock(&ide_setting_mtx);
-			err = setfunc(drive, arg);
-			mutex_unlock(&ide_setting_mtx);
-		}
-	}
-	return err;
-}
-
-EXPORT_SYMBOL(generic_ide_ioctl);
-
 /**
  * ide_device_get	-	get an additional reference to a ide_drive_t
  * @drive:	device to get a reference to
@@ -710,21 +530,21 @@
 			  char *buf)
 {
 	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "%s\n", drive->id->model);
+	return sprintf(buf, "%s\n", (char *)&drive->id[ATA_ID_PROD]);
 }
 
 static ssize_t firmware_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
 	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "%s\n", drive->id->fw_rev);
+	return sprintf(buf, "%s\n", (char *)&drive->id[ATA_ID_FW_REV]);
 }
 
 static ssize_t serial_show(struct device *dev, struct device_attribute *attr,
 			   char *buf)
 {
 	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "%s\n", drive->id->serial_no);
+	return sprintf(buf, "%s\n", (char *)&drive->id[ATA_ID_SERNO]);
 }
 
 static struct device_attribute ide_dev_attrs[] = {
@@ -841,7 +661,7 @@
 static unsigned int ide_nowerr;
 
 module_param_call(nowerr, ide_set_dev_param_mask, NULL, &ide_nowerr, 0);
-MODULE_PARM_DESC(nowerr, "ignore the WRERR_STAT bit for a device");
+MODULE_PARM_DESC(nowerr, "ignore the ATA_DF bit for a device");
 
 static unsigned int ide_cdroms;
 
@@ -906,7 +726,7 @@
 		drive->noprobe = 1;
 	}
 	if (ide_nowerr & (1 << i)) {
-		printk(KERN_INFO "ide: ignoring the WRERR_STAT bit for %s\n",
+		printk(KERN_INFO "ide: ignoring the ATA_DF bit for %s\n",
 				 drive->name);
 		drive->bad_wstat = BAD_R_STAT;
 	}
@@ -927,7 +747,7 @@
 				 drive->cyl, drive->head, drive->sect);
 		drive->present = 1;
 		drive->media = ide_disk;
-		drive->ready_stat = READY_STAT;
+		drive->ready_stat = ATA_DRDY;
 	}
 }
 
diff --git a/drivers/ide/legacy/ali14xx.c b/drivers/ide/legacy/ali14xx.c
index 4ec1973..7276c96 100644
--- a/drivers/ide/legacy/ali14xx.c
+++ b/drivers/ide/legacy/ali14xx.c
@@ -43,7 +43,6 @@
 #include <linux/mm.h>
 #include <linux/ioport.h>
 #include <linux/blkdev.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
diff --git a/drivers/ide/legacy/buddha.c b/drivers/ide/legacy/buddha.c
index 7c2afa9..c5a3c9e 100644
--- a/drivers/ide/legacy/buddha.c
+++ b/drivers/ide/legacy/buddha.c
@@ -20,7 +20,6 @@
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/blkdev.h>
-#include <linux/hdreg.h>
 #include <linux/zorro.h>
 #include <linux/ide.h>
 #include <linux/init.h>
diff --git a/drivers/ide/legacy/dtc2278.c b/drivers/ide/legacy/dtc2278.c
index af791a0..689b2e4 100644
--- a/drivers/ide/legacy/dtc2278.c
+++ b/drivers/ide/legacy/dtc2278.c
@@ -10,7 +10,6 @@
 #include <linux/mm.h>
 #include <linux/ioport.h>
 #include <linux/blkdev.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
diff --git a/drivers/ide/legacy/falconide.c b/drivers/ide/legacy/falconide.c
index 724f950..39d500d 100644
--- a/drivers/ide/legacy/falconide.c
+++ b/drivers/ide/legacy/falconide.c
@@ -13,7 +13,6 @@
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/blkdev.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
diff --git a/drivers/ide/legacy/gayle.c b/drivers/ide/legacy/gayle.c
index 51ba085..6915068 100644
--- a/drivers/ide/legacy/gayle.c
+++ b/drivers/ide/legacy/gayle.c
@@ -12,7 +12,6 @@
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/blkdev.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 #include <linux/zorro.h>
diff --git a/drivers/ide/legacy/ht6560b.c b/drivers/ide/legacy/ht6560b.c
index 98f7c95e..5123ea2 100644
--- a/drivers/ide/legacy/ht6560b.c
+++ b/drivers/ide/legacy/ht6560b.c
@@ -24,7 +24,6 @@
 #include <linux/mm.h>
 #include <linux/ioport.h>
 #include <linux/blkdev.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
diff --git a/drivers/ide/legacy/ide-cs.c b/drivers/ide/legacy/ide-cs.c
index 21bfac1..ee6fc30 100644
--- a/drivers/ide/legacy/ide-cs.c
+++ b/drivers/ide/legacy/ide-cs.c
@@ -38,7 +38,6 @@
 #include <linux/timer.h>
 #include <linux/ioport.h>
 #include <linux/ide.h>
-#include <linux/hdreg.h>
 #include <linux/major.h>
 #include <linux/delay.h>
 #include <asm/io.h>
diff --git a/drivers/ide/legacy/macide.c b/drivers/ide/legacy/macide.c
index a0bb167..43f97cc 100644
--- a/drivers/ide/legacy/macide.c
+++ b/drivers/ide/legacy/macide.c
@@ -15,7 +15,6 @@
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/blkdev.h>
-#include <linux/hdreg.h>
 #include <linux/delay.h>
 #include <linux/ide.h>
 
diff --git a/drivers/ide/legacy/q40ide.c b/drivers/ide/legacy/q40ide.c
index 4abd8fc..4af4a8c 100644
--- a/drivers/ide/legacy/q40ide.c
+++ b/drivers/ide/legacy/q40ide.c
@@ -14,8 +14,6 @@
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/blkdev.h>
-#include <linux/hdreg.h>
-
 #include <linux/ide.h>
 
     /*
diff --git a/drivers/ide/legacy/qd65xx.c b/drivers/ide/legacy/qd65xx.c
index 2338f34..ec408b3 100644
--- a/drivers/ide/legacy/qd65xx.c
+++ b/drivers/ide/legacy/qd65xx.c
@@ -27,7 +27,6 @@
 #include <linux/mm.h>
 #include <linux/ioport.h>
 #include <linux/blkdev.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 #include <asm/system.h>
@@ -151,12 +150,14 @@
 		int *active_time, int *recovery_time)
 {
 	struct qd65xx_timing_s *p;
-	char model[40];
+	char *m = (char *)&drive->id[ATA_ID_PROD];
+	char model[ATA_ID_PROD_LEN];
 
-	if (!*drive->id->model) return 0;
+	if (*m == 0)
+		return 0;
 
-	strncpy(model,drive->id->model,40);
-	ide_fixstring(model,40,1); /* byte-swap */
+	strncpy(model, m, ATA_ID_PROD_LEN);
+	ide_fixstring(model, ATA_ID_PROD_LEN, 1); /* byte-swap */
 
 	for (p = qd65xx_timing ; p->offset != -1 ; p++) {
 		if (!strncmp(p->model, model+p->offset, 4)) {
@@ -185,20 +186,20 @@
 
 static void qd6500_set_pio_mode(ide_drive_t *drive, const u8 pio)
 {
+	u16 *id = drive->id;
 	int active_time   = 175;
 	int recovery_time = 415; /* worst case values from the dos driver */
 
 	/*
 	 * FIXME: use "pio" value
 	 */
-	if (drive->id && !qd_find_disk_type(drive, &active_time, &recovery_time)
-		&& drive->id->tPIO && (drive->id->field_valid & 0x02)
-		&& drive->id->eide_pio >= 240) {
-
+	if (!qd_find_disk_type(drive, &active_time, &recovery_time) &&
+	    (id[ATA_ID_OLD_PIO_MODES] & 0xff) && (id[ATA_ID_FIELD_VALID] & 2) &&
+	    id[ATA_ID_EIDE_PIO] >= 240) {
 		printk(KERN_INFO "%s: PIO mode%d\n", drive->name,
-				drive->id->tPIO);
+			id[ATA_ID_OLD_PIO_MODES] & 0xff);
 		active_time = 110;
-		recovery_time = drive->id->eide_pio - 120;
+		recovery_time = drive->id[ATA_ID_EIDE_PIO] - 120;
 	}
 
 	qd_set_timing(drive, qd6500_compute_timing(HWIF(drive), active_time, recovery_time));
diff --git a/drivers/ide/legacy/umc8672.c b/drivers/ide/legacy/umc8672.c
index b54a14a..1da076e 100644
--- a/drivers/ide/legacy/umc8672.c
+++ b/drivers/ide/legacy/umc8672.c
@@ -45,7 +45,6 @@
 #include <linux/mm.h>
 #include <linux/ioport.h>
 #include <linux/blkdev.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
diff --git a/drivers/ide/pci/aec62xx.c b/drivers/ide/pci/aec62xx.c
index 3187215..e7475ba 100644
--- a/drivers/ide/pci/aec62xx.c
+++ b/drivers/ide/pci/aec62xx.c
@@ -7,7 +7,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/pci.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
@@ -140,7 +139,7 @@
 	drive->hwif->port_ops->set_dma_mode(drive, pio + XFER_PIO_0);
 }
 
-static unsigned int __devinit init_chipset_aec62xx(struct pci_dev *dev)
+static unsigned int init_chipset_aec62xx(struct pci_dev *dev)
 {
 	/* These are necessary to get AEC6280 Macintosh cards to work */
 	if ((dev->device == PCI_DEVICE_ID_ARTOP_ATP865) ||
@@ -308,6 +307,8 @@
 	.id_table	= aec62xx_pci_tbl,
 	.probe		= aec62xx_init_one,
 	.remove		= __devexit_p(aec62xx_remove),
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init aec62xx_ide_init(void)
diff --git a/drivers/ide/pci/alim15x3.c b/drivers/ide/pci/alim15x3.c
index d647526..053c752 100644
--- a/drivers/ide/pci/alim15x3.c
+++ b/drivers/ide/pci/alim15x3.c
@@ -31,7 +31,6 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 #include <linux/dmi.h>
@@ -134,8 +133,8 @@
 	if (m5229_revision > 0x20 && m5229_revision < 0xC2) {
 		if (drive->media != ide_disk)
 			return 0;
-		if (chip_is_1543c_e && strstr(drive->id->model, "WDC ") &&
-		    wdc_udma == 0)
+		if (wdc_udma == 0 && chip_is_1543c_e &&
+		    strstr((char *)&drive->id[ATA_ID_PROD], "WDC "))
 			return 0;
 	}
 
@@ -214,7 +213,7 @@
  *	appropriate also sets up the 1533 southbridge.
  */
 
-static unsigned int __devinit init_chipset_ali15x3(struct pci_dev *dev)
+static unsigned int init_chipset_ali15x3(struct pci_dev *dev)
 {
 	unsigned long flags;
 	u8 tmpbyte;
@@ -582,6 +581,8 @@
 	.id_table	= alim15x3_pci_tbl,
 	.probe		= alim15x3_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init ali15x3_ide_init(void)
diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c
index 1e66a96..824471f 100644
--- a/drivers/ide/pci/amd74xx.c
+++ b/drivers/ide/pci/amd74xx.c
@@ -112,13 +112,13 @@
 	amd_set_drive(drive, XFER_PIO_0 + pio);
 }
 
-static void __devinit amd7409_cable_detect(struct pci_dev *dev)
+static void amd7409_cable_detect(struct pci_dev *dev)
 {
 	/* no host side cable detection */
 	amd_80w = 0x03;
 }
 
-static void __devinit amd7411_cable_detect(struct pci_dev *dev)
+static void amd7411_cable_detect(struct pci_dev *dev)
 {
 	int i;
 	u32 u = 0;
@@ -140,7 +140,7 @@
  * The initialization callback.  Initialize drive independent registers.
  */
 
-static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev)
+static unsigned int init_chipset_amd74xx(struct pci_dev *dev)
 {
 	u8 t = 0, offset = amd_offset(dev);
 
@@ -324,6 +324,8 @@
 	.id_table	= amd74xx_pci_tbl,
 	.probe		= amd74xx_probe,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init amd74xx_ide_init(void)
diff --git a/drivers/ide/pci/atiixp.c b/drivers/ide/pci/atiixp.c
index 41f6cb6..e443703 100644
--- a/drivers/ide/pci/atiixp.c
+++ b/drivers/ide/pci/atiixp.c
@@ -7,7 +7,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
@@ -188,6 +187,8 @@
 	.id_table	= atiixp_pci_tbl,
 	.probe		= atiixp_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init atiixp_ide_init(void)
diff --git a/drivers/ide/pci/cmd640.c b/drivers/ide/pci/cmd640.c
index e6c6200..7f39cdb 100644
--- a/drivers/ide/pci/cmd640.c
+++ b/drivers/ide/pci/cmd640.c
@@ -103,7 +103,6 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
@@ -375,6 +374,21 @@
 }
 #endif
 
+static void __set_prefetch_mode(ide_drive_t *drive, int mode)
+{
+	if (mode) {	/* want prefetch on? */
+#if CMD640_PREFETCH_MASKS
+		drive->no_unmask = 1;
+		drive->unmask = 0;
+#endif
+		drive->no_io_32bit = 0;
+	} else {
+		drive->no_unmask = 0;
+		drive->no_io_32bit = 1;
+		drive->io_32bit = 0;
+	}
+}
+
 #ifndef CONFIG_BLK_DEV_CMD640_ENHANCED
 /*
  * Check whether prefetch is on for a drive,
@@ -384,19 +398,10 @@
 {
 	u8 b = get_cmd640_reg(prefetch_regs[index]);
 
-	if (b & prefetch_masks[index]) {	/* is prefetch off? */
-		drive->no_unmask = 0;
-		drive->no_io_32bit = 1;
-		drive->io_32bit = 0;
-	} else {
-#if CMD640_PREFETCH_MASKS
-		drive->no_unmask = 1;
-		drive->unmask = 0;
-#endif
-		drive->no_io_32bit = 0;
-	}
+	__set_prefetch_mode(drive, (b & prefetch_masks[index]) ? 0 : 1);
 }
 #else
+
 /*
  * Sets prefetch mode for a drive.
  */
@@ -408,19 +413,11 @@
 
 	spin_lock_irqsave(&cmd640_lock, flags);
 	b = __get_cmd640_reg(reg);
-	if (mode) {	/* want prefetch on? */
-#if CMD640_PREFETCH_MASKS
-		drive->no_unmask = 1;
-		drive->unmask = 0;
-#endif
-		drive->no_io_32bit = 0;
+	__set_prefetch_mode(drive, mode);
+	if (mode)
 		b &= ~prefetch_masks[index];	/* enable prefetch */
-	} else {
-		drive->no_unmask = 0;
-		drive->no_io_32bit = 1;
-		drive->io_32bit = 0;
+	else
 		b |= prefetch_masks[index];	/* disable prefetch */
-	}
 	__put_cmd640_reg(reg, b);
 	spin_unlock_irqrestore(&cmd640_lock, flags);
 }
diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c
index e064398..456dee1 100644
--- a/drivers/ide/pci/cmd64x.c
+++ b/drivers/ide/pci/cmd64x.c
@@ -13,7 +13,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/pci.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
@@ -332,7 +331,7 @@
 	return (dma_stat & 7) != 4;
 }
 
-static unsigned int __devinit init_chipset_cmd64x(struct pci_dev *dev)
+static unsigned int init_chipset_cmd64x(struct pci_dev *dev)
 {
 	u8 mrdmode = 0;
 
@@ -511,6 +510,8 @@
 	.id_table	= cmd64x_pci_tbl,
 	.probe		= cmd64x_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init cmd64x_ide_init(void)
diff --git a/drivers/ide/pci/cs5520.c b/drivers/ide/pci/cs5520.c
index 151844f..d6341f7 100644
--- a/drivers/ide/pci/cs5520.c
+++ b/drivers/ide/pci/cs5520.c
@@ -35,7 +35,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/hdreg.h>
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/ide.h>
@@ -150,6 +149,8 @@
 	.name		= "Cyrix_IDE",
 	.id_table	= cs5520_pci_tbl,
 	.probe		= cs5520_init_one,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init cs5520_ide_init(void)
diff --git a/drivers/ide/pci/cs5530.c b/drivers/ide/pci/cs5530.c
index f235db8..da42fa7 100644
--- a/drivers/ide/pci/cs5530.c
+++ b/drivers/ide/pci/cs5530.c
@@ -15,7 +15,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/hdreg.h>
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/ide.h>
@@ -81,17 +80,19 @@
 static u8 cs5530_udma_filter(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	ide_drive_t *mate = &hwif->drives[(drive->dn & 1) ^ 1];
-	struct hd_driveid *mateid = mate->id;
+	ide_drive_t *mate = ide_get_pair_dev(drive);
+	u16 *mateid = mate->id;
 	u8 mask = hwif->ultra_mask;
 
-	if (mate->present == 0)
+	if (mate == NULL)
 		goto out;
 
-	if ((mateid->capability & 1) && __ide_dma_bad_drive(mate) == 0) {
-		if ((mateid->field_valid & 4) && (mateid->dma_ultra & 7))
+	if (ata_id_has_dma(mateid) && __ide_dma_bad_drive(mate) == 0) {
+		if ((mateid[ATA_ID_FIELD_VALID] & 4) &&
+		    (mateid[ATA_ID_UDMA_MODES] & 7))
 			goto out;
-		if ((mateid->field_valid & 2) && (mateid->dma_mword & 7))
+		if ((mateid[ATA_ID_FIELD_VALID] & 2) &&
+		    (mateid[ATA_ID_MWDMA_MODES] & 7))
 			mask = 0;
 	}
 out:
@@ -133,7 +134,7 @@
  *	Initialize the cs5530 bridge for reliable IDE DMA operation.
  */
 
-static unsigned int __devinit init_chipset_cs5530(struct pci_dev *dev)
+static unsigned int init_chipset_cs5530(struct pci_dev *dev)
 {
 	struct pci_dev *master_0 = NULL, *cs5530_0 = NULL;
 
@@ -271,6 +272,8 @@
 	.id_table	= cs5530_pci_tbl,
 	.probe		= cs5530_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init cs5530_ide_init(void)
diff --git a/drivers/ide/pci/cs5535.c b/drivers/ide/pci/cs5535.c
index dd3dc23..1e5bc59 100644
--- a/drivers/ide/pci/cs5535.c
+++ b/drivers/ide/pci/cs5535.c
@@ -80,12 +80,12 @@
 
 	/* Set the PIO timings */
 	if (speed < XFER_SW_DMA_0) {
-		ide_drive_t *pair = ide_get_paired_drive(drive);
+		ide_drive_t *pair = ide_get_pair_dev(drive);
 		u8 cmd, pioa;
 
 		cmd = pioa = speed - XFER_PIO_0;
 
-		if (pair->present) {
+		if (pair) {
 			u8 piob = ide_get_best_pio_mode(pair, 255, 4);
 
 			if (piob < cmd)
@@ -193,10 +193,12 @@
 MODULE_DEVICE_TABLE(pci, cs5535_pci_tbl);
 
 static struct pci_driver driver = {
-	.name       = "CS5535_IDE",
-	.id_table   = cs5535_pci_tbl,
-	.probe      = cs5535_init_one,
-	.remove     = ide_pci_remove,
+	.name		= "CS5535_IDE",
+	.id_table	= cs5535_pci_tbl,
+	.probe		= cs5535_init_one,
+	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init cs5535_ide_init(void)
diff --git a/drivers/ide/pci/cy82c693.c b/drivers/ide/pci/cy82c693.c
index e6d8ee8..69820e9 100644
--- a/drivers/ide/pci/cy82c693.c
+++ b/drivers/ide/pci/cy82c693.c
@@ -332,7 +332,7 @@
 /*
  * this function is called during init and is used to setup the cy82c693 chip
  */
-static unsigned int __devinit init_chipset_cy82c693(struct pci_dev *dev)
+static unsigned int init_chipset_cy82c693(struct pci_dev *dev)
 {
 	if (PCI_FUNC(dev->devfn) != 1)
 		return 0;
@@ -448,6 +448,8 @@
 	.id_table	= cy82c693_pci_tbl,
 	.probe		= cy82c693_init_one,
 	.remove		= __devexit_p(cy82c693_remove),
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init cy82c693_ide_init(void)
diff --git a/drivers/ide/pci/delkin_cb.c b/drivers/ide/pci/delkin_cb.c
index f84bfb4..83b63b3 100644
--- a/drivers/ide/pci/delkin_cb.c
+++ b/drivers/ide/pci/delkin_cb.c
@@ -19,7 +19,6 @@
 
 #include <linux/types.h>
 #include <linux/module.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 #include <linux/pci.h>
diff --git a/drivers/ide/pci/generic.c b/drivers/ide/pci/generic.c
index b07d4f4..092b238 100644
--- a/drivers/ide/pci/generic.c
+++ b/drivers/ide/pci/generic.c
@@ -22,7 +22,6 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/hdreg.h>
 #include <linux/pci.h>
 #include <linux/ide.h>
 #include <linux/init.h>
@@ -172,6 +171,8 @@
 	.id_table	= generic_pci_tbl,
 	.probe		= generic_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init generic_ide_init(void)
diff --git a/drivers/ide/pci/hpt34x.c b/drivers/ide/pci/hpt34x.c
index 6009b0b..644de29 100644
--- a/drivers/ide/pci/hpt34x.c
+++ b/drivers/ide/pci/hpt34x.c
@@ -27,7 +27,6 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/ioport.h>
-#include <linux/hdreg.h>
 #include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/init.h>
@@ -79,7 +78,7 @@
  */
 #define	HPT34X_PCI_INIT_REG		0x80
 
-static unsigned int __devinit init_chipset_hpt34x(struct pci_dev *dev)
+static unsigned int init_chipset_hpt34x(struct pci_dev *dev)
 {
 	int i = 0;
 	unsigned long hpt34xIoBase = pci_resource_start(dev, 4);
@@ -172,6 +171,8 @@
 	.id_table	= hpt34x_pci_tbl,
 	.probe		= hpt34x_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init hpt34x_ide_init(void)
diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c
index c37ab17..a194022 100644
--- a/drivers/ide/pci/hpt366.c
+++ b/drivers/ide/pci/hpt366.c
@@ -122,7 +122,6 @@
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/blkdev.h>
-#include <linux/hdreg.h>
 #include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/init.h>
@@ -605,10 +604,10 @@
 
 static int check_in_drive_list(ide_drive_t *drive, const char **list)
 {
-	struct hd_driveid *id = drive->id;
+	char *m = (char *)&drive->id[ATA_ID_PROD];
 
 	while (*list)
-		if (!strcmp(*list++,id->model))
+		if (!strcmp(*list++, m))
 			return 1;
 	return 0;
 }
@@ -655,7 +654,7 @@
 	case HPT372A:
 	case HPT372N:
 	case HPT374 :
-		if (ide_dev_is_sata(drive->id))
+		if (ata_id_is_sata(drive->id))
 			mask &= ~0x0e;
 		/* Fall thru */
 	default:
@@ -675,7 +674,7 @@
 	case HPT372A:
 	case HPT372N:
 	case HPT374 :
-		if (ide_dev_is_sata(drive->id))
+		if (ata_id_is_sata(drive->id))
 			return 0x00;
 		/* Fall thru */
 	default:
@@ -731,11 +730,11 @@
 
 static void hpt3xx_quirkproc(ide_drive_t *drive)
 {
-	struct hd_driveid *id	= drive->id;
+	char *m			= (char *)&drive->id[ATA_ID_PROD];
 	const  char **list	= quirk_drives;
 
 	while (*list)
-		if (strstr(id->model, *list++)) {
+		if (strstr(m, *list++)) {
 			drive->quirk_list = 1;
 			return;
 		}
@@ -944,7 +943,7 @@
  *	Perform a calibration cycle on the DPLL.
  *	Returns 1 if this succeeds
  */
-static int __devinit hpt37x_calibrate_dpll(struct pci_dev *dev, u16 f_low, u16 f_high)
+static int hpt37x_calibrate_dpll(struct pci_dev *dev, u16 f_low, u16 f_high)
 {
 	u32 dpll = (f_high << 16) | f_low | 0x100;
 	u8  scr2;
@@ -972,7 +971,37 @@
 	return 1;
 }
 
-static unsigned int __devinit init_chipset_hpt366(struct pci_dev *dev)
+static void hpt3xx_disable_fast_irq(struct pci_dev *dev, u8 mcr_addr)
+{
+	struct ide_host *host	= pci_get_drvdata(dev);
+	struct hpt_info *info	= host->host_priv + (&dev->dev == host->dev[1]);
+	u8  chip_type		= info->chip_type;
+	u8  new_mcr, old_mcr	= 0;
+
+	/*
+	 * Disable the "fast interrupt" prediction.  Don't hold off
+	 * on interrupts. (== 0x01 despite what the docs say)
+	 */
+	pci_read_config_byte(dev, mcr_addr + 1, &old_mcr);
+
+	if (chip_type >= HPT374)
+		new_mcr = old_mcr & ~0x07;
+	else if (chip_type >= HPT370) {
+		new_mcr = old_mcr;
+		new_mcr &= ~0x02;
+#ifdef HPT_DELAY_INTERRUPT
+		new_mcr &= ~0x01;
+#else
+		new_mcr |=  0x01;
+#endif
+	} else					/* HPT366 and HPT368  */
+		new_mcr = old_mcr & ~0x80;
+
+	if (new_mcr != old_mcr)
+		pci_write_config_byte(dev, mcr_addr + 1, new_mcr);
+}
+
+static unsigned int init_chipset_hpt366(struct pci_dev *dev)
 {
 	unsigned long io_base	= pci_resource_start(dev, 4);
 	struct hpt_info *info	= hpt3xx_get_info(&dev->dev);
@@ -1209,9 +1238,11 @@
 	 * NOTE: This register is only writeable via I/O space.
 	 */
 	if (chip_type == HPT371N && clock == ATA_CLOCK_66MHZ)
-
 		outb(inb(io_base + 0x9c) | 0x04, io_base + 0x9c);
 
+	hpt3xx_disable_fast_irq(dev, 0x50);
+	hpt3xx_disable_fast_irq(dev, 0x54);
+
 	return dev->irq;
 }
 
@@ -1265,7 +1296,6 @@
 	struct hpt_info *info	= hpt3xx_get_info(hwif->dev);
 	int serialize		= HPT_SERIALIZE_IO;
 	u8  chip_type		= info->chip_type;
-	u8  new_mcr, old_mcr	= 0;
 
 	/* Cache the channel's MISC. control registers' offset */
 	hwif->select_data	= hwif->channel ? 0x54 : 0x50;
@@ -1288,29 +1318,6 @@
 	/* Serialize access to this device if needed */
 	if (serialize && hwif->mate)
 		hwif->serialized = hwif->mate->serialized = 1;
-
-	/*
-	 * Disable the "fast interrupt" prediction.  Don't hold off
-	 * on interrupts. (== 0x01 despite what the docs say)
-	 */
-	pci_read_config_byte(dev, hwif->select_data + 1, &old_mcr);
-
-	if (info->chip_type >= HPT374)
-		new_mcr = old_mcr & ~0x07;
-	else if (info->chip_type >= HPT370) {
-		new_mcr = old_mcr;
-		new_mcr &= ~0x02;
-
-#ifdef HPT_DELAY_INTERRUPT
-		new_mcr &= ~0x01;
-#else
-		new_mcr |=  0x01;
-#endif
-	} else					/* HPT366 and HPT368  */
-		new_mcr = old_mcr & ~0x80;
-
-	if (new_mcr != old_mcr)
-		pci_write_config_byte(dev, hwif->select_data + 1, new_mcr);
 }
 
 static int __devinit init_dma_hpt366(ide_hwif_t *hwif,
@@ -1620,6 +1627,8 @@
 	.id_table	= hpt366_pci_tbl,
 	.probe		= hpt366_init_one,
 	.remove		= __devexit_p(hpt366_remove),
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init hpt366_ide_init(void)
diff --git a/drivers/ide/pci/it8213.c b/drivers/ide/pci/it8213.c
index 652e47d..0954ccd 100644
--- a/drivers/ide/pci/it8213.c
+++ b/drivers/ide/pci/it8213.c
@@ -10,7 +10,6 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/pci.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
@@ -195,6 +194,8 @@
 	.id_table	= it8213_pci_tbl,
 	.probe		= it8213_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init it8213_ide_init(void)
diff --git a/drivers/ide/pci/it821x.c b/drivers/ide/pci/it821x.c
index 4a1508a..46edd08 100644
--- a/drivers/ide/pci/it821x.c
+++ b/drivers/ide/pci/it821x.c
@@ -63,7 +63,6 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/pci.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
@@ -446,8 +445,7 @@
 static void it821x_quirkproc(ide_drive_t *drive)
 {
 	struct it821x_dev *itdev = ide_get_hwifdata(drive->hwif);
-	struct hd_driveid *id = drive->id;
-	u16 *idbits = (u16 *)drive->id;
+	u16 *id = drive->id;
 
 	if (!itdev->smart) {
 		/*
@@ -466,36 +464,36 @@
 	 */
 
 		/* Check for RAID v native */
-		if(strstr(id->model, "Integrated Technology Express")) {
+		if (strstr((char *)&id[ATA_ID_PROD],
+			   "Integrated Technology Express")) {
 			/* In raid mode the ident block is slightly buggy
 			   We need to set the bits so that the IDE layer knows
 			   LBA28. LBA48 and DMA ar valid */
-			id->capability |= 3;		/* LBA28, DMA */
-			id->command_set_2 |= 0x0400;	/* LBA48 valid */
-			id->cfs_enable_2 |= 0x0400;	/* LBA48 on */
+			id[ATA_ID_CAPABILITY]    |= (3 << 8); /* LBA28, DMA */
+			id[ATA_ID_COMMAND_SET_2] |= 0x0400;   /* LBA48 valid */
+			id[ATA_ID_CFS_ENABLE_2]  |= 0x0400;   /* LBA48 on */
 			/* Reporting logic */
 			printk(KERN_INFO "%s: IT8212 %sRAID %d volume",
-				drive->name,
-				idbits[147] ? "Bootable ":"",
-				idbits[129]);
-				if(idbits[129] != 1)
-					printk("(%dK stripe)", idbits[146]);
-				printk(".\n");
+				drive->name, id[147] ? "Bootable " : "",
+				id[ATA_ID_CSFO]);
+			if (id[ATA_ID_CSFO] != 1)
+				printk(KERN_CONT "(%dK stripe)", id[146]);
+			printk(KERN_CONT ".\n");
 		} else {
 			/* Non RAID volume. Fixups to stop the core code
 			   doing unsupported things */
-			id->field_valid &= 3;
-			id->queue_depth = 0;
-			id->command_set_1 = 0;
-			id->command_set_2 &= 0xC400;
-			id->cfsse &= 0xC000;
-			id->cfs_enable_1 = 0;
-			id->cfs_enable_2 &= 0xC400;
-			id->csf_default &= 0xC000;
-			id->word127 = 0;
-			id->dlf = 0;
-			id->csfo = 0;
-			id->cfa_power = 0;
+			id[ATA_ID_FIELD_VALID]	 &= 3;
+			id[ATA_ID_QUEUE_DEPTH]	  = 0;
+			id[ATA_ID_COMMAND_SET_1]  = 0;
+			id[ATA_ID_COMMAND_SET_2] &= 0xC400;
+			id[ATA_ID_CFSSE]	 &= 0xC000;
+			id[ATA_ID_CFS_ENABLE_1]	  = 0;
+			id[ATA_ID_CFS_ENABLE_2]	 &= 0xC400;
+			id[ATA_ID_CSF_DEFAULT]	 &= 0xC000;
+			id[127]			  = 0;
+			id[ATA_ID_DLF]		  = 0;
+			id[ATA_ID_CSFO]		  = 0;
+			id[ATA_ID_CFA_POWER]	  = 0;
 			printk(KERN_INFO "%s: Performing identify fixups.\n",
 				drive->name);
 		}
@@ -505,8 +503,8 @@
 		 * IDE core that DMA is supported (it821x hardware
 		 * takes care of DMA mode programming).
 		 */
-		if (id->capability & 1) {
-			id->dma_mword |= 0x0101;
+		if (ata_id_has_dma(id)) {
+			id[ATA_ID_MWDMA_MODES] |= 0x0101;
 			drive->current_speed = XFER_MW_DMA_0;
 		}
 	}
@@ -588,7 +586,7 @@
 	hwif->mwdma_mask = ATA_MWDMA2;
 }
 
-static void __devinit it8212_disable_raid(struct pci_dev *dev)
+static void it8212_disable_raid(struct pci_dev *dev)
 {
 	/* Reset local CPU, and set BIOS not ready */
 	pci_write_config_byte(dev, 0x5E, 0x01);
@@ -605,7 +603,7 @@
 	pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0x20);
 }
 
-static unsigned int __devinit init_chipset_it821x(struct pci_dev *dev)
+static unsigned int init_chipset_it821x(struct pci_dev *dev)
 {
 	u8 conf;
 	static char *mode[2] = { "pass through", "smart" };
@@ -687,6 +685,8 @@
 	.id_table	= it821x_pci_tbl,
 	.probe		= it821x_init_one,
 	.remove		= __devexit_p(it821x_remove),
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init it821x_ide_init(void)
diff --git a/drivers/ide/pci/jmicron.c b/drivers/ide/pci/jmicron.c
index bb9d09d..acd6471 100644
--- a/drivers/ide/pci/jmicron.c
+++ b/drivers/ide/pci/jmicron.c
@@ -8,7 +8,6 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/pci.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
@@ -155,6 +154,8 @@
 	.id_table	= jmicron_pci_tbl,
 	.probe		= jmicron_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init jmicron_ide_init(void)
diff --git a/drivers/ide/pci/ns87415.c b/drivers/ide/pci/ns87415.c
index ffefcd1..53bd645 100644
--- a/drivers/ide/pci/ns87415.c
+++ b/drivers/ide/pci/ns87415.c
@@ -11,7 +11,6 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
-#include <linux/hdreg.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/ide.h>
@@ -274,9 +273,9 @@
 		do {
 			udelay(50);
 			stat = hwif->tp_ops->read_status(hwif);
-                	if (stat == 0xff)
-                        	break;
-        	} while ((stat & BUSY_STAT) && --timeout);
+			if (stat == 0xff)
+				break;
+		} while ((stat & ATA_BUSY) && --timeout);
 #endif
 	}
 
@@ -340,6 +339,8 @@
 	.id_table	= ns87415_pci_tbl,
 	.probe		= ns87415_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init ns87415_ide_init(void)
diff --git a/drivers/ide/pci/opti621.c b/drivers/ide/pci/opti621.c
index e28e672..3de11dd 100644
--- a/drivers/ide/pci/opti621.c
+++ b/drivers/ide/pci/opti621.c
@@ -85,7 +85,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 
 #include <asm/io.h>
@@ -137,7 +136,7 @@
 static void opti621_set_pio_mode(ide_drive_t *drive, const u8 pio)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	ide_drive_t *pair = ide_get_paired_drive(drive);
+	ide_drive_t *pair = ide_get_pair_dev(drive);
 	unsigned long flags;
 	u8 tim, misc, addr_pio = pio, clk;
 
@@ -153,7 +152,7 @@
 
 	drive->drive_data = XFER_PIO_0 + pio;
 
-	if (pair->present) {
+	if (pair) {
 		if (pair->drive_data && pair->drive_data < drive->drive_data)
 			addr_pio = pair->drive_data - XFER_PIO_0;
 	}
@@ -226,6 +225,8 @@
 	.id_table	= opti621_pci_tbl,
 	.probe		= opti621_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init opti621_ide_init(void)
diff --git a/drivers/ide/pci/pdc202xx_new.c b/drivers/ide/pci/pdc202xx_new.c
index d477da6..9fc5996 100644
--- a/drivers/ide/pci/pdc202xx_new.c
+++ b/drivers/ide/pci/pdc202xx_new.c
@@ -19,7 +19,6 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
-#include <linux/hdreg.h>
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/ide.h>
@@ -203,10 +202,10 @@
 
 static void pdcnew_quirkproc(ide_drive_t *drive)
 {
-	const char **list, *model = drive->id->model;
+	const char **list, *m = (char *)&drive->id[ATA_ID_PROD];
 
 	for (list = pdc_quirk_drives; *list != NULL; list++)
-		if (strstr(model, *list) != NULL) {
+		if (strstr(m, *list) != NULL) {
 			drive->quirk_list = 2;
 			return;
 		}
@@ -227,7 +226,7 @@
  * read_counter - Read the byte count registers
  * @dma_base: for the port address
  */
-static long __devinit read_counter(u32 dma_base)
+static long read_counter(u32 dma_base)
 {
 	u32  pri_dma_base = dma_base, sec_dma_base = dma_base + 0x08;
 	u8   cnt0, cnt1, cnt2, cnt3;
@@ -267,7 +266,7 @@
  * @dma_base: for the port address
  * E.g. 16949000 on 33 MHz PCI bus, i.e. half of the PCI clock.
  */
-static long __devinit detect_pll_input_clock(unsigned long dma_base)
+static long detect_pll_input_clock(unsigned long dma_base)
 {
 	struct timeval start_time, end_time;
 	long start_count, end_count;
@@ -310,7 +309,7 @@
 }
 
 #ifdef CONFIG_PPC_PMAC
-static void __devinit apple_kiwi_init(struct pci_dev *pdev)
+static void apple_kiwi_init(struct pci_dev *pdev)
 {
 	struct device_node *np = pci_device_to_OF_node(pdev);
 	u8 conf;
@@ -326,7 +325,7 @@
 }
 #endif /* CONFIG_PPC_PMAC */
 
-static unsigned int __devinit init_chipset_pdcnew(struct pci_dev *dev)
+static unsigned int init_chipset_pdcnew(struct pci_dev *dev)
 {
 	const char *name = DRV_NAME;
 	unsigned long dma_base = pci_resource_start(dev, 4);
@@ -567,6 +566,8 @@
 	.id_table	= pdc202new_pci_tbl,
 	.probe		= pdc202new_init_one,
 	.remove		= __devexit_p(pdc202new_remove),
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init pdc202new_ide_init(void)
diff --git a/drivers/ide/pci/pdc202xx_old.c b/drivers/ide/pci/pdc202xx_old.c
index de9a274..cb6d2a0 100644
--- a/drivers/ide/pci/pdc202xx_old.c
+++ b/drivers/ide/pci/pdc202xx_old.c
@@ -13,7 +13,6 @@
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/blkdev.h>
-#include <linux/hdreg.h>
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/ide.h>
@@ -86,7 +85,7 @@
 		 * Prefetch_EN / IORDY_EN / PA[3:0] bits of register A
 		 */
 		AP &= ~0x3f;
-		if (drive->id->capability & 4)
+		if (ata_id_iordy_disable(drive->id))
 			AP |= 0x20;	/* set IORDY_EN bit */
 		if (drive->media == ide_disk)
 			AP |= 0x10;	/* set Prefetch_EN bit */
@@ -154,10 +153,10 @@
 
 static void pdc202xx_quirkproc(ide_drive_t *drive)
 {
-	const char **list, *model = drive->id->model;
+	const char **list, *m = (char *)&drive->id[ATA_ID_PROD];
 
 	for (list = pdc_quirk_drives; *list != NULL; list++)
-		if (strstr(model, *list) != NULL) {
+		if (strstr(m, *list) != NULL) {
 			drive->quirk_list = 2;
 			return;
 		}
@@ -265,7 +264,7 @@
 	ide_dma_timeout(drive);
 }
 
-static unsigned int __devinit init_chipset_pdc202xx(struct pci_dev *dev)
+static unsigned int init_chipset_pdc202xx(struct pci_dev *dev)
 {
 	unsigned long dmabase = pci_resource_start(dev, 4);
 	u8 udma_speed_flag = 0, primary_mode = 0, secondary_mode = 0;
@@ -432,6 +431,8 @@
 	.id_table	= pdc202xx_pci_tbl,
 	.probe		= pdc202xx_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init pdc202xx_ide_init(void)
diff --git a/drivers/ide/pci/piix.c b/drivers/ide/pci/piix.c
index 30cfc81..a06c03f 100644
--- a/drivers/ide/pci/piix.c
+++ b/drivers/ide/pci/piix.c
@@ -48,7 +48,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
@@ -205,7 +204,7 @@
  *	out to be nice and simple.
  */
 
-static unsigned int __devinit init_chipset_ich(struct pci_dev *dev)
+static unsigned int init_chipset_ich(struct pci_dev *dev)
 {
 	u32 extra = 0;
 
@@ -450,6 +449,8 @@
 	.id_table	= piix_pci_tbl,
 	.probe		= piix_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init piix_ide_init(void)
diff --git a/drivers/ide/pci/rz1000.c b/drivers/ide/pci/rz1000.c
index 8d11ee8..c117a06 100644
--- a/drivers/ide/pci/rz1000.c
+++ b/drivers/ide/pci/rz1000.c
@@ -16,7 +16,6 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/hdreg.h>
 #include <linux/pci.h>
 #include <linux/ide.h>
 #include <linux/init.h>
diff --git a/drivers/ide/pci/sc1200.c b/drivers/ide/pci/sc1200.c
index 8efaed1..bdc1fed 100644
--- a/drivers/ide/pci/sc1200.c
+++ b/drivers/ide/pci/sc1200.c
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/hdreg.h>
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/ide.h>
@@ -104,17 +103,19 @@
 static u8 sc1200_udma_filter(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	ide_drive_t *mate = &hwif->drives[(drive->dn & 1) ^ 1];
-	struct hd_driveid *mateid = mate->id;
+	ide_drive_t *mate = ide_get_pair_dev(drive);
+	u16 *mateid = mate->id;
 	u8 mask = hwif->ultra_mask;
 
-	if (mate->present == 0)
+	if (mate == NULL)
 		goto out;
 
-	if ((mateid->capability & 1) && __ide_dma_bad_drive(mate) == 0) {
-		if ((mateid->field_valid & 4) && (mateid->dma_ultra & 7))
+	if (ata_id_has_dma(mateid) && __ide_dma_bad_drive(mate) == 0) {
+		if ((mateid[ATA_ID_FIELD_VALID] & 4) &&
+		    (mateid[ATA_ID_UDMA_MODES] & 7))
 			goto out;
-		if ((mateid->field_valid & 2) && (mateid->dma_mword & 7))
+		if ((mateid[ATA_ID_FIELD_VALID] & 2) &&
+		    (mateid[ATA_ID_MWDMA_MODES] & 7))
 			mask = 0;
 	}
 out:
diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c
index 44cccd1..e92a874 100644
--- a/drivers/ide/pci/scc_pata.c
+++ b/drivers/ide/pci/scc_pata.c
@@ -26,7 +26,6 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
@@ -400,7 +399,7 @@
 	/* errata A308 workaround: Step5 (check data loss) */
 	/* We don't check non ide_disk because it is limited to UDMA4 */
 	if (!(in_be32((void __iomem *)hwif->io_ports.ctl_addr)
-	      & ERR_STAT) &&
+	      & ATA_ERR) &&
 	    drive->media == ide_disk && drive->current_speed > XFER_UDMA_4) {
 		reg = in_be32((void __iomem *)intsts_port);
 		if (!(reg & INTSTS_ACTEINT)) {
@@ -504,7 +503,7 @@
 
 	/* SCC errata A252,A308 workaround: Step4 */
 	if ((in_be32((void __iomem *)hwif->io_ports.ctl_addr)
-	     & ERR_STAT) &&
+	     & ATA_ERR) &&
 	    (int_stat & INTSTS_INTRQ))
 		return 1;
 
diff --git a/drivers/ide/pci/serverworks.c b/drivers/ide/pci/serverworks.c
index c3bdc6e..3dff2ae 100644
--- a/drivers/ide/pci/serverworks.c
+++ b/drivers/ide/pci/serverworks.c
@@ -32,7 +32,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
@@ -57,8 +56,10 @@
 
 static int check_in_drive_lists (ide_drive_t *drive, const char **list)
 {
+	char *m = (char *)&drive->id[ATA_ID_PROD];
+
 	while (*list)
-		if (!strcmp(*list++, drive->id->model))
+		if (!strcmp(*list++, m))
 			return 1;
 	return 0;
 }
@@ -174,7 +175,7 @@
 	pci_write_config_byte(dev, 0x54, ultra_enable);
 }
 
-static unsigned int __devinit init_chipset_svwks(struct pci_dev *dev)
+static unsigned int init_chipset_svwks(struct pci_dev *dev)
 {
 	unsigned int reg;
 	u8 btr;
@@ -447,6 +448,8 @@
 	.id_table	= svwks_pci_tbl,
 	.probe		= svwks_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init svwks_ide_init(void)
diff --git a/drivers/ide/pci/sgiioc4.c b/drivers/ide/pci/sgiioc4.c
index 681306c..1017fb4 100644
--- a/drivers/ide/pci/sgiioc4.c
+++ b/drivers/ide/pci/sgiioc4.c
@@ -22,7 +22,6 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
-#include <linux/hdreg.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/ioport.h>
diff --git a/drivers/ide/pci/siimage.c b/drivers/ide/pci/siimage.c
index db2b88a..174a873 100644
--- a/drivers/ide/pci/siimage.c
+++ b/drivers/ide/pci/siimage.c
@@ -39,7 +39,6 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/pci.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 #include <linux/io.h>
@@ -223,7 +222,9 @@
 
 static u8 sil_sata_udma_filter(ide_drive_t *drive)
 {
-	return strstr(drive->id->model, "Maxtor") ? ATA_UDMA5 : ATA_UDMA6;
+	char *m = (char *)&drive->id[ATA_ID_PROD];
+
+	return strstr(m, "Maxtor") ? ATA_UDMA5 : ATA_UDMA6;
 }
 
 /**
@@ -243,7 +244,7 @@
 
 	ide_hwif_t *hwif	= HWIF(drive);
 	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	ide_drive_t *pair	= ide_get_paired_drive(drive);
+	ide_drive_t *pair	= ide_get_pair_dev(drive);
 	u32 speedt		= 0;
 	u16 speedp		= 0;
 	unsigned long addr	= siimage_seldev(drive, 0x04);
@@ -257,7 +258,7 @@
 	u8 unit			= drive->select.b.unit;
 
 	/* trim *taskfile* PIO to the slowest of the master/slave */
-	if (pair->present) {
+	if (pair) {
 		u8 pair_pio = ide_get_best_pio_mode(pair, 255, 4);
 
 		if (pair_pio < tf_pio)
@@ -462,7 +463,7 @@
  *	to 133 MHz clocking if the system isn't already set up to do it.
  */
 
-static unsigned int __devinit init_chipset_siimage(struct pci_dev *dev)
+static unsigned int init_chipset_siimage(struct pci_dev *dev)
 {
 	struct ide_host *host = pci_get_drvdata(dev);
 	void __iomem *ioaddr = host->host_priv;
@@ -616,8 +617,8 @@
 
 static int is_dev_seagate_sata(ide_drive_t *drive)
 {
-	const char *s	= &drive->id->model[0];
-	unsigned len	= strnlen(s, sizeof(drive->id->model));
+	const char *s	= (const char *)&drive->id[ATA_ID_PROD];
+	unsigned len	= strnlen(s, ATA_ID_PROD_LEN);
 
 	if ((len > 4) && (!memcmp(s, "ST", 2)))
 		if ((!memcmp(s + len - 2, "AS", 2)) ||
@@ -833,6 +834,8 @@
 	.id_table	= siimage_pci_tbl,
 	.probe		= siimage_init_one,
 	.remove		= __devexit_p(siimage_remove),
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init siimage_ide_init(void)
diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c
index 5efe21d..734dd41 100644
--- a/drivers/ide/pci/sis5513.c
+++ b/drivers/ide/pci/sis5513.c
@@ -47,7 +47,6 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/hdreg.h>
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/ide.h>
@@ -448,7 +447,7 @@
 	return chipset_family;
 }
 
-static unsigned int __devinit init_chipset_sis5513(struct pci_dev *dev)
+static unsigned int init_chipset_sis5513(struct pci_dev *dev)
 {
 	/* Make general config ops here
 	   1/ tell IDE channels to operate in Compatibility mode only
@@ -611,6 +610,8 @@
 	.id_table	= sis5513_pci_tbl,
 	.probe		= sis5513_init_one,
 	.remove		= __devexit_p(sis5513_remove),
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init sis5513_ide_init(void)
diff --git a/drivers/ide/pci/sl82c105.c b/drivers/ide/pci/sl82c105.c
index 73905bc..37a6b7b 100644
--- a/drivers/ide/pci/sl82c105.c
+++ b/drivers/ide/pci/sl82c105.c
@@ -17,7 +17,6 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/hdreg.h>
 #include <linux/pci.h>
 #include <linux/ide.h>
 
@@ -62,7 +61,7 @@
 	if (cmd_off == 0)
 		cmd_off = 1;
 
-	if (pio > 2 || ide_dev_has_iordy(drive->id))
+	if (pio > 2 || ata_id_has_iordy(drive->id))
 		iordy = 0x40;
 
 	return (cmd_on - 1) << 8 | (cmd_off - 1) | iordy;
@@ -272,7 +271,7 @@
  * channel 0 here at least, but channel 1 has to be enabled by
  * firmware or arch code. We still set both to 16 bits mode.
  */
-static unsigned int __devinit init_chipset_sl82c105(struct pci_dev *dev)
+static unsigned int init_chipset_sl82c105(struct pci_dev *dev)
 {
 	u32 val;
 
@@ -351,6 +350,8 @@
 	.id_table	= sl82c105_pci_tbl,
 	.probe		= sl82c105_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init sl82c105_ide_init(void)
diff --git a/drivers/ide/pci/slc90e66.c b/drivers/ide/pci/slc90e66.c
index 866d6c6..a9551a1 100644
--- a/drivers/ide/pci/slc90e66.c
+++ b/drivers/ide/pci/slc90e66.c
@@ -11,7 +11,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
-#include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
 
@@ -160,6 +159,8 @@
 	.id_table	= slc90e66_pci_tbl,
 	.probe		= slc90e66_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init slc90e66_ide_init(void)
diff --git a/drivers/ide/pci/triflex.c b/drivers/ide/pci/triflex.c
index b77ec35..be8715d 100644
--- a/drivers/ide/pci/triflex.c
+++ b/drivers/ide/pci/triflex.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/hdreg.h>
 #include <linux/pci.h>
 #include <linux/ide.h>
 #include <linux/init.h>
@@ -120,6 +119,8 @@
 	.id_table	= triflex_pci_tbl,
 	.probe		= triflex_init_one,
 	.remove		= ide_pci_remove,
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init triflex_ide_init(void)
diff --git a/drivers/ide/pci/trm290.c b/drivers/ide/pci/trm290.c
index fd28b49..4dfbc6a 100644
--- a/drivers/ide/pci/trm290.c
+++ b/drivers/ide/pci/trm290.c
@@ -135,7 +135,6 @@
 #include <linux/interrupt.h>
 #include <linux/blkdev.h>
 #include <linux/init.h>
-#include <linux/hdreg.h>
 #include <linux/pci.h>
 #include <linux/ide.h>
 
diff --git a/drivers/ide/pci/via82cxxx.c b/drivers/ide/pci/via82cxxx.c
index 94fb9ab..acacdaa 100644
--- a/drivers/ide/pci/via82cxxx.c
+++ b/drivers/ide/pci/via82cxxx.c
@@ -154,7 +154,7 @@
 static void via_set_drive(ide_drive_t *drive, const u8 speed)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	ide_drive_t *peer = hwif->drives + (~drive->dn & 1);
+	ide_drive_t *peer = ide_get_pair_dev(drive);
 	struct pci_dev *dev = to_pci_dev(hwif->dev);
 	struct ide_host *host = pci_get_drvdata(dev);
 	struct via82cxxx_dev *vdev = host->host_priv;
@@ -173,7 +173,7 @@
 
 	ide_timing_compute(drive, speed, &t, T, UT);
 
-	if (peer->present) {
+	if (peer) {
 		ide_timing_compute(peer, peer->current_speed, &p, T, UT);
 		ide_timing_merge(&p, &t, &t, IDE_TIMING_8BIT);
 	}
@@ -215,7 +215,7 @@
 /*
  * Check and handle 80-wire cable presence
  */
-static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
+static void via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
 {
 	int i;
 
@@ -267,7 +267,7 @@
  *	and initialize its drive independent registers.
  */
 
-static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev)
+static unsigned int init_chipset_via82cxxx(struct pci_dev *dev)
 {
 	struct ide_host *host = pci_get_drvdata(dev);
 	struct via82cxxx_dev *vdev = host->host_priv;
@@ -492,6 +492,8 @@
 	.id_table 	= via_pci_tbl,
 	.probe 		= via_init_one,
 	.remove		= __devexit_p(via_remove),
+	.suspend	= ide_pci_suspend,
+	.resume		= ide_pci_resume,
 };
 
 static int __init via_ide_init(void)
diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index fa2be26..c3432da 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -669,9 +669,9 @@
 set_timings_mdma(ide_drive_t *drive, int intf_type, u32 *timings, u32 *timings2,
 		 	u8 speed)
 {
+	u16 *id = drive->id;
 	int cycleTime, accessTime = 0, recTime = 0;
 	unsigned accessTicks, recTicks;
-	struct hd_driveid *id = drive->id;
 	struct mdma_timings_t* tm = NULL;
 	int i;
 
@@ -686,8 +686,8 @@
 	}
 
 	/* Check if drive provides explicit DMA cycle time */
-	if ((id->field_valid & 2) && id->eide_dma_time)
-		cycleTime = max_t(int, id->eide_dma_time, cycleTime);
+	if ((id[ATA_ID_FIELD_VALID] & 2) && id[ATA_ID_EIDE_DMA_TIME])
+		cycleTime = max_t(int, id[ATA_ID_EIDE_DMA_TIME], cycleTime);
 
 	/* OHare limits according to some old Apple sources */	
 	if ((intf_type == controller_ohare) && (cycleTime < 150))
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index a8e9e8a..9f1f9163 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -659,3 +659,36 @@
 	pci_disable_device(dev);
 }
 EXPORT_SYMBOL_GPL(ide_pci_remove);
+
+#ifdef CONFIG_PM
+int ide_pci_suspend(struct pci_dev *dev, pm_message_t state)
+{
+	pci_save_state(dev);
+	pci_disable_device(dev);
+	pci_set_power_state(dev, pci_choose_state(dev, state));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ide_pci_suspend);
+
+int ide_pci_resume(struct pci_dev *dev)
+{
+	struct ide_host *host = pci_get_drvdata(dev);
+	int rc;
+
+	pci_set_power_state(dev, PCI_D0);
+
+	rc = pci_enable_device(dev);
+	if (rc)
+		return rc;
+
+	pci_restore_state(dev);
+	pci_set_master(dev);
+
+	if (host->init_chipset)
+		host->init_chipset(dev);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ide_pci_resume);
+#endif
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 7d63f8c..4b47f4e 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -26,6 +26,8 @@
 # Build Intel IOMMU support
 obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
 
+obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
+
 #
 # Some architectures use the generic PCI setup functions
 #
diff --git a/drivers/pci/dma_remapping.h b/drivers/pci/dma_remapping.h
new file mode 100644
index 0000000..bff5c65
--- /dev/null
+++ b/drivers/pci/dma_remapping.h
@@ -0,0 +1,157 @@
+#ifndef _DMA_REMAPPING_H
+#define _DMA_REMAPPING_H
+
+/*
+ * We need a fixed PAGE_SIZE of 4K irrespective of
+ * arch PAGE_SIZE for IOMMU page tables.
+ */
+#define PAGE_SHIFT_4K		(12)
+#define PAGE_SIZE_4K		(1UL << PAGE_SHIFT_4K)
+#define PAGE_MASK_4K		(((u64)-1) << PAGE_SHIFT_4K)
+#define PAGE_ALIGN_4K(addr)	(((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K)
+
+#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT_4K)
+#define DMA_32BIT_PFN		IOVA_PFN(DMA_32BIT_MASK)
+#define DMA_64BIT_PFN		IOVA_PFN(DMA_64BIT_MASK)
+
+
+/*
+ * 0: Present
+ * 1-11: Reserved
+ * 12-63: Context Ptr (12 - (haw-1))
+ * 64-127: Reserved
+ */
+struct root_entry {
+	u64	val;
+	u64	rsvd1;
+};
+#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
+static inline bool root_present(struct root_entry *root)
+{
+	return (root->val & 1);
+}
+static inline void set_root_present(struct root_entry *root)
+{
+	root->val |= 1;
+}
+static inline void set_root_value(struct root_entry *root, unsigned long value)
+{
+	root->val |= value & PAGE_MASK_4K;
+}
+
+struct context_entry;
+static inline struct context_entry *
+get_context_addr_from_root(struct root_entry *root)
+{
+	return (struct context_entry *)
+		(root_present(root)?phys_to_virt(
+		root->val & PAGE_MASK_4K):
+		NULL);
+}
+
+/*
+ * low 64 bits:
+ * 0: present
+ * 1: fault processing disable
+ * 2-3: translation type
+ * 12-63: address space root
+ * high 64 bits:
+ * 0-2: address width
+ * 3-6: aval
+ * 8-23: domain id
+ */
+struct context_entry {
+	u64 lo;
+	u64 hi;
+};
+#define context_present(c) ((c).lo & 1)
+#define context_fault_disable(c) (((c).lo >> 1) & 1)
+#define context_translation_type(c) (((c).lo >> 2) & 3)
+#define context_address_root(c) ((c).lo & PAGE_MASK_4K)
+#define context_address_width(c) ((c).hi &  7)
+#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
+
+#define context_set_present(c) do {(c).lo |= 1;} while (0)
+#define context_set_fault_enable(c) \
+	do {(c).lo &= (((u64)-1) << 2) | 1;} while (0)
+#define context_set_translation_type(c, val) \
+	do { \
+		(c).lo &= (((u64)-1) << 4) | 3; \
+		(c).lo |= ((val) & 3) << 2; \
+	} while (0)
+#define CONTEXT_TT_MULTI_LEVEL 0
+#define context_set_address_root(c, val) \
+	do {(c).lo |= (val) & PAGE_MASK_4K;} while (0)
+#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
+#define context_set_domain_id(c, val) \
+	do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
+#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
+
+/*
+ * 0: readable
+ * 1: writable
+ * 2-6: reserved
+ * 7: super page
+ * 8-11: available
+ * 12-63: Host physcial address
+ */
+struct dma_pte {
+	u64 val;
+};
+#define dma_clear_pte(p)	do {(p).val = 0;} while (0)
+
+#define DMA_PTE_READ (1)
+#define DMA_PTE_WRITE (2)
+
+#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
+#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
+#define dma_set_pte_prot(p, prot) \
+		do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
+#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
+#define dma_set_pte_addr(p, addr) do {\
+		(p).val |= ((addr) & PAGE_MASK_4K); } while (0)
+#define dma_pte_present(p) (((p).val & 3) != 0)
+
+struct intel_iommu;
+
+struct dmar_domain {
+	int	id;			/* domain id */
+	struct intel_iommu *iommu;	/* back pointer to owning iommu */
+
+	struct list_head devices; 	/* all devices' list */
+	struct iova_domain iovad;	/* iova's that belong to this domain */
+
+	struct dma_pte	*pgd;		/* virtual address */
+	spinlock_t	mapping_lock;	/* page table lock */
+	int		gaw;		/* max guest address width */
+
+	/* adjusted guest address width, 0 is level 2 30-bit */
+	int		agaw;
+
+#define DOMAIN_FLAG_MULTIPLE_DEVICES 1
+	int		flags;
+};
+
+/* PCI domain-device relationship */
+struct device_domain_info {
+	struct list_head link;	/* link to domain siblings */
+	struct list_head global; /* link to global list */
+	u8 bus;			/* PCI bus numer */
+	u8 devfn;		/* PCI devfn number */
+	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
+	struct dmar_domain *domain; /* pointer to domain */
+};
+
+extern int init_dmars(void);
+extern void free_dmar_iommu(struct intel_iommu *iommu);
+
+extern int dmar_disabled;
+
+#ifndef CONFIG_DMAR_GFX_WA
+static inline void iommu_prepare_gfx_mapping(void)
+{
+	return;
+}
+#endif /* !CONFIG_DMAR_GFX_WA */
+
+#endif
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 8bf86ae..bd2c016 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -19,13 +19,16 @@
  * Author: Shaohua Li <shaohua.li@intel.com>
  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  *
- * This file implements early detection/parsing of DMA Remapping Devices
+ * This file implements early detection/parsing of Remapping Devices
  * reported to OS through BIOS via DMA remapping reporting (DMAR) ACPI
  * tables.
+ *
+ * These routines are used by both DMA-remapping and Interrupt-remapping
  */
 
 #include <linux/pci.h>
 #include <linux/dmar.h>
+#include <linux/timer.h>
 #include "iova.h"
 #include "intel-iommu.h"
 
@@ -37,7 +40,6 @@
  * these units are not supported by the architecture.
  */
 LIST_HEAD(dmar_drhd_units);
-LIST_HEAD(dmar_rmrr_units);
 
 static struct acpi_table_header * __initdata dmar_tbl;
 
@@ -53,11 +55,6 @@
 		list_add(&drhd->list, &dmar_drhd_units);
 }
 
-static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
-{
-	list_add(&rmrr->list, &dmar_rmrr_units);
-}
-
 static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope,
 					   struct pci_dev **dev, u16 segment)
 {
@@ -172,19 +169,37 @@
 	struct acpi_dmar_hardware_unit *drhd;
 	struct dmar_drhd_unit *dmaru;
 	int ret = 0;
-	static int include_all;
 
 	dmaru = kzalloc(sizeof(*dmaru), GFP_KERNEL);
 	if (!dmaru)
 		return -ENOMEM;
 
+	dmaru->hdr = header;
 	drhd = (struct acpi_dmar_hardware_unit *)header;
 	dmaru->reg_base_addr = drhd->address;
 	dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
 
+	ret = alloc_iommu(dmaru);
+	if (ret) {
+		kfree(dmaru);
+		return ret;
+	}
+	dmar_register_drhd_unit(dmaru);
+	return 0;
+}
+
+static int __init
+dmar_parse_dev(struct dmar_drhd_unit *dmaru)
+{
+	struct acpi_dmar_hardware_unit *drhd;
+	static int include_all;
+	int ret;
+
+	drhd = (struct acpi_dmar_hardware_unit *) dmaru->hdr;
+
 	if (!dmaru->include_all)
 		ret = dmar_parse_dev_scope((void *)(drhd + 1),
-				((void *)drhd) + header->length,
+				((void *)drhd) + drhd->header.length,
 				&dmaru->devices_cnt, &dmaru->devices,
 				drhd->segment);
 	else {
@@ -197,37 +212,59 @@
 		include_all = 1;
 	}
 
-	if (ret || (dmaru->devices_cnt == 0 && !dmaru->include_all))
+	if (ret || (dmaru->devices_cnt == 0 && !dmaru->include_all)) {
+		list_del(&dmaru->list);
 		kfree(dmaru);
-	else
-		dmar_register_drhd_unit(dmaru);
+	}
 	return ret;
 }
 
+#ifdef CONFIG_DMAR
+LIST_HEAD(dmar_rmrr_units);
+
+static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
+{
+	list_add(&rmrr->list, &dmar_rmrr_units);
+}
+
+
 static int __init
 dmar_parse_one_rmrr(struct acpi_dmar_header *header)
 {
 	struct acpi_dmar_reserved_memory *rmrr;
 	struct dmar_rmrr_unit *rmrru;
-	int ret = 0;
 
 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
 	if (!rmrru)
 		return -ENOMEM;
 
+	rmrru->hdr = header;
 	rmrr = (struct acpi_dmar_reserved_memory *)header;
 	rmrru->base_address = rmrr->base_address;
 	rmrru->end_address = rmrr->end_address;
+
+	dmar_register_rmrr_unit(rmrru);
+	return 0;
+}
+
+static int __init
+rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
+{
+	struct acpi_dmar_reserved_memory *rmrr;
+	int ret;
+
+	rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
 	ret = dmar_parse_dev_scope((void *)(rmrr + 1),
-		((void *)rmrr) + header->length,
+		((void *)rmrr) + rmrr->header.length,
 		&rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
 
-	if (ret || (rmrru->devices_cnt == 0))
+	if (ret || (rmrru->devices_cnt == 0)) {
+		list_del(&rmrru->list);
 		kfree(rmrru);
-	else
-		dmar_register_rmrr_unit(rmrru);
+	}
 	return ret;
 }
+#endif
 
 static void __init
 dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
@@ -252,6 +289,7 @@
 	}
 }
 
+
 /**
  * parse_dmar_table - parses the DMA reporting table
  */
@@ -284,7 +322,9 @@
 			ret = dmar_parse_one_drhd(entry_header);
 			break;
 		case ACPI_DMAR_TYPE_RESERVED_MEMORY:
+#ifdef CONFIG_DMAR
 			ret = dmar_parse_one_rmrr(entry_header);
+#endif
 			break;
 		default:
 			printk(KERN_WARNING PREFIX
@@ -300,15 +340,77 @@
 	return ret;
 }
 
+int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
+			  struct pci_dev *dev)
+{
+	int index;
+
+	while (dev) {
+		for (index = 0; index < cnt; index++)
+			if (dev == devices[index])
+				return 1;
+
+		/* Check our parent */
+		dev = dev->bus->self;
+	}
+
+	return 0;
+}
+
+struct dmar_drhd_unit *
+dmar_find_matched_drhd_unit(struct pci_dev *dev)
+{
+	struct dmar_drhd_unit *drhd = NULL;
+
+	list_for_each_entry(drhd, &dmar_drhd_units, list) {
+		if (drhd->include_all || dmar_pci_device_match(drhd->devices,
+						drhd->devices_cnt, dev))
+			return drhd;
+	}
+
+	return NULL;
+}
+
+int __init dmar_dev_scope_init(void)
+{
+	struct dmar_drhd_unit *drhd;
+	int ret = -ENODEV;
+
+	for_each_drhd_unit(drhd) {
+		ret = dmar_parse_dev(drhd);
+		if (ret)
+			return ret;
+	}
+
+#ifdef CONFIG_DMAR
+	{
+		struct dmar_rmrr_unit *rmrr;
+		for_each_rmrr_units(rmrr) {
+			ret = rmrr_parse_dev(rmrr);
+			if (ret)
+				return ret;
+		}
+	}
+#endif
+
+	return ret;
+}
+
 
 int __init dmar_table_init(void)
 {
-
+	static int dmar_table_initialized;
 	int ret;
 
+	if (dmar_table_initialized)
+		return 0;
+
+	dmar_table_initialized = 1;
+
 	ret = parse_dmar_table();
 	if (ret) {
-		printk(KERN_INFO PREFIX "parse DMAR table failure.\n");
+		if (ret != -ENODEV)
+			printk(KERN_INFO PREFIX "parse DMAR table failure.\n");
 		return ret;
 	}
 
@@ -317,9 +419,14 @@
 		return -ENODEV;
 	}
 
+#ifdef CONFIG_DMAR
 	if (list_empty(&dmar_rmrr_units))
 		printk(KERN_INFO PREFIX "No RMRR found\n");
+#endif
 
+#ifdef CONFIG_INTR_REMAP
+	parse_ioapics_under_ir();
+#endif
 	return 0;
 }
 
@@ -341,3 +448,255 @@
 
 	return (ACPI_SUCCESS(status) ? 1 : 0);
 }
+
+void __init detect_intel_iommu(void)
+{
+	int ret;
+
+	ret = early_dmar_detect();
+
+#ifdef CONFIG_DMAR
+	{
+		struct acpi_table_dmar *dmar;
+		/*
+		 * for now we will disable dma-remapping when interrupt
+		 * remapping is enabled.
+		 * When support for queued invalidation for IOTLB invalidation
+		 * is added, we will not need this any more.
+		 */
+		dmar = (struct acpi_table_dmar *) dmar_tbl;
+		if (ret && cpu_has_x2apic && dmar->flags & 0x1) {
+			printk(KERN_INFO
+			       "Queued invalidation will be enabled to support "
+			       "x2apic and Intr-remapping.\n");
+			printk(KERN_INFO
+			       "Disabling IOMMU detection, because of missing "
+			       "queued invalidation support for IOTLB "
+			       "invalidation\n");
+			printk(KERN_INFO
+			       "Use \"nox2apic\", if you want to use Intel "
+			       " IOMMU for DMA-remapping and don't care about "
+			       " x2apic support\n");
+
+			dmar_disabled = 1;
+			return;
+		}
+
+		if (ret && !no_iommu && !iommu_detected && !swiotlb &&
+		    !dmar_disabled)
+			iommu_detected = 1;
+	}
+#endif
+}
+
+
+int alloc_iommu(struct dmar_drhd_unit *drhd)
+{
+	struct intel_iommu *iommu;
+	int map_size;
+	u32 ver;
+	static int iommu_allocated = 0;
+
+	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+	if (!iommu)
+		return -ENOMEM;
+
+	iommu->seq_id = iommu_allocated++;
+
+	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
+	if (!iommu->reg) {
+		printk(KERN_ERR "IOMMU: can't map the region\n");
+		goto error;
+	}
+	iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
+	iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
+
+	/* the registers might be more than one page */
+	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
+		cap_max_fault_reg_offset(iommu->cap));
+	map_size = PAGE_ALIGN_4K(map_size);
+	if (map_size > PAGE_SIZE_4K) {
+		iounmap(iommu->reg);
+		iommu->reg = ioremap(drhd->reg_base_addr, map_size);
+		if (!iommu->reg) {
+			printk(KERN_ERR "IOMMU: can't map the region\n");
+			goto error;
+		}
+	}
+
+	ver = readl(iommu->reg + DMAR_VER_REG);
+	pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
+		drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
+		iommu->cap, iommu->ecap);
+
+	spin_lock_init(&iommu->register_lock);
+
+	drhd->iommu = iommu;
+	return 0;
+error:
+	kfree(iommu);
+	return -1;
+}
+
+void free_iommu(struct intel_iommu *iommu)
+{
+	if (!iommu)
+		return;
+
+#ifdef CONFIG_DMAR
+	free_dmar_iommu(iommu);
+#endif
+
+	if (iommu->reg)
+		iounmap(iommu->reg);
+	kfree(iommu);
+}
+
+/*
+ * Reclaim all the submitted descriptors which have completed its work.
+ */
+static inline void reclaim_free_desc(struct q_inval *qi)
+{
+	while (qi->desc_status[qi->free_tail] == QI_DONE) {
+		qi->desc_status[qi->free_tail] = QI_FREE;
+		qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
+		qi->free_cnt++;
+	}
+}
+
+/*
+ * Submit the queued invalidation descriptor to the remapping
+ * hardware unit and wait for its completion.
+ */
+void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
+{
+	struct q_inval *qi = iommu->qi;
+	struct qi_desc *hw, wait_desc;
+	int wait_index, index;
+	unsigned long flags;
+
+	if (!qi)
+		return;
+
+	hw = qi->desc;
+
+	spin_lock(&qi->q_lock);
+	while (qi->free_cnt < 3) {
+		spin_unlock(&qi->q_lock);
+		cpu_relax();
+		spin_lock(&qi->q_lock);
+	}
+
+	index = qi->free_head;
+	wait_index = (index + 1) % QI_LENGTH;
+
+	qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE;
+
+	hw[index] = *desc;
+
+	wait_desc.low = QI_IWD_STATUS_DATA(2) | QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
+	wait_desc.high = virt_to_phys(&qi->desc_status[wait_index]);
+
+	hw[wait_index] = wait_desc;
+
+	__iommu_flush_cache(iommu, &hw[index], sizeof(struct qi_desc));
+	__iommu_flush_cache(iommu, &hw[wait_index], sizeof(struct qi_desc));
+
+	qi->free_head = (qi->free_head + 2) % QI_LENGTH;
+	qi->free_cnt -= 2;
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+	/*
+	 * update the HW tail register indicating the presence of
+	 * new descriptors.
+	 */
+	writel(qi->free_head << 4, iommu->reg + DMAR_IQT_REG);
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+	while (qi->desc_status[wait_index] != QI_DONE) {
+		spin_unlock(&qi->q_lock);
+		cpu_relax();
+		spin_lock(&qi->q_lock);
+	}
+
+	qi->desc_status[index] = QI_DONE;
+
+	reclaim_free_desc(qi);
+	spin_unlock(&qi->q_lock);
+}
+
+/*
+ * Flush the global interrupt entry cache.
+ */
+void qi_global_iec(struct intel_iommu *iommu)
+{
+	struct qi_desc desc;
+
+	desc.low = QI_IEC_TYPE;
+	desc.high = 0;
+
+	qi_submit_sync(&desc, iommu);
+}
+
+/*
+ * Enable Queued Invalidation interface. This is a must to support
+ * interrupt-remapping. Also used by DMA-remapping, which replaces
+ * register based IOTLB invalidation.
+ */
+int dmar_enable_qi(struct intel_iommu *iommu)
+{
+	u32 cmd, sts;
+	unsigned long flags;
+	struct q_inval *qi;
+
+	if (!ecap_qis(iommu->ecap))
+		return -ENOENT;
+
+	/*
+	 * queued invalidation is already setup and enabled.
+	 */
+	if (iommu->qi)
+		return 0;
+
+	iommu->qi = kmalloc(sizeof(*qi), GFP_KERNEL);
+	if (!iommu->qi)
+		return -ENOMEM;
+
+	qi = iommu->qi;
+
+	qi->desc = (void *)(get_zeroed_page(GFP_KERNEL));
+	if (!qi->desc) {
+		kfree(qi);
+		iommu->qi = 0;
+		return -ENOMEM;
+	}
+
+	qi->desc_status = kmalloc(QI_LENGTH * sizeof(int), GFP_KERNEL);
+	if (!qi->desc_status) {
+		free_page((unsigned long) qi->desc);
+		kfree(qi);
+		iommu->qi = 0;
+		return -ENOMEM;
+	}
+
+	qi->free_head = qi->free_tail = 0;
+	qi->free_cnt = QI_LENGTH;
+
+	spin_lock_init(&qi->q_lock);
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+	/* write zero to the tail reg */
+	writel(0, iommu->reg + DMAR_IQT_REG);
+
+	dmar_writeq(iommu->reg + DMAR_IQA_REG, virt_to_phys(qi->desc));
+
+	cmd = iommu->gcmd | DMA_GCMD_QIE;
+	iommu->gcmd |= DMA_GCMD_QIE;
+	writel(cmd, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts);
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+	return 0;
+}
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 6c4c1c3..389fdd6 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -49,8 +49,6 @@
 
 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
 
-#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
-
 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
 
 
@@ -58,8 +56,6 @@
 
 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 
-static struct intel_iommu *g_iommus;
-
 #define HIGH_WATER_MARK 250
 struct deferred_flush_tables {
 	int next;
@@ -185,13 +181,6 @@
 	kmem_cache_free(iommu_iova_cache, iova);
 }
 
-static inline void __iommu_flush_cache(
-	struct intel_iommu *iommu, void *addr, int size)
-{
-	if (!ecap_coherent(iommu->ecap))
-		clflush_cache_range(addr, size);
-}
-
 /* Gets context entry for a given bus and devfn */
 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 		u8 bus, u8 devfn)
@@ -488,19 +477,6 @@
 	return 0;
 }
 
-#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
-{\
-	cycles_t start_time = get_cycles();\
-	while (1) {\
-		sts = op (iommu->reg + offset);\
-		if (cond)\
-			break;\
-		if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
-			panic("DMAR hardware is malfunctioning\n");\
-		cpu_relax();\
-	}\
-}
-
 static void iommu_set_root_entry(struct intel_iommu *iommu)
 {
 	void *addr;
@@ -990,6 +966,8 @@
 		return -ENOMEM;
 	}
 
+	spin_lock_init(&iommu->lock);
+
 	/*
 	 * if Caching mode is set, then invalid translations are tagged
 	 * with domainid 0. Hence we need to pre-allocate it.
@@ -998,62 +976,15 @@
 		set_bit(0, iommu->domain_ids);
 	return 0;
 }
-static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
-					struct dmar_drhd_unit *drhd)
-{
-	int ret;
-	int map_size;
-	u32 ver;
 
-	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
-	if (!iommu->reg) {
-		printk(KERN_ERR "IOMMU: can't map the region\n");
-		goto error;
-	}
-	iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
-	iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
-
-	/* the registers might be more than one page */
-	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
-		cap_max_fault_reg_offset(iommu->cap));
-	map_size = PAGE_ALIGN_4K(map_size);
-	if (map_size > PAGE_SIZE_4K) {
-		iounmap(iommu->reg);
-		iommu->reg = ioremap(drhd->reg_base_addr, map_size);
-		if (!iommu->reg) {
-			printk(KERN_ERR "IOMMU: can't map the region\n");
-			goto error;
-		}
-	}
-
-	ver = readl(iommu->reg + DMAR_VER_REG);
-	pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
-		drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
-		iommu->cap, iommu->ecap);
-	ret = iommu_init_domains(iommu);
-	if (ret)
-		goto error_unmap;
-	spin_lock_init(&iommu->lock);
-	spin_lock_init(&iommu->register_lock);
-
-	drhd->iommu = iommu;
-	return iommu;
-error_unmap:
-	iounmap(iommu->reg);
-error:
-	kfree(iommu);
-	return NULL;
-}
 
 static void domain_exit(struct dmar_domain *domain);
-static void free_iommu(struct intel_iommu *iommu)
+
+void free_dmar_iommu(struct intel_iommu *iommu)
 {
 	struct dmar_domain *domain;
 	int i;
 
-	if (!iommu)
-		return;
-
 	i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
 	for (; i < cap_ndoms(iommu->cap); ) {
 		domain = iommu->domains[i];
@@ -1078,10 +1009,6 @@
 
 	/* free context mapping */
 	free_context_table(iommu);
-
-	if (iommu->reg)
-		iounmap(iommu->reg);
-	kfree(iommu);
 }
 
 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
@@ -1426,37 +1353,6 @@
 	return NULL;
 }
 
-static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
-     struct pci_dev *dev)
-{
-	int index;
-
-	while (dev) {
-		for (index = 0; index < cnt; index++)
-			if (dev == devices[index])
-				return 1;
-
-		/* Check our parent */
-		dev = dev->bus->self;
-	}
-
-	return 0;
-}
-
-static struct dmar_drhd_unit *
-dmar_find_matched_drhd_unit(struct pci_dev *dev)
-{
-	struct dmar_drhd_unit *drhd = NULL;
-
-	list_for_each_entry(drhd, &dmar_drhd_units, list) {
-		if (drhd->include_all || dmar_pci_device_match(drhd->devices,
-						drhd->devices_cnt, dev))
-			return drhd;
-	}
-
-	return NULL;
-}
-
 /* domain is initialized */
 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
 {
@@ -1729,8 +1625,6 @@
 	 * endfor
 	 */
 	for_each_drhd_unit(drhd) {
-		if (drhd->ignored)
-			continue;
 		g_num_of_iommus++;
 		/*
 		 * lock not needed as this is only incremented in the single
@@ -1739,12 +1633,6 @@
 		 */
 	}
 
-	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
-	if (!g_iommus) {
-		ret = -ENOMEM;
-		goto error;
-	}
-
 	deferred_flush = kzalloc(g_num_of_iommus *
 		sizeof(struct deferred_flush_tables), GFP_KERNEL);
 	if (!deferred_flush) {
@@ -1752,16 +1640,15 @@
 		goto error;
 	}
 
-	i = 0;
 	for_each_drhd_unit(drhd) {
 		if (drhd->ignored)
 			continue;
-		iommu = alloc_iommu(&g_iommus[i], drhd);
-		i++;
-		if (!iommu) {
-			ret = -ENOMEM;
+
+		iommu = drhd->iommu;
+
+		ret = iommu_init_domains(iommu);
+		if (ret)
 			goto error;
-		}
 
 		/*
 		 * TBD:
@@ -1845,7 +1732,6 @@
 		iommu = drhd->iommu;
 		free_iommu(iommu);
 	}
-	kfree(g_iommus);
 	return ret;
 }
 
@@ -2002,7 +1888,10 @@
 	/* just flush them all */
 	for (i = 0; i < g_num_of_iommus; i++) {
 		if (deferred_flush[i].next) {
-			iommu_flush_iotlb_global(&g_iommus[i], 0);
+			struct intel_iommu *iommu =
+				deferred_flush[i].domain[0]->iommu;
+
+			iommu_flush_iotlb_global(iommu, 0);
 			for (j = 0; j < deferred_flush[i].next; j++) {
 				__free_iova(&deferred_flush[i].domain[j]->iovad,
 						deferred_flush[i].iova[j]);
@@ -2032,7 +1921,8 @@
 	if (list_size == HIGH_WATER_MARK)
 		flush_unmaps();
 
-	iommu_id = dom->iommu - g_iommus;
+	iommu_id = dom->iommu->seq_id;
+
 	next = deferred_flush[iommu_id].next;
 	deferred_flush[iommu_id].domain[next] = dom;
 	deferred_flush[iommu_id].iova[next] = iova;
@@ -2348,38 +2238,6 @@
 
 }
 
-static int blacklist_iommu(const struct dmi_system_id *id)
-{
-	printk(KERN_INFO "%s detected; disabling IOMMU\n",
-	       id->ident);
-	dmar_disabled = 1;
-	return 0;
-}
-
-static struct dmi_system_id __initdata intel_iommu_dmi_table[] = {
-	{	/* Some DG33BU BIOS revisions advertised non-existent VT-d */
-		.callback = blacklist_iommu,
-		.ident = "Intel DG33BU",
-		{	DMI_MATCH(DMI_BOARD_VENDOR, "Intel Corporation"),
-			DMI_MATCH(DMI_BOARD_NAME, "DG33BU"),
-		}
-	},
-	{ }
-};
-
-
-void __init detect_intel_iommu(void)
-{
-	if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
-		return;
-	if (early_dmar_detect()) {
-		dmi_check_system(intel_iommu_dmi_table);
-		if (dmar_disabled)
-			return;
-		iommu_detected = 1;
-	}
-}
-
 static void __init init_no_remapping_devices(void)
 {
 	struct dmar_drhd_unit *drhd;
@@ -2426,12 +2284,19 @@
 {
 	int ret = 0;
 
-	if (no_iommu || swiotlb || dmar_disabled)
-		return -ENODEV;
-
 	if (dmar_table_init())
 		return 	-ENODEV;
 
+	if (dmar_dev_scope_init())
+		return 	-ENODEV;
+
+	/*
+	 * Check the need for DMA-remapping initialization now.
+	 * Above initialization will also be used by Interrupt-remapping.
+	 */
+	if (no_iommu || swiotlb || dmar_disabled)
+		return -ENODEV;
+
 	iommu_init_mempool();
 	dmar_init_reserved_ranges();
 
diff --git a/drivers/pci/intel-iommu.h b/drivers/pci/intel-iommu.h
index afc0ad9..2142c01 100644
--- a/drivers/pci/intel-iommu.h
+++ b/drivers/pci/intel-iommu.h
@@ -27,19 +27,8 @@
 #include <linux/sysdev.h>
 #include "iova.h"
 #include <linux/io.h>
-
-/*
- * We need a fixed PAGE_SIZE of 4K irrespective of
- * arch PAGE_SIZE for IOMMU page tables.
- */
-#define PAGE_SHIFT_4K		(12)
-#define PAGE_SIZE_4K		(1UL << PAGE_SHIFT_4K)
-#define PAGE_MASK_4K		(((u64)-1) << PAGE_SHIFT_4K)
-#define PAGE_ALIGN_4K(addr)	(((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K)
-
-#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT_4K)
-#define DMA_32BIT_PFN		IOVA_PFN(DMA_32BIT_MASK)
-#define DMA_64BIT_PFN		IOVA_PFN(DMA_64BIT_MASK)
+#include <asm/cacheflush.h>
+#include "dma_remapping.h"
 
 /*
  * Intel IOMMU register specification per version 1.0 public spec.
@@ -63,6 +52,11 @@
 #define	DMAR_PLMLIMIT_REG 0x6c	/* PMRR low limit */
 #define	DMAR_PHMBASE_REG 0x70	/* pmrr high base addr */
 #define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
+#define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
+#define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
+#define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
+#define DMAR_ICS_REG	0x98	/* Invalidation complete status register */
+#define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
 
 #define OFFSET_STRIDE		(9)
 /*
@@ -126,6 +120,10 @@
 #define ecap_max_iotlb_offset(e) \
 	(ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
 #define ecap_coherent(e)	((e) & 0x1)
+#define ecap_qis(e)		((e) & 0x2)
+#define ecap_eim_support(e)	((e >> 4) & 0x1)
+#define ecap_ir_support(e)	((e >> 3) & 0x1)
+#define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
 
 
 /* IOTLB_REG */
@@ -141,6 +139,17 @@
 #define DMA_TLB_IH_NONLEAF (((u64)1) << 6)
 #define DMA_TLB_MAX_SIZE (0x3f)
 
+/* INVALID_DESC */
+#define DMA_ID_TLB_GLOBAL_FLUSH	(((u64)1) << 3)
+#define DMA_ID_TLB_DSI_FLUSH	(((u64)2) << 3)
+#define DMA_ID_TLB_PSI_FLUSH	(((u64)3) << 3)
+#define DMA_ID_TLB_READ_DRAIN	(((u64)1) << 7)
+#define DMA_ID_TLB_WRITE_DRAIN	(((u64)1) << 6)
+#define DMA_ID_TLB_DID(id)	(((u64)((id & 0xffff) << 16)))
+#define DMA_ID_TLB_IH_NONLEAF	(((u64)1) << 6)
+#define DMA_ID_TLB_ADDR(addr)	(addr)
+#define DMA_ID_TLB_ADDR_MASK(mask)	(mask)
+
 /* PMEN_REG */
 #define DMA_PMEN_EPM (((u32)1)<<31)
 #define DMA_PMEN_PRS (((u32)1)<<0)
@@ -151,6 +160,9 @@
 #define DMA_GCMD_SFL (((u32)1) << 29)
 #define DMA_GCMD_EAFL (((u32)1) << 28)
 #define DMA_GCMD_WBF (((u32)1) << 27)
+#define DMA_GCMD_QIE (((u32)1) << 26)
+#define DMA_GCMD_SIRTP (((u32)1) << 24)
+#define DMA_GCMD_IRE (((u32) 1) << 25)
 
 /* GSTS_REG */
 #define DMA_GSTS_TES (((u32)1) << 31)
@@ -158,6 +170,9 @@
 #define DMA_GSTS_FLS (((u32)1) << 29)
 #define DMA_GSTS_AFLS (((u32)1) << 28)
 #define DMA_GSTS_WBFS (((u32)1) << 27)
+#define DMA_GSTS_QIES (((u32)1) << 26)
+#define DMA_GSTS_IRTPS (((u32)1) << 24)
+#define DMA_GSTS_IRES (((u32)1) << 25)
 
 /* CCMD_REG */
 #define DMA_CCMD_ICC (((u64)1) << 63)
@@ -187,158 +202,106 @@
 #define dma_frcd_source_id(c) (c & 0xffff)
 #define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
 
-/*
- * 0: Present
- * 1-11: Reserved
- * 12-63: Context Ptr (12 - (haw-1))
- * 64-127: Reserved
- */
-struct root_entry {
-	u64	val;
-	u64	rsvd1;
-};
-#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
-static inline bool root_present(struct root_entry *root)
-{
-	return (root->val & 1);
-}
-static inline void set_root_present(struct root_entry *root)
-{
-	root->val |= 1;
-}
-static inline void set_root_value(struct root_entry *root, unsigned long value)
-{
-	root->val |= value & PAGE_MASK_4K;
+#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
+
+#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
+{\
+	cycles_t start_time = get_cycles();\
+	while (1) {\
+		sts = op (iommu->reg + offset);\
+		if (cond)\
+			break;\
+		if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
+			panic("DMAR hardware is malfunctioning\n");\
+		cpu_relax();\
+	}\
 }
 
-struct context_entry;
-static inline struct context_entry *
-get_context_addr_from_root(struct root_entry *root)
-{
-	return (struct context_entry *)
-		(root_present(root)?phys_to_virt(
-		root->val & PAGE_MASK_4K):
-		NULL);
-}
+#define QI_LENGTH	256	/* queue length */
 
-/*
- * low 64 bits:
- * 0: present
- * 1: fault processing disable
- * 2-3: translation type
- * 12-63: address space root
- * high 64 bits:
- * 0-2: address width
- * 3-6: aval
- * 8-23: domain id
- */
-struct context_entry {
-	u64 lo;
-	u64 hi;
-};
-#define context_present(c) ((c).lo & 1)
-#define context_fault_disable(c) (((c).lo >> 1) & 1)
-#define context_translation_type(c) (((c).lo >> 2) & 3)
-#define context_address_root(c) ((c).lo & PAGE_MASK_4K)
-#define context_address_width(c) ((c).hi &  7)
-#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
-
-#define context_set_present(c) do {(c).lo |= 1;} while (0)
-#define context_set_fault_enable(c) \
-	do {(c).lo &= (((u64)-1) << 2) | 1;} while (0)
-#define context_set_translation_type(c, val) \
-	do { \
-		(c).lo &= (((u64)-1) << 4) | 3; \
-		(c).lo |= ((val) & 3) << 2; \
-	} while (0)
-#define CONTEXT_TT_MULTI_LEVEL 0
-#define context_set_address_root(c, val) \
-	do {(c).lo |= (val) & PAGE_MASK_4K;} while (0)
-#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
-#define context_set_domain_id(c, val) \
-	do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
-#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
-
-/*
- * 0: readable
- * 1: writable
- * 2-6: reserved
- * 7: super page
- * 8-11: available
- * 12-63: Host physcial address
- */
-struct dma_pte {
-	u64 val;
-};
-#define dma_clear_pte(p)	do {(p).val = 0;} while (0)
-
-#define DMA_PTE_READ (1)
-#define DMA_PTE_WRITE (2)
-
-#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
-#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
-#define dma_set_pte_prot(p, prot) \
-		do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
-#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
-#define dma_set_pte_addr(p, addr) do {\
-		(p).val |= ((addr) & PAGE_MASK_4K); } while (0)
-#define dma_pte_present(p) (((p).val & 3) != 0)
-
-struct intel_iommu;
-
-struct dmar_domain {
-	int	id;			/* domain id */
-	struct intel_iommu *iommu;	/* back pointer to owning iommu */
-
-	struct list_head devices; 	/* all devices' list */
-	struct iova_domain iovad;	/* iova's that belong to this domain */
-
-	struct dma_pte	*pgd;		/* virtual address */
-	spinlock_t	mapping_lock;	/* page table lock */
-	int		gaw;		/* max guest address width */
-
-	/* adjusted guest address width, 0 is level 2 30-bit */
-	int		agaw;
-
-#define DOMAIN_FLAG_MULTIPLE_DEVICES 1
-	int		flags;
+enum {
+	QI_FREE,
+	QI_IN_USE,
+	QI_DONE
 };
 
-/* PCI domain-device relationship */
-struct device_domain_info {
-	struct list_head link;	/* link to domain siblings */
-	struct list_head global; /* link to global list */
-	u8 bus;			/* PCI bus numer */
-	u8 devfn;		/* PCI devfn number */
-	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
-	struct dmar_domain *domain; /* pointer to domain */
+#define QI_CC_TYPE		0x1
+#define QI_IOTLB_TYPE		0x2
+#define QI_DIOTLB_TYPE		0x3
+#define QI_IEC_TYPE		0x4
+#define QI_IWD_TYPE		0x5
+
+#define QI_IEC_SELECTIVE	(((u64)1) << 4)
+#define QI_IEC_IIDEX(idx)	(((u64)(idx & 0xffff) << 32))
+#define QI_IEC_IM(m)		(((u64)(m & 0x1f) << 27))
+
+#define QI_IWD_STATUS_DATA(d)	(((u64)d) << 32)
+#define QI_IWD_STATUS_WRITE	(((u64)1) << 5)
+
+struct qi_desc {
+	u64 low, high;
 };
 
-extern int init_dmars(void);
+struct q_inval {
+	spinlock_t      q_lock;
+	struct qi_desc  *desc;          /* invalidation queue */
+	int             *desc_status;   /* desc status */
+	int             free_head;      /* first free entry */
+	int             free_tail;      /* last free entry */
+	int             free_cnt;
+};
+
+#ifdef CONFIG_INTR_REMAP
+/* 1MB - maximum possible interrupt remapping table size */
+#define INTR_REMAP_PAGE_ORDER	8
+#define INTR_REMAP_TABLE_REG_SIZE	0xf
+
+#define INTR_REMAP_TABLE_ENTRIES	65536
+
+struct ir_table {
+	struct irte *base;
+};
+#endif
 
 struct intel_iommu {
 	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
 	u64		cap;
 	u64		ecap;
-	unsigned long 	*domain_ids; /* bitmap of domains */
-	struct dmar_domain **domains; /* ptr to domains */
 	int		seg;
 	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
-	spinlock_t	lock; /* protect context, domain ids */
 	spinlock_t	register_lock; /* protect register handling */
+	int		seq_id;	/* sequence id of the iommu */
+
+#ifdef CONFIG_DMAR
+	unsigned long 	*domain_ids; /* bitmap of domains */
+	struct dmar_domain **domains; /* ptr to domains */
+	spinlock_t	lock; /* protect context, domain ids */
 	struct root_entry *root_entry; /* virtual address */
 
 	unsigned int irq;
 	unsigned char name[7];    /* Device Name */
 	struct msi_msg saved_msg;
 	struct sys_device sysdev;
+#endif
+	struct q_inval  *qi;            /* Queued invalidation info */
+#ifdef CONFIG_INTR_REMAP
+	struct ir_table *ir_table;	/* Interrupt remapping info */
+#endif
 };
 
-#ifndef CONFIG_DMAR_GFX_WA
-static inline void iommu_prepare_gfx_mapping(void)
+static inline void __iommu_flush_cache(
+	struct intel_iommu *iommu, void *addr, int size)
 {
-	return;
+	if (!ecap_coherent(iommu->ecap))
+		clflush_cache_range(addr, size);
 }
-#endif /* !CONFIG_DMAR_GFX_WA */
 
+extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
+
+extern int alloc_iommu(struct dmar_drhd_unit *drhd);
+extern void free_iommu(struct intel_iommu *iommu);
+extern int dmar_enable_qi(struct intel_iommu *iommu);
+extern void qi_global_iec(struct intel_iommu *iommu);
+
+extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 #endif
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
new file mode 100644
index 0000000..bb642cc
--- /dev/null
+++ b/drivers/pci/intr_remapping.c
@@ -0,0 +1,471 @@
+#include <linux/dmar.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <asm/io_apic.h>
+#include "intel-iommu.h"
+#include "intr_remapping.h"
+
+static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
+static int ir_ioapic_num;
+int intr_remapping_enabled;
+
+static struct {
+	struct intel_iommu *iommu;
+	u16 irte_index;
+	u16 sub_handle;
+	u8  irte_mask;
+} irq_2_iommu[NR_IRQS];
+
+static DEFINE_SPINLOCK(irq_2_ir_lock);
+
+int irq_remapped(int irq)
+{
+	if (irq > NR_IRQS)
+		return 0;
+
+	if (!irq_2_iommu[irq].iommu)
+		return 0;
+
+	return 1;
+}
+
+int get_irte(int irq, struct irte *entry)
+{
+	int index;
+
+	if (!entry || irq > NR_IRQS)
+		return -1;
+
+	spin_lock(&irq_2_ir_lock);
+	if (!irq_2_iommu[irq].iommu) {
+		spin_unlock(&irq_2_ir_lock);
+		return -1;
+	}
+
+	index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle;
+	*entry = *(irq_2_iommu[irq].iommu->ir_table->base + index);
+
+	spin_unlock(&irq_2_ir_lock);
+	return 0;
+}
+
+int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
+{
+	struct ir_table *table = iommu->ir_table;
+	u16 index, start_index;
+	unsigned int mask = 0;
+	int i;
+
+	if (!count)
+		return -1;
+
+	/*
+	 * start the IRTE search from index 0.
+	 */
+	index = start_index = 0;
+
+	if (count > 1) {
+		count = __roundup_pow_of_two(count);
+		mask = ilog2(count);
+	}
+
+	if (mask > ecap_max_handle_mask(iommu->ecap)) {
+		printk(KERN_ERR
+		       "Requested mask %x exceeds the max invalidation handle"
+		       " mask value %Lx\n", mask,
+		       ecap_max_handle_mask(iommu->ecap));
+		return -1;
+	}
+
+	spin_lock(&irq_2_ir_lock);
+	do {
+		for (i = index; i < index + count; i++)
+			if  (table->base[i].present)
+				break;
+		/* empty index found */
+		if (i == index + count)
+			break;
+
+		index = (index + count) % INTR_REMAP_TABLE_ENTRIES;
+
+		if (index == start_index) {
+			spin_unlock(&irq_2_ir_lock);
+			printk(KERN_ERR "can't allocate an IRTE\n");
+			return -1;
+		}
+	} while (1);
+
+	for (i = index; i < index + count; i++)
+		table->base[i].present = 1;
+
+	irq_2_iommu[irq].iommu = iommu;
+	irq_2_iommu[irq].irte_index =  index;
+	irq_2_iommu[irq].sub_handle = 0;
+	irq_2_iommu[irq].irte_mask = mask;
+
+	spin_unlock(&irq_2_ir_lock);
+
+	return index;
+}
+
+static void qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
+{
+	struct qi_desc desc;
+
+	desc.low = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask)
+		   | QI_IEC_SELECTIVE;
+	desc.high = 0;
+
+	qi_submit_sync(&desc, iommu);
+}
+
+int map_irq_to_irte_handle(int irq, u16 *sub_handle)
+{
+	int index;
+
+	spin_lock(&irq_2_ir_lock);
+	if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) {
+		spin_unlock(&irq_2_ir_lock);
+		return -1;
+	}
+
+	*sub_handle = irq_2_iommu[irq].sub_handle;
+	index = irq_2_iommu[irq].irte_index;
+	spin_unlock(&irq_2_ir_lock);
+	return index;
+}
+
+int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
+{
+	spin_lock(&irq_2_ir_lock);
+	if (irq >= NR_IRQS || irq_2_iommu[irq].iommu) {
+		spin_unlock(&irq_2_ir_lock);
+		return -1;
+	}
+
+	irq_2_iommu[irq].iommu = iommu;
+	irq_2_iommu[irq].irte_index = index;
+	irq_2_iommu[irq].sub_handle = subhandle;
+	irq_2_iommu[irq].irte_mask = 0;
+
+	spin_unlock(&irq_2_ir_lock);
+
+	return 0;
+}
+
+int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index)
+{
+	spin_lock(&irq_2_ir_lock);
+	if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) {
+		spin_unlock(&irq_2_ir_lock);
+		return -1;
+	}
+
+	irq_2_iommu[irq].iommu = NULL;
+	irq_2_iommu[irq].irte_index = 0;
+	irq_2_iommu[irq].sub_handle = 0;
+	irq_2_iommu[irq].irte_mask = 0;
+
+	spin_unlock(&irq_2_ir_lock);
+
+	return 0;
+}
+
+int modify_irte(int irq, struct irte *irte_modified)
+{
+	int index;
+	struct irte *irte;
+	struct intel_iommu *iommu;
+
+	spin_lock(&irq_2_ir_lock);
+	if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) {
+		spin_unlock(&irq_2_ir_lock);
+		return -1;
+	}
+
+	iommu = irq_2_iommu[irq].iommu;
+
+	index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle;
+	irte = &iommu->ir_table->base[index];
+
+	set_64bit((unsigned long *)irte, irte_modified->low | (1 << 1));
+	__iommu_flush_cache(iommu, irte, sizeof(*irte));
+
+	qi_flush_iec(iommu, index, 0);
+
+	spin_unlock(&irq_2_ir_lock);
+	return 0;
+}
+
+int flush_irte(int irq)
+{
+	int index;
+	struct intel_iommu *iommu;
+
+	spin_lock(&irq_2_ir_lock);
+	if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) {
+		spin_unlock(&irq_2_ir_lock);
+		return -1;
+	}
+
+	iommu = irq_2_iommu[irq].iommu;
+
+	index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle;
+
+	qi_flush_iec(iommu, index, irq_2_iommu[irq].irte_mask);
+	spin_unlock(&irq_2_ir_lock);
+
+	return 0;
+}
+
+struct intel_iommu *map_ioapic_to_ir(int apic)
+{
+	int i;
+
+	for (i = 0; i < MAX_IO_APICS; i++)
+		if (ir_ioapic[i].id == apic)
+			return ir_ioapic[i].iommu;
+	return NULL;
+}
+
+struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
+{
+	struct dmar_drhd_unit *drhd;
+
+	drhd = dmar_find_matched_drhd_unit(dev);
+	if (!drhd)
+		return NULL;
+
+	return drhd->iommu;
+}
+
+int free_irte(int irq)
+{
+	int index, i;
+	struct irte *irte;
+	struct intel_iommu *iommu;
+
+	spin_lock(&irq_2_ir_lock);
+	if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) {
+		spin_unlock(&irq_2_ir_lock);
+		return -1;
+	}
+
+	iommu = irq_2_iommu[irq].iommu;
+
+	index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle;
+	irte = &iommu->ir_table->base[index];
+
+	if (!irq_2_iommu[irq].sub_handle) {
+		for (i = 0; i < (1 << irq_2_iommu[irq].irte_mask); i++)
+			set_64bit((unsigned long *)irte, 0);
+		qi_flush_iec(iommu, index, irq_2_iommu[irq].irte_mask);
+	}
+
+	irq_2_iommu[irq].iommu = NULL;
+	irq_2_iommu[irq].irte_index = 0;
+	irq_2_iommu[irq].sub_handle = 0;
+	irq_2_iommu[irq].irte_mask = 0;
+
+	spin_unlock(&irq_2_ir_lock);
+
+	return 0;
+}
+
+static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
+{
+	u64 addr;
+	u32 cmd, sts;
+	unsigned long flags;
+
+	addr = virt_to_phys((void *)iommu->ir_table->base);
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+
+	dmar_writeq(iommu->reg + DMAR_IRTA_REG,
+		    (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE);
+
+	/* Set interrupt-remapping table pointer */
+	cmd = iommu->gcmd | DMA_GCMD_SIRTP;
+	writel(cmd, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (sts & DMA_GSTS_IRTPS), sts);
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+	/*
+	 * global invalidation of interrupt entry cache before enabling
+	 * interrupt-remapping.
+	 */
+	qi_global_iec(iommu);
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+
+	/* Enable interrupt-remapping */
+	cmd = iommu->gcmd | DMA_GCMD_IRE;
+	iommu->gcmd |= DMA_GCMD_IRE;
+	writel(cmd, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (sts & DMA_GSTS_IRES), sts);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+
+static int setup_intr_remapping(struct intel_iommu *iommu, int mode)
+{
+	struct ir_table *ir_table;
+	struct page *pages;
+
+	ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table),
+					     GFP_KERNEL);
+
+	if (!iommu->ir_table)
+		return -ENOMEM;
+
+	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, INTR_REMAP_PAGE_ORDER);
+
+	if (!pages) {
+		printk(KERN_ERR "failed to allocate pages of order %d\n",
+		       INTR_REMAP_PAGE_ORDER);
+		kfree(iommu->ir_table);
+		return -ENOMEM;
+	}
+
+	ir_table->base = page_address(pages);
+
+	iommu_set_intr_remapping(iommu, mode);
+	return 0;
+}
+
+int __init enable_intr_remapping(int eim)
+{
+	struct dmar_drhd_unit *drhd;
+	int setup = 0;
+
+	/*
+	 * check for the Interrupt-remapping support
+	 */
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		if (eim && !ecap_eim_support(iommu->ecap)) {
+			printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
+			       " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
+			return -1;
+		}
+	}
+
+	/*
+	 * Enable queued invalidation for all the DRHD's.
+	 */
+	for_each_drhd_unit(drhd) {
+		int ret;
+		struct intel_iommu *iommu = drhd->iommu;
+		ret = dmar_enable_qi(iommu);
+
+		if (ret) {
+			printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
+			       " invalidation, ecap %Lx, ret %d\n",
+			       drhd->reg_base_addr, iommu->ecap, ret);
+			return -1;
+		}
+	}
+
+	/*
+	 * Setup Interrupt-remapping for all the DRHD's now.
+	 */
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		if (setup_intr_remapping(iommu, eim))
+			goto error;
+
+		setup = 1;
+	}
+
+	if (!setup)
+		goto error;
+
+	intr_remapping_enabled = 1;
+
+	return 0;
+
+error:
+	/*
+	 * handle error condition gracefully here!
+	 */
+	return -1;
+}
+
+static int ir_parse_ioapic_scope(struct acpi_dmar_header *header,
+				 struct intel_iommu *iommu)
+{
+	struct acpi_dmar_hardware_unit *drhd;
+	struct acpi_dmar_device_scope *scope;
+	void *start, *end;
+
+	drhd = (struct acpi_dmar_hardware_unit *)header;
+
+	start = (void *)(drhd + 1);
+	end = ((void *)drhd) + header->length;
+
+	while (start < end) {
+		scope = start;
+		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
+			if (ir_ioapic_num == MAX_IO_APICS) {
+				printk(KERN_WARNING "Exceeded Max IO APICS\n");
+				return -1;
+			}
+
+			printk(KERN_INFO "IOAPIC id %d under DRHD base"
+			       " 0x%Lx\n", scope->enumeration_id,
+			       drhd->address);
+
+			ir_ioapic[ir_ioapic_num].iommu = iommu;
+			ir_ioapic[ir_ioapic_num].id = scope->enumeration_id;
+			ir_ioapic_num++;
+		}
+		start += scope->length;
+	}
+
+	return 0;
+}
+
+/*
+ * Finds the assocaition between IOAPIC's and its Interrupt-remapping
+ * hardware unit.
+ */
+int __init parse_ioapics_under_ir(void)
+{
+	struct dmar_drhd_unit *drhd;
+	int ir_supported = 0;
+
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (ecap_ir_support(iommu->ecap)) {
+			if (ir_parse_ioapic_scope(drhd->hdr, iommu))
+				return -1;
+
+			ir_supported = 1;
+		}
+	}
+
+	if (ir_supported && ir_ioapic_num != nr_ioapics) {
+		printk(KERN_WARNING
+		       "Not all IO-APIC's listed under remapping hardware\n");
+		return -1;
+	}
+
+	return ir_supported;
+}
diff --git a/drivers/pci/intr_remapping.h b/drivers/pci/intr_remapping.h
new file mode 100644
index 0000000..05f2635
--- /dev/null
+++ b/drivers/pci/intr_remapping.h
@@ -0,0 +1,8 @@
+#include "intel-iommu.h"
+
+struct ioapic_scope {
+	struct intel_iommu *iommu;
+	unsigned int id;
+};
+
+#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 81c16cb..90212ac 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -40,7 +40,6 @@
 #include <linux/ioport.h>
 #include <linux/blkdev.h>
 #include <linux/errno.h>
-#include <linux/hdreg.h>
 #include <linux/slab.h>
 #include <linux/ide.h>
 #include <linux/scatterlist.h>
@@ -131,50 +130,6 @@
 	return scsihost_to_idescsi(ide_drive->driver_data);
 }
 
-/*
- *	PIO data transfer routine using the scatter gather table.
- */
-static void ide_scsi_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				unsigned int bcount, int write)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	xfer_func_t *xf = write ? tp_ops->output_data : tp_ops->input_data;
-	char *buf;
-	int count;
-
-	while (bcount) {
-		count = min(pc->sg->length - pc->b_count, bcount);
-		if (PageHighMem(sg_page(pc->sg))) {
-			unsigned long flags;
-
-			local_irq_save(flags);
-			buf = kmap_atomic(sg_page(pc->sg), KM_IRQ0) +
-					  pc->sg->offset;
-			xf(drive, NULL, buf + pc->b_count, count);
-			kunmap_atomic(buf - pc->sg->offset, KM_IRQ0);
-			local_irq_restore(flags);
-		} else {
-			buf = sg_virt(pc->sg);
-			xf(drive, NULL, buf + pc->b_count, count);
-		}
-		bcount -= count; pc->b_count += count;
-		if (pc->b_count == pc->sg->length) {
-			if (!--pc->sg_cnt)
-				break;
-			pc->sg = sg_next(pc->sg);
-			pc->b_count = 0;
-		}
-	}
-
-	if (bcount) {
-		printk(KERN_ERR "%s: scatter gather table too small, %s\n",
-				drive->name, write ? "padding with zeros"
-						   : "discarding data");
-		ide_pad_transfer(drive, write, bcount);
-	}
-}
-
 static void ide_scsi_hex_dump(u8 *data, int len)
 {
 	print_hex_dump(KERN_CONT, "", DUMP_PREFIX_NONE, 16, 1, data, len, 0);
@@ -244,9 +199,9 @@
 {
 	ide_hwif_t *hwif = drive->hwif;
 
-	if (hwif->tp_ops->read_status(hwif) & (BUSY_STAT | DRQ_STAT))
+	if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
 		/* force an abort */
-		hwif->tp_ops->exec_command(hwif, WIN_IDLEIMMEDIATE);
+		hwif->tp_ops->exec_command(hwif, ATA_CMD_IDLEIMMEDIATE);
 
 	rq->errors++;
 
@@ -344,7 +299,7 @@
 
 	return ide_pc_intr(drive, pc, idescsi_pc_intr, get_timeout(pc),
 			   idescsi_expiry, NULL, NULL, NULL,
-			   ide_scsi_io_buffers);
+			   ide_io_buffers);
 }
 
 static ide_startstop_t idescsi_transfer_pc(ide_drive_t *drive)
@@ -430,21 +385,41 @@
 }
 
 #ifdef CONFIG_IDE_PROC_FS
-static void idescsi_add_settings(ide_drive_t *drive)
-{
-	idescsi_scsi_t *scsi = drive_to_idescsi(drive);
-
-/*
- *			drive	setting name	read/write	data type	min	max	mul_factor	div_factor	data pointer		set function
- */
-	ide_add_setting(drive,	"bios_cyl",	SETTING_RW,	TYPE_INT,	0,	1023,	1,		1,		&drive->bios_cyl,	NULL);
-	ide_add_setting(drive,	"bios_head",	SETTING_RW,	TYPE_BYTE,	0,	255,	1,		1,		&drive->bios_head,	NULL);
-	ide_add_setting(drive,	"bios_sect",	SETTING_RW,	TYPE_BYTE,	0,	63,	1,		1,		&drive->bios_sect,	NULL);
-	ide_add_setting(drive,	"transform",	SETTING_RW,	TYPE_INT,	0,	3,	1,		1,		&scsi->transform,	NULL);
-	ide_add_setting(drive,	"log",		SETTING_RW,	TYPE_INT,	0,	1,	1,		1,		&scsi->log,		NULL);
+#define ide_scsi_devset_get(name, field) \
+static int get_##name(ide_drive_t *drive) \
+{ \
+	idescsi_scsi_t *scsi = drive_to_idescsi(drive); \
+	return scsi->field; \
 }
-#else
-static inline void idescsi_add_settings(ide_drive_t *drive) { ; }
+
+#define ide_scsi_devset_set(name, field) \
+static int set_##name(ide_drive_t *drive, int arg) \
+{ \
+	idescsi_scsi_t *scsi = drive_to_idescsi(drive); \
+	scsi->field = arg; \
+	return 0; \
+}
+
+#define ide_scsi_devset_rw_field(_name, _field) \
+ide_scsi_devset_get(_name, _field); \
+ide_scsi_devset_set(_name, _field); \
+IDE_DEVSET(_name, DS_SYNC, get_##_name, set_##_name);
+
+ide_devset_rw_field(bios_cyl, bios_cyl);
+ide_devset_rw_field(bios_head, bios_head);
+ide_devset_rw_field(bios_sect, bios_sect);
+
+ide_scsi_devset_rw_field(transform, transform);
+ide_scsi_devset_rw_field(log, log);
+
+static const struct ide_proc_devset idescsi_settings[] = {
+	IDE_PROC_DEVSET(bios_cyl,  0, 1023),
+	IDE_PROC_DEVSET(bios_head, 0,  255),
+	IDE_PROC_DEVSET(bios_sect, 0,	63),
+	IDE_PROC_DEVSET(log,	   0,	 1),
+	IDE_PROC_DEVSET(transform, 0,	 3),
+	{ 0 },
+};
 #endif
 
 /*
@@ -452,7 +427,7 @@
  */
 static void idescsi_setup (ide_drive_t *drive, idescsi_scsi_t *scsi)
 {
-	if (drive->id && (drive->id->config & 0x0060) == 0x20)
+	if ((drive->id[ATA_ID_CONFIG] & 0x0060) == 0x20)
 		set_bit(IDE_AFLAG_DRQ_INTERRUPT, &drive->atapi_flags);
 	clear_bit(IDESCSI_SG_TRANSFORM, &scsi->transform);
 #if IDESCSI_DEBUG_LOG
@@ -461,7 +436,7 @@
 
 	drive->pc_callback = ide_scsi_callback;
 
-	idescsi_add_settings(drive);
+	ide_proc_register_driver(drive, scsi->driver);
 }
 
 static void ide_scsi_remove(ide_drive_t *drive)
@@ -503,12 +478,12 @@
 	.remove			= ide_scsi_remove,
 	.version		= IDESCSI_VERSION,
 	.media			= ide_scsi,
-	.supports_dsc_overlap	= 0,
 	.do_request		= idescsi_do_request,
 	.end_request		= idescsi_end_request,
 	.error                  = idescsi_atapi_error,
 #ifdef CONFIG_IDE_PROC_FS
 	.proc			= idescsi_proc,
+	.settings		= idescsi_settings,
 #endif
 };
 
@@ -811,6 +786,7 @@
 	struct gendisk *g;
 	static int warned;
 	int err = -ENOMEM;
+	u16 last_lun;
 
 	if (!warned && drive->media == ide_cdrom) {
 		printk(KERN_WARNING "ide-scsi is deprecated for cd burning! Use ide-cd and give dev=/dev/hdX as device\n");
@@ -821,7 +797,6 @@
 		return -ENODEV;
 
 	if (!strstr("ide-scsi", drive->driver_req) ||
-	    !drive->present ||
 	    drive->media == ide_disk ||
 	    !(host = scsi_host_alloc(&idescsi_template,sizeof(idescsi_scsi_t))))
 		return -ENODEV;
@@ -836,12 +811,12 @@
 
 	host->max_id = 1;
 
-	if (drive->id->last_lun)
-		debug_log("%s: id->last_lun=%u\n", drive->name,
-			  drive->id->last_lun);
+	last_lun = drive->id[ATA_ID_LAST_LUN];
+	if (last_lun)
+		debug_log("%s: last_lun=%u\n", drive->name, last_lun);
 
-	if ((drive->id->last_lun & 0x7) != 7)
-		host->max_lun = (drive->id->last_lun & 0x7) + 1;
+	if ((last_lun & 7) != 7)
+		host->max_lun = (last_lun & 7) + 1;
 	else
 		host->max_lun = 1;
 
@@ -852,7 +827,6 @@
 	idescsi->host = host;
 	idescsi->disk = g;
 	g->private_data = &idescsi->driver;
-	ide_proc_register_driver(drive, &idescsi_driver);
 	err = 0;
 	idescsi_setup(drive, idescsi);
 	g->fops = &idescsi_ops;
diff --git a/fs/Kconfig b/fs/Kconfig
index abccb5d..40183d9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -136,37 +136,51 @@
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
 
-config EXT4DEV_FS
-	tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+config EXT4_FS
+	tristate "The Extended 4 (ext4) filesystem"
 	select JBD2
 	select CRC16
 	help
-	  Ext4dev is a predecessor filesystem of the next generation
-	  extended fs ext4, based on ext3 filesystem code. It will be
-	  renamed ext4 fs later, once ext4dev is mature and stabilized.
+	  This is the next generation of the ext3 filesystem.
 
 	  Unlike the change from ext2 filesystem to ext3 filesystem,
-	  the on-disk format of ext4dev is not the same as ext3 any more:
-	  it is based on extent maps and it supports 48-bit physical block
-	  numbers. These combined on-disk format changes will allow
-	  ext4dev/ext4 to handle more than 16 TB filesystem volumes --
-	  a hard limit that ext3 cannot overcome without changing the
-	  on-disk format.
+	  the on-disk format of ext4 is not forwards compatible with
+	  ext3; it is based on extent maps and it supports 48-bit
+	  physical block numbers.  The ext4 filesystem also supports delayed
+	  allocation, persistent preallocation, high resolution time stamps,
+	  and a number of other features to improve performance and speed
+	  up fsck time.  For more information, please see the web pages at
+	  http://ext4.wiki.kernel.org.
 
-	  Other than extent maps and 48-bit block numbers, ext4dev also is
-	  likely to have other new features such as persistent preallocation,
-	  high resolution time stamps, and larger file support etc.  These
-	  features will be added to ext4dev gradually.
+	  The ext4 filesystem will support mounting an ext3
+	  filesystem; while there will be some performance gains from
+	  the delayed allocation and inode table readahead, the best
+	  performance gains will require enabling ext4 features in the
+	  filesystem, or formating a new filesystem as an ext4
+	  filesystem initially.
 
 	  To compile this file system support as a module, choose M here. The
 	  module will be called ext4dev.
 
 	  If unsure, say N.
 
-config EXT4DEV_FS_XATTR
-	bool "Ext4dev extended attributes"
-	depends on EXT4DEV_FS
+config EXT4DEV_COMPAT
+	bool "Enable ext4dev compatibility"
+	depends on EXT4_FS
+	help
+	  Starting with 2.6.28, the name of the ext4 filesystem was
+	  renamed from ext4dev to ext4.  Unfortunately there are some
+	  lagecy userspace programs (such as klibc's fstype) have
+	  "ext4dev" hardcoded.  
+
+	  To enable backwards compatibility so that systems that are
+	  still expecting to mount ext4 filesystems using ext4dev,
+	  chose Y here.   This feature will go away by 2.6.31, so
+	  please arrange to get your userspace programs fixed!
+
+config EXT4_FS_XATTR
+	bool "Ext4 extended attributes"
+	depends on EXT4_FS
 	default y
 	help
 	  Extended attributes are name:value pairs associated with inodes by
@@ -175,11 +189,11 @@
 
 	  If unsure, say N.
 
-	  You need this for POSIX ACL support on ext4dev/ext4.
+	  You need this for POSIX ACL support on ext4.
 
-config EXT4DEV_FS_POSIX_ACL
-	bool "Ext4dev POSIX Access Control Lists"
-	depends on EXT4DEV_FS_XATTR
+config EXT4_FS_POSIX_ACL
+	bool "Ext4 POSIX Access Control Lists"
+	depends on EXT4_FS_XATTR
 	select FS_POSIX_ACL
 	help
 	  POSIX Access Control Lists (ACLs) support permissions for users and
@@ -190,14 +204,14 @@
 
 	  If you don't know what Access Control Lists are, say N
 
-config EXT4DEV_FS_SECURITY
-	bool "Ext4dev Security Labels"
-	depends on EXT4DEV_FS_XATTR
+config EXT4_FS_SECURITY
+	bool "Ext4 Security Labels"
+	depends on EXT4_FS_XATTR
 	help
 	  Security labels support alternative access control models
 	  implemented by security modules like SELinux.  This option
 	  enables an extended attribute handler for file security
-	  labels in the ext4dev/ext4 filesystem.
+	  labels in the ext4 filesystem.
 
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
@@ -240,22 +254,22 @@
 	help
 	  This is a generic journaling layer for block devices that support
 	  both 32-bit and 64-bit block numbers.  It is currently used by
-	  the ext4dev/ext4 filesystem, but it could also be used to add
+	  the ext4 filesystem, but it could also be used to add
 	  journal support to other file systems or block devices such
 	  as RAID or LVM.
 
-	  If you are using ext4dev/ext4, you need to say Y here. If you are not
-	  using ext4dev/ext4 then you will probably want to say N.
+	  If you are using ext4, you need to say Y here. If you are not
+	  using ext4 then you will probably want to say N.
 
 	  To compile this device as a module, choose M here. The module will be
-	  called jbd2.  If you are compiling ext4dev/ext4 into the kernel,
+	  called jbd2.  If you are compiling ext4 into the kernel,
 	  you cannot compile this code as a module.
 
 config JBD2_DEBUG
-	bool "JBD2 (ext4dev/ext4) debugging support"
+	bool "JBD2 (ext4) debugging support"
 	depends on JBD2 && DEBUG_FS
 	help
-	  If you are using the ext4dev/ext4 journaled file system (or
+	  If you are using the ext4 journaled file system (or
 	  potentially any other filesystem/device using JBD2), this option
 	  allows you to enable debugging output while the system is running,
 	  in order to help track down any problems you are having.
@@ -270,9 +284,9 @@
 config FS_MBCACHE
 # Meta block cache for Extended Attributes (ext2/ext3/ext4)
 	tristate
-	depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR
-	default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
-	default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
+	depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
+	default y if EXT2_FS=y || EXT3_FS=y || EXT4_FS=y
+	default m if EXT2_FS=m || EXT3_FS=m || EXT4_FS=m
 
 config REISERFS_FS
 	tristate "Reiserfs support"
diff --git a/fs/Makefile b/fs/Makefile
index a1482a5..de404b0 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -69,7 +69,7 @@
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT4DEV_FS)	+= ext4/ # Before ext2 so root fs can be ext4dev
+obj-$(CONFIG_EXT4_FS)		+= ext4/ # Before ext2 so root fs can be ext4dev
 obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_JBD2)		+= jbd2/
 obj-$(CONFIG_EXT2_FS)		+= ext2/
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 47d88da..bae998c 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -133,6 +133,8 @@
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern void ext2_get_inode_flags(struct ext2_inode_info *);
+extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		       u64 start, u64 len);
 int __ext2_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5f2fa9c..45ed071 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -86,4 +86,5 @@
 #endif
 	.setattr	= ext2_setattr,
 	.permission	= ext2_permission,
+	.fiemap		= ext2_fiemap,
 };
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 991d6df..7658b33 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
+#include <linux/fiemap.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xip.h"
@@ -704,6 +705,13 @@
 
 }
 
+int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		u64 start, u64 len)
+{
+	return generic_block_fiemap(inode, fieinfo, start, len,
+				    ext2_get_block);
+}
+
 static int ext2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, ext2_get_block, wbc);
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index acc4913..3be1e06 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -134,5 +134,6 @@
 	.removexattr	= generic_removexattr,
 #endif
 	.permission	= ext3_permission,
+	.fiemap		= ext3_fiemap,
 };
 
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 507d868..ebfec4d 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -36,6 +36,7 @@
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
+#include <linux/fiemap.h>
 #include "xattr.h"
 #include "acl.h"
 
@@ -981,6 +982,13 @@
 	return ret;
 }
 
+int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		u64 start, u64 len)
+{
+	return generic_block_fiemap(inode, fieinfo, start, len,
+				    ext3_get_block);
+}
+
 /*
  * `handle' can be NULL if create is zero
  */
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ac6fa8c..a8ff003 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -2,12 +2,12 @@
 # Makefile for the linux ext4-filesystem routines.
 #
 
-obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
+obj-$(CONFIG_EXT4_FS) += ext4.o
 
-ext4dev-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
 		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
 		   ext4_jbd2.o migrate.o mballoc.o
 
-ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)	+= xattr.o xattr_user.o xattr_trusted.o
-ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)	+= acl.o
-ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY)	+= xattr_security.o
+ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
+ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
+ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index cd2b855..cb45257 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -51,18 +51,18 @@
 	}
 }
 
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 
 /* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
    if the ACL has not been cached */
 #define EXT4_ACL_NOT_CACHED ((void *)-1)
 
 /* acl.c */
-extern int ext4_permission (struct inode *, int);
-extern int ext4_acl_chmod (struct inode *);
-extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
+extern int ext4_permission(struct inode *, int);
+extern int ext4_acl_chmod(struct inode *);
+extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
-#else  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#else  /* CONFIG_EXT4_FS_POSIX_ACL */
 #include <linux/sched.h>
 #define ext4_permission NULL
 
@@ -77,5 +77,5 @@
 {
 	return 0;
 }
-#endif  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#endif  /* CONFIG_EXT4_FS_POSIX_ACL */
 
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e9fa960..bd2ece2 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -83,6 +83,7 @@
 	}
 	return used_blocks;
 }
+
 /* Initializes an uninitialized block bitmap if given, and returns the
  * number of blocks free in the group. */
 unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
@@ -132,7 +133,7 @@
 		 */
 		group_blocks = ext4_blocks_count(sbi->s_es) -
 			le32_to_cpu(sbi->s_es->s_first_data_block) -
-			(EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1));
+			(EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
 	} else {
 		group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
 	}
@@ -200,20 +201,20 @@
  * @bh:			pointer to the buffer head to store the block
  *			group descriptor
  */
-struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 					     ext4_group_t block_group,
-					     struct buffer_head ** bh)
+					     struct buffer_head **bh)
 {
 	unsigned long group_desc;
 	unsigned long offset;
-	struct ext4_group_desc * desc;
+	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	if (block_group >= sbi->s_groups_count) {
-		ext4_error (sb, "ext4_get_group_desc",
-			    "block_group >= groups_count - "
-			    "block_group = %lu, groups_count = %lu",
-			    block_group, sbi->s_groups_count);
+		ext4_error(sb, "ext4_get_group_desc",
+			   "block_group >= groups_count - "
+			   "block_group = %lu, groups_count = %lu",
+			   block_group, sbi->s_groups_count);
 
 		return NULL;
 	}
@@ -222,10 +223,10 @@
 	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
 	if (!sbi->s_group_desc[group_desc]) {
-		ext4_error (sb, "ext4_get_group_desc",
-			    "Group descriptor not loaded - "
-			    "block_group = %lu, group_desc = %lu, desc = %lu",
-			     block_group, group_desc, offset);
+		ext4_error(sb, "ext4_get_group_desc",
+			   "Group descriptor not loaded - "
+			   "block_group = %lu, group_desc = %lu, desc = %lu",
+			   block_group, group_desc, offset);
 		return NULL;
 	}
 
@@ -302,8 +303,8 @@
 struct buffer_head *
 ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
-	struct ext4_group_desc * desc;
-	struct buffer_head * bh = NULL;
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh = NULL;
 	ext4_fsblk_t bitmap_blk;
 
 	desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -318,9 +319,11 @@
 			    block_group, bitmap_blk);
 		return NULL;
 	}
-	if (bh_uptodate_or_lock(bh))
+	if (buffer_uptodate(bh) &&
+	    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
 		return bh;
 
+	lock_buffer(bh);
 	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		ext4_init_block_bitmap(sb, bh, block_group, desc);
@@ -345,301 +348,6 @@
 	 */
 	return bh;
 }
-/*
- * The reservation window structure operations
- * --------------------------------------------
- * Operations include:
- * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
- *
- * We use a red-black tree to represent per-filesystem reservation
- * windows.
- *
- */
-
-/**
- * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
- * @rb_root:		root of per-filesystem reservation rb tree
- * @verbose:		verbose mode
- * @fn:			function which wishes to dump the reservation map
- *
- * If verbose is turned on, it will print the whole block reservation
- * windows(start, end).	Otherwise, it will only print out the "bad" windows,
- * those windows that overlap with their immediate neighbors.
- */
-#if 1
-static void __rsv_window_dump(struct rb_root *root, int verbose,
-			      const char *fn)
-{
-	struct rb_node *n;
-	struct ext4_reserve_window_node *rsv, *prev;
-	int bad;
-
-restart:
-	n = rb_first(root);
-	bad = 0;
-	prev = NULL;
-
-	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
-	while (n) {
-		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-		if (verbose)
-			printk("reservation window 0x%p "
-			       "start:  %llu, end:  %llu\n",
-			       rsv, rsv->rsv_start, rsv->rsv_end);
-		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
-			printk("Bad reservation %p (start >= end)\n",
-			       rsv);
-			bad = 1;
-		}
-		if (prev && prev->rsv_end >= rsv->rsv_start) {
-			printk("Bad reservation %p (prev->end >= start)\n",
-			       rsv);
-			bad = 1;
-		}
-		if (bad) {
-			if (!verbose) {
-				printk("Restarting reservation walk in verbose mode\n");
-				verbose = 1;
-				goto restart;
-			}
-		}
-		n = rb_next(n);
-		prev = rsv;
-	}
-	printk("Window map complete.\n");
-	BUG_ON(bad);
-}
-#define rsv_window_dump(root, verbose) \
-	__rsv_window_dump((root), (verbose), __func__)
-#else
-#define rsv_window_dump(root, verbose) do {} while (0)
-#endif
-
-/**
- * goal_in_my_reservation()
- * @rsv:		inode's reservation window
- * @grp_goal:		given goal block relative to the allocation block group
- * @group:		the current allocation block group
- * @sb:			filesystem super block
- *
- * Test if the given goal block (group relative) is within the file's
- * own block reservation window range.
- *
- * If the reservation window is outside the goal allocation group, return 0;
- * grp_goal (given goal block) could be -1, which means no specific
- * goal block. In this case, always return 1.
- * If the goal block is within the reservation window, return 1;
- * otherwise, return 0;
- */
-static int
-goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
-			ext4_group_t group, struct super_block *sb)
-{
-	ext4_fsblk_t group_first_block, group_last_block;
-
-	group_first_block = ext4_group_first_block_no(sb, group);
-	group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
-	if ((rsv->_rsv_start > group_last_block) ||
-	    (rsv->_rsv_end < group_first_block))
-		return 0;
-	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
-		|| (grp_goal + group_first_block > rsv->_rsv_end)))
-		return 0;
-	return 1;
-}
-
-/**
- * search_reserve_window()
- * @rb_root:		root of reservation tree
- * @goal:		target allocation block
- *
- * Find the reserved window which includes the goal, or the previous one
- * if the goal is not in any window.
- * Returns NULL if there are no windows or if all windows start after the goal.
- */
-static struct ext4_reserve_window_node *
-search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
-{
-	struct rb_node *n = root->rb_node;
-	struct ext4_reserve_window_node *rsv;
-
-	if (!n)
-		return NULL;
-
-	do {
-		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-
-		if (goal < rsv->rsv_start)
-			n = n->rb_left;
-		else if (goal > rsv->rsv_end)
-			n = n->rb_right;
-		else
-			return rsv;
-	} while (n);
-	/*
-	 * We've fallen off the end of the tree: the goal wasn't inside
-	 * any particular node.  OK, the previous node must be to one
-	 * side of the interval containing the goal.  If it's the RHS,
-	 * we need to back up one.
-	 */
-	if (rsv->rsv_start > goal) {
-		n = rb_prev(&rsv->rsv_node);
-		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-	}
-	return rsv;
-}
-
-/**
- * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
- * @sb:			super block
- * @rsv:		reservation window to add
- *
- * Must be called with rsv_lock hold.
- */
-void ext4_rsv_window_add(struct super_block *sb,
-		    struct ext4_reserve_window_node *rsv)
-{
-	struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
-	struct rb_node *node = &rsv->rsv_node;
-	ext4_fsblk_t start = rsv->rsv_start;
-
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
-	struct ext4_reserve_window_node *this;
-
-	while (*p)
-	{
-		parent = *p;
-		this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
-
-		if (start < this->rsv_start)
-			p = &(*p)->rb_left;
-		else if (start > this->rsv_end)
-			p = &(*p)->rb_right;
-		else {
-			rsv_window_dump(root, 1);
-			BUG();
-		}
-	}
-
-	rb_link_node(node, parent, p);
-	rb_insert_color(node, root);
-}
-
-/**
- * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
- * @sb:			super block
- * @rsv:		reservation window to remove
- *
- * Mark the block reservation window as not allocated, and unlink it
- * from the filesystem reservation window rb tree. Must be called with
- * rsv_lock hold.
- */
-static void rsv_window_remove(struct super_block *sb,
-			      struct ext4_reserve_window_node *rsv)
-{
-	rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	rsv->rsv_alloc_hit = 0;
-	rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
-}
-
-/*
- * rsv_is_empty() -- Check if the reservation window is allocated.
- * @rsv:		given reservation window to check
- *
- * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
- */
-static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
-{
-	/* a valid reservation end block could not be 0 */
-	return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-}
-
-/**
- * ext4_init_block_alloc_info()
- * @inode:		file inode structure
- *
- * Allocate and initialize the	reservation window structure, and
- * link the window to the ext4 inode structure at last
- *
- * The reservation window structure is only dynamically allocated
- * and linked to ext4 inode the first time the open file
- * needs a new block. So, before every ext4_new_block(s) call, for
- * regular files, we should check whether the reservation window
- * structure exists or not. In the latter case, this function is called.
- * Fail to do so will result in block reservation being turned off for that
- * open file.
- *
- * This function is called from ext4_get_blocks_handle(), also called
- * when setting the reservation window size through ioctl before the file
- * is open for write (needs block allocation).
- *
- * Needs down_write(i_data_sem) protection prior to call this function.
- */
-void ext4_init_block_alloc_info(struct inode *inode)
-{
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
-	struct super_block *sb = inode->i_sb;
-
-	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
-	if (block_i) {
-		struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
-
-		rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-		rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-
-		/*
-		 * if filesystem is mounted with NORESERVATION, the goal
-		 * reservation window size is set to zero to indicate
-		 * block reservation is off
-		 */
-		if (!test_opt(sb, RESERVATION))
-			rsv->rsv_goal_size = 0;
-		else
-			rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
-		rsv->rsv_alloc_hit = 0;
-		block_i->last_alloc_logical_block = 0;
-		block_i->last_alloc_physical_block = 0;
-	}
-	ei->i_block_alloc_info = block_i;
-}
-
-/**
- * ext4_discard_reservation()
- * @inode:		inode
- *
- * Discard(free) block reservation window on last file close, or truncate
- * or at last iput().
- *
- * It is being called in three cases:
- *	ext4_release_file(): last writer close the file
- *	ext4_clear_inode(): last iput(), when nobody link to this file.
- *	ext4_truncate(): when the block indirect map is about to change.
- *
- */
-void ext4_discard_reservation(struct inode *inode)
-{
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
-	struct ext4_reserve_window_node *rsv;
-	spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
-
-	ext4_mb_discard_inode_preallocations(inode);
-
-	if (!block_i)
-		return;
-
-	rsv = &block_i->rsv_window_node;
-	if (!rsv_is_empty(&rsv->rsv_window)) {
-		spin_lock(rsv_lock);
-		if (!rsv_is_empty(&rsv->rsv_window))
-			rsv_window_remove(inode->i_sb, rsv);
-		spin_unlock(rsv_lock);
-	}
-}
 
 /**
  * ext4_free_blocks_sb() -- Free given blocks and update quota
@@ -648,6 +356,13 @@
  * @block:			start physcial block to free
  * @count:			number of blocks to free
  * @pdquot_freed_blocks:	pointer to quota
+ *
+ * XXX This function is only used by the on-line resizing code, which
+ * should probably be fixed up to call the mballoc variant.  There
+ * this needs to be cleaned up later; in fact, I'm not convinced this
+ * is 100% correct in the face of the mballoc code.  The online resizing
+ * code needs to be fixed up to more tightly (and correctly) interlock
+ * with the mballoc code.
  */
 void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
 			 ext4_fsblk_t block, unsigned long count,
@@ -659,8 +374,8 @@
 	ext4_grpblk_t bit;
 	unsigned long i;
 	unsigned long overflow;
-	struct ext4_group_desc * desc;
-	struct ext4_super_block * es;
+	struct ext4_group_desc *desc;
+	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
 	int err = 0, ret;
 	ext4_grpblk_t group_freed;
@@ -671,13 +386,13 @@
 	if (block < le32_to_cpu(es->s_first_data_block) ||
 	    block + count < block ||
 	    block + count > ext4_blocks_count(es)) {
-		ext4_error (sb, "ext4_free_blocks",
-			    "Freeing blocks not in datazone - "
-			    "block = %llu, count = %lu", block, count);
+		ext4_error(sb, "ext4_free_blocks",
+			   "Freeing blocks not in datazone - "
+			   "block = %llu, count = %lu", block, count);
 		goto error_return;
 	}
 
-	ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
+	ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
 
 do_more:
 	overflow = 0;
@@ -694,7 +409,7 @@
 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
-	desc = ext4_get_group_desc (sb, block_group, &gd_bh);
+	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
 	if (!desc)
 		goto error_return;
 
@@ -703,10 +418,10 @@
 	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
 	    in_range(block + count - 1, ext4_inode_table(sb, desc),
 		     sbi->s_itb_per_group)) {
-		ext4_error (sb, "ext4_free_blocks",
-			    "Freeing blocks in system zones - "
-			    "Block = %llu, count = %lu",
-			    block, count);
+		ext4_error(sb, "ext4_free_blocks",
+			   "Freeing blocks in system zones - "
+			   "Block = %llu, count = %lu",
+			   block, count);
 		goto error_return;
 	}
 
@@ -848,7 +563,7 @@
 			ext4_fsblk_t block, unsigned long count,
 			int metadata)
 {
-	struct super_block * sb;
+	struct super_block *sb;
 	unsigned long dquot_freed_blocks;
 
 	/* this isn't the right place to decide whether block is metadata
@@ -859,748 +574,52 @@
 
 	sb = inode->i_sb;
 
-	if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
-		ext4_free_blocks_sb(handle, sb, block, count,
-						&dquot_freed_blocks);
-	else
-		ext4_mb_free_blocks(handle, inode, block, count,
-						metadata, &dquot_freed_blocks);
+	ext4_mb_free_blocks(handle, inode, block, count,
+			    metadata, &dquot_freed_blocks);
 	if (dquot_freed_blocks)
 		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
 	return;
 }
 
-/**
- * ext4_test_allocatable()
- * @nr:			given allocation block group
- * @bh:			bufferhead contains the bitmap of the given block group
- *
- * For ext4 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy.  This
- * prevents deletes from freeing up the page for reuse until we have
- * committed the delete transaction.
- *
- * If we didn't do this, then deleting something and reallocating it as
- * data would allow the old block to be overwritten before the
- * transaction committed (because we force data to disk before commit).
- * This would lead to corruption if we crashed between overwriting the
- * data and committing the delete.
- *
- * @@@ We may want to make this allocation behaviour conditional on
- * data-writes at some point, and disable it for metadata allocations or
- * sync-data inodes.
- */
-static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+						s64 nblocks)
 {
-	int ret;
-	struct journal_head *jh = bh2jh(bh);
+	s64 free_blocks, dirty_blocks;
+	s64 root_blocks = 0;
+	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
 
-	if (ext4_test_bit(nr, bh->b_data))
-		return 0;
+	free_blocks  = percpu_counter_read_positive(fbc);
+	dirty_blocks = percpu_counter_read_positive(dbc);
 
-	jbd_lock_bh_state(bh);
-	if (!jh->b_committed_data)
-		ret = 1;
-	else
-		ret = !ext4_test_bit(nr, jh->b_committed_data);
-	jbd_unlock_bh_state(bh);
-	return ret;
-}
+	if (!capable(CAP_SYS_RESOURCE) &&
+		sbi->s_resuid != current->fsuid &&
+		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+		root_blocks = ext4_r_blocks_count(sbi->s_es);
 
-/**
- * bitmap_search_next_usable_block()
- * @start:		the starting block (group relative) of the search
- * @bh:			bufferhead contains the block group bitmap
- * @maxblocks:		the ending block (group relative) of the reservation
- *
- * The bitmap search --- search forward alternately through the actual
- * bitmap on disk and the last-committed copy in journal, until we find a
- * bit free in both bitmaps.
- */
-static ext4_grpblk_t
-bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
-					ext4_grpblk_t maxblocks)
-{
-	ext4_grpblk_t next;
-	struct journal_head *jh = bh2jh(bh);
-
-	while (start < maxblocks) {
-		next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
-		if (next >= maxblocks)
-			return -1;
-		if (ext4_test_allocatable(next, bh))
-			return next;
-		jbd_lock_bh_state(bh);
-		if (jh->b_committed_data)
-			start = ext4_find_next_zero_bit(jh->b_committed_data,
-							maxblocks, next);
-		jbd_unlock_bh_state(bh);
-	}
-	return -1;
-}
-
-/**
- * find_next_usable_block()
- * @start:		the starting block (group relative) to find next
- *			allocatable block in bitmap.
- * @bh:			bufferhead contains the block group bitmap
- * @maxblocks:		the ending block (group relative) for the search
- *
- * Find an allocatable block in a bitmap.  We honor both the bitmap and
- * its last-committed copy (if that exists), and perform the "most
- * appropriate allocation" algorithm of looking for a free block near
- * the initial goal; then for a free byte somewhere in the bitmap; then
- * for any free bit in the bitmap.
- */
-static ext4_grpblk_t
-find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
-			ext4_grpblk_t maxblocks)
-{
-	ext4_grpblk_t here, next;
-	char *p, *r;
-
-	if (start > 0) {
-		/*
-		 * The goal was occupied; search forward for a free
-		 * block within the next XX blocks.
-		 *
-		 * end_goal is more or less random, but it has to be
-		 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
-		 * next 64-bit boundary is simple..
-		 */
-		ext4_grpblk_t end_goal = (start + 63) & ~63;
-		if (end_goal > maxblocks)
-			end_goal = maxblocks;
-		here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
-		if (here < end_goal && ext4_test_allocatable(here, bh))
-			return here;
-		ext4_debug("Bit not found near goal\n");
-	}
-
-	here = start;
-	if (here < 0)
-		here = 0;
-
-	p = ((char *)bh->b_data) + (here >> 3);
-	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
-	next = (r - ((char *)bh->b_data)) << 3;
-
-	if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
-		return next;
-
-	/*
-	 * The bitmap search --- search forward alternately through the actual
-	 * bitmap and the last-committed copy until we find a bit free in
-	 * both
-	 */
-	here = bitmap_search_next_usable_block(here, bh, maxblocks);
-	return here;
-}
-
-/**
- * claim_block()
- * @block:		the free block (group relative) to allocate
- * @bh:			the bufferhead containts the block group bitmap
- *
- * We think we can allocate this block in this bitmap.  Try to set the bit.
- * If that succeeds then check that nobody has allocated and then freed the
- * block since we saw that is was not marked in b_committed_data.  If it _was_
- * allocated and freed then clear the bit in the bitmap again and return
- * zero (failure).
- */
-static inline int
-claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
-{
-	struct journal_head *jh = bh2jh(bh);
-	int ret;
-
-	if (ext4_set_bit_atomic(lock, block, bh->b_data))
-		return 0;
-	jbd_lock_bh_state(bh);
-	if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
-		ext4_clear_bit_atomic(lock, block, bh->b_data);
-		ret = 0;
-	} else {
-		ret = 1;
-	}
-	jbd_unlock_bh_state(bh);
-	return ret;
-}
-
-/**
- * ext4_try_to_allocate()
- * @sb:			superblock
- * @handle:		handle to this transaction
- * @group:		given allocation block group
- * @bitmap_bh:		bufferhead holds the block bitmap
- * @grp_goal:		given target block within the group
- * @count:		target number of blocks to allocate
- * @my_rsv:		reservation window
- *
- * Attempt to allocate blocks within a give range. Set the range of allocation
- * first, then find the first free bit(s) from the bitmap (within the range),
- * and at last, allocate the blocks by claiming the found free bit as allocated.
- *
- * To set the range of this allocation:
- *	if there is a reservation window, only try to allocate block(s) from the
- *	file's own reservation window;
- *	Otherwise, the allocation range starts from the give goal block, ends at
- *	the block group's last block.
- *
- * If we failed to allocate the desired block then we may end up crossing to a
- * new bitmap.  In that case we must release write access to the old one via
- * ext4_journal_release_buffer(), else we'll run out of credits.
- */
-static ext4_grpblk_t
-ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
-			ext4_group_t group, struct buffer_head *bitmap_bh,
-			ext4_grpblk_t grp_goal, unsigned long *count,
-			struct ext4_reserve_window *my_rsv)
-{
-	ext4_fsblk_t group_first_block;
-	ext4_grpblk_t start, end;
-	unsigned long num = 0;
-
-	/* we do allocation within the reservation window if we have a window */
-	if (my_rsv) {
-		group_first_block = ext4_group_first_block_no(sb, group);
-		if (my_rsv->_rsv_start >= group_first_block)
-			start = my_rsv->_rsv_start - group_first_block;
-		else
-			/* reservation window cross group boundary */
-			start = 0;
-		end = my_rsv->_rsv_end - group_first_block + 1;
-		if (end > EXT4_BLOCKS_PER_GROUP(sb))
-			/* reservation window crosses group boundary */
-			end = EXT4_BLOCKS_PER_GROUP(sb);
-		if ((start <= grp_goal) && (grp_goal < end))
-			start = grp_goal;
-		else
-			grp_goal = -1;
-	} else {
-		if (grp_goal > 0)
-			start = grp_goal;
-		else
-			start = 0;
-		end = EXT4_BLOCKS_PER_GROUP(sb);
-	}
-
-	BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
-
-repeat:
-	if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
-		grp_goal = find_next_usable_block(start, bitmap_bh, end);
-		if (grp_goal < 0)
-			goto fail_access;
-		if (!my_rsv) {
-			int i;
-
-			for (i = 0; i < 7 && grp_goal > start &&
-					ext4_test_allocatable(grp_goal - 1,
-								bitmap_bh);
-					i++, grp_goal--)
-				;
+	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
+						EXT4_FREEBLOCKS_WATERMARK) {
+		free_blocks  = percpu_counter_sum(fbc);
+		dirty_blocks = percpu_counter_sum(dbc);
+		if (dirty_blocks < 0) {
+			printk(KERN_CRIT "Dirty block accounting "
+					"went wrong %lld\n",
+					dirty_blocks);
 		}
 	}
-	start = grp_goal;
-
-	if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
-		grp_goal, bitmap_bh)) {
-		/*
-		 * The block was allocated by another thread, or it was
-		 * allocated and then freed by another thread
-		 */
-		start++;
-		grp_goal++;
-		if (start >= end)
-			goto fail_access;
-		goto repeat;
-	}
-	num++;
-	grp_goal++;
-	while (num < *count && grp_goal < end
-		&& ext4_test_allocatable(grp_goal, bitmap_bh)
-		&& claim_block(sb_bgl_lock(EXT4_SB(sb), group),
-				grp_goal, bitmap_bh)) {
-		num++;
-		grp_goal++;
-	}
-	*count = num;
-	return grp_goal - num;
-fail_access:
-	*count = num;
-	return -1;
-}
-
-/**
- *	find_next_reservable_window():
- *		find a reservable space within the given range.
- *		It does not allocate the reservation window for now:
- *		alloc_new_reservation() will do the work later.
- *
- *	@search_head: the head of the searching list;
- *		This is not necessarily the list head of the whole filesystem
- *
- *		We have both head and start_block to assist the search
- *		for the reservable space. The list starts from head,
- *		but we will shift to the place where start_block is,
- *		then start from there, when looking for a reservable space.
- *
- *	@size: the target new reservation window size
- *
- *	@group_first_block: the first block we consider to start
- *			the real search from
- *
- *	@last_block:
- *		the maximum block number that our goal reservable space
- *		could start from. This is normally the last block in this
- *		group. The search will end when we found the start of next
- *		possible reservable space is out of this boundary.
- *		This could handle the cross boundary reservation window
- *		request.
- *
- *	basically we search from the given range, rather than the whole
- *	reservation double linked list, (start_block, last_block)
- *	to find a free region that is of my size and has not
- *	been reserved.
- *
- */
-static int find_next_reservable_window(
-				struct ext4_reserve_window_node *search_head,
-				struct ext4_reserve_window_node *my_rsv,
-				struct super_block * sb,
-				ext4_fsblk_t start_block,
-				ext4_fsblk_t last_block)
-{
-	struct rb_node *next;
-	struct ext4_reserve_window_node *rsv, *prev;
-	ext4_fsblk_t cur;
-	int size = my_rsv->rsv_goal_size;
-
-	/* TODO: make the start of the reservation window byte-aligned */
-	/* cur = *start_block & ~7;*/
-	cur = start_block;
-	rsv = search_head;
-	if (!rsv)
-		return -1;
-
-	while (1) {
-		if (cur <= rsv->rsv_end)
-			cur = rsv->rsv_end + 1;
-
-		/* TODO?
-		 * in the case we could not find a reservable space
-		 * that is what is expected, during the re-search, we could
-		 * remember what's the largest reservable space we could have
-		 * and return that one.
-		 *
-		 * For now it will fail if we could not find the reservable
-		 * space with expected-size (or more)...
-		 */
-		if (cur > last_block)
-			return -1;		/* fail */
-
-		prev = rsv;
-		next = rb_next(&rsv->rsv_node);
-		rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
-
-		/*
-		 * Reached the last reservation, we can just append to the
-		 * previous one.
-		 */
-		if (!next)
-			break;
-
-		if (cur + size <= rsv->rsv_start) {
-			/*
-			 * Found a reserveable space big enough.  We could
-			 * have a reservation across the group boundary here
-			 */
-			break;
-		}
-	}
-	/*
-	 * we come here either :
-	 * when we reach the end of the whole list,
-	 * and there is empty reservable space after last entry in the list.
-	 * append it to the end of the list.
-	 *
-	 * or we found one reservable space in the middle of the list,
-	 * return the reservation window that we could append to.
-	 * succeed.
+	/* Check whether we have space after
+	 * accounting for current dirty blocks
 	 */
+	if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
+		/* we don't have free space */
+		return -ENOSPC;
 
-	if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
-		rsv_window_remove(sb, my_rsv);
-
-	/*
-	 * Let's book the whole avaliable window for now.  We will check the
-	 * disk bitmap later and then, if there are free blocks then we adjust
-	 * the window size if it's larger than requested.
-	 * Otherwise, we will remove this node from the tree next time
-	 * call find_next_reservable_window.
-	 */
-	my_rsv->rsv_start = cur;
-	my_rsv->rsv_end = cur + size - 1;
-	my_rsv->rsv_alloc_hit = 0;
-
-	if (prev != my_rsv)
-		ext4_rsv_window_add(sb, my_rsv);
-
+	/* Add the blocks to nblocks */
+	percpu_counter_add(dbc, nblocks);
 	return 0;
 }
 
 /**
- *	alloc_new_reservation()--allocate a new reservation window
- *
- *		To make a new reservation, we search part of the filesystem
- *		reservation list (the list that inside the group). We try to
- *		allocate a new reservation window near the allocation goal,
- *		or the beginning of the group, if there is no goal.
- *
- *		We first find a reservable space after the goal, then from
- *		there, we check the bitmap for the first free block after
- *		it. If there is no free block until the end of group, then the
- *		whole group is full, we failed. Otherwise, check if the free
- *		block is inside the expected reservable space, if so, we
- *		succeed.
- *		If the first free block is outside the reservable space, then
- *		start from the first free block, we search for next available
- *		space, and go on.
- *
- *	on succeed, a new reservation will be found and inserted into the list
- *	It contains at least one free block, and it does not overlap with other
- *	reservation windows.
- *
- *	failed: we failed to find a reservation window in this group
- *
- *	@rsv: the reservation
- *
- *	@grp_goal: The goal (group-relative).  It is where the search for a
- *		free reservable space should start from.
- *		if we have a grp_goal(grp_goal >0 ), then start from there,
- *		no grp_goal(grp_goal = -1), we start from the first block
- *		of the group.
- *
- *	@sb: the super block
- *	@group: the group we are trying to allocate in
- *	@bitmap_bh: the block group block bitmap
- *
- */
-static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
-		ext4_grpblk_t grp_goal, struct super_block *sb,
-		ext4_group_t group, struct buffer_head *bitmap_bh)
-{
-	struct ext4_reserve_window_node *search_head;
-	ext4_fsblk_t group_first_block, group_end_block, start_block;
-	ext4_grpblk_t first_free_block;
-	struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
-	unsigned long size;
-	int ret;
-	spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-
-	group_first_block = ext4_group_first_block_no(sb, group);
-	group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
-	if (grp_goal < 0)
-		start_block = group_first_block;
-	else
-		start_block = grp_goal + group_first_block;
-
-	size = my_rsv->rsv_goal_size;
-
-	if (!rsv_is_empty(&my_rsv->rsv_window)) {
-		/*
-		 * if the old reservation is cross group boundary
-		 * and if the goal is inside the old reservation window,
-		 * we will come here when we just failed to allocate from
-		 * the first part of the window. We still have another part
-		 * that belongs to the next group. In this case, there is no
-		 * point to discard our window and try to allocate a new one
-		 * in this group(which will fail). we should
-		 * keep the reservation window, just simply move on.
-		 *
-		 * Maybe we could shift the start block of the reservation
-		 * window to the first block of next group.
-		 */
-
-		if ((my_rsv->rsv_start <= group_end_block) &&
-				(my_rsv->rsv_end > group_end_block) &&
-				(start_block >= my_rsv->rsv_start))
-			return -1;
-
-		if ((my_rsv->rsv_alloc_hit >
-		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
-			/*
-			 * if the previously allocation hit ratio is
-			 * greater than 1/2, then we double the size of
-			 * the reservation window the next time,
-			 * otherwise we keep the same size window
-			 */
-			size = size * 2;
-			if (size > EXT4_MAX_RESERVE_BLOCKS)
-				size = EXT4_MAX_RESERVE_BLOCKS;
-			my_rsv->rsv_goal_size= size;
-		}
-	}
-
-	spin_lock(rsv_lock);
-	/*
-	 * shift the search start to the window near the goal block
-	 */
-	search_head = search_reserve_window(fs_rsv_root, start_block);
-
-	/*
-	 * find_next_reservable_window() simply finds a reservable window
-	 * inside the given range(start_block, group_end_block).
-	 *
-	 * To make sure the reservation window has a free bit inside it, we
-	 * need to check the bitmap after we found a reservable window.
-	 */
-retry:
-	ret = find_next_reservable_window(search_head, my_rsv, sb,
-						start_block, group_end_block);
-
-	if (ret == -1) {
-		if (!rsv_is_empty(&my_rsv->rsv_window))
-			rsv_window_remove(sb, my_rsv);
-		spin_unlock(rsv_lock);
-		return -1;
-	}
-
-	/*
-	 * On success, find_next_reservable_window() returns the
-	 * reservation window where there is a reservable space after it.
-	 * Before we reserve this reservable space, we need
-	 * to make sure there is at least a free block inside this region.
-	 *
-	 * searching the first free bit on the block bitmap and copy of
-	 * last committed bitmap alternatively, until we found a allocatable
-	 * block. Search start from the start block of the reservable space
-	 * we just found.
-	 */
-	spin_unlock(rsv_lock);
-	first_free_block = bitmap_search_next_usable_block(
-			my_rsv->rsv_start - group_first_block,
-			bitmap_bh, group_end_block - group_first_block + 1);
-
-	if (first_free_block < 0) {
-		/*
-		 * no free block left on the bitmap, no point
-		 * to reserve the space. return failed.
-		 */
-		spin_lock(rsv_lock);
-		if (!rsv_is_empty(&my_rsv->rsv_window))
-			rsv_window_remove(sb, my_rsv);
-		spin_unlock(rsv_lock);
-		return -1;		/* failed */
-	}
-
-	start_block = first_free_block + group_first_block;
-	/*
-	 * check if the first free block is within the
-	 * free space we just reserved
-	 */
-	if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
-		return 0;		/* success */
-	/*
-	 * if the first free bit we found is out of the reservable space
-	 * continue search for next reservable space,
-	 * start from where the free block is,
-	 * we also shift the list head to where we stopped last time
-	 */
-	search_head = my_rsv;
-	spin_lock(rsv_lock);
-	goto retry;
-}
-
-/**
- * try_to_extend_reservation()
- * @my_rsv:		given reservation window
- * @sb:			super block
- * @size:		the delta to extend
- *
- * Attempt to expand the reservation window large enough to have
- * required number of free blocks
- *
- * Since ext4_try_to_allocate() will always allocate blocks within
- * the reservation window range, if the window size is too small,
- * multiple blocks allocation has to stop at the end of the reservation
- * window. To make this more efficient, given the total number of
- * blocks needed and the current size of the window, we try to
- * expand the reservation window size if necessary on a best-effort
- * basis before ext4_new_blocks() tries to allocate blocks,
- */
-static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
-			struct super_block *sb, int size)
-{
-	struct ext4_reserve_window_node *next_rsv;
-	struct rb_node *next;
-	spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-
-	if (!spin_trylock(rsv_lock))
-		return;
-
-	next = rb_next(&my_rsv->rsv_node);
-
-	if (!next)
-		my_rsv->rsv_end += size;
-	else {
-		next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
-
-		if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
-			my_rsv->rsv_end += size;
-		else
-			my_rsv->rsv_end = next_rsv->rsv_start - 1;
-	}
-	spin_unlock(rsv_lock);
-}
-
-/**
- * ext4_try_to_allocate_with_rsv()
- * @sb:			superblock
- * @handle:		handle to this transaction
- * @group:		given allocation block group
- * @bitmap_bh:		bufferhead holds the block bitmap
- * @grp_goal:		given target block within the group
- * @count:		target number of blocks to allocate
- * @my_rsv:		reservation window
- * @errp:		pointer to store the error code
- *
- * This is the main function used to allocate a new block and its reservation
- * window.
- *
- * Each time when a new block allocation is need, first try to allocate from
- * its own reservation.  If it does not have a reservation window, instead of
- * looking for a free bit on bitmap first, then look up the reservation list to
- * see if it is inside somebody else's reservation window, we try to allocate a
- * reservation window for it starting from the goal first. Then do the block
- * allocation within the reservation window.
- *
- * This will avoid keeping on searching the reservation list again and
- * again when somebody is looking for a free block (without
- * reservation), and there are lots of free blocks, but they are all
- * being reserved.
- *
- * We use a red-black tree for the per-filesystem reservation list.
- *
- */
-static ext4_grpblk_t
-ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
-			ext4_group_t group, struct buffer_head *bitmap_bh,
-			ext4_grpblk_t grp_goal,
-			struct ext4_reserve_window_node * my_rsv,
-			unsigned long *count, int *errp)
-{
-	ext4_fsblk_t group_first_block, group_last_block;
-	ext4_grpblk_t ret = 0;
-	int fatal;
-	unsigned long num = *count;
-
-	*errp = 0;
-
-	/*
-	 * Make sure we use undo access for the bitmap, because it is critical
-	 * that we do the frozen_data COW on bitmap buffers in all cases even
-	 * if the buffer is in BJ_Forget state in the committing transaction.
-	 */
-	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
-	fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
-	if (fatal) {
-		*errp = fatal;
-		return -1;
-	}
-
-	/*
-	 * we don't deal with reservation when
-	 * filesystem is mounted without reservation
-	 * or the file is not a regular file
-	 * or last attempt to allocate a block with reservation turned on failed
-	 */
-	if (my_rsv == NULL ) {
-		ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
-						grp_goal, count, NULL);
-		goto out;
-	}
-	/*
-	 * grp_goal is a group relative block number (if there is a goal)
-	 * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
-	 * first block is a filesystem wide block number
-	 * first block is the block number of the first block in this group
-	 */
-	group_first_block = ext4_group_first_block_no(sb, group);
-	group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
-	/*
-	 * Basically we will allocate a new block from inode's reservation
-	 * window.
-	 *
-	 * We need to allocate a new reservation window, if:
-	 * a) inode does not have a reservation window; or
-	 * b) last attempt to allocate a block from existing reservation
-	 *    failed; or
-	 * c) we come here with a goal and with a reservation window
-	 *
-	 * We do not need to allocate a new reservation window if we come here
-	 * at the beginning with a goal and the goal is inside the window, or
-	 * we don't have a goal but already have a reservation window.
-	 * then we could go to allocate from the reservation window directly.
-	 */
-	while (1) {
-		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-			!goal_in_my_reservation(&my_rsv->rsv_window,
-						grp_goal, group, sb)) {
-			if (my_rsv->rsv_goal_size < *count)
-				my_rsv->rsv_goal_size = *count;
-			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
-							group, bitmap_bh);
-			if (ret < 0)
-				break;			/* failed */
-
-			if (!goal_in_my_reservation(&my_rsv->rsv_window,
-							grp_goal, group, sb))
-				grp_goal = -1;
-		} else if (grp_goal >= 0) {
-			int curr = my_rsv->rsv_end -
-					(grp_goal + group_first_block) + 1;
-
-			if (curr < *count)
-				try_to_extend_reservation(my_rsv, sb,
-							*count - curr);
-		}
-
-		if ((my_rsv->rsv_start > group_last_block) ||
-				(my_rsv->rsv_end < group_first_block)) {
-			rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
-			BUG();
-		}
-		ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
-					   grp_goal, &num, &my_rsv->rsv_window);
-		if (ret >= 0) {
-			my_rsv->rsv_alloc_hit += num;
-			*count = num;
-			break;				/* succeed */
-		}
-		num = *count;
-	}
-out:
-	if (ret >= 0) {
-		BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
-					"bitmap block");
-		fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
-		if (fatal) {
-			*errp = fatal;
-			return -1;
-		}
-		return ret;
-	}
-
-	BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
-	ext4_journal_release_buffer(handle, bitmap_bh);
-	return ret;
-}
-
-/**
  * ext4_has_free_blocks()
  * @sbi:	in-core super block structure.
  * @nblocks:	number of neeed blocks
@@ -1610,29 +629,34 @@
  * On success, return nblocks
  */
 ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
-						ext4_fsblk_t nblocks)
+						s64 nblocks)
 {
-	ext4_fsblk_t free_blocks;
-	ext4_fsblk_t root_blocks = 0;
+	s64 free_blocks, dirty_blocks;
+	s64 root_blocks = 0;
+	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
 
-	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	free_blocks  = percpu_counter_read_positive(fbc);
+	dirty_blocks = percpu_counter_read_positive(dbc);
 
 	if (!capable(CAP_SYS_RESOURCE) &&
 		sbi->s_resuid != current->fsuid &&
 		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
 		root_blocks = ext4_r_blocks_count(sbi->s_es);
-#ifdef CONFIG_SMP
-	if (free_blocks - root_blocks < FBC_BATCH)
-		free_blocks =
-			percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
-#endif
-	if (free_blocks <= root_blocks)
+
+	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
+						EXT4_FREEBLOCKS_WATERMARK) {
+		free_blocks  = percpu_counter_sum(fbc);
+		dirty_blocks = percpu_counter_sum(dbc);
+	}
+	if (free_blocks <= (root_blocks + dirty_blocks))
 		/* we don't have free space */
 		return 0;
-	if (free_blocks - root_blocks < nblocks)
-		return free_blocks - root_blocks;
+
+	if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
+		return free_blocks - (root_blocks + dirty_blocks);
 	return nblocks;
- }
+}
 
 
 /**
@@ -1657,303 +681,6 @@
 	return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
 }
 
-/**
- * ext4_old_new_blocks() -- core block bitmap based block allocation function
- *
- * @handle:		handle to this transaction
- * @inode:		file inode
- * @goal:		given target block(filesystem wide)
- * @count:		target number of blocks to allocate
- * @errp:		error code
- *
- * ext4_old_new_blocks uses a goal block to assist allocation and look up
- * the block bitmap directly to do block allocation.  It tries to
- * allocate block(s) from the block group contains the goal block first. If
- * that fails, it will try to allocate block(s) from other block groups
- * without any specific goal block.
- *
- * This function is called when -o nomballoc mount option is enabled
- *
- */
-ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, unsigned long *count, int *errp)
-{
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *gdp_bh;
-	ext4_group_t group_no;
-	ext4_group_t goal_group;
-	ext4_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
-	ext4_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
-	ext4_fsblk_t ret_block;		/* filesyetem-wide allocated block */
-	ext4_group_t bgi;			/* blockgroup iteration index */
-	int fatal = 0, err;
-	int performed_allocation = 0;
-	ext4_grpblk_t free_blocks;	/* number of free blocks in a group */
-	struct super_block *sb;
-	struct ext4_group_desc *gdp;
-	struct ext4_super_block *es;
-	struct ext4_sb_info *sbi;
-	struct ext4_reserve_window_node *my_rsv = NULL;
-	struct ext4_block_alloc_info *block_i;
-	unsigned short windowsz = 0;
-	ext4_group_t ngroups;
-	unsigned long num = *count;
-
-	sb = inode->i_sb;
-	if (!sb) {
-		*errp = -ENODEV;
-		printk("ext4_new_block: nonexistent device");
-		return 0;
-	}
-
-	sbi = EXT4_SB(sb);
-	if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
-		/*
-		 * With delalloc we already reserved the blocks
-		 */
-		*count = ext4_has_free_blocks(sbi, *count);
-	}
-	if (*count == 0) {
-		*errp = -ENOSPC;
-		return 0;	/*return with ENOSPC error */
-	}
-	num = *count;
-
-	/*
-	 * Check quota for allocation of this block.
-	 */
-	if (DQUOT_ALLOC_BLOCK(inode, num)) {
-		*errp = -EDQUOT;
-		return 0;
-	}
-
-	sbi = EXT4_SB(sb);
-	es = EXT4_SB(sb)->s_es;
-	ext4_debug("goal=%llu.\n", goal);
-	/*
-	 * Allocate a block from reservation only when
-	 * filesystem is mounted with reservation(default,-o reservation), and
-	 * it's a regular file, and
-	 * the desired window size is greater than 0 (One could use ioctl
-	 * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
-	 * reservation on that particular file)
-	 */
-	block_i = EXT4_I(inode)->i_block_alloc_info;
-	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
-		my_rsv = &block_i->rsv_window_node;
-
-	/*
-	 * First, test whether the goal block is free.
-	 */
-	if (goal < le32_to_cpu(es->s_first_data_block) ||
-	    goal >= ext4_blocks_count(es))
-		goal = le32_to_cpu(es->s_first_data_block);
-	ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
-	goal_group = group_no;
-retry_alloc:
-	gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
-	if (!gdp)
-		goto io_error;
-
-	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-	/*
-	 * if there is not enough free blocks to make a new resevation
-	 * turn off reservation for this allocation
-	 */
-	if (my_rsv && (free_blocks < windowsz)
-		&& (rsv_is_empty(&my_rsv->rsv_window)))
-		my_rsv = NULL;
-
-	if (free_blocks > 0) {
-		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
-		if (!bitmap_bh)
-			goto io_error;
-		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
-					group_no, bitmap_bh, grp_target_blk,
-					my_rsv,	&num, &fatal);
-		if (fatal)
-			goto out;
-		if (grp_alloc_blk >= 0)
-			goto allocated;
-	}
-
-	ngroups = EXT4_SB(sb)->s_groups_count;
-	smp_rmb();
-
-	/*
-	 * Now search the rest of the groups.  We assume that
-	 * group_no and gdp correctly point to the last group visited.
-	 */
-	for (bgi = 0; bgi < ngroups; bgi++) {
-		group_no++;
-		if (group_no >= ngroups)
-			group_no = 0;
-		gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
-		if (!gdp)
-			goto io_error;
-		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-		/*
-		 * skip this group if the number of
-		 * free blocks is less than half of the reservation
-		 * window size.
-		 */
-		if (free_blocks <= (windowsz/2))
-			continue;
-
-		brelse(bitmap_bh);
-		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
-		if (!bitmap_bh)
-			goto io_error;
-		/*
-		 * try to allocate block(s) from this group, without a goal(-1).
-		 */
-		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
-					group_no, bitmap_bh, -1, my_rsv,
-					&num, &fatal);
-		if (fatal)
-			goto out;
-		if (grp_alloc_blk >= 0)
-			goto allocated;
-	}
-	/*
-	 * We may end up a bogus ealier ENOSPC error due to
-	 * filesystem is "full" of reservations, but
-	 * there maybe indeed free blocks avaliable on disk
-	 * In this case, we just forget about the reservations
-	 * just do block allocation as without reservations.
-	 */
-	if (my_rsv) {
-		my_rsv = NULL;
-		windowsz = 0;
-		group_no = goal_group;
-		goto retry_alloc;
-	}
-	/* No space left on the device */
-	*errp = -ENOSPC;
-	goto out;
-
-allocated:
-
-	ext4_debug("using block group %lu(%d)\n",
-			group_no, gdp->bg_free_blocks_count);
-
-	BUFFER_TRACE(gdp_bh, "get_write_access");
-	fatal = ext4_journal_get_write_access(handle, gdp_bh);
-	if (fatal)
-		goto out;
-
-	ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
-
-	if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
-	    in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
-	    in_range(ret_block, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group) ||
-	    in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group)) {
-		ext4_error(sb, "ext4_new_block",
-			    "Allocating block in system zone - "
-			    "blocks from %llu, length %lu",
-			     ret_block, num);
-		/*
-		 * claim_block marked the blocks we allocated
-		 * as in use. So we may want to selectively
-		 * mark some of the blocks as free
-		 */
-		goto retry_alloc;
-	}
-
-	performed_allocation = 1;
-
-#ifdef CONFIG_JBD2_DEBUG
-	{
-		struct buffer_head *debug_bh;
-
-		/* Record bitmap buffer state in the newly allocated block */
-		debug_bh = sb_find_get_block(sb, ret_block);
-		if (debug_bh) {
-			BUFFER_TRACE(debug_bh, "state when allocated");
-			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
-			brelse(debug_bh);
-		}
-	}
-	jbd_lock_bh_state(bitmap_bh);
-	spin_lock(sb_bgl_lock(sbi, group_no));
-	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
-		int i;
-
-		for (i = 0; i < num; i++) {
-			if (ext4_test_bit(grp_alloc_blk+i,
-					bh2jh(bitmap_bh)->b_committed_data)) {
-				printk("%s: block was unexpectedly set in "
-					"b_committed_data\n", __func__);
-			}
-		}
-	}
-	ext4_debug("found bit %d\n", grp_alloc_blk);
-	spin_unlock(sb_bgl_lock(sbi, group_no));
-	jbd_unlock_bh_state(bitmap_bh);
-#endif
-
-	if (ret_block + num - 1 >= ext4_blocks_count(es)) {
-		ext4_error(sb, "ext4_new_block",
-			    "block(%llu) >= blocks count(%llu) - "
-			    "block_group = %lu, es == %p ", ret_block,
-			ext4_blocks_count(es), group_no, es);
-		goto out;
-	}
-
-	/*
-	 * It is up to the caller to add the new buffer to a journal
-	 * list of some description.  We don't know in advance whether
-	 * the caller wants to use it as metadata or data.
-	 */
-	spin_lock(sb_bgl_lock(sbi, group_no));
-	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
-		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
-	spin_unlock(sb_bgl_lock(sbi, group_no));
-	if (!EXT4_I(inode)->i_delalloc_reserved_flag)
-		percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-
-	if (sbi->s_log_groups_per_flex) {
-		ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
-		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_blocks -= num;
-		spin_unlock(sb_bgl_lock(sbi, flex_group));
-	}
-
-	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
-	err = ext4_journal_dirty_metadata(handle, gdp_bh);
-	if (!fatal)
-		fatal = err;
-
-	sb->s_dirt = 1;
-	if (fatal)
-		goto out;
-
-	*errp = 0;
-	brelse(bitmap_bh);
-	DQUOT_FREE_BLOCK(inode, *count-num);
-	*count = num;
-	return ret_block;
-
-io_error:
-	*errp = -EIO;
-out:
-	if (fatal) {
-		*errp = fatal;
-		ext4_std_error(sb, fatal);
-	}
-	/*
-	 * Undo the block allocation
-	 */
-	if (!performed_allocation)
-		DQUOT_FREE_BLOCK(inode, *count);
-	brelse(bitmap_bh);
-	return 0;
-}
-
 #define EXT4_META_BLOCK 0x1
 
 static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
@@ -1963,10 +690,6 @@
 	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
 
-	if (!test_opt(inode->i_sb, MBALLOC)) {
-		return ext4_old_new_blocks(handle, inode, goal, count, errp);
-	}
-
 	memset(&ar, 0, sizeof(ar));
 	/* Fill with neighbour allocated blocks */
 
@@ -2008,7 +731,7 @@
 	/*
 	 * Account for the allocated meta blocks
 	 */
-	if (!(*errp)) {
+	if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
 		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 		EXT4_I(inode)->i_allocated_meta_blocks += *count;
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2093,10 +816,9 @@
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-	printk("ext4_count_free_blocks: stored = %llu"
-		", computed = %llu, %llu\n",
-		ext4_free_blocks_count(es),
-		desc_count, bitmap_count);
+	printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
+		", computed = %llu, %llu\n", ext4_free_blocks_count(es),
+	       desc_count, bitmap_count);
 	return bitmap_count;
 #else
 	desc_count = 0;
@@ -2183,8 +905,9 @@
 
 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
 			metagroup < first_meta_bg)
-		return ext4_bg_num_gdb_nometa(sb,group);
+		return ext4_bg_num_gdb_nometa(sb, group);
 
 	return ext4_bg_num_gdb_meta(sb,group);
 
 }
+
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index d37ea67..0a7a666 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,17 +15,17 @@
 
 static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
 
-unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars)
+unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
 {
 	unsigned int i;
 	unsigned long sum = 0;
 
 	if (!map)
-		return (0);
+		return 0;
 	for (i = 0; i < numchars; i++)
 		sum += nibblemap[map->b_data[i] & 0xf] +
 			nibblemap[(map->b_data[i] >> 4) & 0xf];
-	return (sum);
+	return sum;
 }
 
 #endif  /*  EXT4FS_DEBUG  */
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ec8e33b..3ca6a2b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -33,10 +33,10 @@
 };
 
 static int ext4_readdir(struct file *, void *, filldir_t);
-static int ext4_dx_readdir(struct file * filp,
-			   void * dirent, filldir_t filldir);
-static int ext4_release_dir (struct inode * inode,
-				struct file * filp);
+static int ext4_dx_readdir(struct file *filp,
+			   void *dirent, filldir_t filldir);
+static int ext4_release_dir(struct inode *inode,
+				struct file *filp);
 
 const struct file_operations ext4_dir_operations = {
 	.llseek		= generic_file_llseek,
@@ -61,12 +61,12 @@
 }
 
 
-int ext4_check_dir_entry (const char * function, struct inode * dir,
-			  struct ext4_dir_entry_2 * de,
-			  struct buffer_head * bh,
-			  unsigned long offset)
+int ext4_check_dir_entry(const char *function, struct inode *dir,
+			 struct ext4_dir_entry_2 *de,
+			 struct buffer_head *bh,
+			 unsigned long offset)
 {
-	const char * error_msg = NULL;
+	const char *error_msg = NULL;
 	const int rlen = ext4_rec_len_from_disk(de->rec_len);
 
 	if (rlen < EXT4_DIR_REC_LEN(1))
@@ -82,7 +82,7 @@
 		error_msg = "inode out of bounds";
 
 	if (error_msg != NULL)
-		ext4_error (dir->i_sb, function,
+		ext4_error(dir->i_sb, function,
 			"bad entry in directory #%lu: %s - "
 			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
 			dir->i_ino, error_msg, offset,
@@ -91,8 +91,8 @@
 	return error_msg == NULL ? 1 : 0;
 }
 
-static int ext4_readdir(struct file * filp,
-			 void * dirent, filldir_t filldir)
+static int ext4_readdir(struct file *filp,
+			 void *dirent, filldir_t filldir)
 {
 	int error = 0;
 	unsigned long offset;
@@ -102,6 +102,7 @@
 	int err;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int ret = 0;
+	int dir_has_error = 0;
 
 	sb = inode->i_sb;
 
@@ -148,9 +149,13 @@
 		 * of recovering data when there's a bad sector
 		 */
 		if (!bh) {
-			ext4_error (sb, "ext4_readdir",
-				"directory #%lu contains a hole at offset %lu",
-				inode->i_ino, (unsigned long)filp->f_pos);
+			if (!dir_has_error) {
+				ext4_error(sb, __func__, "directory #%lu "
+					   "contains a hole at offset %Lu",
+					   inode->i_ino,
+					   (unsigned long long) filp->f_pos);
+				dir_has_error = 1;
+			}
 			/* corrupt size?  Maybe no more blocks to read */
 			if (filp->f_pos > inode->i_blocks << 9)
 				break;
@@ -187,14 +192,14 @@
 		while (!error && filp->f_pos < inode->i_size
 		       && offset < sb->s_blocksize) {
 			de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-			if (!ext4_check_dir_entry ("ext4_readdir", inode, de,
-						   bh, offset)) {
+			if (!ext4_check_dir_entry("ext4_readdir", inode, de,
+						  bh, offset)) {
 				/*
 				 * On error, skip the f_pos to the next block
 				 */
 				filp->f_pos = (filp->f_pos |
 						(sb->s_blocksize - 1)) + 1;
-				brelse (bh);
+				brelse(bh);
 				ret = stored;
 				goto out;
 			}
@@ -218,12 +223,12 @@
 					break;
 				if (version != filp->f_version)
 					goto revalidate;
-				stored ++;
+				stored++;
 			}
 			filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
 		}
 		offset = 0;
-		brelse (bh);
+		brelse(bh);
 	}
 out:
 	return ret;
@@ -290,9 +295,9 @@
 		parent = rb_parent(n);
 		fname = rb_entry(n, struct fname, rb_hash);
 		while (fname) {
-			struct fname * old = fname;
+			struct fname *old = fname;
 			fname = fname->next;
-			kfree (old);
+			kfree(old);
 		}
 		if (!parent)
 			root->rb_node = NULL;
@@ -331,7 +336,7 @@
 			     struct ext4_dir_entry_2 *dirent)
 {
 	struct rb_node **p, *parent = NULL;
-	struct fname * fname, *new_fn;
+	struct fname *fname, *new_fn;
 	struct dir_private_info *info;
 	int len;
 
@@ -388,19 +393,20 @@
  * for all entres on the fname linked list.  (Normally there is only
  * one entry on the linked list, unless there are 62 bit hash collisions.)
  */
-static int call_filldir(struct file * filp, void * dirent,
+static int call_filldir(struct file *filp, void *dirent,
 			filldir_t filldir, struct fname *fname)
 {
 	struct dir_private_info *info = filp->private_data;
 	loff_t	curr_pos;
 	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct super_block * sb;
+	struct super_block *sb;
 	int error;
 
 	sb = inode->i_sb;
 
 	if (!fname) {
-		printk("call_filldir: called with null fname?!?\n");
+		printk(KERN_ERR "ext4: call_filldir: called with "
+		       "null fname?!?\n");
 		return 0;
 	}
 	curr_pos = hash2pos(fname->hash, fname->minor_hash);
@@ -419,8 +425,8 @@
 	return 0;
 }
 
-static int ext4_dx_readdir(struct file * filp,
-			 void * dirent, filldir_t filldir)
+static int ext4_dx_readdir(struct file *filp,
+			 void *dirent, filldir_t filldir)
 {
 	struct dir_private_info *info = filp->private_data;
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -511,7 +517,7 @@
 	return 0;
 }
 
-static int ext4_release_dir (struct inode * inode, struct file * filp)
+static int ext4_release_dir(struct inode *inode, struct file *filp)
 {
 	if (filp->private_data)
 		ext4_htree_free_dir_info(filp->private_data);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2950032..f46a513 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -44,9 +44,9 @@
 #ifdef EXT4FS_DEBUG
 #define ext4_debug(f, a...)						\
 	do {								\
-		printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
+		printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
 			__FILE__, __LINE__, __func__);			\
-		printk (KERN_DEBUG f, ## a);				\
+		printk(KERN_DEBUG f, ## a);				\
 	} while (0)
 #else
 #define ext4_debug(f, a...)	do {} while (0)
@@ -128,7 +128,7 @@
 #else
 # define EXT4_BLOCK_SIZE(s)		(EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
 #endif
-#define	EXT4_ADDR_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / sizeof (__u32))
+#define	EXT4_ADDR_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / sizeof(__u32))
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_blocksize_bits)
 #else
@@ -245,7 +245,7 @@
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
 #define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE		0x000B80FF /* User modifiable flags */
 
 /*
  * Inode dynamic state flags
@@ -291,8 +291,6 @@
 #define	EXT4_IOC_SETFLAGS		FS_IOC_SETFLAGS
 #define	EXT4_IOC_GETVERSION		_IOR('f', 3, long)
 #define	EXT4_IOC_SETVERSION		_IOW('f', 4, long)
-#define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
-#define EXT4_IOC_GROUP_ADD		_IOW('f', 8,struct ext4_new_group_input)
 #define	EXT4_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
 #define	EXT4_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
 #ifdef CONFIG_JBD2_DEBUG
@@ -300,7 +298,10 @@
 #endif
 #define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
 #define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
-#define EXT4_IOC_MIGRATE		_IO('f', 7)
+#define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
+#define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
+#define EXT4_IOC_MIGRATE		_IO('f', 9)
+ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 
 /*
  * ioctl commands in 32 bit emulation
@@ -538,7 +539,6 @@
 #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
-#define EXT4_MOUNT_MBALLOC		0x4000000 /* Buddy allocation support */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -667,7 +667,7 @@
 };
 
 #ifdef __KERNEL__
-static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb)
+static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
@@ -725,11 +725,11 @@
  */
 
 #define EXT4_HAS_COMPAT_FEATURE(sb,mask)			\
-	( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+	(EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
 #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)			\
-	( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+	(EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
 #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)			\
-	( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+	(EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
 #define EXT4_SET_COMPAT_FEATURE(sb,mask)			\
 	EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
 #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)			\
@@ -789,6 +789,8 @@
 #define	EXT4_DEF_RESUID		0
 #define	EXT4_DEF_RESGID		0
 
+#define EXT4_DEF_INODE_READAHEAD_BLKS	32
+
 /*
  * Default mount options
  */
@@ -954,6 +956,24 @@
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 			unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
 
+extern struct proc_dir_entry *ext4_proc_root;
+
+#ifdef CONFIG_PROC_FS
+extern const struct file_operations ext4_ui_proc_fops;
+
+#define	EXT4_PROC_HANDLER(name, var)					\
+do {									\
+	proc = proc_create_data(name, mode, sbi->s_proc,		\
+				&ext4_ui_proc_fops, &sbi->s_##var);	\
+	if (proc == NULL) {						\
+		printk(KERN_ERR "EXT4-fs: can't create %s\n", name);	\
+		goto err_out;						\
+	}								\
+} while (0)
+#else
+#define EXT4_PROC_HANDLER(name, var)
+#endif
+
 /*
  * Function prototypes
  */
@@ -981,23 +1001,20 @@
 extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 					ext4_lblk_t iblock, ext4_fsblk_t goal,
 					unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
-						ext4_fsblk_t nblocks);
-extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
+					 s64 nblocks);
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
-extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
-				 ext4_fsblk_t block, unsigned long count,
+extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
+				ext4_fsblk_t block, unsigned long count,
 				unsigned long *pdquot_freed_blocks);
-extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
-extern void ext4_check_blocks_bitmap (struct super_block *);
+extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
+extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 						    ext4_group_t block_group,
 						    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
-extern void ext4_init_block_alloc_info(struct inode *);
-extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
 
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1009,20 +1026,20 @@
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 
 /* fsync.c */
-extern int ext4_sync_file (struct file *, struct dentry *, int);
+extern int ext4_sync_file(struct file *, struct dentry *, int);
 
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
 			  dx_hash_info *hinfo);
 
 /* ialloc.c */
-extern struct inode * ext4_new_inode (handle_t *, struct inode *, int);
-extern void ext4_free_inode (handle_t *, struct inode *);
-extern struct inode * ext4_orphan_get (struct super_block *, unsigned long);
-extern unsigned long ext4_count_free_inodes (struct super_block *);
-extern unsigned long ext4_count_dirs (struct super_block *);
-extern void ext4_check_inodes_bitmap (struct super_block *);
-extern unsigned long ext4_count_free (struct buffer_head *, unsigned);
+extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
+extern void ext4_free_inode(handle_t *, struct inode *);
+extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
+extern unsigned long ext4_count_free_inodes(struct super_block *);
+extern unsigned long ext4_count_dirs(struct super_block *);
+extern void ext4_check_inodes_bitmap(struct super_block *);
+extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1032,7 +1049,7 @@
 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
 				struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
-extern void ext4_mb_discard_inode_preallocations(struct inode *);
+extern void ext4_discard_preallocations(struct inode *);
 extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
@@ -1050,24 +1067,25 @@
 						ext4_lblk_t, int, int *);
 struct buffer_head *ext4_bread(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
+int ext4_get_block(struct inode *inode, sector_t iblock,
+				struct buffer_head *bh_result, int create);
 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 				ext4_lblk_t iblock, unsigned long maxblocks,
 				struct buffer_head *bh_result,
 				int create, int extend_disksize);
 
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern int  ext4_write_inode (struct inode *, int);
-extern int  ext4_setattr (struct dentry *, struct iattr *);
+extern int  ext4_write_inode(struct inode *, int);
+extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 				struct kstat *stat);
-extern void ext4_delete_inode (struct inode *);
-extern int  ext4_sync_inode (handle_t *, struct inode *);
-extern void ext4_discard_reservation (struct inode *);
+extern void ext4_delete_inode(struct inode *);
+extern int  ext4_sync_inode(handle_t *, struct inode *);
 extern void ext4_dirty_inode(struct inode *);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
-extern void ext4_truncate (struct inode *);
+extern void ext4_truncate(struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
@@ -1080,11 +1098,10 @@
 
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
-extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long);
+extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* migrate.c */
-extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int,
-		       unsigned long);
+extern int ext4_ext_migrate(struct inode *);
 /* namei.c */
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
@@ -1099,14 +1116,14 @@
 				ext4_fsblk_t n_blocks_count);
 
 /* super.c */
-extern void ext4_error (struct super_block *, const char *, const char *, ...)
+extern void ext4_error(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void __ext4_std_error (struct super_block *, const char *, int);
-extern void ext4_abort (struct super_block *, const char *, const char *, ...)
+extern void __ext4_std_error(struct super_block *, const char *, int);
+extern void ext4_abort(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void ext4_warning (struct super_block *, const char *, const char *, ...)
+extern void ext4_warning(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void ext4_update_dynamic_rev (struct super_block *sb);
+extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
 					__u32 compat);
 extern int ext4_update_rocompat_feature(handle_t *handle,
@@ -1179,7 +1196,7 @@
 
 static inline
 struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
-							ext4_group_t group)
+					    ext4_group_t group)
 {
 	 struct ext4_group_info ***grp_info;
 	 long indexv, indexh;
@@ -1207,6 +1224,28 @@
 		__ext4_std_error((sb), __func__, (errno));	\
 } while (0)
 
+#ifdef CONFIG_SMP
+/* Each CPU can accumulate FBC_BATCH blocks in their local
+ * counters. So we need to make sure we have free blocks more
+ * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
+ */
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#else
+#define EXT4_FREEBLOCKS_WATERMARK 0
+#endif
+
+static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+	/*
+	 * XXX: replace with spinlock if seen contended -bzzz
+	 */
+	down_write(&EXT4_I(inode)->i_data_sem);
+	if (newsize > EXT4_I(inode)->i_disksize)
+		EXT4_I(inode)->i_disksize = newsize;
+	up_write(&EXT4_I(inode)->i_data_sem);
+	return ;
+}
+
 /*
  * Inodes and files operations
  */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index d33dc56..bec7ce5 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -124,6 +124,19 @@
 #define EXT4_EXT_CACHE_GAP	1
 #define EXT4_EXT_CACHE_EXTENT	2
 
+/*
+ * to be called by ext4_ext_walk_space()
+ * negative retcode - error
+ * positive retcode - signal for ext4_ext_walk_space(), see below
+ * callback must return valid extent (passed or newly created)
+ */
+typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
+					struct ext4_ext_cache *,
+					struct ext4_extent *, void *);
+
+#define EXT_CONTINUE   0
+#define EXT_BREAK      1
+#define EXT_REPEAT     2
 
 #define EXT_MAX_BLOCK	0xffffffff
 
@@ -224,6 +237,8 @@
 				 struct ext4_extent *);
 extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
+							ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
 							struct ext4_ext_path *);
 extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index ef7409f..5c124c0 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,38 +33,6 @@
 /* data type for block group number */
 typedef unsigned long ext4_group_t;
 
-struct ext4_reserve_window {
-	ext4_fsblk_t	_rsv_start;	/* First byte reserved */
-	ext4_fsblk_t	_rsv_end;	/* Last byte reserved or 0 */
-};
-
-struct ext4_reserve_window_node {
-	struct rb_node		rsv_node;
-	__u32			rsv_goal_size;
-	__u32			rsv_alloc_hit;
-	struct ext4_reserve_window	rsv_window;
-};
-
-struct ext4_block_alloc_info {
-	/* information about reservation window */
-	struct ext4_reserve_window_node rsv_window_node;
-	/*
-	 * was i_next_alloc_block in ext4_inode_info
-	 * is the logical (file-relative) number of the
-	 * most-recently-allocated block in this file.
-	 * We use this for detecting linearly ascending allocation requests.
-	 */
-	ext4_lblk_t last_alloc_logical_block;
-	/*
-	 * Was i_next_alloc_goal in ext4_inode_info
-	 * is the *physical* companion to i_next_alloc_block.
-	 * it the physical block number of the block which was most-recentl
-	 * allocated to this file.  This give us the goal (target) for the next
-	 * allocation when we detect linearly ascending requests.
-	 */
-	ext4_fsblk_t last_alloc_physical_block;
-};
-
 #define rsv_start rsv_window._rsv_start
 #define rsv_end rsv_window._rsv_end
 
@@ -97,11 +65,8 @@
 	ext4_group_t	i_block_group;
 	__u32	i_state;		/* Dynamic state flags for ext4 */
 
-	/* block reservation info */
-	struct ext4_block_alloc_info *i_block_alloc_info;
-
 	ext4_lblk_t		i_dir_start_lookup;
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	/*
 	 * Extended attributes can be read independently of the main file
 	 * data. Taking i_mutex even when reading would cause contention
@@ -111,7 +76,7 @@
 	 */
 	struct rw_semaphore xattr_sem;
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	struct posix_acl	*i_acl;
 	struct posix_acl	*i_default_acl;
 #endif
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6300226..6a0b40d 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -40,8 +40,8 @@
 	unsigned long s_blocks_last;    /* Last seen block count */
 	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
-	struct ext4_super_block * s_es;	/* Pointer to the super block in the buffer */
-	struct buffer_head ** s_group_desc;
+	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
+	struct buffer_head **s_group_desc;
 	unsigned long  s_mount_opt;
 	ext4_fsblk_t s_sb_block;
 	uid_t s_resuid;
@@ -52,6 +52,7 @@
 	int s_desc_per_block_bits;
 	int s_inode_size;
 	int s_first_ino;
+	unsigned int s_inode_readahead_blks;
 	spinlock_t s_next_gen_lock;
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
@@ -59,16 +60,17 @@
 	struct percpu_counter s_freeblocks_counter;
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
+	struct percpu_counter s_dirtyblocks_counter;
 	struct blockgroup_lock s_blockgroup_lock;
+	struct proc_dir_entry *s_proc;
 
 	/* root of the per fs reservation window tree */
 	spinlock_t s_rsv_window_lock;
 	struct rb_root s_rsv_window_root;
-	struct ext4_reserve_window_node s_rsv_window_head;
 
 	/* Journaling */
-	struct inode * s_journal_inode;
-	struct journal_s * s_journal;
+	struct inode *s_journal_inode;
+	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	unsigned long s_commit_interval;
 	struct block_device *journal_bdev;
@@ -106,12 +108,12 @@
 
 	/* tunables */
 	unsigned long s_stripe;
-	unsigned long s_mb_stream_request;
-	unsigned long s_mb_max_to_scan;
-	unsigned long s_mb_min_to_scan;
-	unsigned long s_mb_stats;
-	unsigned long s_mb_order2_reqs;
-	unsigned long s_mb_group_prealloc;
+	unsigned int s_mb_stream_request;
+	unsigned int s_mb_max_to_scan;
+	unsigned int s_mb_min_to_scan;
+	unsigned int s_mb_stats;
+	unsigned int s_mb_order2_reqs;
+	unsigned int s_mb_group_prealloc;
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
@@ -121,7 +123,6 @@
 	int s_mb_history_cur;
 	int s_mb_history_max;
 	int s_mb_history_num;
-	struct proc_dir_entry *s_mb_proc;
 	spinlock_t s_mb_history_lock;
 	int s_mb_history_filter;
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b24d3c5..ea2ce3c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/falloc.h>
 #include <asm/uaccess.h>
+#include <linux/fiemap.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 
@@ -383,8 +384,8 @@
 	ext_debug("\n");
 }
 #else
-#define ext4_ext_show_path(inode,path)
-#define ext4_ext_show_leaf(inode,path)
+#define ext4_ext_show_path(inode, path)
+#define ext4_ext_show_leaf(inode, path)
 #endif
 
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -440,9 +441,10 @@
 		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
 		  if (k != 0 &&
 		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
-				printk("k=%d, ix=0x%p, first=0x%p\n", k,
-					ix, EXT_FIRST_INDEX(eh));
-				printk("%u <= %u\n",
+				printk(KERN_DEBUG "k=%d, ix=0x%p, "
+				       "first=0x%p\n", k,
+				       ix, EXT_FIRST_INDEX(eh));
+				printk(KERN_DEBUG "%u <= %u\n",
 				       le32_to_cpu(ix->ei_block),
 				       le32_to_cpu(ix[-1].ei_block));
 			}
@@ -1475,7 +1477,7 @@
 				struct ext4_ext_path *path,
 				struct ext4_extent *newext)
 {
-	struct ext4_extent_header * eh;
+	struct ext4_extent_header *eh;
 	struct ext4_extent *ex, *fex;
 	struct ext4_extent *nearex; /* nearest extent */
 	struct ext4_ext_path *npath = NULL;
@@ -1625,6 +1627,113 @@
 	return err;
 }
 
+int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+			ext4_lblk_t num, ext_prepare_callback func,
+			void *cbdata)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_ext_cache cbex;
+	struct ext4_extent *ex;
+	ext4_lblk_t next, start = 0, end = 0;
+	ext4_lblk_t last = block + num;
+	int depth, exists, err = 0;
+
+	BUG_ON(func == NULL);
+	BUG_ON(inode == NULL);
+
+	while (block < last && block != EXT_MAX_BLOCK) {
+		num = last - block;
+		/* find extent for this block */
+		path = ext4_ext_find_extent(inode, block, path);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			path = NULL;
+			break;
+		}
+
+		depth = ext_depth(inode);
+		BUG_ON(path[depth].p_hdr == NULL);
+		ex = path[depth].p_ext;
+		next = ext4_ext_next_allocated_block(path);
+
+		exists = 0;
+		if (!ex) {
+			/* there is no extent yet, so try to allocate
+			 * all requested space */
+			start = block;
+			end = block + num;
+		} else if (le32_to_cpu(ex->ee_block) > block) {
+			/* need to allocate space before found extent */
+			start = block;
+			end = le32_to_cpu(ex->ee_block);
+			if (block + num < end)
+				end = block + num;
+		} else if (block >= le32_to_cpu(ex->ee_block)
+					+ ext4_ext_get_actual_len(ex)) {
+			/* need to allocate space after found extent */
+			start = block;
+			end = block + num;
+			if (end >= next)
+				end = next;
+		} else if (block >= le32_to_cpu(ex->ee_block)) {
+			/*
+			 * some part of requested space is covered
+			 * by found extent
+			 */
+			start = block;
+			end = le32_to_cpu(ex->ee_block)
+				+ ext4_ext_get_actual_len(ex);
+			if (block + num < end)
+				end = block + num;
+			exists = 1;
+		} else {
+			BUG();
+		}
+		BUG_ON(end <= start);
+
+		if (!exists) {
+			cbex.ec_block = start;
+			cbex.ec_len = end - start;
+			cbex.ec_start = 0;
+			cbex.ec_type = EXT4_EXT_CACHE_GAP;
+		} else {
+			cbex.ec_block = le32_to_cpu(ex->ee_block);
+			cbex.ec_len = ext4_ext_get_actual_len(ex);
+			cbex.ec_start = ext_pblock(ex);
+			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
+		}
+
+		BUG_ON(cbex.ec_len == 0);
+		err = func(inode, path, &cbex, ex, cbdata);
+		ext4_ext_drop_refs(path);
+
+		if (err < 0)
+			break;
+
+		if (err == EXT_REPEAT)
+			continue;
+		else if (err == EXT_BREAK) {
+			err = 0;
+			break;
+		}
+
+		if (ext_depth(inode) != depth) {
+			/* depth was changed. we have to realloc path */
+			kfree(path);
+			path = NULL;
+		}
+
+		block = cbex.ec_block + cbex.ec_len;
+	}
+
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+
+	return err;
+}
+
 static void
 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
 			__u32 len, ext4_fsblk_t start, int type)
@@ -2142,7 +2251,7 @@
 	 */
 
 	if (test_opt(sb, EXTENTS)) {
-		printk("EXT4-fs: file extents enabled");
+		printk(KERN_INFO "EXT4-fs: file extents enabled");
 #ifdef AGGRESSIVE_TEST
 		printk(", aggressive tests");
 #endif
@@ -2696,11 +2805,8 @@
 		goto out2;
 	}
 	/*
-	 * Okay, we need to do block allocation.  Lazily initialize the block
-	 * allocation info here if necessary.
+	 * Okay, we need to do block allocation.
 	 */
-	if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
-		ext4_init_block_alloc_info(inode);
 
 	/* find neighbour allocated blocks */
 	ar.lleft = iblock;
@@ -2760,7 +2866,7 @@
 		/* free data blocks we just allocated */
 		/* not a good idea to call discard here directly,
 		 * but otherwise we'd need to call it every free() */
-		ext4_mb_discard_inode_preallocations(inode);
+		ext4_discard_preallocations(inode);
 		ext4_free_blocks(handle, inode, ext_pblock(&newex),
 					ext4_ext_get_actual_len(&newex), 0);
 		goto out2;
@@ -2824,7 +2930,7 @@
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_ext_invalidate_cache(inode);
 
-	ext4_discard_reservation(inode);
+	ext4_discard_preallocations(inode);
 
 	/*
 	 * TODO: optimization is possible here.
@@ -2877,10 +2983,11 @@
 	 * Update only when preallocation was requested beyond
 	 * the file size.
 	 */
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-				new_size > i_size_read(inode)) {
-		i_size_write(inode, new_size);
-		EXT4_I(inode)->i_disksize = new_size;
+	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+		if (new_size > i_size_read(inode))
+			i_size_write(inode, new_size);
+		if (new_size > EXT4_I(inode)->i_disksize)
+			ext4_update_i_disksize(inode, new_size);
 	}
 
 }
@@ -2972,3 +3079,143 @@
 	mutex_unlock(&inode->i_mutex);
 	return ret > 0 ? ret2 : ret;
 }
+
+/*
+ * Callback function called for each extent to gather FIEMAP information.
+ */
+int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
+		       void *data)
+{
+	struct fiemap_extent_info *fieinfo = data;
+	unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+	__u64	logical;
+	__u64	physical;
+	__u64	length;
+	__u32	flags = 0;
+	int	error;
+
+	logical =  (__u64)newex->ec_block << blksize_bits;
+
+	if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
+		pgoff_t offset;
+		struct page *page;
+		struct buffer_head *bh = NULL;
+
+		offset = logical >> PAGE_SHIFT;
+		page = find_get_page(inode->i_mapping, offset);
+		if (!page || !page_has_buffers(page))
+			return EXT_CONTINUE;
+
+		bh = page_buffers(page);
+
+		if (!bh)
+			return EXT_CONTINUE;
+
+		if (buffer_delay(bh)) {
+			flags |= FIEMAP_EXTENT_DELALLOC;
+			page_cache_release(page);
+		} else {
+			page_cache_release(page);
+			return EXT_CONTINUE;
+		}
+	}
+
+	physical = (__u64)newex->ec_start << blksize_bits;
+	length =   (__u64)newex->ec_len << blksize_bits;
+
+	if (ex && ext4_ext_is_uninitialized(ex))
+		flags |= FIEMAP_EXTENT_UNWRITTEN;
+
+	/*
+	 * If this extent reaches EXT_MAX_BLOCK, it must be last.
+	 *
+	 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
+	 * this also indicates no more allocated blocks.
+	 *
+	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
+	 */
+	if (logical + length - 1 == EXT_MAX_BLOCK ||
+	    ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
+		flags |= FIEMAP_EXTENT_LAST;
+
+	error = fiemap_fill_next_extent(fieinfo, logical, physical,
+					length, flags);
+	if (error < 0)
+		return error;
+	if (error == 1)
+		return EXT_BREAK;
+
+	return EXT_CONTINUE;
+}
+
+/* fiemap flags we can handle specified here */
+#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
+{
+	__u64 physical = 0;
+	__u64 length;
+	__u32 flags = FIEMAP_EXTENT_LAST;
+	int blockbits = inode->i_sb->s_blocksize_bits;
+	int error = 0;
+
+	/* in-inode? */
+	if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+		struct ext4_iloc iloc;
+		int offset;	/* offset of xattr in inode */
+
+		error = ext4_get_inode_loc(inode, &iloc);
+		if (error)
+			return error;
+		physical = iloc.bh->b_blocknr << blockbits;
+		offset = EXT4_GOOD_OLD_INODE_SIZE +
+				EXT4_I(inode)->i_extra_isize;
+		physical += offset;
+		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
+		flags |= FIEMAP_EXTENT_DATA_INLINE;
+	} else { /* external block */
+		physical = EXT4_I(inode)->i_file_acl << blockbits;
+		length = inode->i_sb->s_blocksize;
+	}
+
+	if (physical)
+		error = fiemap_fill_next_extent(fieinfo, 0, physical,
+						length, flags);
+	return (error < 0 ? error : 0);
+}
+
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
+{
+	ext4_lblk_t start_blk;
+	ext4_lblk_t len_blks;
+	int error = 0;
+
+	/* fallback to generic here if not in extents fmt */
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+		return generic_block_fiemap(inode, fieinfo, start, len,
+			ext4_get_block);
+
+	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+		return -EBADR;
+
+	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+		error = ext4_xattr_fiemap(inode, fieinfo);
+	} else {
+		start_blk = start >> inode->i_sb->s_blocksize_bits;
+		len_blks = len >> inode->i_sb->s_blocksize_bits;
+
+		/*
+		 * Walk the extent tree gathering extent information.
+		 * ext4_ext_fiemap_cb will push extents back to user.
+		 */
+		down_write(&EXT4_I(inode)->i_data_sem);
+		error = ext4_ext_walk_space(inode, start_blk, len_blks,
+					  ext4_ext_fiemap_cb, fieinfo);
+		up_write(&EXT4_I(inode)->i_data_sem);
+	}
+
+	return error;
+}
+
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 430eb79..6bd11fb 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -31,14 +31,14 @@
  * from ext4_file_open: open gets called at every open, but release
  * gets called only when /all/ the files are closed.
  */
-static int ext4_release_file (struct inode * inode, struct file * filp)
+static int ext4_release_file(struct inode *inode, struct file *filp)
 {
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
 			(atomic_read(&inode->i_writecount) == 1))
 	{
 		down_write(&EXT4_I(inode)->i_data_sem);
-		ext4_discard_reservation(inode);
+		ext4_discard_preallocations(inode);
 		up_write(&EXT4_I(inode)->i_data_sem);
 	}
 	if (is_dx(inode) && filp->private_data)
@@ -140,6 +140,9 @@
 	return 0;
 }
 
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len);
+
 const struct file_operations ext4_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -162,7 +165,7 @@
 	.truncate	= ext4_truncate,
 	.setattr	= ext4_setattr,
 	.getattr	= ext4_getattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
@@ -170,5 +173,6 @@
 #endif
 	.permission	= ext4_permission,
 	.fallocate	= ext4_fallocate,
+	.fiemap		= ext4_fiemap,
 };
 
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a45c373..5afe437 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
 #include <linux/blkdev.h>
+#include <linux/marker.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
@@ -43,7 +44,7 @@
  * inode to disk.
  */
 
-int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
@@ -51,6 +52,10 @@
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
+	trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
+		   inode->i_sb->s_id, datasync, inode->i_ino,
+		   dentry->d_parent->d_inode->i_ino);
+
 	/*
 	 * data=writeback:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 1d6329d..556ca8e 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -27,7 +27,7 @@
 		sum += DELTA;
 		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
 		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
-	} while(--n);
+	} while (--n);
 
 	buf[0] += b0;
 	buf[1] += b1;
@@ -35,7 +35,7 @@
 
 
 /* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash(const char *name, int len)
 {
 	__u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
 	while (len--) {
@@ -59,7 +59,7 @@
 	val = pad;
 	if (len > num*4)
 		len = num * 4;
-	for (i=0; i < len; i++) {
+	for (i = 0; i < len; i++) {
 		if ((i % 4) == 0)
 			val = pad;
 		val = msg[i] + (val << 8);
@@ -104,7 +104,7 @@
 
 	/* Check to see if the seed is all zero's */
 	if (hinfo->seed) {
-		for (i=0; i < 4; i++) {
+		for (i = 0; i < 4; i++) {
 			if (hinfo->seed[i])
 				break;
 		}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f344834..fe34d74 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -115,9 +115,11 @@
 			    block_group, bitmap_blk);
 		return NULL;
 	}
-	if (bh_uptodate_or_lock(bh))
+	if (buffer_uptodate(bh) &&
+	    !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
 		return bh;
 
+	lock_buffer(bh);
 	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -154,39 +156,40 @@
  * though), and then we'd have two inodes sharing the
  * same inode number and space on the harddisk.
  */
-void ext4_free_inode (handle_t *handle, struct inode * inode)
+void ext4_free_inode(handle_t *handle, struct inode *inode)
 {
-	struct super_block * sb = inode->i_sb;
+	struct super_block *sb = inode->i_sb;
 	int is_directory;
 	unsigned long ino;
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *bh2;
 	ext4_group_t block_group;
 	unsigned long bit;
-	struct ext4_group_desc * gdp;
-	struct ext4_super_block * es;
+	struct ext4_group_desc *gdp;
+	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
 	int fatal = 0, err;
 	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
-		printk ("ext4_free_inode: inode has count=%d\n",
-					atomic_read(&inode->i_count));
+		printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
+		       atomic_read(&inode->i_count));
 		return;
 	}
 	if (inode->i_nlink) {
-		printk ("ext4_free_inode: inode has nlink=%d\n",
-			inode->i_nlink);
+		printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
+		       inode->i_nlink);
 		return;
 	}
 	if (!sb) {
-		printk("ext4_free_inode: inode on nonexistent device\n");
+		printk(KERN_ERR "ext4_free_inode: inode on "
+		       "nonexistent device\n");
 		return;
 	}
 	sbi = EXT4_SB(sb);
 
 	ino = inode->i_ino;
-	ext4_debug ("freeing inode %lu\n", ino);
+	ext4_debug("freeing inode %lu\n", ino);
 
 	/*
 	 * Note: we must free any quota before locking the superblock,
@@ -200,12 +203,12 @@
 	is_directory = S_ISDIR(inode->i_mode);
 
 	/* Do this BEFORE marking the inode not in use or returning an error */
-	clear_inode (inode);
+	clear_inode(inode);
 
 	es = EXT4_SB(sb)->s_es;
 	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
-		ext4_error (sb, "ext4_free_inode",
-			    "reserved or nonexistent inode %lu", ino);
+		ext4_error(sb, "ext4_free_inode",
+			   "reserved or nonexistent inode %lu", ino);
 		goto error_return;
 	}
 	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -222,10 +225,10 @@
 	/* Ok, now we can actually update the inode bitmaps.. */
 	if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
 					bit, bitmap_bh->b_data))
-		ext4_error (sb, "ext4_free_inode",
-			      "bit already cleared for inode %lu", ino);
+		ext4_error(sb, "ext4_free_inode",
+			   "bit already cleared for inode %lu", ino);
 	else {
-		gdp = ext4_get_group_desc (sb, block_group, &bh2);
+		gdp = ext4_get_group_desc(sb, block_group, &bh2);
 
 		BUFFER_TRACE(bh2, "get_write_access");
 		fatal = ext4_journal_get_write_access(handle, bh2);
@@ -287,7 +290,7 @@
 	avefreei = freei / ngroups;
 
 	for (group = 0; group < ngroups; group++) {
-		desc = ext4_get_group_desc (sb, group, NULL);
+		desc = ext4_get_group_desc(sb, group, NULL);
 		if (!desc || !desc->bg_free_inodes_count)
 			continue;
 		if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
@@ -576,16 +579,16 @@
  * For other inodes, search forward from the parent directory's block
  * group to find a free inode.
  */
-struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 {
 	struct super_block *sb;
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *bh2;
 	ext4_group_t group = 0;
 	unsigned long ino = 0;
-	struct inode * inode;
-	struct ext4_group_desc * gdp = NULL;
-	struct ext4_super_block * es;
+	struct inode *inode;
+	struct ext4_group_desc *gdp = NULL;
+	struct ext4_super_block *es;
 	struct ext4_inode_info *ei;
 	struct ext4_sb_info *sbi;
 	int ret2, err = 0;
@@ -613,7 +616,7 @@
 	}
 
 	if (S_ISDIR(mode)) {
-		if (test_opt (sb, OLDALLOC))
+		if (test_opt(sb, OLDALLOC))
 			ret2 = find_group_dir(sb, dir, &group);
 		else
 			ret2 = find_group_orlov(sb, dir, &group);
@@ -783,7 +786,7 @@
 	}
 
 	inode->i_uid = current->fsuid;
-	if (test_opt (sb, GRPID))
+	if (test_opt(sb, GRPID))
 		inode->i_gid = dir->i_gid;
 	else if (dir->i_mode & S_ISGID) {
 		inode->i_gid = dir->i_gid;
@@ -816,7 +819,6 @@
 		ei->i_flags &= ~EXT4_DIRSYNC_FL;
 	ei->i_file_acl = 0;
 	ei->i_dtime = 0;
-	ei->i_block_alloc_info = NULL;
 	ei->i_block_group = group;
 
 	ext4_set_inode_flags(inode);
@@ -832,7 +834,7 @@
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
 	ret = inode;
-	if(DQUOT_ALLOC_INODE(inode)) {
+	if (DQUOT_ALLOC_INODE(inode)) {
 		err = -EDQUOT;
 		goto fail_drop;
 	}
@@ -841,7 +843,7 @@
 	if (err)
 		goto fail_free_drop;
 
-	err = ext4_init_security(handle,inode, dir);
+	err = ext4_init_security(handle, inode, dir);
 	if (err)
 		goto fail_free_drop;
 
@@ -959,7 +961,7 @@
 	return ERR_PTR(err);
 }
 
-unsigned long ext4_count_free_inodes (struct super_block * sb)
+unsigned long ext4_count_free_inodes(struct super_block *sb)
 {
 	unsigned long desc_count;
 	struct ext4_group_desc *gdp;
@@ -974,7 +976,7 @@
 	bitmap_count = 0;
 	gdp = NULL;
 	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-		gdp = ext4_get_group_desc (sb, i, NULL);
+		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
 		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -989,13 +991,14 @@
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-	printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n",
-		le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+	printk(KERN_DEBUG "ext4_count_free_inodes: "
+	       "stored = %u, computed = %lu, %lu\n",
+	       le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
 	return desc_count;
 #else
 	desc_count = 0;
 	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-		gdp = ext4_get_group_desc (sb, i, NULL);
+		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
 		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -1006,13 +1009,13 @@
 }
 
 /* Called at mount-time, super-block is locked */
-unsigned long ext4_count_dirs (struct super_block * sb)
+unsigned long ext4_count_dirs(struct super_block * sb)
 {
 	unsigned long count = 0;
 	ext4_group_t i;
 
 	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-		struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
+		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
 		count += le16_to_cpu(gdp->bg_used_dirs_count);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7e91913e..9b4ec9d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -190,7 +190,7 @@
 /*
  * Called at the last iput() if i_nlink is zero.
  */
-void ext4_delete_inode (struct inode * inode)
+void ext4_delete_inode(struct inode *inode)
 {
 	handle_t *handle;
 	int err;
@@ -330,11 +330,11 @@
 	int final = 0;
 
 	if (i_block < 0) {
-		ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
+		ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
 	} else if (i_block < direct_blocks) {
 		offsets[n++] = i_block;
 		final = direct_blocks;
-	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
+	} else if ((i_block -= direct_blocks) < indirect_blocks) {
 		offsets[n++] = EXT4_IND_BLOCK;
 		offsets[n++] = i_block;
 		final = ptrs;
@@ -400,14 +400,14 @@
 
 	*err = 0;
 	/* i_data is not going away, no lock needed */
-	add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
+	add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
 		bh = sb_bread(sb, le32_to_cpu(p->key));
 		if (!bh)
 			goto failure;
-		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
+		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
 		/* Reader: end */
 		if (!p->key)
 			goto no_block;
@@ -443,7 +443,7 @@
 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
+	__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
 	__le32 *p;
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
@@ -486,18 +486,9 @@
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 		Indirect *partial)
 {
-	struct ext4_block_alloc_info *block_i;
-
-	block_i =  EXT4_I(inode)->i_block_alloc_info;
-
 	/*
-	 * try the heuristic for sequential allocation,
-	 * failing that at least try to get decent locality.
+	 * XXX need to get goal block from mballoc's data structures
 	 */
-	if (block_i && (block == block_i->last_alloc_logical_block + 1)
-		&& (block_i->last_alloc_physical_block != 0)) {
-		return block_i->last_alloc_physical_block + 1;
-	}
 
 	return ext4_find_near(inode, partial);
 }
@@ -630,7 +621,7 @@
 	*err = 0;
 	return ret;
 failed_out:
-	for (i = 0; i <index; i++)
+	for (i = 0; i < index; i++)
 		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 	return ret;
 }
@@ -703,7 +694,7 @@
 		branch[n].p = (__le32 *) bh->b_data + offsets[n];
 		branch[n].key = cpu_to_le32(new_blocks[n]);
 		*branch[n].p = branch[n].key;
-		if ( n == indirect_blks) {
+		if (n == indirect_blks) {
 			current_block = new_blocks[n];
 			/*
 			 * End of chain, update the last new metablock of
@@ -730,7 +721,7 @@
 		BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
 		ext4_journal_forget(handle, branch[i].bh);
 	}
-	for (i = 0; i <indirect_blks; i++)
+	for (i = 0; i < indirect_blks; i++)
 		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 
 	ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
@@ -757,10 +748,8 @@
 {
 	int i;
 	int err = 0;
-	struct ext4_block_alloc_info *block_i;
 	ext4_fsblk_t current_block;
 
-	block_i = EXT4_I(inode)->i_block_alloc_info;
 	/*
 	 * If we're splicing into a [td]indirect block (as opposed to the
 	 * inode) then we need to get write access to the [td]indirect block
@@ -783,18 +772,7 @@
 	if (num == 0 && blks > 1) {
 		current_block = le32_to_cpu(where->key) + 1;
 		for (i = 1; i < blks; i++)
-			*(where->p + i ) = cpu_to_le32(current_block++);
-	}
-
-	/*
-	 * update the most recently allocated logical & physical block
-	 * in i_block_alloc_info, to assist find the proper goal block for next
-	 * allocation
-	 */
-	if (block_i) {
-		block_i->last_alloc_logical_block = block + blks - 1;
-		block_i->last_alloc_physical_block =
-				le32_to_cpu(where[num].key) + blks - 1;
+			*(where->p + i) = cpu_to_le32(current_block++);
 	}
 
 	/* We are done with atomic stuff, now do the rest of housekeeping */
@@ -914,12 +892,8 @@
 		goto cleanup;
 
 	/*
-	 * Okay, we need to do block allocation.  Lazily initialize the block
-	 * allocation info here if necessary
+	 * Okay, we need to do block allocation.
 	*/
-	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
-		ext4_init_block_alloc_info(inode);
-
 	goal = ext4_find_goal(inode, iblock, partial);
 
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -1030,19 +1004,20 @@
 	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
 	mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
 
-	/* Account for allocated meta_blocks */
-	mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+	if (mdb_free) {
+		/* Account for allocated meta_blocks */
+		mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
 
-	/* update fs free blocks counter for truncate case */
-	percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
+		/* update fs dirty blocks counter */
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+		EXT4_I(inode)->i_allocated_meta_blocks = 0;
+		EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+	}
 
 	/* update per-inode reservations */
 	BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
 	EXT4_I(inode)->i_reserved_data_blocks -= used;
 
-	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-	EXT4_I(inode)->i_reserved_meta_blocks = mdb;
-	EXT4_I(inode)->i_allocated_meta_blocks = 0;
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 }
 
@@ -1160,8 +1135,8 @@
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
-static int ext4_get_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create)
+int ext4_get_block(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	int ret = 0, started = 0;
@@ -1241,7 +1216,7 @@
 			BUFFER_TRACE(bh, "call get_create_access");
 			fatal = ext4_journal_get_create_access(handle, bh);
 			if (!fatal && !buffer_uptodate(bh)) {
-				memset(bh->b_data,0,inode->i_sb->s_blocksize);
+				memset(bh->b_data, 0, inode->i_sb->s_blocksize);
 				set_buffer_uptodate(bh);
 			}
 			unlock_buffer(bh);
@@ -1266,7 +1241,7 @@
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 			       ext4_lblk_t block, int create, int *err)
 {
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 
 	bh = ext4_getblk(handle, inode, block, create, err);
 	if (!bh)
@@ -1282,13 +1257,13 @@
 	return NULL;
 }
 
-static int walk_page_buffers(	handle_t *handle,
-				struct buffer_head *head,
-				unsigned from,
-				unsigned to,
-				int *partial,
-				int (*fn)(	handle_t *handle,
-						struct buffer_head *bh))
+static int walk_page_buffers(handle_t *handle,
+			     struct buffer_head *head,
+			     unsigned from,
+			     unsigned to,
+			     int *partial,
+			     int (*fn)(handle_t *handle,
+				       struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
@@ -1296,9 +1271,9 @@
 	int err, ret = 0;
 	struct buffer_head *next;
 
-	for (	bh = head, block_start = 0;
-		ret == 0 && (bh != head || !block_start);
-		block_start = block_end, bh = next)
+	for (bh = head, block_start = 0;
+	     ret == 0 && (bh != head || !block_start);
+	     block_start = block_end, bh = next)
 	{
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
@@ -1351,23 +1326,23 @@
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
 {
- 	struct inode *inode = mapping->host;
+	struct inode *inode = mapping->host;
 	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
 	handle_t *handle;
 	int retries = 0;
- 	struct page *page;
+	struct page *page;
  	pgoff_t index;
- 	unsigned from, to;
+	unsigned from, to;
 
  	index = pos >> PAGE_CACHE_SHIFT;
- 	from = pos & (PAGE_CACHE_SIZE - 1);
- 	to = from + len;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
 
 retry:
-  	handle = ext4_journal_start(inode, needed_blocks);
-  	if (IS_ERR(handle)) {
-  		ret = PTR_ERR(handle);
-  		goto out;
+	handle = ext4_journal_start(inode, needed_blocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
 	}
 
 	page = __grab_cache_page(mapping, index);
@@ -1387,9 +1362,16 @@
 	}
 
 	if (ret) {
- 		unlock_page(page);
+		unlock_page(page);
 		ext4_journal_stop(handle);
- 		page_cache_release(page);
+		page_cache_release(page);
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 */
+		if (pos + len > inode->i_size)
+			vmtruncate(inode, inode->i_size);
 	}
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1426,16 +1408,18 @@
 	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
-		/*
-		 * generic_write_end() will run mark_inode_dirty() if i_size
-		 * changes.  So let's piggyback the i_disksize mark_inode_dirty
-		 * into that.
-		 */
 		loff_t new_i_size;
 
 		new_i_size = pos + copied;
-		if (new_i_size > EXT4_I(inode)->i_disksize)
-			EXT4_I(inode)->i_disksize = new_i_size;
+		if (new_i_size > EXT4_I(inode)->i_disksize) {
+			ext4_update_i_disksize(inode, new_i_size);
+			/* We need to mark inode dirty even if
+			 * new_i_size is less that inode->i_size
+			 * bu greater than i_disksize.(hint delalloc)
+			 */
+			ext4_mark_inode_dirty(handle, inode);
+		}
+
 		ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
@@ -1460,8 +1444,14 @@
 	loff_t new_i_size;
 
 	new_i_size = pos + copied;
-	if (new_i_size > EXT4_I(inode)->i_disksize)
-		EXT4_I(inode)->i_disksize = new_i_size;
+	if (new_i_size > EXT4_I(inode)->i_disksize) {
+		ext4_update_i_disksize(inode, new_i_size);
+		/* We need to mark inode dirty even if
+		 * new_i_size is less that inode->i_size
+		 * bu greater than i_disksize.(hint delalloc)
+		 */
+		ext4_mark_inode_dirty(handle, inode);
+	}
 
 	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
@@ -1486,6 +1476,7 @@
 	int ret = 0, ret2;
 	int partial = 0;
 	unsigned from, to;
+	loff_t new_i_size;
 
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
@@ -1500,11 +1491,12 @@
 				to, &partial, write_end_fn);
 	if (!partial)
 		SetPageUptodate(page);
-	if (pos+copied > inode->i_size)
+	new_i_size = pos + copied;
+	if (new_i_size > inode->i_size)
 		i_size_write(inode, pos+copied);
 	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-	if (inode->i_size > EXT4_I(inode)->i_disksize) {
-		EXT4_I(inode)->i_disksize = inode->i_size;
+	if (new_i_size > EXT4_I(inode)->i_disksize) {
+		ext4_update_i_disksize(inode, new_i_size);
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		if (!ret)
 			ret = ret2;
@@ -1521,6 +1513,7 @@
 
 static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 {
+	int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        unsigned long md_needed, mdblocks, total = 0;
 
@@ -1529,6 +1522,7 @@
 	 * in order to allocate nrblocks
 	 * worse case is one extent per block
 	 */
+repeat:
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
 	mdblocks = ext4_calc_metadata_amount(inode, total);
@@ -1537,13 +1531,14 @@
 	md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
 	total = md_needed + nrblocks;
 
-	if (ext4_has_free_blocks(sbi, total) < total) {
+	if (ext4_claim_free_blocks(sbi, total)) {
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+			yield();
+			goto repeat;
+		}
 		return -ENOSPC;
 	}
-	/* reduce fs free blocks counter */
-	percpu_counter_sub(&sbi->s_freeblocks_counter, total);
-
 	EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
 	EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
 
@@ -1585,8 +1580,8 @@
 
 	release = to_free + mdb_free;
 
-	/* update fs free blocks counter for truncate case */
-	percpu_counter_add(&sbi->s_freeblocks_counter, release);
+	/* update fs dirty blocks counter for truncate case */
+	percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
 
 	/* update per-inode reservations */
 	BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
@@ -1630,6 +1625,7 @@
 	struct writeback_control *wbc;
 	int io_done;
 	long pages_written;
+	int retval;
 };
 
 /*
@@ -1783,6 +1779,57 @@
 		unmap_underlying_metadata(bdev, bh->b_blocknr + i);
 }
 
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
+					sector_t logical, long blk_cnt)
+{
+	int nr_pages, i;
+	pgoff_t index, end;
+	struct pagevec pvec;
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
+
+	index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	end   = (logical + blk_cnt - 1) >>
+				(PAGE_CACHE_SHIFT - inode->i_blkbits);
+	while (index <= end) {
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			index = page->index;
+			if (index > end)
+				break;
+			index++;
+
+			BUG_ON(!PageLocked(page));
+			BUG_ON(PageWriteback(page));
+			block_invalidatepage(page, 0);
+			ClearPageUptodate(page);
+			unlock_page(page);
+		}
+	}
+	return;
+}
+
+static void ext4_print_free_blocks(struct inode *inode)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	printk(KERN_EMERG "Total free blocks count %lld\n",
+			ext4_count_free_blocks(inode->i_sb));
+	printk(KERN_EMERG "Free/Dirty block details\n");
+	printk(KERN_EMERG "free_blocks=%lld\n",
+			percpu_counter_sum(&sbi->s_freeblocks_counter));
+	printk(KERN_EMERG "dirty_blocks=%lld\n",
+			percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+	printk(KERN_EMERG "Block reservation details\n");
+	printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
+			EXT4_I(inode)->i_reserved_data_blocks);
+	printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
+			EXT4_I(inode)->i_reserved_meta_blocks);
+	return;
+}
+
 /*
  * mpage_da_map_blocks - go through given space
  *
@@ -1792,32 +1839,69 @@
  * The function skips space we know is already mapped to disk blocks.
  *
  */
-static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
 	int err = 0;
-	struct buffer_head *lbh = &mpd->lbh;
-	sector_t next = lbh->b_blocknr;
 	struct buffer_head new;
+	struct buffer_head *lbh = &mpd->lbh;
+	sector_t next;
 
 	/*
 	 * We consider only non-mapped and non-allocated blocks
 	 */
 	if (buffer_mapped(lbh) && !buffer_delay(lbh))
-		return;
-
+		return 0;
 	new.b_state = lbh->b_state;
 	new.b_blocknr = 0;
 	new.b_size = lbh->b_size;
-
+	next = lbh->b_blocknr;
 	/*
 	 * If we didn't accumulate anything
 	 * to write simply return
 	 */
 	if (!new.b_size)
-		return;
+		return 0;
 	err = mpd->get_block(mpd->inode, next, &new, 1);
-	if (err)
-		return;
+	if (err) {
+
+		/* If get block returns with error
+		 * we simply return. Later writepage
+		 * will redirty the page and writepages
+		 * will find the dirty page again
+		 */
+		if (err == -EAGAIN)
+			return 0;
+
+		if (err == -ENOSPC &&
+				ext4_count_free_blocks(mpd->inode->i_sb)) {
+			mpd->retval = err;
+			return 0;
+		}
+
+		/*
+		 * get block failure will cause us
+		 * to loop in writepages. Because
+		 * a_ops->writepage won't be able to
+		 * make progress. The page will be redirtied
+		 * by writepage and writepages will again
+		 * try to write the same.
+		 */
+		printk(KERN_EMERG "%s block allocation failed for inode %lu "
+				  "at logical offset %llu with max blocks "
+				  "%zd with error %d\n",
+				  __func__, mpd->inode->i_ino,
+				  (unsigned long long)next,
+				  lbh->b_size >> mpd->inode->i_blkbits, err);
+		printk(KERN_EMERG "This should not happen.!! "
+					"Data will be lost\n");
+		if (err == -ENOSPC) {
+			ext4_print_free_blocks(mpd->inode);
+		}
+		/* invlaidate all the pages */
+		ext4_da_block_invalidatepages(mpd, next,
+				lbh->b_size >> mpd->inode->i_blkbits);
+		return err;
+	}
 	BUG_ON(new.b_size == 0);
 
 	if (buffer_new(&new))
@@ -1830,7 +1914,7 @@
 	if (buffer_delay(lbh) || buffer_unwritten(lbh))
 		mpage_put_bnr_to_bhs(mpd, next, &new);
 
-	return;
+	return 0;
 }
 
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -1899,8 +1983,8 @@
 	 * We couldn't merge the block to our extent, so we
 	 * need to flush current  extent and start new one
 	 */
-	mpage_da_map_blocks(mpd);
-	mpage_da_submit_io(mpd);
+	if (mpage_da_map_blocks(mpd) == 0)
+		mpage_da_submit_io(mpd);
 	mpd->io_done = 1;
 	return;
 }
@@ -1942,8 +2026,8 @@
 		 * and start IO on them using writepage()
 		 */
 		if (mpd->next_page != mpd->first_page) {
-			mpage_da_map_blocks(mpd);
-			mpage_da_submit_io(mpd);
+			if (mpage_da_map_blocks(mpd) == 0)
+				mpage_da_submit_io(mpd);
 			/*
 			 * skip rest of the page in the page_vec
 			 */
@@ -2018,39 +2102,36 @@
  */
 static int mpage_da_writepages(struct address_space *mapping,
 			       struct writeback_control *wbc,
-			       get_block_t get_block)
+			       struct mpage_da_data *mpd)
 {
-	struct mpage_da_data mpd;
 	long to_write;
 	int ret;
 
-	if (!get_block)
+	if (!mpd->get_block)
 		return generic_writepages(mapping, wbc);
 
-	mpd.wbc = wbc;
-	mpd.inode = mapping->host;
-	mpd.lbh.b_size = 0;
-	mpd.lbh.b_state = 0;
-	mpd.lbh.b_blocknr = 0;
-	mpd.first_page = 0;
-	mpd.next_page = 0;
-	mpd.get_block = get_block;
-	mpd.io_done = 0;
-	mpd.pages_written = 0;
+	mpd->lbh.b_size = 0;
+	mpd->lbh.b_state = 0;
+	mpd->lbh.b_blocknr = 0;
+	mpd->first_page = 0;
+	mpd->next_page = 0;
+	mpd->io_done = 0;
+	mpd->pages_written = 0;
+	mpd->retval = 0;
 
 	to_write = wbc->nr_to_write;
 
-	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
 
 	/*
 	 * Handle last extent of pages
 	 */
-	if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-		mpage_da_map_blocks(&mpd);
-		mpage_da_submit_io(&mpd);
+	if (!mpd->io_done && mpd->next_page != mpd->first_page) {
+		if (mpage_da_map_blocks(mpd) == 0)
+			mpage_da_submit_io(mpd);
 	}
 
-	wbc->nr_to_write = to_write - mpd.pages_written;
+	wbc->nr_to_write = to_write - mpd->pages_written;
 	return ret;
 }
 
@@ -2103,18 +2184,24 @@
 	handle_t *handle = NULL;
 
 	handle = ext4_journal_current_handle();
-	if (!handle) {
-		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, 0, 0, 0);
-		BUG_ON(!ret);
-	} else {
-		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
-	}
-
+	BUG_ON(!handle);
+	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+			bh_result, create, 0, EXT4_DELALLOC_RSVED);
 	if (ret > 0) {
+
 		bh_result->b_size = (ret << inode->i_blkbits);
 
+		if (ext4_should_order_data(inode)) {
+			int retval;
+			retval = ext4_jbd2_file_inode(handle, inode);
+			if (retval)
+				/*
+				 * Failed to add inode for ordered
+				 * mode. Don't update file size
+				 */
+				return retval;
+		}
+
 		/*
 		 * Update on-disk size along with block allocation
 		 * we don't use 'extend_disksize' as size may change
@@ -2124,18 +2211,9 @@
 		if (disksize > i_size_read(inode))
 			disksize = i_size_read(inode);
 		if (disksize > EXT4_I(inode)->i_disksize) {
-			/*
-			 * XXX: replace with spinlock if seen contended -bzzz
-			 */
-			down_write(&EXT4_I(inode)->i_data_sem);
-			if (disksize > EXT4_I(inode)->i_disksize)
-				EXT4_I(inode)->i_disksize = disksize;
-			up_write(&EXT4_I(inode)->i_data_sem);
-
-			if (EXT4_I(inode)->i_disksize == disksize) {
-				ret = ext4_mark_inode_dirty(handle, inode);
-				return ret;
-			}
+			ext4_update_i_disksize(inode, disksize);
+			ret = ext4_mark_inode_dirty(handle, inode);
+			return ret;
 		}
 		ret = 0;
 	}
@@ -2284,6 +2362,7 @@
 {
 	handle_t *handle = NULL;
 	loff_t range_start = 0;
+	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
 	long to_write, pages_skipped = 0;
@@ -2317,6 +2396,9 @@
 	range_start =  wbc->range_start;
 	pages_skipped = wbc->pages_skipped;
 
+	mpd.wbc = wbc;
+	mpd.inode = mapping->host;
+
 restart_loop:
 	to_write = wbc->nr_to_write;
 	while (!ret && to_write > 0) {
@@ -2340,23 +2422,17 @@
 			dump_stack();
 			goto out_writepages;
 		}
-		if (ext4_should_order_data(inode)) {
-			/*
-			 * With ordered mode we need to add
-			 * the inode to the journal handl
-			 * when we do block allocation.
-			 */
-			ret = ext4_jbd2_file_inode(handle, inode);
-			if (ret) {
-				ext4_journal_stop(handle);
-				goto out_writepages;
-			}
-		}
-
 		to_write -= wbc->nr_to_write;
-		ret = mpage_da_writepages(mapping, wbc,
-					  ext4_da_get_block_write);
+
+		mpd.get_block = ext4_da_get_block_write;
+		ret = mpage_da_writepages(mapping, wbc, &mpd);
+
 		ext4_journal_stop(handle);
+
+		if (mpd.retval == -ENOSPC)
+			jbd2_journal_force_commit_nested(sbi->s_journal);
+
+		/* reset the retry count */
 		if (ret == MPAGE_DA_EXTENT_TAIL) {
 			/*
 			 * got one extent now try with
@@ -2391,6 +2467,33 @@
 	return ret;
 }
 
+#define FALL_BACK_TO_NONDELALLOC 1
+static int ext4_nonda_switch(struct super_block *sb)
+{
+	s64 free_blocks, dirty_blocks;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	/*
+	 * switch to non delalloc mode if we are running low
+	 * on free block. The free block accounting via percpu
+	 * counters can get slightly wrong with FBC_BATCH getting
+	 * accumulated on each CPU without updating global counters
+	 * Delalloc need an accurate free block accounting. So switch
+	 * to non delalloc when we are near to error range.
+	 */
+	free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
+	if (2 * free_blocks < 3 * dirty_blocks ||
+		free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+		/*
+		 * free block count is less that 150% of dirty blocks
+		 * or free blocks is less that watermark
+		 */
+		return 1;
+	}
+	return 0;
+}
+
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
@@ -2406,6 +2509,12 @@
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
+	if (ext4_nonda_switch(inode->i_sb)) {
+		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
+		return ext4_write_begin(file, mapping, pos,
+					len, flags, pagep, fsdata);
+	}
+	*fsdata = (void *)0;
 retry:
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
@@ -2433,6 +2542,13 @@
 		unlock_page(page);
 		ext4_journal_stop(handle);
 		page_cache_release(page);
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 */
+		if (pos + len > inode->i_size)
+			vmtruncate(inode, inode->i_size);
 	}
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2456,7 +2572,7 @@
 	bh = page_buffers(page);
 	idx = offset >> inode->i_blkbits;
 
-	for (i=0; i < idx; i++)
+	for (i = 0; i < idx; i++)
 		bh = bh->b_this_page;
 
 	if (!buffer_mapped(bh) || (buffer_delay(bh)))
@@ -2474,9 +2590,22 @@
 	handle_t *handle = ext4_journal_current_handle();
 	loff_t new_i_size;
 	unsigned long start, end;
+	int write_mode = (int)(unsigned long)fsdata;
+
+	if (write_mode == FALL_BACK_TO_NONDELALLOC) {
+		if (ext4_should_order_data(inode)) {
+			return ext4_ordered_write_end(file, mapping, pos,
+					len, copied, page, fsdata);
+		} else if (ext4_should_writeback_data(inode)) {
+			return ext4_writeback_write_end(file, mapping, pos,
+					len, copied, page, fsdata);
+		} else {
+			BUG();
+		}
+	}
 
 	start = pos & (PAGE_CACHE_SIZE - 1);
-	end = start + copied -1;
+	end = start + copied - 1;
 
 	/*
 	 * generic_write_end() will run mark_inode_dirty() if i_size
@@ -2500,6 +2629,11 @@
 				EXT4_I(inode)->i_disksize = new_i_size;
 			}
 			up_write(&EXT4_I(inode)->i_data_sem);
+			/* We need to mark inode dirty even if
+			 * new_i_size is less that inode->i_size
+			 * bu greater than i_disksize.(hint delalloc)
+			 */
+			ext4_mark_inode_dirty(handle, inode);
 		}
 	}
 	ret2 = generic_write_end(file, mapping, pos, len, copied,
@@ -2591,7 +2725,7 @@
 			return 0;
 	}
 
-	return generic_block_bmap(mapping,block,ext4_get_block);
+	return generic_block_bmap(mapping, block, ext4_get_block);
 }
 
 static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -3197,7 +3331,7 @@
 	if (!partial->key && *partial->p)
 		/* Writer: end */
 		goto no_top;
-	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
+	for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
 		;
 	/*
 	 * OK, we've found the last block that must survive. The rest of our
@@ -3216,7 +3350,7 @@
 	}
 	/* Writer: end */
 
-	while(partial > p) {
+	while (partial > p) {
 		brelse(partial->bh);
 		partial--;
 	}
@@ -3408,9 +3542,9 @@
 			/* This zaps the entire block.  Bottom up. */
 			BUFFER_TRACE(bh, "free child branches");
 			ext4_free_branches(handle, inode, bh,
-					   (__le32*)bh->b_data,
-					   (__le32*)bh->b_data + addr_per_block,
-					   depth);
+					(__le32 *) bh->b_data,
+					(__le32 *) bh->b_data + addr_per_block,
+					depth);
 
 			/*
 			 * We've probably journalled the indirect block several
@@ -3578,7 +3712,7 @@
 	 */
 	down_write(&ei->i_data_sem);
 
-	ext4_discard_reservation(inode);
+	ext4_discard_preallocations(inode);
 
 	/*
 	 * The orphan list entry will now protect us from any crash which
@@ -3673,41 +3807,6 @@
 	ext4_journal_stop(handle);
 }
 
-static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
-		unsigned long ino, struct ext4_iloc *iloc)
-{
-	ext4_group_t block_group;
-	unsigned long offset;
-	ext4_fsblk_t block;
-	struct ext4_group_desc *gdp;
-
-	if (!ext4_valid_inum(sb, ino)) {
-		/*
-		 * This error is already checked for in namei.c unless we are
-		 * looking at an NFS filehandle, in which case no error
-		 * report is needed
-		 */
-		return 0;
-	}
-
-	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
-	gdp = ext4_get_group_desc(sb, block_group, NULL);
-	if (!gdp)
-		return 0;
-
-	/*
-	 * Figure out the offset within the block group inode table
-	 */
-	offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
-		EXT4_INODE_SIZE(sb);
-	block = ext4_inode_table(sb, gdp) +
-		(offset >> EXT4_BLOCK_SIZE_BITS(sb));
-
-	iloc->block_group = block_group;
-	iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
-	return block;
-}
-
 /*
  * ext4_get_inode_loc returns with an extra refcount against the inode's
  * underlying buffer_head on success. If 'in_mem' is true, we have all
@@ -3717,19 +3816,35 @@
 static int __ext4_get_inode_loc(struct inode *inode,
 				struct ext4_iloc *iloc, int in_mem)
 {
-	ext4_fsblk_t block;
-	struct buffer_head *bh;
+	struct ext4_group_desc	*gdp;
+	struct buffer_head	*bh;
+	struct super_block	*sb = inode->i_sb;
+	ext4_fsblk_t		block;
+	int			inodes_per_block, inode_offset;
 
-	block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
-	if (!block)
+	iloc->bh = 0;
+	if (!ext4_valid_inum(sb, inode->i_ino))
 		return -EIO;
 
-	bh = sb_getblk(inode->i_sb, block);
+	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+	if (!gdp)
+		return -EIO;
+
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+	inode_offset = ((inode->i_ino - 1) %
+			EXT4_INODES_PER_GROUP(sb));
+	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
+	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+	bh = sb_getblk(sb, block);
 	if (!bh) {
-		ext4_error (inode->i_sb, "ext4_get_inode_loc",
-				"unable to read inode block - "
-				"inode=%lu, block=%llu",
-				 inode->i_ino, block);
+		ext4_error(sb, "ext4_get_inode_loc", "unable to read "
+			   "inode block - inode=%lu, block=%llu",
+			   inode->i_ino, block);
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
@@ -3757,28 +3872,12 @@
 		 */
 		if (in_mem) {
 			struct buffer_head *bitmap_bh;
-			struct ext4_group_desc *desc;
-			int inodes_per_buffer;
-			int inode_offset, i;
-			ext4_group_t block_group;
-			int start;
+			int i, start;
 
-			block_group = (inode->i_ino - 1) /
-					EXT4_INODES_PER_GROUP(inode->i_sb);
-			inodes_per_buffer = bh->b_size /
-				EXT4_INODE_SIZE(inode->i_sb);
-			inode_offset = ((inode->i_ino - 1) %
-					EXT4_INODES_PER_GROUP(inode->i_sb));
-			start = inode_offset & ~(inodes_per_buffer - 1);
+			start = inode_offset & ~(inodes_per_block - 1);
 
 			/* Is the inode bitmap in cache? */
-			desc = ext4_get_group_desc(inode->i_sb,
-						block_group, NULL);
-			if (!desc)
-				goto make_io;
-
-			bitmap_bh = sb_getblk(inode->i_sb,
-				ext4_inode_bitmap(inode->i_sb, desc));
+			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
 			if (!bitmap_bh)
 				goto make_io;
 
@@ -3791,14 +3890,14 @@
 				brelse(bitmap_bh);
 				goto make_io;
 			}
-			for (i = start; i < start + inodes_per_buffer; i++) {
+			for (i = start; i < start + inodes_per_block; i++) {
 				if (i == inode_offset)
 					continue;
 				if (ext4_test_bit(i, bitmap_bh->b_data))
 					break;
 			}
 			brelse(bitmap_bh);
-			if (i == start + inodes_per_buffer) {
+			if (i == start + inodes_per_block) {
 				/* all other inodes are free, so skip I/O */
 				memset(bh->b_data, 0, bh->b_size);
 				set_buffer_uptodate(bh);
@@ -3809,6 +3908,36 @@
 
 make_io:
 		/*
+		 * If we need to do any I/O, try to pre-readahead extra
+		 * blocks from the inode table.
+		 */
+		if (EXT4_SB(sb)->s_inode_readahead_blks) {
+			ext4_fsblk_t b, end, table;
+			unsigned num;
+
+			table = ext4_inode_table(sb, gdp);
+			/* Make sure s_inode_readahead_blks is a power of 2 */
+			while (EXT4_SB(sb)->s_inode_readahead_blks &
+			       (EXT4_SB(sb)->s_inode_readahead_blks-1))
+				EXT4_SB(sb)->s_inode_readahead_blks = 
+				   (EXT4_SB(sb)->s_inode_readahead_blks &
+				    (EXT4_SB(sb)->s_inode_readahead_blks-1));
+			b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+			if (table > b)
+				b = table;
+			end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+			num = EXT4_INODES_PER_GROUP(sb);
+			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+				num -= le16_to_cpu(gdp->bg_itable_unused);
+			table += num / inodes_per_block;
+			if (end > table)
+				end = table;
+			while (b <= end)
+				sb_breadahead(sb, b++);
+		}
+
+		/*
 		 * There are other valid inodes in the buffer, this inode
 		 * has in-inode xattrs, or we don't have this inode in memory.
 		 * Read the block from disk.
@@ -3818,10 +3947,9 @@
 		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
-			ext4_error(inode->i_sb, "ext4_get_inode_loc",
-					"unable to read inode block - "
-					"inode=%lu, block=%llu",
-					inode->i_ino, block);
+			ext4_error(sb, __func__,
+				   "unable to read inode block - inode=%lu, "
+				   "block=%llu", inode->i_ino, block);
 			brelse(bh);
 			return -EIO;
 		}
@@ -3913,11 +4041,10 @@
 		return inode;
 
 	ei = EXT4_I(inode);
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	ei->i_acl = EXT4_ACL_NOT_CACHED;
 	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
 #endif
-	ei->i_block_alloc_info = NULL;
 
 	ret = __ext4_get_inode_loc(inode, &iloc, 0);
 	if (ret < 0)
@@ -3927,7 +4054,7 @@
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
-	if(!(test_opt (inode->i_sb, NO_UID32))) {
+	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
@@ -3945,7 +4072,7 @@
 		if (inode->i_mode == 0 ||
 		    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
 			/* this inode is deleted */
-			brelse (bh);
+			brelse(bh);
 			ret = -ESTALE;
 			goto bad_inode;
 		}
@@ -3978,7 +4105,7 @@
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
 		    EXT4_INODE_SIZE(inode->i_sb)) {
-			brelse (bh);
+			brelse(bh);
 			ret = -EIO;
 			goto bad_inode;
 		}
@@ -4031,7 +4158,7 @@
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	}
-	brelse (iloc.bh);
+	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
 	unlock_new_inode(inode);
 	return inode;
@@ -4113,14 +4240,14 @@
 
 	ext4_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
-	if(!(test_opt(inode->i_sb, NO_UID32))) {
+	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
 /*
  * Fix up interoperability with old kernels. Otherwise, old inodes get
  * re-used with the upper 16 bits of the uid/gid intact
  */
-		if(!ei->i_dtime) {
+		if (!ei->i_dtime) {
 			raw_inode->i_uid_high =
 				cpu_to_le16(high_16_bits(inode->i_uid));
 			raw_inode->i_gid_high =
@@ -4208,7 +4335,7 @@
 	ei->i_state &= ~EXT4_STATE_NEW;
 
 out_brelse:
-	brelse (bh);
+	brelse(bh);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
@@ -4811,6 +4938,7 @@
 	loff_t size;
 	unsigned long len;
 	int ret = -EINVAL;
+	void *fsdata;
 	struct file *file = vma->vm_file;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
@@ -4849,11 +4977,11 @@
 	 * on the same page though
 	 */
 	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
-			len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+			len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 	if (ret < 0)
 		goto out_unlock;
 	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
-			len, len, page, NULL);
+			len, len, page, fsdata);
 	if (ret < 0)
 		goto out_unlock;
 	ret = 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7a6c2f1..ea27eaa 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -23,9 +23,8 @@
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int flags;
-	unsigned short rsv_window_size;
 
-	ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+	ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
 
 	switch (cmd) {
 	case EXT4_IOC_GETFLAGS:
@@ -34,7 +33,7 @@
 		return put_user(flags, (int __user *) arg);
 	case EXT4_IOC_SETFLAGS: {
 		handle_t *handle = NULL;
-		int err;
+		int err, migrate = 0;
 		struct ext4_iloc iloc;
 		unsigned int oldflags;
 		unsigned int jflag;
@@ -82,6 +81,17 @@
 			if (!capable(CAP_SYS_RESOURCE))
 				goto flags_out;
 		}
+		if (oldflags & EXT4_EXTENTS_FL) {
+			/* We don't support clearning extent flags */
+			if (!(flags & EXT4_EXTENTS_FL)) {
+				err = -EOPNOTSUPP;
+				goto flags_out;
+			}
+		} else if (flags & EXT4_EXTENTS_FL) {
+			/* migrate the file */
+			migrate = 1;
+			flags &= ~EXT4_EXTENTS_FL;
+		}
 
 		handle = ext4_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
@@ -109,6 +119,10 @@
 
 		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
 			err = ext4_change_inode_journal_flag(inode, jflag);
+		if (err)
+			goto flags_out;
+		if (migrate)
+			err = ext4_ext_migrate(inode);
 flags_out:
 		mutex_unlock(&inode->i_mutex);
 		mnt_drop_write(filp->f_path.mnt);
@@ -175,49 +189,6 @@
 			return ret;
 		}
 #endif
-	case EXT4_IOC_GETRSVSZ:
-		if (test_opt(inode->i_sb, RESERVATION)
-			&& S_ISREG(inode->i_mode)
-			&& ei->i_block_alloc_info) {
-			rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
-			return put_user(rsv_window_size, (int __user *)arg);
-		}
-		return -ENOTTY;
-	case EXT4_IOC_SETRSVSZ: {
-		int err;
-
-		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
-			return -ENOTTY;
-
-		if (!is_owner_or_cap(inode))
-			return -EACCES;
-
-		if (get_user(rsv_window_size, (int __user *)arg))
-			return -EFAULT;
-
-		err = mnt_want_write(filp->f_path.mnt);
-		if (err)
-			return err;
-
-		if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
-			rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
-
-		/*
-		 * need to allocate reservation structure for this inode
-		 * before set the window size
-		 */
-		down_write(&ei->i_data_sem);
-		if (!ei->i_block_alloc_info)
-			ext4_init_block_alloc_info(inode);
-
-		if (ei->i_block_alloc_info){
-			struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
-			rsv->rsv_goal_size = rsv_window_size;
-		}
-		up_write(&ei->i_data_sem);
-		mnt_drop_write(filp->f_path.mnt);
-		return 0;
-	}
 	case EXT4_IOC_GROUP_EXTEND: {
 		ext4_fsblk_t n_blocks_count;
 		struct super_block *sb = inode->i_sb;
@@ -267,7 +238,26 @@
 	}
 
 	case EXT4_IOC_MIGRATE:
-		return ext4_ext_migrate(inode, filp, cmd, arg);
+	{
+		int err;
+		if (!is_owner_or_cap(inode))
+			return -EACCES;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+		/*
+		 * inode_mutex prevent write and truncate on the file.
+		 * Read still goes through. We take i_data_sem in
+		 * ext4_ext_swap_inode_data before we switch the
+		 * inode format to prevent read.
+		 */
+		mutex_lock(&(inode->i_mutex));
+		err = ext4_ext_migrate(inode);
+		mutex_unlock(&(inode->i_mutex));
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
 
 	default:
 		return -ENOTTY;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e0e3a5e..b580714 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -477,9 +477,10 @@
 		b2 = (unsigned char *) bitmap;
 		for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
 			if (b1[i] != b2[i]) {
-				printk("corruption in group %lu at byte %u(%u):"
-				       " %x in copy != %x on disk/prealloc\n",
-					e4b->bd_group, i, i * 8, b1[i], b2[i]);
+				printk(KERN_ERR "corruption in group %lu "
+				       "at byte %u(%u): %x in copy != %x "
+				       "on disk/prealloc\n",
+				       e4b->bd_group, i, i * 8, b1[i], b2[i]);
 				BUG();
 			}
 		}
@@ -533,9 +534,6 @@
 	void *buddy;
 	void *buddy2;
 
-	if (!test_opt(sb, MBALLOC))
-		return 0;
-
 	{
 		static int mb_check_counter;
 		if (mb_check_counter++ % 100 != 0)
@@ -784,9 +782,11 @@
 		if (bh[i] == NULL)
 			goto out;
 
-		if (bh_uptodate_or_lock(bh[i]))
+		if (buffer_uptodate(bh[i]) &&
+		    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
 			continue;
 
+		lock_buffer(bh[i]);
 		spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
 		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			ext4_init_block_bitmap(sb, bh[i],
@@ -2169,9 +2169,10 @@
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	remove_proc_entry("mb_groups", sbi->s_mb_proc);
-	remove_proc_entry("mb_history", sbi->s_mb_proc);
-
+	if (sbi->s_proc != NULL) {
+		remove_proc_entry("mb_groups", sbi->s_proc);
+		remove_proc_entry("mb_history", sbi->s_proc);
+	}
 	kfree(sbi->s_mb_history);
 }
 
@@ -2180,10 +2181,10 @@
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int i;
 
-	if (sbi->s_mb_proc != NULL) {
-		proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc,
+	if (sbi->s_proc != NULL) {
+		proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
 				 &ext4_mb_seq_history_fops, sb);
-		proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc,
+		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
 				 &ext4_mb_seq_groups_fops, sb);
 	}
 
@@ -2485,19 +2486,14 @@
 	unsigned max;
 	int ret;
 
-	if (!test_opt(sb, MBALLOC))
-		return 0;
-
 	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
 
 	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_offsets == NULL) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
 		return -ENOMEM;
 	}
 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_maxs == NULL) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_maxs);
 		return -ENOMEM;
 	}
@@ -2520,7 +2516,6 @@
 	/* init file for buddy data */
 	ret = ext4_mb_init_backend(sb);
 	if (ret != 0) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_offsets);
 		kfree(sbi->s_mb_maxs);
 		return ret;
@@ -2540,17 +2535,15 @@
 	sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
 	sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
 
-	i = sizeof(struct ext4_locality_group) * nr_cpu_ids;
-	sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
+	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
 	if (sbi->s_locality_groups == NULL) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_offsets);
 		kfree(sbi->s_mb_maxs);
 		return -ENOMEM;
 	}
-	for (i = 0; i < nr_cpu_ids; i++) {
+	for_each_possible_cpu(i) {
 		struct ext4_locality_group *lg;
-		lg = &sbi->s_locality_groups[i];
+		lg = per_cpu_ptr(sbi->s_locality_groups, i);
 		mutex_init(&lg->lg_mutex);
 		for (j = 0; j < PREALLOC_TB_SIZE; j++)
 			INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
@@ -2560,7 +2553,7 @@
 	ext4_mb_init_per_dev_proc(sb);
 	ext4_mb_history_init(sb);
 
-	printk("EXT4-fs: mballoc enabled\n");
+	printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
 	return 0;
 }
 
@@ -2589,9 +2582,6 @@
 	struct ext4_group_info *grinfo;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (!test_opt(sb, MBALLOC))
-		return 0;
-
 	/* release freed, non-committed blocks */
 	spin_lock(&sbi->s_md_lock);
 	list_splice_init(&sbi->s_closed_transaction,
@@ -2647,8 +2637,7 @@
 				atomic_read(&sbi->s_mb_discarded));
 	}
 
-	kfree(sbi->s_locality_groups);
-
+	free_percpu(sbi->s_locality_groups);
 	ext4_mb_history_release(sb);
 	ext4_mb_destroy_per_dev_proc(sb);
 
@@ -2721,118 +2710,46 @@
 #define EXT4_MB_STREAM_REQ		"stream_req"
 #define EXT4_MB_GROUP_PREALLOC		"group_prealloc"
 
-
-
-#define MB_PROC_FOPS(name)					\
-static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)	\
-{								\
-	struct ext4_sb_info *sbi = m->private;			\
-								\
-	seq_printf(m, "%ld\n", sbi->s_mb_##name);		\
-	return 0;						\
-}								\
-								\
-static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
-{								\
-	return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
-}								\
-								\
-static ssize_t ext4_mb_##name##_proc_write(struct file *file,	\
-		const char __user *buf, size_t cnt, loff_t *ppos)	\
-{								\
-	struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
-	char str[32];						\
-	long value;						\
-	if (cnt >= sizeof(str))					\
-		return -EINVAL;					\
-	if (copy_from_user(str, buf, cnt))			\
-		return -EFAULT;					\
-	value = simple_strtol(str, NULL, 0);			\
-	if (value <= 0)						\
-		return -ERANGE;					\
-	sbi->s_mb_##name = value;				\
-	return cnt;						\
-}								\
-								\
-static const struct file_operations ext4_mb_##name##_proc_fops = {	\
-	.owner		= THIS_MODULE,				\
-	.open		= ext4_mb_##name##_proc_open,		\
-	.read		= seq_read,				\
-	.llseek		= seq_lseek,				\
-	.release	= single_release,			\
-	.write		= ext4_mb_##name##_proc_write,		\
-};
-
-MB_PROC_FOPS(stats);
-MB_PROC_FOPS(max_to_scan);
-MB_PROC_FOPS(min_to_scan);
-MB_PROC_FOPS(order2_reqs);
-MB_PROC_FOPS(stream_request);
-MB_PROC_FOPS(group_prealloc);
-
-#define	MB_PROC_HANDLER(name, var)					\
-do {									\
-	proc = proc_create_data(name, mode, sbi->s_mb_proc,		\
-				&ext4_mb_##var##_proc_fops, sbi);	\
-	if (proc == NULL) {						\
-		printk(KERN_ERR "EXT4-fs: can't to create %s\n", name);	\
-		goto err_out;						\
-	}								\
-} while (0)
-
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 {
 	mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct proc_dir_entry *proc;
-	char devname[64];
 
-	if (proc_root_ext4 == NULL) {
-		sbi->s_mb_proc = NULL;
+	if (sbi->s_proc == NULL)
 		return -EINVAL;
-	}
-	bdevname(sb->s_bdev, devname);
-	sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
 
-	MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
-	MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
-	MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
-	MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
-	MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
-	MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
-
+	EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
+	EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
+	EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
+	EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
+	EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
+	EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
 	return 0;
 
 err_out:
-	printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
-	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
-	remove_proc_entry(devname, proc_root_ext4);
-	sbi->s_mb_proc = NULL;
-
+	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
 	return -ENOMEM;
 }
 
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	char devname[64];
 
-	if (sbi->s_mb_proc == NULL)
+	if (sbi->s_proc == NULL)
 		return -EINVAL;
 
-	bdevname(sb->s_bdev, devname);
-	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
-	remove_proc_entry(devname, proc_root_ext4);
+	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
 
 	return 0;
 }
@@ -2854,11 +2771,6 @@
 		kmem_cache_destroy(ext4_pspace_cachep);
 		return -ENOMEM;
 	}
-#ifdef CONFIG_PROC_FS
-	proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
-	if (proc_root_ext4 == NULL)
-		printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n");
-#endif
 	return 0;
 }
 
@@ -2867,9 +2779,6 @@
 	/* XXX: synchronize_rcu(); */
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
-#ifdef CONFIG_PROC_FS
-	remove_proc_entry("fs/ext4", NULL);
-#endif
 }
 
 
@@ -2879,7 +2788,7 @@
  */
 static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
-				handle_t *handle)
+				handle_t *handle, unsigned long reserv_blks)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct ext4_super_block *es;
@@ -2968,15 +2877,16 @@
 	le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-
+	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
 	/*
-	 * free blocks account has already be reduced/reserved
-	 * at write_begin() time for delayed allocation
-	 * do not double accounting
+	 * Now reduce the dirty block count also. Should not go negative
 	 */
 	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
-		percpu_counter_sub(&sbi->s_freeblocks_counter,
-					ac->ac_b_ex.fe_len);
+		/* release all the reserved blocks if non delalloc */
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+	else
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+						ac->ac_b_ex.fe_len);
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -3884,7 +3794,7 @@
  *
  * FIXME!! Make sure it is valid at all the call sites
  */
-void ext4_mb_discard_inode_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -3896,7 +3806,7 @@
 	struct ext4_buddy e4b;
 	int err;
 
-	if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
+	if (!S_ISREG(inode->i_mode)) {
 		/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
 		return;
 	}
@@ -4094,8 +4004,7 @@
 	 * per cpu locality group is to reduce the contention between block
 	 * request from multiple CPUs.
 	 */
-	ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
-	put_cpu();
+	ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id());
 
 	/* we're going to use group allocation */
 	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4369,33 +4278,32 @@
 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 				 struct ext4_allocation_request *ar, int *errp)
 {
+	int freed;
 	struct ext4_allocation_context *ac = NULL;
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	ext4_fsblk_t block = 0;
-	int freed;
-	int inquota;
+	unsigned long inquota;
+	unsigned long reserv_blks = 0;
 
 	sb = ar->inode->i_sb;
 	sbi = EXT4_SB(sb);
 
-	if (!test_opt(sb, MBALLOC)) {
-		block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
-					    &(ar->len), errp);
-		return block;
-	}
 	if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
 		/*
 		 * With delalloc we already reserved the blocks
 		 */
-		ar->len = ext4_has_free_blocks(sbi, ar->len);
+		while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+			/* let others to free the space */
+			yield();
+			ar->len = ar->len >> 1;
+		}
+		if (!ar->len) {
+			*errp = -ENOSPC;
+			return 0;
+		}
+		reserv_blks = ar->len;
 	}
-
-	if (ar->len == 0) {
-		*errp = -ENOSPC;
-		return 0;
-	}
-
 	while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
 		ar->flags |= EXT4_MB_HINT_NOPREALLOC;
 		ar->len--;
@@ -4441,7 +4349,7 @@
 	}
 
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
-		*errp = ext4_mb_mark_diskspace_used(ac, handle);
+		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
 		if (*errp ==  -EAGAIN) {
 			ac->ac_b_ex.fe_group = 0;
 			ac->ac_b_ex.fe_start = 0;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c7c9906..b3b4828 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -257,7 +257,6 @@
 
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
-static struct proc_dir_entry *proc_root_ext4;
 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
 
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 46fc0b5..f2a9cf4 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -447,8 +447,7 @@
 
 }
 
-int ext4_ext_migrate(struct inode *inode, struct file *filp,
-				unsigned int cmd, unsigned long arg)
+int ext4_ext_migrate(struct inode *inode)
 {
 	handle_t *handle;
 	int retval = 0, i;
@@ -516,12 +515,6 @@
 	 * when we add extents we extent the journal
 	 */
 	/*
-	 * inode_mutex prevent write and truncate on the file. Read still goes
-	 * through. We take i_data_sem in ext4_ext_swap_inode_data before we
-	 * switch the inode format to prevent read.
-	 */
-	mutex_lock(&(inode->i_mutex));
-	/*
 	 * Even though we take i_mutex we can still cause block allocation
 	 * via mmap write to holes. If we have allocated new blocks we fail
 	 * migrate.  New block allocation will clear EXT4_EXT_MIGRATE flag.
@@ -623,7 +616,6 @@
 	tmp_inode->i_nlink = 0;
 
 	ext4_journal_stop(handle);
-	mutex_unlock(&(inode->i_mutex));
 
 	if (tmp_inode)
 		iput(tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 387ad98..92db9e9 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -151,34 +151,36 @@
 
 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
 static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
-static inline unsigned dx_get_hash (struct dx_entry *entry);
-static void dx_set_hash (struct dx_entry *entry, unsigned value);
-static unsigned dx_get_count (struct dx_entry *entries);
-static unsigned dx_get_limit (struct dx_entry *entries);
-static void dx_set_count (struct dx_entry *entries, unsigned value);
-static void dx_set_limit (struct dx_entry *entries, unsigned value);
-static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
-static unsigned dx_node_limit (struct inode *dir);
-static struct dx_frame *dx_probe(struct dentry *dentry,
+static inline unsigned dx_get_hash(struct dx_entry *entry);
+static void dx_set_hash(struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count(struct dx_entry *entries);
+static unsigned dx_get_limit(struct dx_entry *entries);
+static void dx_set_count(struct dx_entry *entries, unsigned value);
+static void dx_set_limit(struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit(struct inode *dir);
+static struct dx_frame *dx_probe(const struct qstr *d_name,
 				 struct inode *dir,
 				 struct dx_hash_info *hinfo,
 				 struct dx_frame *frame,
 				 int *err);
-static void dx_release (struct dx_frame *frames);
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
-			struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static void dx_release(struct dx_frame *frames);
+static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
+		       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
+static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
 		struct dx_map_entry *offsets, int count);
-static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
 static void dx_insert_block(struct dx_frame *frame,
 					u32 hash, ext4_lblk_t block);
 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 				 struct dx_frame *frame,
 				 struct dx_frame *frames,
 				 __u32 *start_hash);
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
-		       struct ext4_dir_entry_2 **res_dir, int *err);
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+		const struct qstr *d_name,
+		struct ext4_dir_entry_2 **res_dir,
+		int *err);
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
@@ -207,44 +209,44 @@
 	entry->block = cpu_to_le32(value);
 }
 
-static inline unsigned dx_get_hash (struct dx_entry *entry)
+static inline unsigned dx_get_hash(struct dx_entry *entry)
 {
 	return le32_to_cpu(entry->hash);
 }
 
-static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
+static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
 {
 	entry->hash = cpu_to_le32(value);
 }
 
-static inline unsigned dx_get_count (struct dx_entry *entries)
+static inline unsigned dx_get_count(struct dx_entry *entries)
 {
 	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
 }
 
-static inline unsigned dx_get_limit (struct dx_entry *entries)
+static inline unsigned dx_get_limit(struct dx_entry *entries)
 {
 	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
 }
 
-static inline void dx_set_count (struct dx_entry *entries, unsigned value)
+static inline void dx_set_count(struct dx_entry *entries, unsigned value)
 {
 	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
 }
 
-static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
+static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
 {
 	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
 }
 
-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
+static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
 		EXT4_DIR_REC_LEN(2) - infosize;
 	return entry_space / sizeof(struct dx_entry);
 }
 
-static inline unsigned dx_node_limit (struct inode *dir)
+static inline unsigned dx_node_limit(struct inode *dir)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
 	return entry_space / sizeof(struct dx_entry);
@@ -254,12 +256,12 @@
  * Debug
  */
 #ifdef DX_DEBUG
-static void dx_show_index (char * label, struct dx_entry *entries)
+static void dx_show_index(char * label, struct dx_entry *entries)
 {
 	int i, n = dx_get_count (entries);
-	printk("%s index ", label);
+	printk(KERN_DEBUG "%s index ", label);
 	for (i = 0; i < n; i++) {
-		printk("%x->%lu ", i? dx_get_hash(entries + i) :
+		printk("%x->%lu ", i ? dx_get_hash(entries + i) :
 				0, (unsigned long)dx_get_block(entries + i));
 	}
 	printk("\n");
@@ -306,7 +308,7 @@
 			     struct dx_entry *entries, int levels)
 {
 	unsigned blocksize = dir->i_sb->s_blocksize;
-	unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+	unsigned count = dx_get_count(entries), names = 0, space = 0, i;
 	unsigned bcount = 0;
 	struct buffer_head *bh;
 	int err;
@@ -325,11 +327,12 @@
 		names += stats.names;
 		space += stats.space;
 		bcount += stats.bcount;
-		brelse (bh);
+		brelse(bh);
 	}
 	if (bcount)
-		printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
-			names, space/bcount,(space/bcount)*100/blocksize);
+		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 
+		       levels ? "" : "   ", names, space/bcount,
+		       (space/bcount)*100/blocksize);
 	return (struct stats) { names, space, bcount};
 }
 #endif /* DX_DEBUG */
@@ -344,7 +347,7 @@
  * back to userspace.
  */
 static struct dx_frame *
-dx_probe(struct dentry *dentry, struct inode *dir,
+dx_probe(const struct qstr *d_name, struct inode *dir,
 	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
 {
 	unsigned count, indirect;
@@ -355,8 +358,6 @@
 	u32 hash;
 
 	frame->bh = NULL;
-	if (dentry)
-		dir = dentry->d_parent->d_inode;
 	if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
 		goto fail;
 	root = (struct dx_root *) bh->b_data;
@@ -372,8 +373,8 @@
 	}
 	hinfo->hash_version = root->info.hash_version;
 	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
-	if (dentry)
-		ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+	if (d_name)
+		ext4fs_dirhash(d_name->name, d_name->len, hinfo);
 	hash = hinfo->hash;
 
 	if (root->info.unused_flags & 1) {
@@ -406,7 +407,7 @@
 		goto fail;
 	}
 
-	dxtrace (printk("Look up %x", hash));
+	dxtrace(printk("Look up %x", hash));
 	while (1)
 	{
 		count = dx_get_count(entries);
@@ -555,7 +556,7 @@
 				      0, &err)))
 			return err; /* Failure */
 		p++;
-		brelse (p->bh);
+		brelse(p->bh);
 		p->bh = bh;
 		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
 	}
@@ -593,7 +594,7 @@
 			/* On error, skip the f_pos to the next block. */
 			dir_file->f_pos = (dir_file->f_pos |
 					(dir->i_sb->s_blocksize - 1)) + 1;
-			brelse (bh);
+			brelse(bh);
 			return count;
 		}
 		ext4fs_dirhash(de->name, de->name_len, hinfo);
@@ -635,8 +636,8 @@
 	int ret, err;
 	__u32 hashval;
 
-	dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
-		       start_minor_hash));
+	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 
+		       start_hash, start_minor_hash));
 	dir = dir_file->f_path.dentry->d_inode;
 	if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
 		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -648,7 +649,7 @@
 	}
 	hinfo.hash = start_hash;
 	hinfo.minor_hash = 0;
-	frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err);
+	frame = dx_probe(NULL, dir, &hinfo, frames, &err);
 	if (!frame)
 		return err;
 
@@ -694,8 +695,8 @@
 			break;
 	}
 	dx_release(frames);
-	dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
-		       count, *next_hash));
+	dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
+		       "next hash: %x\n", count, *next_hash));
 	return count;
 errout:
 	dx_release(frames);
@@ -802,17 +803,17 @@
 /*
  * Returns 0 if not found, -1 on failure, and 1 on success
  */
-static inline int search_dirblock(struct buffer_head * bh,
+static inline int search_dirblock(struct buffer_head *bh,
 				  struct inode *dir,
-				  struct dentry *dentry,
+				  const struct qstr *d_name,
 				  unsigned long offset,
 				  struct ext4_dir_entry_2 ** res_dir)
 {
 	struct ext4_dir_entry_2 * de;
 	char * dlimit;
 	int de_len;
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	const char *name = d_name->name;
+	int namelen = d_name->len;
 
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	dlimit = bh->b_data + dir->i_sb->s_blocksize;
@@ -851,12 +852,13 @@
  * The returned buffer_head has ->b_count elevated.  The caller is expected
  * to brelse() it when appropriate.
  */
-static struct buffer_head * ext4_find_entry (struct dentry *dentry,
+static struct buffer_head * ext4_find_entry (struct inode *dir,
+					const struct qstr *d_name,
 					struct ext4_dir_entry_2 ** res_dir)
 {
-	struct super_block * sb;
-	struct buffer_head * bh_use[NAMEI_RA_SIZE];
-	struct buffer_head * bh, *ret = NULL;
+	struct super_block *sb;
+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
+	struct buffer_head *bh, *ret = NULL;
 	ext4_lblk_t start, block, b;
 	int ra_max = 0;		/* Number of bh's in the readahead
 				   buffer, bh_use[] */
@@ -865,16 +867,15 @@
 	int num = 0;
 	ext4_lblk_t  nblocks;
 	int i, err;
-	struct inode *dir = dentry->d_parent->d_inode;
 	int namelen;
 
 	*res_dir = NULL;
 	sb = dir->i_sb;
-	namelen = dentry->d_name.len;
+	namelen = d_name->len;
 	if (namelen > EXT4_NAME_LEN)
 		return NULL;
 	if (is_dx(dir)) {
-		bh = ext4_dx_find_entry(dentry, res_dir, &err);
+		bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
 		/*
 		 * On success, or if the error was file not found,
 		 * return.  Otherwise, fall back to doing a search the
@@ -882,7 +883,8 @@
 		 */
 		if (bh || (err != ERR_BAD_DX_DIR))
 			return bh;
-		dxtrace(printk("ext4_find_entry: dx failed, falling back\n"));
+		dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+			       "falling back\n"));
 	}
 	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
 	start = EXT4_I(dir)->i_dir_start_lookup;
@@ -926,7 +928,7 @@
 			brelse(bh);
 			goto next;
 		}
-		i = search_dirblock(bh, dir, dentry,
+		i = search_dirblock(bh, dir, d_name,
 			    block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
 		if (i == 1) {
 			EXT4_I(dir)->i_dir_start_lookup = block;
@@ -956,11 +958,11 @@
 cleanup_and_exit:
 	/* Clean up the read-ahead blocks */
 	for (; ra_ptr < ra_max; ra_ptr++)
-		brelse (bh_use[ra_ptr]);
+		brelse(bh_use[ra_ptr]);
 	return ret;
 }
 
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
 		       struct ext4_dir_entry_2 **res_dir, int *err)
 {
 	struct super_block * sb;
@@ -971,14 +973,13 @@
 	struct buffer_head *bh;
 	ext4_lblk_t block;
 	int retval;
-	int namelen = dentry->d_name.len;
-	const u8 *name = dentry->d_name.name;
-	struct inode *dir = dentry->d_parent->d_inode;
+	int namelen = d_name->len;
+	const u8 *name = d_name->name;
 
 	sb = dir->i_sb;
 	/* NFS may look up ".." - look at dx_root directory block */
 	if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-		if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
+		if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
 			return NULL;
 	} else {
 		frame = frames;
@@ -1010,7 +1011,7 @@
 				return bh;
 			}
 		}
-		brelse (bh);
+		brelse(bh);
 		/* Check to see if we should continue to search */
 		retval = ext4_htree_next_block(dir, hash, frame,
 					       frames, NULL);
@@ -1025,25 +1026,25 @@
 
 	*err = -ENOENT;
 errout:
-	dxtrace(printk("%s not found\n", name));
+	dxtrace(printk(KERN_DEBUG "%s not found\n", name));
 	dx_release (frames);
 	return NULL;
 }
 
-static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode * inode;
-	struct ext4_dir_entry_2 * de;
-	struct buffer_head * bh;
+	struct inode *inode;
+	struct ext4_dir_entry_2 *de;
+	struct buffer_head *bh;
 
 	if (dentry->d_name.len > EXT4_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	bh = ext4_find_entry(dentry, &de);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	inode = NULL;
 	if (bh) {
 		unsigned long ino = le32_to_cpu(de->inode);
-		brelse (bh);
+		brelse(bh);
 		if (!ext4_valid_inum(dir->i_sb, ino)) {
 			ext4_error(dir->i_sb, "ext4_lookup",
 				   "bad inode number: %lu", ino);
@@ -1062,15 +1063,14 @@
 	unsigned long ino;
 	struct dentry *parent;
 	struct inode *inode;
-	struct dentry dotdot;
+	static const struct qstr dotdot = {
+		.name = "..",
+		.len = 2,
+	};
 	struct ext4_dir_entry_2 * de;
 	struct buffer_head *bh;
 
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
-	dotdot.d_parent = child; /* confusing, isn't it! */
-
-	bh = ext4_find_entry(&dotdot, &de);
+	bh = ext4_find_entry(child->d_inode, &dotdot, &de);
 	inode = NULL;
 	if (!bh)
 		return ERR_PTR(-ENOENT);
@@ -1201,10 +1201,10 @@
 
 	/* create map in the end of data2 block */
 	map = (struct dx_map_entry *) (data2 + blocksize);
-	count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
+	count = dx_make_map((struct ext4_dir_entry_2 *) data1,
 			     blocksize, hinfo, map);
 	map -= count;
-	dx_sort_map (map, count);
+	dx_sort_map(map, count);
 	/* Split the existing block in the middle, size-wise */
 	size = 0;
 	move = 0;
@@ -1225,7 +1225,7 @@
 
 	/* Fancy dance to stay within two buffers */
 	de2 = dx_move_dirents(data1, data2, map + split, count - split);
-	de = dx_pack_dirents(data1,blocksize);
+	de = dx_pack_dirents(data1, blocksize);
 	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
 	de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
@@ -1237,15 +1237,15 @@
 		swap(*bh, bh2);
 		de = de2;
 	}
-	dx_insert_block (frame, hash2 + continued, newblock);
-	err = ext4_journal_dirty_metadata (handle, bh2);
+	dx_insert_block(frame, hash2 + continued, newblock);
+	err = ext4_journal_dirty_metadata(handle, bh2);
 	if (err)
 		goto journal_error;
-	err = ext4_journal_dirty_metadata (handle, frame->bh);
+	err = ext4_journal_dirty_metadata(handle, frame->bh);
 	if (err)
 		goto journal_error;
-	brelse (bh2);
-	dxtrace(dx_show_index ("frame", frame->entries));
+	brelse(bh2);
+	dxtrace(dx_show_index("frame", frame->entries));
 	return de;
 
 journal_error:
@@ -1271,7 +1271,7 @@
  */
 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode, struct ext4_dir_entry_2 *de,
-			     struct buffer_head * bh)
+			     struct buffer_head *bh)
 {
 	struct inode	*dir = dentry->d_parent->d_inode;
 	const char	*name = dentry->d_name.name;
@@ -1288,11 +1288,11 @@
 		while ((char *) de <= top) {
 			if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
 						  bh, offset)) {
-				brelse (bh);
+				brelse(bh);
 				return -EIO;
 			}
-			if (ext4_match (namelen, name, de)) {
-				brelse (bh);
+			if (ext4_match(namelen, name, de)) {
+				brelse(bh);
 				return -EEXIST;
 			}
 			nlen = EXT4_DIR_REC_LEN(de->name_len);
@@ -1329,7 +1329,7 @@
 	} else
 		de->inode = 0;
 	de->name_len = namelen;
-	memcpy (de->name, name, namelen);
+	memcpy(de->name, name, namelen);
 	/*
 	 * XXX shouldn't update any times until successful
 	 * completion of syscall, but too many callers depend
@@ -1377,7 +1377,7 @@
 	struct fake_dirent *fde;
 
 	blocksize =  dir->i_sb->s_blocksize;
-	dxtrace(printk("Creating index\n"));
+	dxtrace(printk(KERN_DEBUG "Creating index\n"));
 	retval = ext4_journal_get_write_access(handle, bh);
 	if (retval) {
 		ext4_std_error(dir->i_sb, retval);
@@ -1386,7 +1386,7 @@
 	}
 	root = (struct dx_root *) bh->b_data;
 
-	bh2 = ext4_append (handle, dir, &block, &retval);
+	bh2 = ext4_append(handle, dir, &block, &retval);
 	if (!(bh2)) {
 		brelse(bh);
 		return retval;
@@ -1412,9 +1412,9 @@
 	root->info.info_length = sizeof(root->info);
 	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
 	entries = root->entries;
-	dx_set_block (entries, 1);
-	dx_set_count (entries, 1);
-	dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
+	dx_set_block(entries, 1);
+	dx_set_count(entries, 1);
+	dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
 
 	/* Initialize as for dx_probe */
 	hinfo.hash_version = root->info.hash_version;
@@ -1443,14 +1443,14 @@
  * may not sleep between calling this and putting something into
  * the entry, as someone else might have used it while you slept.
  */
-static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
-	struct inode *inode)
+static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+			  struct inode *inode)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 	unsigned long offset;
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	struct ext4_dir_entry_2 *de;
-	struct super_block * sb;
+	struct super_block *sb;
 	int	retval;
 	int	dx_fallback=0;
 	unsigned blocksize;
@@ -1500,13 +1500,13 @@
 	struct dx_frame frames[2], *frame;
 	struct dx_entry *entries, *at;
 	struct dx_hash_info hinfo;
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	struct inode *dir = dentry->d_parent->d_inode;
-	struct super_block * sb = dir->i_sb;
+	struct super_block *sb = dir->i_sb;
 	struct ext4_dir_entry_2 *de;
 	int err;
 
-	frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
 	if (!frame)
 		return err;
 	entries = frame->entries;
@@ -1527,7 +1527,7 @@
 	}
 
 	/* Block full, should compress but for now just split */
-	dxtrace(printk("using %u of %u node entries\n",
+	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
 		       dx_get_count(entries), dx_get_limit(entries)));
 	/* Need to split index? */
 	if (dx_get_count(entries) == dx_get_limit(entries)) {
@@ -1559,7 +1559,8 @@
 		if (levels) {
 			unsigned icount1 = icount/2, icount2 = icount - icount1;
 			unsigned hash2 = dx_get_hash(entries + icount1);
-			dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+				       icount1, icount2));
 
 			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
 			err = ext4_journal_get_write_access(handle,
@@ -1567,11 +1568,11 @@
 			if (err)
 				goto journal_error;
 
-			memcpy ((char *) entries2, (char *) (entries + icount1),
-				icount2 * sizeof(struct dx_entry));
-			dx_set_count (entries, icount1);
-			dx_set_count (entries2, icount2);
-			dx_set_limit (entries2, dx_node_limit(dir));
+			memcpy((char *) entries2, (char *) (entries + icount1),
+			       icount2 * sizeof(struct dx_entry));
+			dx_set_count(entries, icount1);
+			dx_set_count(entries2, icount2);
+			dx_set_limit(entries2, dx_node_limit(dir));
 
 			/* Which index block gets the new entry? */
 			if (at - entries >= icount1) {
@@ -1579,16 +1580,17 @@
 				frame->entries = entries = entries2;
 				swap(frame->bh, bh2);
 			}
-			dx_insert_block (frames + 0, hash2, newblock);
-			dxtrace(dx_show_index ("node", frames[1].entries));
-			dxtrace(dx_show_index ("node",
+			dx_insert_block(frames + 0, hash2, newblock);
+			dxtrace(dx_show_index("node", frames[1].entries));
+			dxtrace(dx_show_index("node",
 			       ((struct dx_node *) bh2->b_data)->entries));
 			err = ext4_journal_dirty_metadata(handle, bh2);
 			if (err)
 				goto journal_error;
 			brelse (bh2);
 		} else {
-			dxtrace(printk("Creating second level index...\n"));
+			dxtrace(printk(KERN_DEBUG
+				       "Creating second level index...\n"));
 			memcpy((char *) entries2, (char *) entries,
 			       icount * sizeof(struct dx_entry));
 			dx_set_limit(entries2, dx_node_limit(dir));
@@ -1630,12 +1632,12 @@
  * ext4_delete_entry deletes a directory entry by merging it with the
  * previous entry
  */
-static int ext4_delete_entry (handle_t *handle,
-			      struct inode * dir,
-			      struct ext4_dir_entry_2 * de_del,
-			      struct buffer_head * bh)
+static int ext4_delete_entry(handle_t *handle,
+			     struct inode *dir,
+			     struct ext4_dir_entry_2 *de_del,
+			     struct buffer_head *bh)
 {
-	struct ext4_dir_entry_2 * de, * pde;
+	struct ext4_dir_entry_2 *de, *pde;
 	int i;
 
 	i = 0;
@@ -1716,11 +1718,11 @@
  * If the create succeeds, we fill in the inode information
  * with d_instantiate().
  */
-static int ext4_create (struct inode * dir, struct dentry * dentry, int mode,
-		struct nameidata *nd)
+static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
+		       struct nameidata *nd)
 {
 	handle_t *handle;
-	struct inode * inode;
+	struct inode *inode;
 	int err, retries = 0;
 
 retry:
@@ -1747,8 +1749,8 @@
 	return err;
 }
 
-static int ext4_mknod (struct inode * dir, struct dentry *dentry,
-			int mode, dev_t rdev)
+static int ext4_mknod(struct inode *dir, struct dentry *dentry,
+		      int mode, dev_t rdev)
 {
 	handle_t *handle;
 	struct inode *inode;
@@ -1767,11 +1769,11 @@
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode = ext4_new_inode (handle, dir, mode);
+	inode = ext4_new_inode(handle, dir, mode);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 		inode->i_op = &ext4_special_inode_operations;
 #endif
 		err = ext4_add_nondir(handle, dentry, inode);
@@ -1782,12 +1784,12 @@
 	return err;
 }
 
-static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	handle_t *handle;
-	struct inode * inode;
-	struct buffer_head * dir_block;
-	struct ext4_dir_entry_2 * de;
+	struct inode *inode;
+	struct buffer_head *dir_block;
+	struct ext4_dir_entry_2 *de;
 	int err, retries = 0;
 
 	if (EXT4_DIR_LINK_MAX(dir))
@@ -1803,7 +1805,7 @@
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
+	inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
@@ -1811,7 +1813,7 @@
 	inode->i_op = &ext4_dir_inode_operations;
 	inode->i_fop = &ext4_dir_operations;
 	inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
-	dir_block = ext4_bread (handle, inode, 0, 1, &err);
+	dir_block = ext4_bread(handle, inode, 0, 1, &err);
 	if (!dir_block)
 		goto out_clear_inode;
 	BUFFER_TRACE(dir_block, "get_write_access");
@@ -1820,26 +1822,26 @@
 	de->inode = cpu_to_le32(inode->i_ino);
 	de->name_len = 1;
 	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
-	strcpy (de->name, ".");
+	strcpy(de->name, ".");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
 	de = ext4_next_entry(de);
 	de->inode = cpu_to_le32(dir->i_ino);
 	de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
 						EXT4_DIR_REC_LEN(1));
 	de->name_len = 2;
-	strcpy (de->name, "..");
+	strcpy(de->name, "..");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
 	inode->i_nlink = 2;
 	BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
 	ext4_journal_dirty_metadata(handle, dir_block);
-	brelse (dir_block);
+	brelse(dir_block);
 	ext4_mark_inode_dirty(handle, inode);
-	err = ext4_add_entry (handle, dentry, inode);
+	err = ext4_add_entry(handle, dentry, inode);
 	if (err) {
 out_clear_inode:
 		clear_nlink(inode);
 		ext4_mark_inode_dirty(handle, inode);
-		iput (inode);
+		iput(inode);
 		goto out_stop;
 	}
 	ext4_inc_count(handle, dir);
@@ -1856,17 +1858,17 @@
 /*
  * routine to check that the specified directory is empty (for rmdir)
  */
-static int empty_dir (struct inode * inode)
+static int empty_dir(struct inode *inode)
 {
 	unsigned long offset;
-	struct buffer_head * bh;
-	struct ext4_dir_entry_2 * de, * de1;
-	struct super_block * sb;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de, *de1;
+	struct super_block *sb;
 	int err = 0;
 
 	sb = inode->i_sb;
 	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
-	    !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
+	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
 		if (err)
 			ext4_error(inode->i_sb, __func__,
 				   "error %d reading directory #%lu offset 0",
@@ -1881,23 +1883,23 @@
 	de1 = ext4_next_entry(de);
 	if (le32_to_cpu(de->inode) != inode->i_ino ||
 			!le32_to_cpu(de1->inode) ||
-			strcmp (".", de->name) ||
-			strcmp ("..", de1->name)) {
-		ext4_warning (inode->i_sb, "empty_dir",
-			      "bad directory (dir #%lu) - no `.' or `..'",
-			      inode->i_ino);
-		brelse (bh);
+			strcmp(".", de->name) ||
+			strcmp("..", de1->name)) {
+		ext4_warning(inode->i_sb, "empty_dir",
+			     "bad directory (dir #%lu) - no `.' or `..'",
+			     inode->i_ino);
+		brelse(bh);
 		return 1;
 	}
 	offset = ext4_rec_len_from_disk(de->rec_len) +
 		 ext4_rec_len_from_disk(de1->rec_len);
 	de = ext4_next_entry(de1);
-	while (offset < inode->i_size ) {
+	while (offset < inode->i_size) {
 		if (!bh ||
 			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
 			err = 0;
-			brelse (bh);
-			bh = ext4_bread (NULL, inode,
+			brelse(bh);
+			bh = ext4_bread(NULL, inode,
 				offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
 			if (!bh) {
 				if (err)
@@ -1917,13 +1919,13 @@
 			continue;
 		}
 		if (le32_to_cpu(de->inode)) {
-			brelse (bh);
+			brelse(bh);
 			return 0;
 		}
 		offset += ext4_rec_len_from_disk(de->rec_len);
 		de = ext4_next_entry(de);
 	}
-	brelse (bh);
+	brelse(bh);
 	return 1;
 }
 
@@ -1954,8 +1956,8 @@
 	 * ->i_nlink. For, say it, character device. Not a regular file,
 	 * not a directory, not a symlink and ->i_nlink > 0.
 	 */
-	J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-		S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
 
 	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
@@ -2069,12 +2071,12 @@
 	goto out_err;
 }
 
-static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
+static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	int retval;
-	struct inode * inode;
-	struct buffer_head * bh;
-	struct ext4_dir_entry_2 * de;
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
 	handle_t *handle;
 
 	/* Initialize quotas before so that eventual writes go in
@@ -2085,7 +2087,7 @@
 		return PTR_ERR(handle);
 
 	retval = -ENOENT;
-	bh = ext4_find_entry (dentry, &de);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	if (!bh)
 		goto end_rmdir;
 
@@ -2099,16 +2101,16 @@
 		goto end_rmdir;
 
 	retval = -ENOTEMPTY;
-	if (!empty_dir (inode))
+	if (!empty_dir(inode))
 		goto end_rmdir;
 
 	retval = ext4_delete_entry(handle, dir, de, bh);
 	if (retval)
 		goto end_rmdir;
 	if (!EXT4_DIR_LINK_EMPTY(inode))
-		ext4_warning (inode->i_sb, "ext4_rmdir",
-			      "empty directory has too many links (%d)",
-			      inode->i_nlink);
+		ext4_warning(inode->i_sb, "ext4_rmdir",
+			     "empty directory has too many links (%d)",
+			     inode->i_nlink);
 	inode->i_version++;
 	clear_nlink(inode);
 	/* There's no need to set i_disksize: the fact that i_nlink is
@@ -2124,16 +2126,16 @@
 
 end_rmdir:
 	ext4_journal_stop(handle);
-	brelse (bh);
+	brelse(bh);
 	return retval;
 }
 
-static int ext4_unlink(struct inode * dir, struct dentry *dentry)
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int retval;
-	struct inode * inode;
-	struct buffer_head * bh;
-	struct ext4_dir_entry_2 * de;
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
 	handle_t *handle;
 
 	/* Initialize quotas before so that eventual writes go
@@ -2147,7 +2149,7 @@
 		handle->h_sync = 1;
 
 	retval = -ENOENT;
-	bh = ext4_find_entry (dentry, &de);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	if (!bh)
 		goto end_unlink;
 
@@ -2158,9 +2160,9 @@
 		goto end_unlink;
 
 	if (!inode->i_nlink) {
-		ext4_warning (inode->i_sb, "ext4_unlink",
-			      "Deleting nonexistent file (%lu), %d",
-			      inode->i_ino, inode->i_nlink);
+		ext4_warning(inode->i_sb, "ext4_unlink",
+			     "Deleting nonexistent file (%lu), %d",
+			     inode->i_ino, inode->i_nlink);
 		inode->i_nlink = 1;
 	}
 	retval = ext4_delete_entry(handle, dir, de, bh);
@@ -2178,15 +2180,15 @@
 
 end_unlink:
 	ext4_journal_stop(handle);
-	brelse (bh);
+	brelse(bh);
 	return retval;
 }
 
-static int ext4_symlink (struct inode * dir,
-		struct dentry *dentry, const char * symname)
+static int ext4_symlink(struct inode *dir,
+			struct dentry *dentry, const char *symname)
 {
 	handle_t *handle;
-	struct inode * inode;
+	struct inode *inode;
 	int l, err, retries = 0;
 
 	l = strlen(symname)+1;
@@ -2203,12 +2205,12 @@
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
 
-	if (l > sizeof (EXT4_I(inode)->i_data)) {
+	if (l > sizeof(EXT4_I(inode)->i_data)) {
 		inode->i_op = &ext4_symlink_inode_operations;
 		ext4_set_aops(inode);
 		/*
@@ -2221,14 +2223,14 @@
 		if (err) {
 			clear_nlink(inode);
 			ext4_mark_inode_dirty(handle, inode);
-			iput (inode);
+			iput(inode);
 			goto out_stop;
 		}
 	} else {
 		/* clear the extent format for fast symlink */
 		EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
 		inode->i_op = &ext4_fast_symlink_inode_operations;
-		memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
+		memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
 		inode->i_size = l-1;
 	}
 	EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2240,8 +2242,8 @@
 	return err;
 }
 
-static int ext4_link (struct dentry * old_dentry,
-		struct inode * dir, struct dentry *dentry)
+static int ext4_link(struct dentry *old_dentry,
+		     struct inode *dir, struct dentry *dentry)
 {
 	handle_t *handle;
 	struct inode *inode = old_dentry->d_inode;
@@ -2284,13 +2286,13 @@
  * Anybody can rename anything with this: the permission checks are left to the
  * higher-level routines.
  */
-static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
-			   struct inode * new_dir,struct dentry *new_dentry)
+static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
 {
 	handle_t *handle;
-	struct inode * old_inode, * new_inode;
-	struct buffer_head * old_bh, * new_bh, * dir_bh;
-	struct ext4_dir_entry_2 * old_de, * new_de;
+	struct inode *old_inode, *new_inode;
+	struct buffer_head *old_bh, *new_bh, *dir_bh;
+	struct ext4_dir_entry_2 *old_de, *new_de;
 	int retval;
 
 	old_bh = new_bh = dir_bh = NULL;
@@ -2308,7 +2310,7 @@
 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
 		handle->h_sync = 1;
 
-	old_bh = ext4_find_entry (old_dentry, &old_de);
+	old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
 	/*
 	 *  Check for inode number is _not_ due to possible IO errors.
 	 *  We might rmdir the source, keep it as pwd of some process
@@ -2321,32 +2323,32 @@
 		goto end_rename;
 
 	new_inode = new_dentry->d_inode;
-	new_bh = ext4_find_entry (new_dentry, &new_de);
+	new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
 	if (new_bh) {
 		if (!new_inode) {
-			brelse (new_bh);
+			brelse(new_bh);
 			new_bh = NULL;
 		}
 	}
 	if (S_ISDIR(old_inode->i_mode)) {
 		if (new_inode) {
 			retval = -ENOTEMPTY;
-			if (!empty_dir (new_inode))
+			if (!empty_dir(new_inode))
 				goto end_rename;
 		}
 		retval = -EIO;
-		dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval);
+		dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
 		if (!dir_bh)
 			goto end_rename;
 		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
 			goto end_rename;
 		retval = -EMLINK;
-		if (!new_inode && new_dir!=old_dir &&
+		if (!new_inode && new_dir != old_dir &&
 				new_dir->i_nlink >= EXT4_LINK_MAX)
 			goto end_rename;
 	}
 	if (!new_bh) {
-		retval = ext4_add_entry (handle, new_dentry, old_inode);
+		retval = ext4_add_entry(handle, new_dentry, old_inode);
 		if (retval)
 			goto end_rename;
 	} else {
@@ -2388,7 +2390,7 @@
 		struct buffer_head *old_bh2;
 		struct ext4_dir_entry_2 *old_de2;
 
-		old_bh2 = ext4_find_entry(old_dentry, &old_de2);
+		old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
 		if (old_bh2) {
 			retval = ext4_delete_entry(handle, old_dir,
 						   old_de2, old_bh2);
@@ -2433,9 +2435,9 @@
 	retval = 0;
 
 end_rename:
-	brelse (dir_bh);
-	brelse (old_bh);
-	brelse (new_bh);
+	brelse(dir_bh);
+	brelse(old_bh);
+	brelse(new_bh);
 	ext4_journal_stop(handle);
 	return retval;
 }
@@ -2454,7 +2456,7 @@
 	.mknod		= ext4_mknod,
 	.rename		= ext4_rename,
 	.setattr	= ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
@@ -2465,7 +2467,7 @@
 
 const struct inode_operations ext4_special_inode_operations = {
 	.setattr	= ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b3d3560..b6ec184 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -416,8 +416,8 @@
 		       "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
 		       gdb_num);
 
-        /*
-         * If we are not using the primary superblock/GDT copy don't resize,
+	/*
+	 * If we are not using the primary superblock/GDT copy don't resize,
          * because the user tools have no way of handling this.  Probably a
          * bad time to do it anyways.
          */
@@ -870,11 +870,10 @@
 	 * We can allocate memory for mb_alloc based on the new group
 	 * descriptor
 	 */
-	if (test_opt(sb, MBALLOC)) {
-		err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
-		if (err)
-			goto exit_journal;
-	}
+	err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+	if (err)
+		goto exit_journal;
+
 	/*
 	 * Make the new blocks and inodes valid next.  We do this before
 	 * increasing the group count so that once the group is enabled,
@@ -929,6 +928,15 @@
 	percpu_counter_add(&sbi->s_freeinodes_counter,
 			   EXT4_INODES_PER_GROUP(sb));
 
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		ext4_group_t flex_group;
+		flex_group = ext4_flex_group(sbi, input->group);
+		sbi->s_flex_groups[flex_group].free_blocks +=
+			input->free_blocks_count;
+		sbi->s_flex_groups[flex_group].free_inodes +=
+			EXT4_INODES_PER_GROUP(sb);
+	}
+
 	ext4_journal_dirty_metadata(handle, sbi->s_sbh);
 	sb->s_dirt = 1;
 
@@ -964,7 +972,7 @@
 	ext4_group_t o_groups_count;
 	ext4_grpblk_t last;
 	ext4_grpblk_t add;
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	handle_t *handle;
 	int err;
 	unsigned long freed_blocks;
@@ -1077,8 +1085,15 @@
 	/*
 	 * Mark mballoc pages as not up to date so that they will be updated
 	 * next time they are loaded by ext4_mb_load_buddy.
+	 *
+	 * XXX Bad, Bad, BAD!!!  We should not be overloading the
+	 * Uptodate flag, particularly on thte bitmap bh, as way of
+	 * hinting to ext4_mb_load_buddy() that it needs to be
+	 * overloaded.  A user could take a LVM snapshot, then do an
+	 * on-line fsck, and clear the uptodate flag, and this would
+	 * not be a bug in userspace, but a bug in the kernel.  FIXME!!!
 	 */
-	if (test_opt(sb, MBALLOC)) {
+	{
 		struct ext4_sb_info *sbi = EXT4_SB(sb);
 		struct inode *inode = sbi->s_buddy_cache;
 		int blocks_per_page;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 566344b..0e661c5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -34,6 +34,8 @@
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
@@ -45,6 +47,8 @@
 #include "namei.h"
 #include "group.h"
 
+struct proc_dir_entry *ext4_proc_root;
+
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
 static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
@@ -508,10 +512,12 @@
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-		BUFFER_TRACE(sbi->s_sbh, "marking dirty");
-		mark_buffer_dirty(sbi->s_sbh);
 		ext4_commit_super(sb, es, 1);
 	}
+	if (sbi->s_proc) {
+		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
+		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
 
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -520,6 +526,7 @@
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -562,11 +569,10 @@
 	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	ei->i_acl = EXT4_ACL_NOT_CACHED;
 	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
 #endif
-	ei->i_block_alloc_info = NULL;
 	ei->vfs_inode.i_version = 1;
 	ei->vfs_inode.i_data.writeback_index = 0;
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
@@ -599,7 +605,7 @@
 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 
 	INIT_LIST_HEAD(&ei->i_orphan);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	init_rwsem(&ei->xattr_sem);
 #endif
 	init_rwsem(&ei->i_data_sem);
@@ -625,8 +631,7 @@
 
 static void ext4_clear_inode(struct inode *inode)
 {
-	struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info;
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	if (EXT4_I(inode)->i_acl &&
 			EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
 		posix_acl_release(EXT4_I(inode)->i_acl);
@@ -638,10 +643,7 @@
 		EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
 	}
 #endif
-	ext4_discard_reservation(inode);
-	EXT4_I(inode)->i_block_alloc_info = NULL;
-	if (unlikely(rsv))
-		kfree(rsv);
+	ext4_discard_preallocations(inode);
 	jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
 				       &EXT4_I(inode)->jinode);
 }
@@ -654,7 +656,7 @@
 
 	if (sbi->s_jquota_fmt)
 		seq_printf(seq, ",jqfmt=%s",
-		(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+		(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0");
 
 	if (sbi->s_qf_names[USRQUOTA])
 		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -718,7 +720,7 @@
 		seq_puts(seq, ",debug");
 	if (test_opt(sb, OLDALLOC))
 		seq_puts(seq, ",oldalloc");
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	if (test_opt(sb, XATTR_USER) &&
 		!(def_mount_opts & EXT4_DEFM_XATTR_USER))
 		seq_puts(seq, ",user_xattr");
@@ -727,7 +729,7 @@
 		seq_puts(seq, ",nouser_xattr");
 	}
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
 		seq_puts(seq, ",acl");
 	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
@@ -752,8 +754,6 @@
 		seq_puts(seq, ",nobh");
 	if (!test_opt(sb, EXTENTS))
 		seq_puts(seq, ",noextents");
-	if (!test_opt(sb, MBALLOC))
-		seq_puts(seq, ",nomballoc");
 	if (test_opt(sb, I_VERSION))
 		seq_puts(seq, ",i_version");
 	if (!test_opt(sb, DELALLOC))
@@ -773,6 +773,10 @@
 	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
 		seq_puts(seq, ",data=writeback");
 
+	if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+		seq_printf(seq, ",inode_readahead_blks=%u",
+			   sbi->s_inode_readahead_blks);
+
 	ext4_show_quota_options(seq, sb);
 	return 0;
 }
@@ -822,7 +826,7 @@
 }
 
 #ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group")
+#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
 
 static int ext4_dquot_initialize(struct inode *inode, int type);
@@ -907,6 +911,7 @@
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
 	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+	Opt_inode_readahead_blks
 };
 
 static match_table_t tokens = {
@@ -967,6 +972,7 @@
 	{Opt_resize, "resize"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
+	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
 	{Opt_err, NULL},
 };
 
@@ -981,7 +987,7 @@
 	/*todo: use simple_strtoll with >32bit ext4 */
 	sb_block = simple_strtoul(options, &options, 0);
 	if (*options && *options != ',') {
-		printk("EXT4-fs: Invalid sb specification: %s\n",
+		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
 		       (char *) *data);
 		return 1;
 	}
@@ -1072,7 +1078,7 @@
 		case Opt_orlov:
 			clear_opt(sbi->s_mount_opt, OLDALLOC);
 			break;
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 		case Opt_user_xattr:
 			set_opt(sbi->s_mount_opt, XATTR_USER);
 			break;
@@ -1082,10 +1088,11 @@
 #else
 		case Opt_user_xattr:
 		case Opt_nouser_xattr:
-			printk("EXT4 (no)user_xattr options not supported\n");
+			printk(KERN_ERR "EXT4 (no)user_xattr options "
+			       "not supported\n");
 			break;
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 		case Opt_acl:
 			set_opt(sbi->s_mount_opt, POSIX_ACL);
 			break;
@@ -1095,7 +1102,8 @@
 #else
 		case Opt_acl:
 		case Opt_noacl:
-			printk("EXT4 (no)acl options not supported\n");
+			printk(KERN_ERR "EXT4 (no)acl options "
+			       "not supported\n");
 			break;
 #endif
 		case Opt_reservation:
@@ -1189,8 +1197,8 @@
 			     sb_any_quota_suspended(sb)) &&
 			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
-					"EXT4-fs: Cannot change journaled "
-					"quota options when quota turned on.\n");
+				       "EXT4-fs: Cannot change journaled "
+				       "quota options when quota turned on.\n");
 				return 0;
 			}
 			qname = match_strdup(&args[0]);
@@ -1357,12 +1365,6 @@
 		case Opt_nodelalloc:
 			clear_opt(sbi->s_mount_opt, DELALLOC);
 			break;
-		case Opt_mballoc:
-			set_opt(sbi->s_mount_opt, MBALLOC);
-			break;
-		case Opt_nomballoc:
-			clear_opt(sbi->s_mount_opt, MBALLOC);
-			break;
 		case Opt_stripe:
 			if (match_int(&args[0], &option))
 				return 0;
@@ -1373,6 +1375,13 @@
 		case Opt_delalloc:
 			set_opt(sbi->s_mount_opt, DELALLOC);
 			break;
+		case Opt_inode_readahead_blks:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > (1 << 30))
+				return 0;
+			sbi->s_inode_readahead_blks = option;
+			break;
 		default:
 			printk(KERN_ERR
 			       "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1473,15 +1482,9 @@
 			EXT4_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt);
 
-	printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id);
-	if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
-		char b[BDEVNAME_SIZE];
-
-		printk("external journal on %s\n",
-			bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
-	} else {
-		printk("internal journal\n");
-	}
+	printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+	       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+	       "external", EXT4_SB(sb)->s_journal->j_devname);
 	return res;
 }
 
@@ -1504,8 +1507,11 @@
 	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
 	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
 
-	flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
-		groups_per_flex;
+	/* We allocate both existing and potentially added groups */
+	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
+			    ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
+			      EXT4_DESC_PER_BLOCK_BITS(sb))) /
+			   groups_per_flex;
 	sbi->s_flex_groups = kzalloc(flex_group_count *
 				     sizeof(struct flex_groups), GFP_KERNEL);
 	if (sbi->s_flex_groups == NULL) {
@@ -1584,7 +1590,7 @@
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 		flexbg_flag = 1;
 
-	ext4_debug ("Checking group descriptors");
+	ext4_debug("Checking group descriptors");
 
 	for (i = 0; i < sbi->s_groups_count; i++) {
 		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
@@ -1623,8 +1629,10 @@
 			       "Checksum for group %lu failed (%u!=%u)\n",
 			       i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
 			       gdp)), le16_to_cpu(gdp->bg_checksum));
-			if (!(sb->s_flags & MS_RDONLY))
+			if (!(sb->s_flags & MS_RDONLY)) {
+				spin_unlock(sb_bgl_lock(sbi, i));
 				return 0;
+			}
 		}
 		spin_unlock(sb_bgl_lock(sbi, i));
 		if (!flexbg_flag)
@@ -1714,9 +1722,9 @@
 		DQUOT_INIT(inode);
 		if (inode->i_nlink) {
 			printk(KERN_DEBUG
-				"%s: truncating inode %lu to %Ld bytes\n",
+				"%s: truncating inode %lu to %lld bytes\n",
 				__func__, inode->i_ino, inode->i_size);
-			jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
+			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
 				  inode->i_ino, inode->i_size);
 			ext4_truncate(inode);
 			nr_truncates++;
@@ -1914,6 +1922,7 @@
 	unsigned long journal_devnum = 0;
 	unsigned long def_mount_opts;
 	struct inode *root;
+	char *cp;
 	int ret = -EINVAL;
 	int blocksize;
 	int db_count;
@@ -1930,10 +1939,15 @@
 	sbi->s_mount_opt = 0;
 	sbi->s_resuid = EXT4_DEF_RESUID;
 	sbi->s_resgid = EXT4_DEF_RESGID;
+	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
 	sbi->s_sb_block = sb_block;
 
 	unlock_kernel();
 
+	/* Cleanup superblock name */
+	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
+		*cp = '!';
+
 	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
 	if (!blocksize) {
 		printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
@@ -1973,11 +1987,11 @@
 		set_opt(sbi->s_mount_opt, GRPID);
 	if (def_mount_opts & EXT4_DEFM_UID16)
 		set_opt(sbi->s_mount_opt, NO_UID32);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	if (def_mount_opts & EXT4_DEFM_XATTR_USER)
 		set_opt(sbi->s_mount_opt, XATTR_USER);
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	if (def_mount_opts & EXT4_DEFM_ACL)
 		set_opt(sbi->s_mount_opt, POSIX_ACL);
 #endif
@@ -2012,11 +2026,6 @@
 		ext4_warning(sb, __func__,
 			"extents feature not enabled on this filesystem, "
 			"use tune2fs.\n");
-	/*
-	 * turn on mballoc code by default in ext4 filesystem
-	 * Use -o nomballoc to turn it off
-	 */
-	set_opt(sbi->s_mount_opt, MBALLOC);
 
 	/*
 	 * enable delayed allocation by default
@@ -2041,16 +2050,6 @@
 		       "running e2fsck is recommended\n");
 
 	/*
-	 * Since ext4 is still considered development code, we require
-	 * that the TEST_FILESYS flag in s->flags be set.
-	 */
-	if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) {
-		printk(KERN_WARNING "EXT4-fs: %s: not marked "
-		       "OK to use with test code.\n", sb->s_id);
-		goto failed_mount;
-	}
-
-	/*
 	 * Check feature flags regardless of the revision level, since we
 	 * previously didn't change the revision level when setting the flags,
 	 * so there is a chance incompat flags are set on a rev 0 filesystem.
@@ -2219,6 +2218,14 @@
 		goto failed_mount;
 	}
 
+	if (ext4_proc_root)
+		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+
+	if (sbi->s_proc)
+		proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
+				 &ext4_ui_proc_fops,
+				 &sbi->s_inode_readahead_blks);
+
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
@@ -2257,24 +2264,14 @@
 		err = percpu_counter_init(&sbi->s_dirs_counter,
 				ext4_count_dirs(sb));
 	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+	}
 	if (err) {
 		printk(KERN_ERR "EXT4-fs: insufficient memory\n");
 		goto failed_mount3;
 	}
 
-	/* per fileystem reservation list head & lock */
-	spin_lock_init(&sbi->s_rsv_window_lock);
-	sbi->s_rsv_window_root = RB_ROOT;
-	/* Add a single, static dummy reservation to the start of the
-	 * reservation window list --- it gives us a placeholder for
-	 * append-at-start-of-list which makes the allocation logic
-	 * _much_ simpler. */
-	sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	sbi->s_rsv_window_head.rsv_alloc_hit = 0;
-	sbi->s_rsv_window_head.rsv_goal_size = 0;
-	ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
-
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 
 	/*
@@ -2471,7 +2468,12 @@
 		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
 
 	ext4_ext_init(sb);
-	ext4_mb_init(sb, needs_recovery);
+	err = ext4_mb_init(sb, needs_recovery);
+	if (err) {
+		printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+		       err);
+		goto failed_mount4;
+	}
 
 	lock_kernel();
 	return 0;
@@ -2489,11 +2491,16 @@
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
 failed_mount:
+	if (sbi->s_proc) {
+		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
+		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
@@ -2552,7 +2559,7 @@
 		return NULL;
 	}
 
-	jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
 		  journal_inode, journal_inode->i_size);
 	if (!S_ISREG(journal_inode->i_mode)) {
 		printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
@@ -2715,6 +2722,11 @@
 			return -EINVAL;
 	}
 
+	if (journal->j_flags & JBD2_BARRIER)
+		printk(KERN_INFO "EXT4-fs: barriers enabled\n");
+	else
+		printk(KERN_INFO "EXT4-fs: barriers disabled\n");
+
 	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
 		err = jbd2_journal_update_format(journal);
 		if (err)  {
@@ -2799,13 +2811,34 @@
 
 	if (!sbh)
 		return;
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		printk(KERN_ERR "ext4: previous I/O error to "
+		       "superblock detected for %s.\n", sb->s_id);
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
 	es->s_wtime = cpu_to_le32(get_seconds());
 	ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
 	es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
-	if (sync)
+	if (sync) {
 		sync_dirty_buffer(sbh);
+		if (buffer_write_io_error(sbh)) {
+			printk(KERN_ERR "ext4: I/O error while writing "
+			       "superblock for %s.\n", sb->s_id);
+			clear_buffer_write_io_error(sbh);
+			set_buffer_uptodate(sbh);
+		}
+	}
 }
 
 
@@ -2907,6 +2940,7 @@
 {
 	tid_t target;
 
+	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
 	sb->s_dirt = 0;
 	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
 		if (wait)
@@ -3162,7 +3196,8 @@
 	buf->f_type = EXT4_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+		       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
 	ext4_free_blocks_count_set(es, buf->f_bfree);
 	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
 	if (buf->f_bfree < ext4_r_blocks_count(es))
@@ -3432,7 +3467,7 @@
 	handle_t *handle = journal_current_handle();
 
 	if (!handle) {
-		printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)"
+		printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
 			" cancelled because transaction is not started.\n",
 			(unsigned long long)off, (unsigned long long)len);
 		return -EIO;
@@ -3493,18 +3528,82 @@
 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
 }
 
-static struct file_system_type ext4dev_fs_type = {
+#ifdef CONFIG_PROC_FS
+static int ext4_ui_proc_show(struct seq_file *m, void *v)
+{
+	unsigned int *p = m->private;
+
+	seq_printf(m, "%u\n", *p);
+	return 0;
+}
+
+static int ext4_ui_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
+}
+
+static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
+			       size_t cnt, loff_t *ppos)
+{
+	unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
+	char str[32];
+	unsigned long value;
+
+	if (cnt >= sizeof(str))
+		return -EINVAL;
+	if (copy_from_user(str, buf, cnt))
+		return -EFAULT;
+	value = simple_strtol(str, NULL, 0);
+	if (value < 0)
+		return -ERANGE;
+	*p = value;
+	return cnt;
+}
+
+const struct file_operations ext4_ui_proc_fops = {
 	.owner		= THIS_MODULE,
-	.name		= "ext4dev",
+	.open		= ext4_ui_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= ext4_ui_proc_write,
+};
+#endif
+
+static struct file_system_type ext4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext4",
 	.get_sb		= ext4_get_sb,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
+#ifdef CONFIG_EXT4DEV_COMPAT
+static int ext4dev_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
+	       "to mount using ext4\n");
+	printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
+	       "will go away by 2.6.31\n");
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+}
+
+static struct file_system_type ext4dev_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext4dev",
+	.get_sb		= ext4dev_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS("ext4dev");
+#endif
+
 static int __init init_ext4_fs(void)
 {
 	int err;
 
+	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 	err = init_ext4_mballoc();
 	if (err)
 		return err;
@@ -3515,9 +3614,16 @@
 	err = init_inodecache();
 	if (err)
 		goto out1;
-	err = register_filesystem(&ext4dev_fs_type);
+	err = register_filesystem(&ext4_fs_type);
 	if (err)
 		goto out;
+#ifdef CONFIG_EXT4DEV_COMPAT
+	err = register_filesystem(&ext4dev_fs_type);
+	if (err) {
+		unregister_filesystem(&ext4_fs_type);
+		goto out;
+	}
+#endif
 	return 0;
 out:
 	destroy_inodecache();
@@ -3530,10 +3636,14 @@
 
 static void __exit exit_ext4_fs(void)
 {
+	unregister_filesystem(&ext4_fs_type);
+#ifdef CONFIG_EXT4DEV_COMPAT
 	unregister_filesystem(&ext4dev_fs_type);
+#endif
 	destroy_inodecache();
 	exit_ext4_xattr();
 	exit_ext4_mballoc();
+	remove_proc_entry("fs/ext4", NULL);
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index e917864..00740cb 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,10 +23,10 @@
 #include "ext4.h"
 #include "xattr.h"
 
-static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
-	nd_set_link(nd, (char*)ei->i_data);
+	nd_set_link(nd, (char *) ei->i_data);
 	return NULL;
 }
 
@@ -34,7 +34,7 @@
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
@@ -45,7 +45,7 @@
 const struct inode_operations ext4_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext4_follow_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8954208..80626d5 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -99,12 +99,12 @@
 
 static struct xattr_handler *ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
 	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
 #endif
 	[EXT4_XATTR_INDEX_TRUSTED]	     = &ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
 	[EXT4_XATTR_INDEX_SECURITY]	     = &ext4_xattr_security_handler,
 #endif
 };
@@ -112,11 +112,11 @@
 struct xattr_handler *ext4_xattr_handlers[] = {
 	&ext4_xattr_user_handler,
 	&ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	&ext4_xattr_acl_access_handler,
 	&ext4_xattr_acl_default_handler,
 #endif
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
 	&ext4_xattr_security_handler,
 #endif
 	NULL
@@ -959,6 +959,7 @@
 	struct ext4_xattr_block_find bs = {
 		.s = { .not_found = -ENODATA, },
 	};
+	unsigned long no_expand;
 	int error;
 
 	if (!name)
@@ -966,6 +967,9 @@
 	if (strlen(name) > 255)
 		return -ERANGE;
 	down_write(&EXT4_I(inode)->xattr_sem);
+	no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
+	EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+
 	error = ext4_get_inode_loc(inode, &is.iloc);
 	if (error)
 		goto cleanup;
@@ -1042,6 +1046,8 @@
 cleanup:
 	brelse(is.iloc.bh);
 	brelse(bs.bh);
+	if (no_expand == 0)
+		EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
 	up_write(&EXT4_I(inode)->xattr_sem);
 	return error;
 }
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 5992fe9..8ede88b 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -51,8 +51,8 @@
 	(((name_len) + EXT4_XATTR_ROUND + \
 	sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
 #define EXT4_XATTR_NEXT(entry) \
-	( (struct ext4_xattr_entry *)( \
-	  (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) )
+	((struct ext4_xattr_entry *)( \
+	 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
 #define EXT4_XATTR_SIZE(size) \
 	(((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
 
@@ -63,7 +63,7 @@
 		EXT4_I(inode)->i_extra_isize))
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
-# ifdef CONFIG_EXT4DEV_FS_XATTR
+# ifdef CONFIG_EXT4_FS_XATTR
 
 extern struct xattr_handler ext4_xattr_user_handler;
 extern struct xattr_handler ext4_xattr_trusted_handler;
@@ -88,7 +88,7 @@
 
 extern struct xattr_handler *ext4_xattr_handlers[];
 
-# else  /* CONFIG_EXT4DEV_FS_XATTR */
+# else  /* CONFIG_EXT4_FS_XATTR */
 
 static inline int
 ext4_xattr_get(struct inode *inode, int name_index, const char *name,
@@ -141,9 +141,9 @@
 
 #define ext4_xattr_handlers	NULL
 
-# endif  /* CONFIG_EXT4DEV_FS_XATTR */
+# endif  /* CONFIG_EXT4_FS_XATTR */
 
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
 				struct inode *dir);
 #else
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7db32b3..33a6b7e 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -13,9 +13,14 @@
 #include <linux/security.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
 
 #include <asm/ioctls.h>
 
+/* So that the fiemap access checks can't overflow on 32 bit machines. */
+#define FIEMAP_MAX_EXTENTS	(UINT_MAX / sizeof(struct fiemap_extent))
+
 /**
  * vfs_ioctl - call filesystem specific ioctl methods
  * @filp:	open file to invoke ioctl method on
@@ -71,6 +76,272 @@
 	return put_user(res, p);
 }
 
+/**
+ * fiemap_fill_next_extent - Fiemap helper function
+ * @fieinfo:	Fiemap context passed into ->fiemap
+ * @logical:	Extent logical start offset, in bytes
+ * @phys:	Extent physical start offset, in bytes
+ * @len:	Extent length, in bytes
+ * @flags:	FIEMAP_EXTENT flags that describe this extent
+ *
+ * Called from file system ->fiemap callback. Will populate extent
+ * info as passed in via arguments and copy to user memory. On
+ * success, extent count on fieinfo is incremented.
+ *
+ * Returns 0 on success, -errno on error, 1 if this was the last
+ * extent that will fit in user array.
+ */
+#define SET_UNKNOWN_FLAGS	(FIEMAP_EXTENT_DELALLOC)
+#define SET_NO_UNMOUNTED_IO_FLAGS	(FIEMAP_EXTENT_DATA_ENCRYPTED)
+#define SET_NOT_ALIGNED_FLAGS	(FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
+int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
+			    u64 phys, u64 len, u32 flags)
+{
+	struct fiemap_extent extent;
+	struct fiemap_extent *dest = fieinfo->fi_extents_start;
+
+	/* only count the extents */
+	if (fieinfo->fi_extents_max == 0) {
+		fieinfo->fi_extents_mapped++;
+		return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+	}
+
+	if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
+		return 1;
+
+	if (flags & SET_UNKNOWN_FLAGS)
+		flags |= FIEMAP_EXTENT_UNKNOWN;
+	if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
+		flags |= FIEMAP_EXTENT_ENCODED;
+	if (flags & SET_NOT_ALIGNED_FLAGS)
+		flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+	memset(&extent, 0, sizeof(extent));
+	extent.fe_logical = logical;
+	extent.fe_physical = phys;
+	extent.fe_length = len;
+	extent.fe_flags = flags;
+
+	dest += fieinfo->fi_extents_mapped;
+	if (copy_to_user(dest, &extent, sizeof(extent)))
+		return -EFAULT;
+
+	fieinfo->fi_extents_mapped++;
+	if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
+		return 1;
+	return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+}
+EXPORT_SYMBOL(fiemap_fill_next_extent);
+
+/**
+ * fiemap_check_flags - check validity of requested flags for fiemap
+ * @fieinfo:	Fiemap context passed into ->fiemap
+ * @fs_flags:	Set of fiemap flags that the file system understands
+ *
+ * Called from file system ->fiemap callback. This will compute the
+ * intersection of valid fiemap flags and those that the fs supports. That
+ * value is then compared against the user supplied flags. In case of bad user
+ * flags, the invalid values will be written into the fieinfo structure, and
+ * -EBADR is returned, which tells ioctl_fiemap() to return those values to
+ * userspace. For this reason, a return code of -EBADR should be preserved.
+ *
+ * Returns 0 on success, -EBADR on bad flags.
+ */
+int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags)
+{
+	u32 incompat_flags;
+
+	incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags);
+	if (incompat_flags) {
+		fieinfo->fi_flags = incompat_flags;
+		return -EBADR;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(fiemap_check_flags);
+
+static int fiemap_check_ranges(struct super_block *sb,
+			       u64 start, u64 len, u64 *new_len)
+{
+	*new_len = len;
+
+	if (len == 0)
+		return -EINVAL;
+
+	if (start > sb->s_maxbytes)
+		return -EFBIG;
+
+	/*
+	 * Shrink request scope to what the fs can actually handle.
+	 */
+	if ((len > sb->s_maxbytes) ||
+	    (sb->s_maxbytes - len) < start)
+		*new_len = sb->s_maxbytes - start;
+
+	return 0;
+}
+
+static int ioctl_fiemap(struct file *filp, unsigned long arg)
+{
+	struct fiemap fiemap;
+	struct fiemap_extent_info fieinfo = { 0, };
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	u64 len;
+	int error;
+
+	if (!inode->i_op->fiemap)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
+			   sizeof(struct fiemap)))
+		return -EFAULT;
+
+	if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
+		return -EINVAL;
+
+	error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
+				    &len);
+	if (error)
+		return error;
+
+	fieinfo.fi_flags = fiemap.fm_flags;
+	fieinfo.fi_extents_max = fiemap.fm_extent_count;
+	fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
+
+	if (fiemap.fm_extent_count != 0 &&
+	    !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
+		       fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
+		return -EFAULT;
+
+	if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
+		filemap_write_and_wait(inode->i_mapping);
+
+	error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
+	fiemap.fm_flags = fieinfo.fi_flags;
+	fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+	if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
+		error = -EFAULT;
+
+	return error;
+}
+
+#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
+#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
+
+/*
+ * @inode - the inode to map
+ * @arg - the pointer to userspace where we copy everything to
+ * @get_block - the fs's get_block function
+ *
+ * This does FIEMAP for block based inodes.  Basically it will just loop
+ * through get_block until we hit the number of extents we want to map, or we
+ * go past the end of the file and hit a hole.
+ *
+ * If it is possible to have data blocks beyond a hole past @inode->i_size, then
+ * please do not use this function, it will stop at the first unmapped block
+ * beyond i_size
+ */
+int generic_block_fiemap(struct inode *inode,
+			 struct fiemap_extent_info *fieinfo, u64 start,
+			 u64 len, get_block_t *get_block)
+{
+	struct buffer_head tmp;
+	unsigned int start_blk;
+	long long length = 0, map_len = 0;
+	u64 logical = 0, phys = 0, size = 0;
+	u32 flags = FIEMAP_EXTENT_MERGED;
+	int ret = 0;
+
+	if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
+		return ret;
+
+	start_blk = logical_to_blk(inode, start);
+
+	/* guard against change */
+	mutex_lock(&inode->i_mutex);
+
+	length = (long long)min_t(u64, len, i_size_read(inode));
+	map_len = length;
+
+	do {
+		/*
+		 * we set b_size to the total size we want so it will map as
+		 * many contiguous blocks as possible at once
+		 */
+		memset(&tmp, 0, sizeof(struct buffer_head));
+		tmp.b_size = map_len;
+
+		ret = get_block(inode, start_blk, &tmp, 0);
+		if (ret)
+			break;
+
+		/* HOLE */
+		if (!buffer_mapped(&tmp)) {
+			/*
+			 * first hole after going past the EOF, this is our
+			 * last extent
+			 */
+			if (length <= 0) {
+				flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size,
+							      flags);
+				break;
+			}
+
+			length -= blk_to_logical(inode, 1);
+
+			/* if we have holes up to/past EOF then we're done */
+			if (length <= 0)
+				break;
+
+			start_blk++;
+		} else {
+			if (length <= 0 && size) {
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size,
+							      flags);
+				if (ret)
+					break;
+			}
+
+			logical = blk_to_logical(inode, start_blk);
+			phys = blk_to_logical(inode, tmp.b_blocknr);
+			size = tmp.b_size;
+			flags = FIEMAP_EXTENT_MERGED;
+
+			length -= tmp.b_size;
+			start_blk += logical_to_blk(inode, size);
+
+			/*
+			 * if we are past the EOF we need to loop again to see
+			 * if there is a hole so we can mark this extent as the
+			 * last one, and if not keep mapping things until we
+			 * find a hole, or we run out of slots in the extent
+			 * array
+			 */
+			if (length <= 0)
+				continue;
+
+			ret = fiemap_fill_next_extent(fieinfo, logical, phys,
+						      size, flags);
+			if (ret)
+				break;
+		}
+		cond_resched();
+	} while (1);
+
+	mutex_unlock(&inode->i_mutex);
+
+	/* if ret is 1 then we just hit the end of the extent array */
+	if (ret == 1)
+		ret = 0;
+
+	return ret;
+}
+EXPORT_SYMBOL(generic_block_fiemap);
+
 static int file_ioctl(struct file *filp, unsigned int cmd,
 		unsigned long arg)
 {
@@ -80,6 +351,8 @@
 	switch (cmd) {
 	case FIBMAP:
 		return ioctl_fibmap(filp, p);
+	case FS_IOC_FIEMAP:
+		return ioctl_fiemap(filp, arg);
 	case FIGETBSZ:
 		return put_user(inode->i_sb->s_blocksize, p);
 	case FIONREAD:
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 91389c8..42895d36 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,6 +20,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 
@@ -126,14 +127,29 @@
 
 		/*
 		 * Test again, another process may have checkpointed while we
-		 * were waiting for the checkpoint lock
+		 * were waiting for the checkpoint lock. If there are no
+		 * outstanding transactions there is nothing to checkpoint and
+		 * we can't make progress. Abort the journal in this case.
 		 */
 		spin_lock(&journal->j_state_lock);
+		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
 		if (__jbd2_log_space_left(journal) < nblocks) {
+			int chkpt = journal->j_checkpoint_transactions != NULL;
+
+			spin_unlock(&journal->j_list_lock);
 			spin_unlock(&journal->j_state_lock);
-			jbd2_log_do_checkpoint(journal);
+			if (chkpt) {
+				jbd2_log_do_checkpoint(journal);
+			} else {
+				printk(KERN_ERR "%s: no transactions\n",
+				       __func__);
+				jbd2_journal_abort(journal, 0);
+			}
+
 			spin_lock(&journal->j_state_lock);
+		} else {
+			spin_unlock(&journal->j_list_lock);
 		}
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
@@ -313,6 +329,8 @@
 	 * journal straight away.
 	 */
 	result = jbd2_cleanup_journal_tail(journal);
+	trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
+		   journal->j_devname, result);
 	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
 	if (result <= 0)
 		return result;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f2ad061..0d3814a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,6 +16,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
@@ -126,8 +127,7 @@
 
 	JBUFFER_TRACE(descriptor, "submit commit block");
 	lock_buffer(bh);
-	get_bh(bh);
-	set_buffer_dirty(bh);
+	clear_buffer_dirty(bh);
 	set_buffer_uptodate(bh);
 	bh->b_end_io = journal_end_buffer_io_sync;
 
@@ -147,12 +147,9 @@
 	 * to remember if we sent a barrier request
 	 */
 	if (ret == -EOPNOTSUPP && barrier_done) {
-		char b[BDEVNAME_SIZE];
-
 		printk(KERN_WARNING
-			"JBD: barrier-based sync failed on %s - "
-			"disabling barriers\n",
-			bdevname(journal->j_dev, b));
+		       "JBD: barrier-based sync failed on %s - "
+		       "disabling barriers\n", journal->j_devname);
 		spin_lock(&journal->j_state_lock);
 		journal->j_flags &= ~JBD2_BARRIER;
 		spin_unlock(&journal->j_state_lock);
@@ -160,7 +157,7 @@
 		/* And try again, without the barrier */
 		lock_buffer(bh);
 		set_buffer_uptodate(bh);
-		set_buffer_dirty(bh);
+		clear_buffer_dirty(bh);
 		ret = submit_bh(WRITE, bh);
 	}
 	*cbh = bh;
@@ -371,6 +368,8 @@
 	commit_transaction = journal->j_running_transaction;
 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 
+	trace_mark(jbd2_start_commit, "dev %s transaction %d",
+		   journal->j_devname, commit_transaction->t_tid);
 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
@@ -681,11 +680,9 @@
 	 */
 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
 	if (err) {
-		char b[BDEVNAME_SIZE];
-
 		printk(KERN_WARNING
 			"JBD2: Detected IO errors while flushing file data "
-			"on %s\n", bdevname(journal->j_fs_dev, b));
+		       "on %s\n", journal->j_devname);
 		err = 0;
 	}
 
@@ -990,6 +987,9 @@
 	}
 	spin_unlock(&journal->j_list_lock);
 
+	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
+		   journal->j_devname, commit_transaction->t_tid,
+		   journal->j_tail_sequence);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8207a01..01c3901 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -597,13 +597,9 @@
 		if (ret)
 			*retp = ret;
 		else {
-			char b[BDEVNAME_SIZE];
-
 			printk(KERN_ALERT "%s: journal block not found "
 					"at offset %lu on %s\n",
-				__func__,
-				blocknr,
-				bdevname(journal->j_dev, b));
+			       __func__, blocknr, journal->j_devname);
 			err = -EIO;
 			__journal_abort_soft(journal, err);
 		}
@@ -901,10 +897,7 @@
 
 static void jbd2_stats_proc_init(journal_t *journal)
 {
-	char name[BDEVNAME_SIZE];
-
-	bdevname(journal->j_dev, name);
-	journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
+	journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
 	if (journal->j_proc_entry) {
 		proc_create_data("history", S_IRUGO, journal->j_proc_entry,
 				 &jbd2_seq_history_fops, journal);
@@ -915,12 +908,9 @@
 
 static void jbd2_stats_proc_exit(journal_t *journal)
 {
-	char name[BDEVNAME_SIZE];
-
-	bdevname(journal->j_dev, name);
 	remove_proc_entry("info", journal->j_proc_entry);
 	remove_proc_entry("history", journal->j_proc_entry);
-	remove_proc_entry(name, proc_jbd2_stats);
+	remove_proc_entry(journal->j_devname, proc_jbd2_stats);
 }
 
 static void journal_init_stats(journal_t *journal)
@@ -1018,6 +1008,7 @@
 {
 	journal_t *journal = journal_init_common();
 	struct buffer_head *bh;
+	char *p;
 	int n;
 
 	if (!journal)
@@ -1039,6 +1030,10 @@
 	journal->j_fs_dev = fs_dev;
 	journal->j_blk_offset = start;
 	journal->j_maxlen = len;
+	bdevname(journal->j_dev, journal->j_devname);
+	p = journal->j_devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
 	jbd2_stats_proc_init(journal);
 
 	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
@@ -1061,6 +1056,7 @@
 {
 	struct buffer_head *bh;
 	journal_t *journal = journal_init_common();
+	char *p;
 	int err;
 	int n;
 	unsigned long long blocknr;
@@ -1070,6 +1066,12 @@
 
 	journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
 	journal->j_inode = inode;
+	bdevname(journal->j_dev, journal->j_devname);
+	p = journal->j_devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
+	p = journal->j_devname + strlen(journal->j_devname);
+	sprintf(p, ":%lu", journal->j_inode->i_ino);
 	jbd_debug(1,
 		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
 		  journal, inode->i_sb->s_id, inode->i_ino,
@@ -1253,6 +1255,22 @@
 		goto out;
 	}
 
+	if (buffer_write_io_error(bh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the journal
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		printk(KERN_ERR "JBD2: previous I/O error detected "
+		       "for journal superblock update for %s.\n",
+		       journal->j_devname);
+		clear_buffer_write_io_error(bh);
+		set_buffer_uptodate(bh);
+	}
+
 	spin_lock(&journal->j_state_lock);
 	jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
 		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1264,9 +1282,16 @@
 
 	BUFFER_TRACE(bh, "marking dirty");
 	mark_buffer_dirty(bh);
-	if (wait)
+	if (wait) {
 		sync_dirty_buffer(bh);
-	else
+		if (buffer_write_io_error(bh)) {
+			printk(KERN_ERR "JBD2: I/O error detected "
+			       "when updating journal superblock for %s.\n",
+			       journal->j_devname);
+			clear_buffer_write_io_error(bh);
+			set_buffer_uptodate(bh);
+		}
+	} else
 		ll_rw_block(SWRITE, 1, &bh);
 
 out:
@@ -1761,23 +1786,6 @@
 }
 
 /*
- * journal_dev_name: format a character string to describe on what
- * device this journal is present.
- */
-
-static const char *journal_dev_name(journal_t *journal, char *buffer)
-{
-	struct block_device *bdev;
-
-	if (journal->j_inode)
-		bdev = journal->j_inode->i_sb->s_bdev;
-	else
-		bdev = journal->j_dev;
-
-	return bdevname(bdev, buffer);
-}
-
-/*
  * Journal abort has very specific semantics, which we describe
  * for journal abort.
  *
@@ -1793,13 +1801,12 @@
 void __jbd2_journal_abort_hard(journal_t *journal)
 {
 	transaction_t *transaction;
-	char b[BDEVNAME_SIZE];
 
 	if (journal->j_flags & JBD2_ABORT)
 		return;
 
 	printk(KERN_ERR "Aborting journal on device %s.\n",
-		journal_dev_name(journal, b));
+	       journal->j_devname);
 
 	spin_lock(&journal->j_state_lock);
 	journal->j_flags |= JBD2_ABORT;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 10bfb46..29ff57e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -990,15 +990,6 @@
 }
 
 /*
- * This is only valid for leaf nodes, which are the only ones that can
- * have empty extents anyway.
- */
-static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
-{
-	return !rec->e_leaf_clusters;
-}
-
-/*
  * This function will discard the rightmost extent record.
  */
 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 42ff94b..60cd3d5 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -146,4 +146,13 @@
 		return le16_to_cpu(rec->e_leaf_clusters);
 }
 
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
+{
+	return !rec->e_leaf_clusters;
+}
+
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c58668a..aed268e 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/fiemap.h>
 
 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
@@ -32,6 +33,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
@@ -282,6 +284,51 @@
 		kfree(new_emi);
 }
 
+static int ocfs2_last_eb_is_empty(struct inode *inode,
+				  struct ocfs2_dinode *di)
+{
+	int ret, next_free;
+	u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), last_eb_blk,
+			       &eb_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+	el = &eb->h_list;
+
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		ret = -EROFS;
+		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+		goto out;
+	}
+
+	if (el->l_tree_depth) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %lu has non zero tree depth in "
+			    "leaf block %llu\n", inode->i_ino,
+			    (unsigned long long)eb_bh->b_blocknr);
+		ret = -EROFS;
+		goto out;
+	}
+
+	next_free = le16_to_cpu(el->l_next_free_rec);
+
+	if (next_free == 0 ||
+	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
+		ret = 1;
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
 /*
  * Return the 1st index within el which contains an extent start
  * larger than v_cluster.
@@ -373,42 +420,28 @@
 	return ret;
 }
 
-int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
-		       u32 *p_cluster, u32 *num_clusters,
-		       unsigned int *extent_flags)
+static int ocfs2_get_clusters_nocache(struct inode *inode,
+				      struct buffer_head *di_bh,
+				      u32 v_cluster, unsigned int *hole_len,
+				      struct ocfs2_extent_rec *ret_rec,
+				      unsigned int *is_last)
 {
-	int ret, i;
-	unsigned int flags = 0;
-	struct buffer_head *di_bh = NULL;
-	struct buffer_head *eb_bh = NULL;
+	int i, ret, tree_height, len;
 	struct ocfs2_dinode *di;
-	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_block *uninitialized_var(eb);
 	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_rec *rec;
-	u32 coff;
+	struct buffer_head *eb_bh = NULL;
 
-	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		ret = -ERANGE;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
-				      num_clusters, extent_flags);
-	if (ret == 0)
-		goto out;
-
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, inode);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	memset(ret_rec, 0, sizeof(*ret_rec));
+	if (is_last)
+		*is_last = 0;
 
 	di = (struct ocfs2_dinode *) di_bh->b_data;
 	el = &di->id2.i_list;
+	tree_height = le16_to_cpu(el->l_tree_depth);
 
-	if (el->l_tree_depth) {
+	if (tree_height > 0) {
 		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
 		if (ret) {
 			mlog_errno(ret);
@@ -431,46 +464,143 @@
 	i = ocfs2_search_extent_list(el, v_cluster);
 	if (i == -1) {
 		/*
+		 * Holes can be larger than the maximum size of an
+		 * extent, so we return their lengths in a seperate
+		 * field.
+		 */
+		if (hole_len) {
+			ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
+							 v_cluster, &len);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			*hole_len = len;
+		}
+		goto out_hole;
+	}
+
+	rec = &el->l_recs[i];
+
+	BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+	if (!rec->e_blkno) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0)", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	*ret_rec = *rec;
+
+	/*
+	 * Checking for last extent is potentially expensive - we
+	 * might have to look at the next leaf over to see if it's
+	 * empty.
+	 *
+	 * The first two checks are to see whether the caller even
+	 * cares for this information, and if the extent is at least
+	 * the last in it's list.
+	 *
+	 * If those hold true, then the extent is last if any of the
+	 * additional conditions hold true:
+	 *  - Extent list is in-inode
+	 *  - Extent list is right-most
+	 *  - Extent list is 2nd to rightmost, with empty right-most
+	 */
+	if (is_last) {
+		if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
+			if (tree_height == 0)
+				*is_last = 1;
+			else if (eb->h_blkno == di->i_last_eb_blk)
+				*is_last = 1;
+			else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
+				ret = ocfs2_last_eb_is_empty(inode, di);
+				if (ret < 0) {
+					mlog_errno(ret);
+					goto out;
+				}
+				if (ret == 1)
+					*is_last = 1;
+			}
+		}
+	}
+
+out_hole:
+	ret = 0;
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+static void ocfs2_relative_extent_offsets(struct super_block *sb,
+					  u32 v_cluster,
+					  struct ocfs2_extent_rec *rec,
+					  u32 *p_cluster, u32 *num_clusters)
+
+{
+	u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
+
+	*p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
+	*p_cluster = *p_cluster + coff;
+
+	if (num_clusters)
+		*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
+}
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+		       u32 *p_cluster, u32 *num_clusters,
+		       unsigned int *extent_flags)
+{
+	int ret;
+	unsigned int uninitialized_var(hole_len), flags = 0;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = -ERANGE;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+				      num_clusters, extent_flags);
+	if (ret == 0)
+		goto out;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+			       &di_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
+					 &rec, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (rec.e_blkno == 0ULL) {
+		/*
 		 * A hole was found. Return some canned values that
 		 * callers can key on. If asked for, num_clusters will
 		 * be populated with the size of the hole.
 		 */
 		*p_cluster = 0;
 		if (num_clusters) {
-			ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
-							 v_cluster,
-							 num_clusters);
-			if (ret) {
-				mlog_errno(ret);
-				goto out;
-			}
+			*num_clusters = hole_len;
 		}
 	} else {
-		rec = &el->l_recs[i];
+		ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
+					      p_cluster, num_clusters);
+		flags = rec.e_flags;
 
-		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
-
-		if (!rec->e_blkno) {
-			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-				    "record (%u, %u, 0)", inode->i_ino,
-				    le32_to_cpu(rec->e_cpos),
-				    ocfs2_rec_clusters(el, rec));
-			ret = -EROFS;
-			goto out;
-		}
-
-		coff = v_cluster - le32_to_cpu(rec->e_cpos);
-
-		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
-						    le64_to_cpu(rec->e_blkno));
-		*p_cluster = *p_cluster + coff;
-
-		if (num_clusters)
-			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
-
-		flags = rec->e_flags;
-
-		ocfs2_extent_map_insert_rec(inode, rec);
+		ocfs2_extent_map_insert_rec(inode, &rec);
 	}
 
 	if (extent_flags)
@@ -478,7 +608,6 @@
 
 out:
 	brelse(di_bh);
-	brelse(eb_bh);
 	return ret;
 }
 
@@ -521,3 +650,114 @@
 out:
 	return ret;
 }
+
+static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
+			       struct fiemap_extent_info *fieinfo,
+			       u64 map_start)
+{
+	int ret;
+	unsigned int id_count;
+	struct ocfs2_dinode *di;
+	u64 phys;
+	u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	id_count = le16_to_cpu(di->id2.i_data.id_count);
+
+	if (map_start < id_count) {
+		phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
+		phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+
+		ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
+					      flags);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+#define OCFS2_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
+
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len)
+{
+	int ret, is_last;
+	u32 mapping_end, cpos;
+	unsigned int hole_size;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u64 len_bytes, phys_bytes, virt_bytes;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
+
+	ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
+	if (ret)
+		return ret;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	/*
+	 * Handle inline-data separately.
+	 */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
+		goto out_unlock;
+	}
+
+	cpos = map_start >> osb->s_clustersize_bits;
+	mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
+					       map_start + map_len);
+	mapping_end -= cpos;
+	is_last = 0;
+	while (cpos < mapping_end && !is_last) {
+		u32 fe_flags;
+
+		ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
+						 &hole_size, &rec, &is_last);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (rec.e_blkno == 0ULL) {
+			cpos += hole_size;
+			continue;
+		}
+
+		fe_flags = 0;
+		if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
+			fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+		if (is_last)
+			fe_flags |= FIEMAP_EXTENT_LAST;
+		len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
+		phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
+		virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
+
+		ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
+					      len_bytes, fe_flags);
+		if (ret)
+			break;
+
+		cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
+	}
+
+	if (ret > 0)
+		ret = 0;
+
+out_unlock:
+	brelse(di_bh);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_inode_unlock(inode, 0);
+out:
+
+	return ret;
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index de91e3e..1b97490 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -50,4 +50,7 @@
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 				u64 *ret_count, unsigned int *extent_flags);
 
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len);
+
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ec2ed15..ed38796 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2228,6 +2228,7 @@
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
 	.fallocate	= ocfs2_fallocate,
+	.fiemap		= ocfs2_fiemap,
 };
 
 const struct inode_operations ocfs2_special_file_iops = {
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index cb752ba..7440a0d 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -385,6 +385,7 @@
 	. = ALIGN(align);						\
 	VMLINUX_SYMBOL(__per_cpu_start) = .;				\
 	.data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {		\
+		*(.data.percpu.page_aligned)				\
 		*(.data.percpu)						\
 		*(.data.percpu.shared_aligned)				\
 	}								\
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index 65590c9..d76a083 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -9,6 +9,8 @@
 #include <asm/apicdef.h>
 #include <asm/processor.h>
 #include <asm/system.h>
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
 
 #define ARCH_APICTIMER_STOPS_ON_C3	1
 
@@ -47,8 +49,6 @@
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
-#define apic_write native_apic_write
-#define apic_read native_apic_read
 #define setup_boot_clock setup_boot_APIC_clock
 #define setup_secondary_clock setup_secondary_APIC_clock
 #endif
@@ -60,7 +60,7 @@
 extern void xapic_icr_write(u32, u32);
 extern int setup_profiling_timer(unsigned int);
 
-static inline void native_apic_write(unsigned long reg, u32 v)
+static inline void native_apic_mem_write(u32 reg, u32 v)
 {
 	volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
 
@@ -69,15 +69,68 @@
 		       ASM_OUTPUT2("0" (v), "m" (*addr)));
 }
 
-static inline u32 native_apic_read(unsigned long reg)
+static inline u32 native_apic_mem_read(u32 reg)
 {
 	return *((volatile u32 *)(APIC_BASE + reg));
 }
 
-extern void apic_wait_icr_idle(void);
-extern u32 safe_apic_wait_icr_idle(void);
+static inline void native_apic_msr_write(u32 reg, u32 v)
+{
+	if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
+	    reg == APIC_LVR)
+		return;
+
+	wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0);
+}
+
+static inline u32 native_apic_msr_read(u32 reg)
+{
+	u32 low, high;
+
+	if (reg == APIC_DFR)
+		return -1;
+
+	rdmsr(APIC_BASE_MSR + (reg >> 4), low, high);
+	return low;
+}
+
+#ifndef CONFIG_X86_32
+extern int x2apic, x2apic_preenabled;
+extern void check_x2apic(void);
+extern void enable_x2apic(void);
+extern void enable_IR_x2apic(void);
+extern void x2apic_icr_write(u32 low, u32 id);
+#endif
+
+struct apic_ops {
+	u32 (*read)(u32 reg);
+	void (*write)(u32 reg, u32 v);
+	u64 (*icr_read)(void);
+	void (*icr_write)(u32 low, u32 high);
+	void (*wait_icr_idle)(void);
+	u32 (*safe_wait_icr_idle)(void);
+};
+
+extern struct apic_ops *apic_ops;
+
+#define apic_read (apic_ops->read)
+#define apic_write (apic_ops->write)
+#define apic_icr_read (apic_ops->icr_read)
+#define apic_icr_write (apic_ops->icr_write)
+#define apic_wait_icr_idle (apic_ops->wait_icr_idle)
+#define safe_apic_wait_icr_idle (apic_ops->safe_wait_icr_idle)
+
 extern int get_physical_broadcast(void);
 
+#ifdef CONFIG_X86_64
+static inline void ack_x2APIC_irq(void)
+{
+	/* Docs say use 0 for future compatibility */
+	native_apic_msr_write(APIC_EOI, 0);
+}
+#endif
+
+
 static inline void ack_APIC_irq(void)
 {
 	/*
diff --git a/include/asm-x86/apicdef.h b/include/asm-x86/apicdef.h
index c40687d..b922c85 100644
--- a/include/asm-x86/apicdef.h
+++ b/include/asm-x86/apicdef.h
@@ -105,6 +105,7 @@
 #define	APIC_TMICT	0x380
 #define	APIC_TMCCT	0x390
 #define	APIC_TDCR	0x3E0
+#define APIC_SELF_IPI	0x3F0
 #define		APIC_TDR_DIV_TMBASE	(1 << 2)
 #define		APIC_TDR_DIV_1		0xB
 #define		APIC_TDR_DIV_2		0x0
@@ -128,6 +129,8 @@
 #define	APIC_EILVT3     0x530
 
 #define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
+#define APIC_BASE_MSR	0x800
+#define X2APIC_ENABLE	(1UL << 10)
 
 #ifdef CONFIG_X86_32
 # define MAX_IO_APICS 64
diff --git a/include/asm-x86/arch_hooks.h b/include/asm-x86/arch_hooks.h
index 72adc3a..de4596b 100644
--- a/include/asm-x86/arch_hooks.h
+++ b/include/asm-x86/arch_hooks.h
@@ -12,8 +12,6 @@
 /* these aren't arch hooks, they are generic routines
  * that can be used by the hooks */
 extern void init_ISA_irqs(void);
-extern void apic_intr_init(void);
-extern void smp_intr_init(void);
 extern irqreturn_t timer_interrupt(int irq, void *dev_id);
 
 /* these are the defined hooks */
diff --git a/include/asm-x86/mach-bigsmp/mach_apic.h b/include/asm-x86/bigsmp/apic.h
similarity index 94%
rename from include/asm-x86/mach-bigsmp/mach_apic.h
rename to include/asm-x86/bigsmp/apic.h
index 05362d4..0a9cd7c 100644
--- a/include/asm-x86/mach-bigsmp/mach_apic.h
+++ b/include/asm-x86/bigsmp/apic.h
@@ -1,5 +1,5 @@
-#ifndef ASM_X86__MACH_BIGSMP__MACH_APIC_H
-#define ASM_X86__MACH_BIGSMP__MACH_APIC_H
+#ifndef __ASM_MACH_APIC_H
+#define __ASM_MACH_APIC_H
 
 #define xapic_phys_to_log_apicid(cpu) (per_cpu(x86_bios_cpu_apicid, cpu))
 #define esr_disable (1)
@@ -11,7 +11,7 @@
 
 /* Round robin the irqs amoung the online cpus */
 static inline cpumask_t target_cpus(void)
-{ 
+{
 	static unsigned long cpu = NR_CPUS;
 	do {
 		if (cpu >= NR_CPUS)
@@ -23,7 +23,7 @@
 }
 
 #undef APIC_DEST_LOGICAL
-#define APIC_DEST_LOGICAL 	0
+#define APIC_DEST_LOGICAL	0
 #define TARGET_CPUS		(target_cpus())
 #define APIC_DFR_VALUE		(APIC_DFR_FLAT)
 #define INT_DELIVERY_MODE	(dest_Fixed)
@@ -141,4 +141,4 @@
 	return cpuid_apic >> index_msb;
 }
 
-#endif /* ASM_X86__MACH_BIGSMP__MACH_APIC_H */
+#endif /* __ASM_MACH_APIC_H */
diff --git a/include/asm-x86/bigsmp/apicdef.h b/include/asm-x86/bigsmp/apicdef.h
new file mode 100644
index 0000000..392c3f5
--- /dev/null
+++ b/include/asm-x86/bigsmp/apicdef.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_MACH_APICDEF_H
+#define __ASM_MACH_APICDEF_H
+
+#define		APIC_ID_MASK		(0xFF<<24)
+
+static inline unsigned get_apic_id(unsigned long x)
+{
+	return (((x)>>24)&0xFF);
+}
+
+#define		GET_APIC_ID(x)	get_apic_id(x)
+
+#endif
diff --git a/include/asm-x86/mach-bigsmp/mach_ipi.h b/include/asm-x86/bigsmp/ipi.h
similarity index 77%
rename from include/asm-x86/mach-bigsmp/mach_ipi.h
rename to include/asm-x86/bigsmp/ipi.h
index b1b0f96..9404c53 100644
--- a/include/asm-x86/mach-bigsmp/mach_ipi.h
+++ b/include/asm-x86/bigsmp/ipi.h
@@ -1,5 +1,5 @@
-#ifndef ASM_X86__MACH_BIGSMP__MACH_IPI_H
-#define ASM_X86__MACH_BIGSMP__MACH_IPI_H
+#ifndef __ASM_MACH_IPI_H
+#define __ASM_MACH_IPI_H
 
 void send_IPI_mask_sequence(cpumask_t mask, int vector);
 
@@ -22,4 +22,4 @@
 	send_IPI_mask(cpu_online_map, vector);
 }
 
-#endif /* ASM_X86__MACH_BIGSMP__MACH_IPI_H */
+#endif /* __ASM_MACH_IPI_H */
diff --git a/include/asm-x86/bugs.h b/include/asm-x86/bugs.h
index 4761c46..dc60498 100644
--- a/include/asm-x86/bugs.h
+++ b/include/asm-x86/bugs.h
@@ -2,6 +2,11 @@
 #define ASM_X86__BUGS_H
 
 extern void check_bugs(void);
+
+#if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32)
 int ppro_with_ram_bug(void);
+#else
+static inline int ppro_with_ram_bug(void) { return 0; }
+#endif
 
 #endif /* ASM_X86__BUGS_H */
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index 250fa0c..adfeae6 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -6,7 +6,13 @@
 
 #include <asm/required-features.h>
 
-#define NCAPINTS	8	/* N 32-bit words worth of info */
+#define NCAPINTS	9	/* N 32-bit words worth of info */
+
+/*
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name.  If the string is "",
+ * this feature bit is not displayed in /proc/cpuinfo at all.
+ */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
 #define X86_FEATURE_FPU		(0*32+ 0) /* Onboard FPU */
@@ -14,7 +20,7 @@
 #define X86_FEATURE_DE		(0*32+ 2) /* Debugging Extensions */
 #define X86_FEATURE_PSE		(0*32+ 3) /* Page Size Extensions */
 #define X86_FEATURE_TSC		(0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR		(0*32+ 5) /* Model-Specific Registers, RDMSR, WRMSR */
+#define X86_FEATURE_MSR		(0*32+ 5) /* Model-Specific Registers */
 #define X86_FEATURE_PAE		(0*32+ 6) /* Physical Address Extensions */
 #define X86_FEATURE_MCE		(0*32+ 7) /* Machine Check Architecture */
 #define X86_FEATURE_CX8		(0*32+ 8) /* CMPXCHG8 instruction */
@@ -23,22 +29,23 @@
 #define X86_FEATURE_MTRR	(0*32+12) /* Memory Type Range Registers */
 #define X86_FEATURE_PGE		(0*32+13) /* Page Global Enable */
 #define X86_FEATURE_MCA		(0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV	(0*32+15) /* CMOV instruction (FCMOVCC and FCOMI too if FPU present) */
+#define X86_FEATURE_CMOV	(0*32+15) /* CMOV instructions */
+					  /* (plus FCMOVcc, FCOMI with FPU) */
 #define X86_FEATURE_PAT		(0*32+16) /* Page Attribute Table */
 #define X86_FEATURE_PSE36	(0*32+17) /* 36-bit PSEs */
 #define X86_FEATURE_PN		(0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLSH	(0*32+19) /* Supports the CLFLUSH instruction */
-#define X86_FEATURE_DS		(0*32+21) /* Debug Store */
+#define X86_FEATURE_CLFLSH	(0*32+19) /* "clflush" CLFLUSH instruction */
+#define X86_FEATURE_DS		(0*32+21) /* "dts" Debug Store */
 #define X86_FEATURE_ACPI	(0*32+22) /* ACPI via MSR */
 #define X86_FEATURE_MMX		(0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR	(0*32+24) /* FXSAVE and FXRSTOR instructions (fast save and restore */
-					  /* of FPU context), and CR4.OSFXSR available */
-#define X86_FEATURE_XMM		(0*32+25) /* Streaming SIMD Extensions */
-#define X86_FEATURE_XMM2	(0*32+26) /* Streaming SIMD Extensions-2 */
-#define X86_FEATURE_SELFSNOOP	(0*32+27) /* CPU self snoop */
+#define X86_FEATURE_FXSR	(0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM		(0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2	(0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP	(0*32+27) /* "ss" CPU self snoop */
 #define X86_FEATURE_HT		(0*32+28) /* Hyper-Threading */
-#define X86_FEATURE_ACC		(0*32+29) /* Automatic clock control */
+#define X86_FEATURE_ACC		(0*32+29) /* "tm" Automatic clock control */
 #define X86_FEATURE_IA64	(0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE		(0*32+31) /* Pending Break Enable */
 
 /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
 /* Don't duplicate feature flags which are redundant with Intel! */
@@ -46,7 +53,8 @@
 #define X86_FEATURE_MP		(1*32+19) /* MP Capable. */
 #define X86_FEATURE_NX		(1*32+20) /* Execute Disable */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_GBPAGES	(1*32+26) /* GB pages */
+#define X86_FEATURE_FXSR_OPT	(1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES	(1*32+26) /* "pdpe1gb" GB pages */
 #define X86_FEATURE_RDTSCP	(1*32+27) /* RDTSCP */
 #define X86_FEATURE_LM		(1*32+29) /* Long Mode (x86-64) */
 #define X86_FEATURE_3DNOWEXT	(1*32+30) /* AMD 3DNow! extensions */
@@ -64,53 +72,79 @@
 #define X86_FEATURE_CYRIX_ARR	(3*32+ 2) /* Cyrix ARRs (= MTRRs) */
 #define X86_FEATURE_CENTAUR_MCR	(3*32+ 3) /* Centaur MCRs (= MTRRs) */
 /* cpu types for specific tunings: */
-#define X86_FEATURE_K8		(3*32+ 4) /* Opteron, Athlon64 */
-#define X86_FEATURE_K7		(3*32+ 5) /* Athlon */
-#define X86_FEATURE_P3		(3*32+ 6) /* P3 */
-#define X86_FEATURE_P4		(3*32+ 7) /* P4 */
+#define X86_FEATURE_K8		(3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7		(3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3		(3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4		(3*32+ 7) /* "" P4 */
 #define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
 #define X86_FEATURE_UP		(3*32+ 9) /* smp kernel running on up */
-#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */
 #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_NOPL	(3*32+20) /* The NOPL (0F 1F) instructions */
 #define X86_FEATURE_PEBS	(3*32+12) /* Precise-Event Based Sampling */
 #define X86_FEATURE_BTS		(3*32+13) /* Branch Trace Store */
-#define X86_FEATURE_SYSCALL32	(3*32+14) /* syscall in ia32 userspace */
-#define X86_FEATURE_SYSENTER32	(3*32+15) /* sysenter in ia32 userspace */
-#define X86_FEATURE_REP_GOOD	(3*32+16) /* rep microcode works well on this CPU */
-#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */
-#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */
-#define X86_FEATURE_11AP	(3*32+19) /* Bad local APIC aka 11AP */
+#define X86_FEATURE_SYSCALL32	(3*32+14) /* "" syscall in ia32 userspace */
+#define X86_FEATURE_SYSENTER32	(3*32+15) /* "" sysenter in ia32 userspace */
+#define X86_FEATURE_REP_GOOD	(3*32+16) /* rep microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
+#define X86_FEATURE_11AP	(3*32+19) /* "" Bad local APIC aka 11AP */
 #define X86_FEATURE_NOPL	(3*32+20) /* The NOPL (0F 1F) instructions */
 #define X86_FEATURE_AMDC1E	(3*32+21) /* AMD C1E detected */
+#define X86_FEATURE_XTOPOLOGY	(3*32+21) /* cpu topology enum extensions */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
-#define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
-#define X86_FEATURE_MWAIT	(4*32+ 3) /* Monitor/Mwait support */
-#define X86_FEATURE_DSCPL	(4*32+ 4) /* CPL Qualified Debug Store */
+#define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ	(4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64	(4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT	(4*32+ 3) /* "monitor" Monitor/Mwait support */
+#define X86_FEATURE_DSCPL	(4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
+#define X86_FEATURE_VMX		(4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX		(4*32+ 6) /* Safer mode */
 #define X86_FEATURE_EST		(4*32+ 7) /* Enhanced SpeedStep */
 #define X86_FEATURE_TM2		(4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3	(4*32+ 9) /* Supplemental SSE-3 */
 #define X86_FEATURE_CID		(4*32+10) /* Context ID */
+#define X86_FEATURE_FMA		(4*32+12) /* Fused multiply-add */
 #define X86_FEATURE_CX16	(4*32+13) /* CMPXCHG16B */
 #define X86_FEATURE_XTPR	(4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM	(4*32+15) /* Performance Capabilities */
 #define X86_FEATURE_DCA		(4*32+18) /* Direct Cache Access */
-#define X86_FEATURE_XMM4_2	(4*32+20) /* Streaming SIMD Extensions-4.2 */
+#define X86_FEATURE_XMM4_1	(4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2	(4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC	(4*32+21) /* x2APIC */
+#define X86_FEATURE_AES		(4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE	(4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_OSXSAVE	(4*32+27) /* "" XSAVE enabled in the OS */
+#define X86_FEATURE_AVX		(4*32+28) /* Advanced Vector Extensions */
 
 /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
-#define X86_FEATURE_XSTORE	(5*32+ 2) /* on-CPU RNG present (xstore insn) */
-#define X86_FEATURE_XSTORE_EN	(5*32+ 3) /* on-CPU RNG enabled */
-#define X86_FEATURE_XCRYPT	(5*32+ 6) /* on-CPU crypto (xcrypt insn) */
-#define X86_FEATURE_XCRYPT_EN	(5*32+ 7) /* on-CPU crypto enabled */
+#define X86_FEATURE_XSTORE	(5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN	(5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT	(5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN	(5*32+ 7) /* "ace_en" on-CPU crypto enabled */
 #define X86_FEATURE_ACE2	(5*32+ 8) /* Advanced Cryptography Engine v2 */
 #define X86_FEATURE_ACE2_EN	(5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE		(5*32+ 10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN	(5*32+ 11) /* PHE enabled */
-#define X86_FEATURE_PMM		(5*32+ 12) /* PadLock Montgomery Multiplier */
-#define X86_FEATURE_PMM_EN	(5*32+ 13) /* PMM enabled */
+#define X86_FEATURE_PHE		(5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN	(5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM		(5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN	(5*32+13) /* PMM enabled */
 
 /* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
 #define X86_FEATURE_LAHF_LM	(6*32+ 0) /* LAHF/SAHF in long mode */
 #define X86_FEATURE_CMP_LEGACY	(6*32+ 1) /* If yes HyperThreading not valid */
-#define X86_FEATURE_IBS		(6*32+ 10) /* Instruction Based Sampling */
+#define X86_FEATURE_SVM		(6*32+ 2) /* Secure virtual machine */
+#define X86_FEATURE_EXTAPIC	(6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY	(6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM		(6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A	(6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW	(6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS		(6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_SSE5	(6*32+11) /* SSE-5 */
+#define X86_FEATURE_SKINIT	(6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT		(6*32+13) /* Watchdog timer */
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
@@ -118,6 +152,13 @@
  */
 #define X86_FEATURE_IDA		(7*32+ 0) /* Intel Dynamic Acceleration */
 
+/* Virtualization flags: Linux defined */
+#define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI        (8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT         (8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID        (8*32+ 4) /* Intel Virtual Processor ID */
+
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
 #include <linux/bitops.h>
@@ -151,7 +192,7 @@
 } while (0)
 #define setup_force_cpu_cap(bit) do { \
 	set_cpu_cap(&boot_cpu_data, bit);	\
-	clear_bit(bit, (unsigned long *)cleared_cpu_caps); 	\
+	clear_bit(bit, (unsigned long *)cleared_cpu_caps);	\
 } while (0)
 
 #define cpu_has_fpu		boot_cpu_has(X86_FEATURE_FPU)
@@ -192,7 +233,10 @@
 #define cpu_has_gbpages		boot_cpu_has(X86_FEATURE_GBPAGES)
 #define cpu_has_arch_perfmon	boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
 #define cpu_has_pat		boot_cpu_has(X86_FEATURE_PAT)
+#define cpu_has_xmm4_1		boot_cpu_has(X86_FEATURE_XMM4_1)
 #define cpu_has_xmm4_2		boot_cpu_has(X86_FEATURE_XMM4_2)
+#define cpu_has_x2apic		boot_cpu_has(X86_FEATURE_X2APIC)
+#define cpu_has_xsave		boot_cpu_has(X86_FEATURE_XSAVE)
 
 #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
 # define cpu_has_invlpg		1
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index f52daf1..5abbdec 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -43,6 +43,7 @@
 #define E820_RESERVED	2
 #define E820_ACPI	3
 #define E820_NVS	4
+#define E820_UNUSABLE	5
 
 /* reserved RAM used by kernel itself */
 #define E820_RESERVED_KERN        128
@@ -121,6 +122,7 @@
 extern u64 e820_hole_size(u64 start, u64 end);
 extern void finish_e820_parsing(void);
 extern void e820_reserve_resources(void);
+extern void e820_reserve_resources_late(void);
 extern void setup_memory_map(void);
 extern char *default_machine_specific_memory_setup(void);
 extern char *machine_specific_memory_setup(void);
diff --git a/include/asm-x86/mach-es7000/mach_apic.h b/include/asm-x86/es7000/apic.h
similarity index 90%
rename from include/asm-x86/mach-es7000/mach_apic.h
rename to include/asm-x86/es7000/apic.h
index c1f6f68..bd2c44d 100644
--- a/include/asm-x86/mach-es7000/mach_apic.h
+++ b/include/asm-x86/es7000/apic.h
@@ -1,5 +1,5 @@
-#ifndef ASM_X86__MACH_ES7000__MACH_APIC_H
-#define ASM_X86__MACH_ES7000__MACH_APIC_H
+#ifndef __ASM_ES7000_APIC_H
+#define __ASM_ES7000_APIC_H
 
 #define xapic_phys_to_log_apicid(cpu) per_cpu(x86_bios_cpu_apicid, cpu)
 #define esr_disable (1)
@@ -10,7 +10,7 @@
 }
 
 static inline cpumask_t target_cpus(void)
-{ 
+{
 #if defined CONFIG_ES7000_CLUSTERED_APIC
 	return CPU_MASK_ALL;
 #else
@@ -23,24 +23,24 @@
 #define APIC_DFR_VALUE		(APIC_DFR_CLUSTER)
 #define INT_DELIVERY_MODE	(dest_LowestPrio)
 #define INT_DEST_MODE		(1)    /* logical delivery broadcast to all procs */
-#define NO_BALANCE_IRQ 		(1)
+#define NO_BALANCE_IRQ		(1)
 #undef  WAKE_SECONDARY_VIA_INIT
 #define WAKE_SECONDARY_VIA_MIP
 #else
 #define APIC_DFR_VALUE		(APIC_DFR_FLAT)
 #define INT_DELIVERY_MODE	(dest_Fixed)
 #define INT_DEST_MODE		(0)    /* phys delivery to target procs */
-#define NO_BALANCE_IRQ 		(0)
+#define NO_BALANCE_IRQ		(0)
 #undef  APIC_DEST_LOGICAL
 #define APIC_DEST_LOGICAL	0x0
 #define WAKE_SECONDARY_VIA_INIT
 #endif
 
 static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
-{ 
+{
 	return 0;
-} 
-static inline unsigned long check_apicid_present(int bit) 
+}
+static inline unsigned long check_apicid_present(int bit)
 {
 	return physid_isset(bit, phys_cpu_present_map);
 }
@@ -80,7 +80,7 @@
 {
 	int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs, target cpus %lx\n",
-		(apic_version[apic] == 0x14) ? 
+		(apic_version[apic] == 0x14) ?
 		"Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(TARGET_CPUS)[0]);
 }
 
@@ -141,7 +141,7 @@
 extern unsigned int boot_cpu_physical_apicid;
 static inline int check_phys_apicid_present(int cpu_physical_apicid)
 {
-	boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
+	boot_cpu_physical_apicid = read_apic_id();
 	return (1);
 }
 
@@ -150,7 +150,7 @@
 	int num_bits_set;
 	int cpus_found = 0;
 	int cpu;
-	int apicid;	
+	int apicid;
 
 	num_bits_set = cpus_weight(cpumask);
 	/* Return id to all */
@@ -160,16 +160,16 @@
 #else
 		return cpu_to_logical_apicid(0);
 #endif
-	/* 
-	 * The cpus in the mask must all be on the apic cluster.  If are not 
-	 * on the same apicid cluster return default value of TARGET_CPUS. 
+	/*
+	 * The cpus in the mask must all be on the apic cluster.  If are not
+	 * on the same apicid cluster return default value of TARGET_CPUS.
 	 */
 	cpu = first_cpu(cpumask);
 	apicid = cpu_to_logical_apicid(cpu);
 	while (cpus_found < num_bits_set) {
 		if (cpu_isset(cpu, cpumask)) {
 			int new_apicid = cpu_to_logical_apicid(cpu);
-			if (apicid_cluster(apicid) != 
+			if (apicid_cluster(apicid) !=
 					apicid_cluster(new_apicid)){
 				printk ("%s: Not a valid mask!\n",__FUNCTION__);
 #if defined CONFIG_ES7000_CLUSTERED_APIC
@@ -191,4 +191,4 @@
 	return cpuid_apic >> index_msb;
 }
 
-#endif /* ASM_X86__MACH_ES7000__MACH_APIC_H */
+#endif /* __ASM_ES7000_APIC_H */
diff --git a/include/asm-x86/es7000/apicdef.h b/include/asm-x86/es7000/apicdef.h
new file mode 100644
index 0000000..8b234a3
--- /dev/null
+++ b/include/asm-x86/es7000/apicdef.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_ES7000_APICDEF_H
+#define __ASM_ES7000_APICDEF_H
+
+#define		APIC_ID_MASK		(0xFF<<24)
+
+static inline unsigned get_apic_id(unsigned long x)
+{
+	return (((x)>>24)&0xFF);
+}
+
+#define		GET_APIC_ID(x)	get_apic_id(x)
+
+#endif
diff --git a/include/asm-x86/mach-bigsmp/mach_ipi.h b/include/asm-x86/es7000/ipi.h
similarity index 76%
copy from include/asm-x86/mach-bigsmp/mach_ipi.h
copy to include/asm-x86/es7000/ipi.h
index b1b0f96..632a955 100644
--- a/include/asm-x86/mach-bigsmp/mach_ipi.h
+++ b/include/asm-x86/es7000/ipi.h
@@ -1,5 +1,5 @@
-#ifndef ASM_X86__MACH_BIGSMP__MACH_IPI_H
-#define ASM_X86__MACH_BIGSMP__MACH_IPI_H
+#ifndef __ASM_ES7000_IPI_H
+#define __ASM_ES7000_IPI_H
 
 void send_IPI_mask_sequence(cpumask_t mask, int vector);
 
@@ -12,7 +12,6 @@
 {
 	cpumask_t mask = cpu_online_map;
 	cpu_clear(smp_processor_id(), mask);
-
 	if (!cpus_empty(mask))
 		send_IPI_mask(mask, vector);
 }
@@ -22,4 +21,4 @@
 	send_IPI_mask(cpu_online_map, vector);
 }
 
-#endif /* ASM_X86__MACH_BIGSMP__MACH_IPI_H */
+#endif /* __ASM_ES7000_IPI_H */
diff --git a/include/asm-x86/mach-es7000/mach_mpparse.h b/include/asm-x86/es7000/mpparse.h
similarity index 80%
rename from include/asm-x86/mach-es7000/mach_mpparse.h
rename to include/asm-x86/es7000/mpparse.h
index befde24..7b5c889 100644
--- a/include/asm-x86/mach-es7000/mach_mpparse.h
+++ b/include/asm-x86/es7000/mpparse.h
@@ -1,5 +1,5 @@
-#ifndef ASM_X86__MACH_ES7000__MACH_MPPARSE_H
-#define ASM_X86__MACH_ES7000__MACH_MPPARSE_H
+#ifndef __ASM_ES7000_MPPARSE_H
+#define __ASM_ES7000_MPPARSE_H
 
 #include <linux/acpi.h>
 
@@ -26,4 +26,4 @@
 }
 #endif
 
-#endif /* ASM_X86__MACH_ES7000__MACH_MPPARSE_H */
+#endif /* __ASM_MACH_MPPARSE_H */
diff --git a/include/asm-x86/mach-es7000/mach_wakecpu.h b/include/asm-x86/es7000/wakecpu.h
similarity index 89%
rename from include/asm-x86/mach-es7000/mach_wakecpu.h
rename to include/asm-x86/es7000/wakecpu.h
index 97c776c..3ffc5a7 100644
--- a/include/asm-x86/mach-es7000/mach_wakecpu.h
+++ b/include/asm-x86/es7000/wakecpu.h
@@ -1,7 +1,7 @@
-#ifndef ASM_X86__MACH_ES7000__MACH_WAKECPU_H
-#define ASM_X86__MACH_ES7000__MACH_WAKECPU_H
+#ifndef __ASM_ES7000_WAKECPU_H
+#define __ASM_ES7000_WAKECPU_H
 
-/* 
+/*
  * This file copes with machines that wakeup secondary CPUs by the
  * INIT, INIT, STARTUP sequence.
  */
@@ -56,4 +56,4 @@
  #define inquire_remote_apic(apicid) {}
 #endif
 
-#endif /* ASM_X86__MACH_ES7000__MACH_WAKECPU_H */
+#endif /* __ASM_MACH_WAKECPU_H */
diff --git a/include/asm-x86/genapic_64.h b/include/asm-x86/genapic_64.h
index 25097a8..ed6a488 100644
--- a/include/asm-x86/genapic_64.h
+++ b/include/asm-x86/genapic_64.h
@@ -14,6 +14,7 @@
 
 struct genapic {
 	char *name;
+	int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
 	u32 int_delivery_mode;
 	u32 int_dest_mode;
 	int (*apic_id_registered)(void);
@@ -24,17 +25,24 @@
 	void (*send_IPI_mask)(cpumask_t mask, int vector);
 	void (*send_IPI_allbutself)(int vector);
 	void (*send_IPI_all)(int vector);
+	void (*send_IPI_self)(int vector);
 	/* */
 	unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
 	unsigned int (*phys_pkg_id)(int index_msb);
+	unsigned int (*get_apic_id)(unsigned long x);
+	unsigned long (*set_apic_id)(unsigned int id);
+	unsigned long apic_id_mask;
 };
 
 extern struct genapic *genapic;
 
 extern struct genapic apic_flat;
 extern struct genapic apic_physflat;
+extern struct genapic apic_x2apic_cluster;
+extern struct genapic apic_x2apic_phys;
 extern int acpi_madt_oem_check(char *, char *);
 
+extern void apic_send_IPI_self(int vector);
 enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
 extern enum uv_system_type get_uv_system_type(void);
 extern int is_uv_system(void);
diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h
index 65997b1..50f6e03 100644
--- a/include/asm-x86/hw_irq.h
+++ b/include/asm-x86/hw_irq.h
@@ -64,7 +64,6 @@
 extern void init_VISWS_APIC_irqs(void);
 extern void setup_IO_APIC(void);
 extern void disable_IO_APIC(void);
-extern void print_IO_APIC(void);
 extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
 extern void setup_ioapic_dest(void);
 
@@ -73,7 +72,9 @@
 #endif
 
 /* IPI functions */
+#ifdef CONFIG_X86_32
 extern void send_IPI_self(int vector);
+#endif
 extern void send_IPI(int dest, int vector);
 
 /* Statistics */
diff --git a/include/asm-x86/i387.h b/include/asm-x86/i387.h
index 1ecdc3e..9ba862a 100644
--- a/include/asm-x86/i387.h
+++ b/include/asm-x86/i387.h
@@ -19,7 +19,9 @@
 #include <asm/sigcontext.h>
 #include <asm/user.h>
 #include <asm/uaccess.h>
+#include <asm/xsave.h>
 
+extern unsigned int sig_xstate_size;
 extern void fpu_init(void);
 extern void mxcsr_feature_mask_init(void);
 extern int init_fpu(struct task_struct *child);
@@ -31,12 +33,18 @@
 extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
 extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set;
 
+extern struct _fpx_sw_bytes fx_sw_reserved;
 #ifdef CONFIG_IA32_EMULATION
+extern unsigned int sig_xstate_ia32_size;
+extern struct _fpx_sw_bytes fx_sw_reserved_ia32;
 struct _fpstate_ia32;
-extern int save_i387_ia32(struct _fpstate_ia32 __user *buf);
-extern int restore_i387_ia32(struct _fpstate_ia32 __user *buf);
+struct _xstate_ia32;
+extern int save_i387_xstate_ia32(void __user *buf);
+extern int restore_i387_xstate_ia32(void __user *buf);
 #endif
 
+#define X87_FSW_ES (1 << 7)	/* Exception Summary */
+
 #ifdef CONFIG_X86_64
 
 /* Ignore delayed exceptions from user space */
@@ -47,7 +55,7 @@
 		     _ASM_EXTABLE(1b, 2b));
 }
 
-static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
+static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 {
 	int err;
 
@@ -67,15 +75,31 @@
 	return err;
 }
 
-#define X87_FSW_ES (1 << 7)	/* Exception Summary */
+static inline int restore_fpu_checking(struct task_struct *tsk)
+{
+	if (task_thread_info(tsk)->status & TS_XSAVE)
+		return xrstor_checking(&tsk->thread.xstate->xsave);
+	else
+		return fxrstor_checking(&tsk->thread.xstate->fxsave);
+}
 
 /* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
    is pending. Clear the x87 state here by setting it to fixed
    values. The kernel data segment can be sometimes 0 and sometimes
    new user value. Both should be ok.
    Use the PDA as safe address because it should be already in L1. */
-static inline void clear_fpu_state(struct i387_fxsave_struct *fx)
+static inline void clear_fpu_state(struct task_struct *tsk)
 {
+	struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
+	struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+
+	/*
+	 * xsave header may indicate the init state of the FP.
+	 */
+	if ((task_thread_info(tsk)->status & TS_XSAVE) &&
+	    !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
+		return;
+
 	if (unlikely(fx->swd & X87_FSW_ES))
 		asm volatile("fnclex");
 	alternative_input(ASM_NOP8 ASM_NOP2,
@@ -84,7 +108,7 @@
 			  X86_FEATURE_FXSAVE_LEAK);
 }
 
-static inline int save_i387_checking(struct i387_fxsave_struct __user *fx)
+static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 {
 	int err;
 
@@ -108,7 +132,7 @@
 	return err;
 }
 
-static inline void __save_init_fpu(struct task_struct *tsk)
+static inline void fxsave(struct task_struct *tsk)
 {
 	/* Using "rex64; fxsave %0" is broken because, if the memory operand
 	   uses any extended registers for addressing, a second REX prefix
@@ -133,7 +157,16 @@
 			     : "=m" (tsk->thread.xstate->fxsave)
 			     : "cdaSDb" (&tsk->thread.xstate->fxsave));
 #endif
-	clear_fpu_state(&tsk->thread.xstate->fxsave);
+}
+
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+	if (task_thread_info(tsk)->status & TS_XSAVE)
+		xsave(tsk);
+	else
+		fxsave(tsk);
+
+	clear_fpu_state(tsk);
 	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 }
 
@@ -148,6 +181,10 @@
 
 static inline void restore_fpu(struct task_struct *tsk)
 {
+	if (task_thread_info(tsk)->status & TS_XSAVE) {
+		xrstor_checking(&tsk->thread.xstate->xsave);
+		return;
+	}
 	/*
 	 * The "nop" is needed to make the instructions the same
 	 * length.
@@ -173,6 +210,27 @@
  */
 static inline void __save_init_fpu(struct task_struct *tsk)
 {
+	if (task_thread_info(tsk)->status & TS_XSAVE) {
+		struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
+		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+
+		xsave(tsk);
+
+		/*
+		 * xsave header may indicate the init state of the FP.
+		 */
+		if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
+			goto end;
+
+		if (unlikely(fx->swd & X87_FSW_ES))
+			asm volatile("fnclex");
+
+		/*
+		 * we can do a simple return here or be paranoid :)
+		 */
+		goto clear_state;
+	}
+
 	/* Use more nops than strictly needed in case the compiler
 	   varies code */
 	alternative_input(
@@ -182,6 +240,7 @@
 		X86_FEATURE_FXSR,
 		[fx] "m" (tsk->thread.xstate->fxsave),
 		[fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory");
+clear_state:
 	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
 	   is pending.  Clear the x87 state here by setting it to fixed
 	   values. safe_address is a random variable that should be in L1 */
@@ -191,16 +250,17 @@
 		"fildl %[addr]", 	/* set F?P to defined value */
 		X86_FEATURE_FXSAVE_LEAK,
 		[addr] "m" (safe_address));
+end:
 	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 }
 
+#endif	/* CONFIG_X86_64 */
+
 /*
  * Signal frame handlers...
  */
-extern int save_i387(struct _fpstate __user *buf);
-extern int restore_i387(struct _fpstate __user *buf);
-
-#endif	/* CONFIG_X86_64 */
+extern int save_i387_xstate(void __user *buf);
+extern int restore_i387_xstate(void __user *buf);
 
 static inline void __unlazy_fpu(struct task_struct *tsk)
 {
diff --git a/include/asm-x86/i8259.h b/include/asm-x86/i8259.h
index c586559..23c1b3b 100644
--- a/include/asm-x86/i8259.h
+++ b/include/asm-x86/i8259.h
@@ -57,4 +57,7 @@
 
 extern struct irq_chip i8259A_chip;
 
+extern void mask_8259A(void);
+extern void unmask_8259A(void);
+
 #endif /* ASM_X86__I8259_H */
diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h
index be62847..8ec68a5 100644
--- a/include/asm-x86/io_apic.h
+++ b/include/asm-x86/io_apic.h
@@ -107,6 +107,20 @@
 
 } __attribute__ ((packed));
 
+struct IR_IO_APIC_route_entry {
+	__u64	vector		: 8,
+		zero		: 3,
+		index2		: 1,
+		delivery_status : 1,
+		polarity	: 1,
+		irr		: 1,
+		trigger		: 1,
+		mask		: 1,
+		reserved	: 31,
+		format		: 1,
+		index		: 15;
+} __attribute__ ((packed));
+
 #ifdef CONFIG_X86_IO_APIC
 
 /*
@@ -183,6 +197,12 @@
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 
+#ifdef CONFIG_X86_64
+extern int save_mask_IO_APIC_setup(void);
+extern void restore_IO_APIC_setup(void);
+extern void reinit_intr_remapped_IO_APIC(int);
+#endif
+
 #else  /* !CONFIG_X86_IO_APIC */
 #define io_apic_assign_pci_irqs 0
 static const int timer_through_8259 = 0;
diff --git a/include/asm-x86/ipi.h b/include/asm-x86/ipi.h
index c1b2267..30a692c 100644
--- a/include/asm-x86/ipi.h
+++ b/include/asm-x86/ipi.h
@@ -49,6 +49,12 @@
 	return SET_APIC_DEST_FIELD(mask);
 }
 
+static inline void __xapic_wait_icr_idle(void)
+{
+	while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY)
+		cpu_relax();
+}
+
 static inline void __send_IPI_shortcut(unsigned int shortcut, int vector,
 				       unsigned int dest)
 {
@@ -64,7 +70,7 @@
 	/*
 	 * Wait for idle.
 	 */
-	apic_wait_icr_idle();
+	__xapic_wait_icr_idle();
 
 	/*
 	 * No need to touch the target chip field
@@ -74,7 +80,7 @@
 	/*
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */
-	apic_write(APIC_ICR, cfg);
+	native_apic_mem_write(APIC_ICR, cfg);
 }
 
 /*
@@ -92,13 +98,13 @@
 	if (unlikely(vector == NMI_VECTOR))
 		safe_apic_wait_icr_idle();
 	else
-		apic_wait_icr_idle();
+		__xapic_wait_icr_idle();
 
 	/*
 	 * prepare target chip field
 	 */
 	cfg = __prepare_ICR2(mask);
-	apic_write(APIC_ICR2, cfg);
+	native_apic_mem_write(APIC_ICR2, cfg);
 
 	/*
 	 * program the ICR
@@ -108,7 +114,7 @@
 	/*
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */
-	apic_write(APIC_ICR, cfg);
+	native_apic_mem_write(APIC_ICR, cfg);
 }
 
 static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
diff --git a/include/asm-x86/irq_remapping.h b/include/asm-x86/irq_remapping.h
new file mode 100644
index 0000000..78242c6
--- /dev/null
+++ b/include/asm-x86/irq_remapping.h
@@ -0,0 +1,8 @@
+#ifndef _ASM_IRQ_REMAPPING_H
+#define _ASM_IRQ_REMAPPING_H
+
+extern int x2apic;
+
+#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
+
+#endif
diff --git a/include/asm-x86/mach-bigsmp/mach_apicdef.h b/include/asm-x86/mach-bigsmp/mach_apicdef.h
deleted file mode 100644
index 811935d..0000000
--- a/include/asm-x86/mach-bigsmp/mach_apicdef.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef ASM_X86__MACH_BIGSMP__MACH_APICDEF_H
-#define ASM_X86__MACH_BIGSMP__MACH_APICDEF_H
-
-#define		APIC_ID_MASK		(0xFF<<24)
-
-static inline unsigned get_apic_id(unsigned long x) 
-{ 
-	return (((x)>>24)&0xFF);
-} 
-
-#define		GET_APIC_ID(x)	get_apic_id(x)
-
-#endif /* ASM_X86__MACH_BIGSMP__MACH_APICDEF_H */
diff --git a/include/asm-x86/mach-default/mach_apic.h b/include/asm-x86/mach-default/mach_apic.h
index b615f40..2a330a4 100644
--- a/include/asm-x86/mach-default/mach_apic.h
+++ b/include/asm-x86/mach-default/mach_apic.h
@@ -30,6 +30,8 @@
 #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
 #define phys_pkg_id	(genapic->phys_pkg_id)
 #define vector_allocation_domain    (genapic->vector_allocation_domain)
+#define read_apic_id()  (GET_APIC_ID(apic_read(APIC_ID)))
+#define send_IPI_self (genapic->send_IPI_self)
 extern void setup_apic_routing(void);
 #else
 #define INT_DELIVERY_MODE dest_LowestPrio
@@ -54,7 +56,7 @@
 
 static inline int apic_id_registered(void)
 {
-	return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map);
+	return physid_isset(read_apic_id(), phys_cpu_present_map);
 }
 
 static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
diff --git a/include/asm-x86/mach-default/mach_apicdef.h b/include/asm-x86/mach-default/mach_apicdef.h
index 936704f..0c2d41c 100644
--- a/include/asm-x86/mach-default/mach_apicdef.h
+++ b/include/asm-x86/mach-default/mach_apicdef.h
@@ -4,9 +4,9 @@
 #include <asm/apic.h>
 
 #ifdef CONFIG_X86_64
-#define	APIC_ID_MASK		(0xFFu<<24)
-#define GET_APIC_ID(x)          (((x)>>24)&0xFFu)
-#define	SET_APIC_ID(x)		(((x)<<24))
+#define	APIC_ID_MASK		(genapic->apic_id_mask)
+#define GET_APIC_ID(x)		(genapic->get_apic_id(x))
+#define	SET_APIC_ID(x)		(genapic->set_apic_id(x))
 #else
 #define		APIC_ID_MASK		(0xF<<24)
 static inline unsigned get_apic_id(unsigned long x) 
diff --git a/include/asm-x86/mach-es7000/mach_apicdef.h b/include/asm-x86/mach-es7000/mach_apicdef.h
deleted file mode 100644
index a07e567..0000000
--- a/include/asm-x86/mach-es7000/mach_apicdef.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef ASM_X86__MACH_ES7000__MACH_APICDEF_H
-#define ASM_X86__MACH_ES7000__MACH_APICDEF_H
-
-#define		APIC_ID_MASK		(0xFF<<24)
-
-static inline unsigned get_apic_id(unsigned long x) 
-{ 
-	return (((x)>>24)&0xFF);
-} 
-
-#define		GET_APIC_ID(x)	get_apic_id(x)
-
-#endif /* ASM_X86__MACH_ES7000__MACH_APICDEF_H */
diff --git a/include/asm-x86/mach-es7000/mach_ipi.h b/include/asm-x86/mach-es7000/mach_ipi.h
deleted file mode 100644
index 3a21240..0000000
--- a/include/asm-x86/mach-es7000/mach_ipi.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef ASM_X86__MACH_ES7000__MACH_IPI_H
-#define ASM_X86__MACH_ES7000__MACH_IPI_H
-
-void send_IPI_mask_sequence(cpumask_t mask, int vector);
-
-static inline void send_IPI_mask(cpumask_t mask, int vector)
-{
-	send_IPI_mask_sequence(mask, vector);
-}
-
-static inline void send_IPI_allbutself(int vector)
-{
-	cpumask_t mask = cpu_online_map;
-	cpu_clear(smp_processor_id(), mask);
-	if (!cpus_empty(mask))
-		send_IPI_mask(mask, vector);
-}
-
-static inline void send_IPI_all(int vector)
-{
-	send_IPI_mask(cpu_online_map, vector);
-}
-
-#endif /* ASM_X86__MACH_ES7000__MACH_IPI_H */
diff --git a/include/asm-x86/mach-numaq/mach_apicdef.h b/include/asm-x86/mach-numaq/mach_apicdef.h
deleted file mode 100644
index f870ec5..0000000
--- a/include/asm-x86/mach-numaq/mach_apicdef.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef ASM_X86__MACH_NUMAQ__MACH_APICDEF_H
-#define ASM_X86__MACH_NUMAQ__MACH_APICDEF_H
-
-
-#define APIC_ID_MASK (0xF<<24)
-
-static inline unsigned get_apic_id(unsigned long x)
-{
-	        return (((x)>>24)&0x0F);
-}
-
-#define         GET_APIC_ID(x)  get_apic_id(x)
-
-#endif /* ASM_X86__MACH_NUMAQ__MACH_APICDEF_H */
diff --git a/include/asm-x86/mach-numaq/mach_mpparse.h b/include/asm-x86/mach-numaq/mach_mpparse.h
deleted file mode 100644
index 74ade18..0000000
--- a/include/asm-x86/mach-numaq/mach_mpparse.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef ASM_X86__MACH_NUMAQ__MACH_MPPARSE_H
-#define ASM_X86__MACH_NUMAQ__MACH_MPPARSE_H
-
-extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
-				char *productid);
-
-#endif /* ASM_X86__MACH_NUMAQ__MACH_MPPARSE_H */
diff --git a/include/asm-x86/mach-summit/mach_apicdef.h b/include/asm-x86/mach-summit/mach_apicdef.h
deleted file mode 100644
index d4bc859..0000000
--- a/include/asm-x86/mach-summit/mach_apicdef.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef ASM_X86__MACH_SUMMIT__MACH_APICDEF_H
-#define ASM_X86__MACH_SUMMIT__MACH_APICDEF_H
-
-#define		APIC_ID_MASK		(0xFF<<24)
-
-static inline unsigned get_apic_id(unsigned long x) 
-{ 
-	return (((x)>>24)&0xFF);
-} 
-
-#define		GET_APIC_ID(x)	get_apic_id(x)
-
-#endif /* ASM_X86__MACH_SUMMIT__MACH_APICDEF_H */
diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h
index 118da36..be2241a 100644
--- a/include/asm-x86/mpspec.h
+++ b/include/asm-x86/mpspec.h
@@ -5,11 +5,12 @@
 
 #include <asm/mpspec_def.h>
 
+extern int apic_version[MAX_APICS];
+
 #ifdef CONFIG_X86_32
 #include <mach_mpspec.h>
 
 extern unsigned int def_to_bigsmp;
-extern int apic_version[MAX_APICS];
 extern u8 apicid_2_node[];
 extern int pic_mode;
 
diff --git a/include/asm-x86/msidef.h b/include/asm-x86/msidef.h
index 3139666..ed91902 100644
--- a/include/asm-x86/msidef.h
+++ b/include/asm-x86/msidef.h
@@ -48,4 +48,8 @@
 #define  MSI_ADDR_DEST_ID(dest)		(((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
 					 MSI_ADDR_DEST_ID_MASK)
 
+#define MSI_ADDR_IR_EXT_INT		(1 << 4)
+#define MSI_ADDR_IR_SHV			(1 << 3)
+#define MSI_ADDR_IR_INDEX1(index)	((index & 0x8000) >> 13)
+#define MSI_ADDR_IR_INDEX2(index)	((index & 0x7fff) << 5)
 #endif /* ASM_X86__MSIDEF_H */
diff --git a/include/asm-x86/msr-index.h b/include/asm-x86/msr-index.h
index 3052f05..0bb4330 100644
--- a/include/asm-x86/msr-index.h
+++ b/include/asm-x86/msr-index.h
@@ -176,6 +176,7 @@
 #define MSR_IA32_TSC			0x00000010
 #define MSR_IA32_PLATFORM_ID		0x00000017
 #define MSR_IA32_EBL_CR_POWERON		0x0000002a
+#define MSR_IA32_FEATURE_CONTROL        0x0000003a
 
 #define MSR_IA32_APICBASE		0x0000001b
 #define MSR_IA32_APICBASE_BSP		(1<<8)
@@ -310,4 +311,19 @@
 /* Geode defined MSRs */
 #define MSR_GEODE_BUSCONT_CONF0		0x00001900
 
+/* Intel VT MSRs */
+#define MSR_IA32_VMX_BASIC              0x00000480
+#define MSR_IA32_VMX_PINBASED_CTLS      0x00000481
+#define MSR_IA32_VMX_PROCBASED_CTLS     0x00000482
+#define MSR_IA32_VMX_EXIT_CTLS          0x00000483
+#define MSR_IA32_VMX_ENTRY_CTLS         0x00000484
+#define MSR_IA32_VMX_MISC               0x00000485
+#define MSR_IA32_VMX_CR0_FIXED0         0x00000486
+#define MSR_IA32_VMX_CR0_FIXED1         0x00000487
+#define MSR_IA32_VMX_CR4_FIXED0         0x00000488
+#define MSR_IA32_VMX_CR4_FIXED1         0x00000489
+#define MSR_IA32_VMX_VMCS_ENUM          0x0000048a
+#define MSR_IA32_VMX_PROCBASED_CTLS2    0x0000048b
+#define MSR_IA32_VMX_EPT_VPID_CAP       0x0000048c
+
 #endif /* ASM_X86__MSR_INDEX_H */
diff --git a/include/asm-x86/mach-numaq/mach_apic.h b/include/asm-x86/numaq/apic.h
similarity index 95%
rename from include/asm-x86/mach-numaq/mach_apic.h
rename to include/asm-x86/numaq/apic.h
index 7a0d39e..a8344ba 100644
--- a/include/asm-x86/mach-numaq/mach_apic.h
+++ b/include/asm-x86/numaq/apic.h
@@ -1,5 +1,5 @@
-#ifndef ASM_X86__MACH_NUMAQ__MACH_APIC_H
-#define ASM_X86__MACH_NUMAQ__MACH_APIC_H
+#ifndef __ASM_NUMAQ_APIC_H
+#define __ASM_NUMAQ_APIC_H
 
 #include <asm/io.h>
 #include <linux/mmzone.h>
@@ -135,4 +135,4 @@
 	return cpuid_apic >> index_msb;
 }
 
-#endif /* ASM_X86__MACH_NUMAQ__MACH_APIC_H */
+#endif /* __ASM_NUMAQ_APIC_H */
diff --git a/include/asm-x86/numaq/apicdef.h b/include/asm-x86/numaq/apicdef.h
new file mode 100644
index 0000000..e012a46
--- /dev/null
+++ b/include/asm-x86/numaq/apicdef.h
@@ -0,0 +1,14 @@
+#ifndef __ASM_NUMAQ_APICDEF_H
+#define __ASM_NUMAQ_APICDEF_H
+
+
+#define APIC_ID_MASK (0xF<<24)
+
+static inline unsigned get_apic_id(unsigned long x)
+{
+	        return (((x)>>24)&0x0F);
+}
+
+#define         GET_APIC_ID(x)  get_apic_id(x)
+
+#endif
diff --git a/include/asm-x86/mach-numaq/mach_ipi.h b/include/asm-x86/numaq/ipi.h
similarity index 77%
rename from include/asm-x86/mach-numaq/mach_ipi.h
rename to include/asm-x86/numaq/ipi.h
index 1e83582..935588d2 100644
--- a/include/asm-x86/mach-numaq/mach_ipi.h
+++ b/include/asm-x86/numaq/ipi.h
@@ -1,5 +1,5 @@
-#ifndef ASM_X86__MACH_NUMAQ__MACH_IPI_H
-#define ASM_X86__MACH_NUMAQ__MACH_IPI_H
+#ifndef __ASM_NUMAQ_IPI_H
+#define __ASM_NUMAQ_IPI_H
 
 void send_IPI_mask_sequence(cpumask_t, int vector);
 
@@ -22,4 +22,4 @@
 	send_IPI_mask(cpu_online_map, vector);
 }
 
-#endif /* ASM_X86__MACH_NUMAQ__MACH_IPI_H */
+#endif /* __ASM_NUMAQ_IPI_H */
diff --git a/include/asm-x86/numaq/mpparse.h b/include/asm-x86/numaq/mpparse.h
new file mode 100644
index 0000000..252292e
--- /dev/null
+++ b/include/asm-x86/numaq/mpparse.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_NUMAQ_MPPARSE_H
+#define __ASM_NUMAQ_MPPARSE_H
+
+extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
+				char *productid);
+
+#endif /* __ASM_NUMAQ_MPPARSE_H */
diff --git a/include/asm-x86/mach-numaq/mach_wakecpu.h b/include/asm-x86/numaq/wakecpu.h
similarity index 88%
rename from include/asm-x86/mach-numaq/mach_wakecpu.h
rename to include/asm-x86/numaq/wakecpu.h
index 0db8cea..c577bda 100644
--- a/include/asm-x86/mach-numaq/mach_wakecpu.h
+++ b/include/asm-x86/numaq/wakecpu.h
@@ -1,5 +1,5 @@
-#ifndef ASM_X86__MACH_NUMAQ__MACH_WAKECPU_H
-#define ASM_X86__MACH_NUMAQ__MACH_WAKECPU_H
+#ifndef __ASM_NUMAQ_WAKECPU_H
+#define __ASM_NUMAQ_WAKECPU_H
 
 /* This file copes with machines that wakeup secondary CPUs by NMIs */
 
@@ -40,4 +40,4 @@
 
 #define inquire_remote_apic(apicid) {}
 
-#endif /* ASM_X86__MACH_NUMAQ__MACH_WAKECPU_H */
+#endif /* __ASM_NUMAQ_WAKECPU_H */
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 891971f..d7d358a 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -201,12 +201,6 @@
 
 struct pv_apic_ops {
 #ifdef CONFIG_X86_LOCAL_APIC
-	/*
-	 * Direct APIC operations, principally for VMI.  Ideally
-	 * these shouldn't be in this interface.
-	 */
-	void (*apic_write)(unsigned long reg, u32 v);
-	u32 (*apic_read)(unsigned long reg);
 	void (*setup_boot_clock)(void);
 	void (*setup_secondary_clock)(void);
 
@@ -910,19 +904,6 @@
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Basic functions accessing APICs.
- */
-static inline void apic_write(unsigned long reg, u32 v)
-{
-	PVOP_VCALL2(pv_apic_ops.apic_write, reg, v);
-}
-
-static inline u32 apic_read(unsigned long reg)
-{
-	return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg);
-}
-
 static inline void setup_boot_clock(void)
 {
 	PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
diff --git a/include/asm-x86/processor-cyrix.h b/include/asm-x86/processor-cyrix.h
index 97568ad..1198f2a 100644
--- a/include/asm-x86/processor-cyrix.h
+++ b/include/asm-x86/processor-cyrix.h
@@ -28,3 +28,11 @@
 	outb(reg, 0x22);
 	outb(data, 0x23);
 }
+
+#define getCx86_old(reg) ({ outb((reg), 0x22); inb(0x23); })
+
+#define setCx86_old(reg, data) do { \
+	outb((reg), 0x22); \
+	outb((data), 0x23); \
+} while (0)
+
diff --git a/include/asm-x86/processor-flags.h b/include/asm-x86/processor-flags.h
index 5dd7977..dc5f071 100644
--- a/include/asm-x86/processor-flags.h
+++ b/include/asm-x86/processor-flags.h
@@ -59,6 +59,7 @@
 #define X86_CR4_OSFXSR	0x00000200 /* enable fast FPU save and restore */
 #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
 #define X86_CR4_VMXE	0x00002000 /* enable VMX virtualization */
+#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
 
 /*
  * x86-64 Task Priority Register, CR8
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index 5eaf9bf..c7d3546 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -76,11 +76,11 @@
 	int			 x86_tlbsize;
 	__u8			x86_virt_bits;
 	__u8			x86_phys_bits;
+#endif
 	/* CPUID returned core id bits: */
 	__u8			x86_coreid_bits;
 	/* Max extended CPUID function supported: */
 	__u32			extended_cpuid_level;
-#endif
 	/* Maximum supported CPUID level, -1=no CPUID: */
 	int			cpuid_level;
 	__u32			x86_capability[NCAPINTS];
@@ -166,11 +166,8 @@
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern unsigned short num_cache_leaves;
 
-#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64)
+extern void detect_extended_topology(struct cpuinfo_x86 *c);
 extern void detect_ht(struct cpuinfo_x86 *c);
-#else
-static inline void detect_ht(struct cpuinfo_x86 *c) {}
-#endif
 
 static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
 				unsigned int *ecx, unsigned int *edx)
@@ -327,7 +324,12 @@
 	/* 16*16 bytes for each XMM-reg = 256 bytes:			*/
 	u32			xmm_space[64];
 
-	u32			padding[24];
+	u32			padding[12];
+
+	union {
+		u32		padding1[12];
+		u32		sw_reserved[12];
+	};
 
 } __attribute__((aligned(16)));
 
@@ -351,10 +353,23 @@
 	u32			entry_eip;
 };
 
+struct xsave_hdr_struct {
+	u64 xstate_bv;
+	u64 reserved1[2];
+	u64 reserved2[5];
+} __attribute__((packed));
+
+struct xsave_struct {
+	struct i387_fxsave_struct i387;
+	struct xsave_hdr_struct xsave_hdr;
+	/* new processor state extensions will go here */
+} __attribute__ ((packed, aligned (64)));
+
 union thread_xstate {
 	struct i387_fsave_struct	fsave;
 	struct i387_fxsave_struct	fxsave;
 	struct i387_soft_struct		soft;
+	struct xsave_struct		xsave;
 };
 
 #ifdef CONFIG_X86_64
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index 9030cb7..11b6cc1 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -38,6 +38,7 @@
 	void (*mpc_oem_pci_bus)(struct mpc_config_bus *m);
 	void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable,
                                     unsigned short oemsize);
+	int (*setup_ioapic_ids)(void);
 };
 
 extern struct x86_quirks *x86_quirks;
diff --git a/include/asm-x86/sigcontext.h b/include/asm-x86/sigcontext.h
index 24879c8..ee813f4 100644
--- a/include/asm-x86/sigcontext.h
+++ b/include/asm-x86/sigcontext.h
@@ -4,6 +4,40 @@
 #include <linux/compiler.h>
 #include <asm/types.h>
 
+#define FP_XSTATE_MAGIC1	0x46505853U
+#define FP_XSTATE_MAGIC2	0x46505845U
+#define FP_XSTATE_MAGIC2_SIZE	sizeof(FP_XSTATE_MAGIC2)
+
+/*
+ * bytes 464..511 in the current 512byte layout of fxsave/fxrstor frame
+ * are reserved for SW usage. On cpu's supporting xsave/xrstor, these bytes
+ * are used to extended the fpstate pointer in the sigcontext, which now
+ * includes the extended state information along with fpstate information.
+ *
+ * Presence of FP_XSTATE_MAGIC1 at the beginning of this SW reserved
+ * area and FP_XSTATE_MAGIC2 at the end of memory layout
+ * (extended_size - FP_XSTATE_MAGIC2_SIZE) indicates the presence of the
+ * extended state information in the memory layout pointed by the fpstate
+ * pointer in sigcontext.
+ */
+struct _fpx_sw_bytes {
+	__u32 magic1;		/* FP_XSTATE_MAGIC1 */
+	__u32 extended_size;	/* total size of the layout referred by
+				 * fpstate pointer in the sigcontext.
+				 */
+	__u64 xstate_bv;
+				/* feature bit mask (including fp/sse/extended
+				 * state) that is present in the memory
+				 * layout.
+				 */
+	__u32 xstate_size;	/* actual xsave state size, based on the
+				 * features saved in the layout.
+				 * 'extended_size' will be greater than
+				 * 'xstate_size'.
+				 */
+	__u32 padding[7];	/*  for future use. */
+};
+
 #ifdef __i386__
 /*
  * As documented in the iBCS2 standard..
@@ -53,7 +87,13 @@
 	unsigned long	reserved;
 	struct _fpxreg	_fxsr_st[8];	/* FXSR FPU reg data is ignored */
 	struct _xmmreg	_xmm[8];
-	unsigned long	padding[56];
+	unsigned long	padding1[44];
+
+	union {
+		unsigned long	padding2[12];
+		struct _fpx_sw_bytes sw_reserved; /* represents the extended
+						   * state info */
+	};
 };
 
 #define X86_FXSR_MAGIC		0x0000
@@ -79,7 +119,15 @@
 	unsigned long flags;
 	unsigned long sp_at_signal;
 	unsigned short ss, __ssh;
-	struct _fpstate __user *fpstate;
+
+	/*
+	 * fpstate is really (struct _fpstate *) or (struct _xstate *)
+	 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+	 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+	 * of extended memory layout. See comments at the defintion of
+	 * (struct _fpx_sw_bytes)
+	 */
+	void __user *fpstate;		/* zero when no FPU/extended context */
 	unsigned long oldmask;
 	unsigned long cr2;
 };
@@ -130,7 +178,12 @@
 	__u32	mxcsr_mask;
 	__u32	st_space[32];	/* 8*16 bytes for each FP-reg */
 	__u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg  */
-	__u32	reserved2[24];
+	__u32	reserved2[12];
+	union {
+		__u32	reserved3[12];
+		struct _fpx_sw_bytes sw_reserved; /* represents the extended
+						   * state information */
+	};
 };
 
 #ifdef __KERNEL__
@@ -161,7 +214,15 @@
 	unsigned long trapno;
 	unsigned long oldmask;
 	unsigned long cr2;
-	struct _fpstate __user *fpstate;	/* zero when no FPU context */
+
+	/*
+	 * fpstate is really (struct _fpstate *) or (struct _xstate *)
+	 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+	 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+	 * of extended memory layout. See comments at the defintion of
+	 * (struct _fpx_sw_bytes)
+	 */
+	void __user *fpstate;		/* zero when no FPU/extended context */
 	unsigned long reserved1[8];
 };
 #else /* __KERNEL__ */
@@ -202,4 +263,22 @@
 
 #endif /* !__i386__ */
 
+struct _xsave_hdr {
+	__u64 xstate_bv;
+	__u64 reserved1[2];
+	__u64 reserved2[5];
+};
+
+/*
+ * Extended state pointed by the fpstate pointer in the sigcontext.
+ * In addition to the fpstate, information encoded in the xstate_hdr
+ * indicates the presence of other extended state information
+ * supported by the processor and OS.
+ */
+struct _xstate {
+	struct _fpstate fpstate;
+	struct _xsave_hdr xstate_hdr;
+	/* new processor state extensions go here */
+};
+
 #endif /* ASM_X86__SIGCONTEXT_H */
diff --git a/include/asm-x86/sigcontext32.h b/include/asm-x86/sigcontext32.h
index 4e2ec73..8c34703 100644
--- a/include/asm-x86/sigcontext32.h
+++ b/include/asm-x86/sigcontext32.h
@@ -40,7 +40,11 @@
 	__u32	reserved;
 	struct _fpxreg	_fxsr_st[8];
 	struct _xmmreg	_xmm[8];	/* It's actually 16 */
-	__u32	padding[56];
+	__u32	padding[44];
+	union {
+		__u32 padding2[12];
+		struct _fpx_sw_bytes sw_reserved;
+	};
 };
 
 struct sigcontext_ia32 {
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
index 04f84f4..29324c1 100644
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -167,30 +167,33 @@
 
 #ifdef CONFIG_X86_LOCAL_APIC
 
+#ifndef CONFIG_X86_64
 static inline int logical_smp_processor_id(void)
 {
 	/* we don't want to mark this access volatile - bad code generation */
 	return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
 }
 
-#ifndef CONFIG_X86_64
+#include <mach_apicdef.h>
 static inline unsigned int read_apic_id(void)
 {
-	return *(u32 *)(APIC_BASE + APIC_ID);
+	unsigned int reg;
+
+	reg = *(u32 *)(APIC_BASE + APIC_ID);
+
+	return GET_APIC_ID(reg);
 }
-#else
-extern unsigned int read_apic_id(void);
 #endif
 
 
-# ifdef APIC_DEFINITION
+# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64)
 extern int hard_smp_processor_id(void);
 # else
-#  include <mach_apicdef.h>
+#include <mach_apicdef.h>
 static inline int hard_smp_processor_id(void)
 {
 	/* we don't want to mark this access volatile - bad code generation */
-	return GET_APIC_ID(read_apic_id());
+	return read_apic_id();
 }
 # endif /* APIC_DEFINITION */
 
diff --git a/include/asm-x86/mach-summit/mach_apic.h b/include/asm-x86/summit/apic.h
similarity index 92%
rename from include/asm-x86/mach-summit/mach_apic.h
rename to include/asm-x86/summit/apic.h
index 7a66758..c5b2e4b 100644
--- a/include/asm-x86/mach-summit/mach_apic.h
+++ b/include/asm-x86/summit/apic.h
@@ -1,5 +1,5 @@
-#ifndef ASM_X86__MACH_SUMMIT__MACH_APIC_H
-#define ASM_X86__MACH_SUMMIT__MACH_APIC_H
+#ifndef __ASM_SUMMIT_APIC_H
+#define __ASM_SUMMIT_APIC_H
 
 #include <asm/smp.h>
 
@@ -21,7 +21,7 @@
 	 * Just start on cpu 0.  IRQ balancing will spread load
 	 */
 	return cpumask_of_cpu(0);
-} 
+}
 #define TARGET_CPUS	(target_cpus())
 
 #define INT_DELIVERY_MODE (dest_LowestPrio)
@@ -30,10 +30,10 @@
 static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
 {
 	return 0;
-} 
+}
 
 /* we don't use the phys_cpu_present_map to indicate apicid presence */
-static inline unsigned long check_apicid_present(int bit) 
+static inline unsigned long check_apicid_present(int bit)
 {
 	return 1;
 }
@@ -122,7 +122,7 @@
 
 static inline physid_mask_t apicid_to_cpu_present(int apicid)
 {
-	return physid_mask_of_physid(apicid);
+	return physid_mask_of_physid(0);
 }
 
 static inline void setup_portio_remap(void)
@@ -143,22 +143,22 @@
 	int num_bits_set;
 	int cpus_found = 0;
 	int cpu;
-	int apicid;	
+	int apicid;
 
 	num_bits_set = cpus_weight(cpumask);
 	/* Return id to all */
 	if (num_bits_set == NR_CPUS)
 		return (int) 0xFF;
-	/* 
-	 * The cpus in the mask must all be on the apic cluster.  If are not 
-	 * on the same apicid cluster return default value of TARGET_CPUS. 
+	/*
+	 * The cpus in the mask must all be on the apic cluster.  If are not
+	 * on the same apicid cluster return default value of TARGET_CPUS.
 	 */
 	cpu = first_cpu(cpumask);
 	apicid = cpu_to_logical_apicid(cpu);
 	while (cpus_found < num_bits_set) {
 		if (cpu_isset(cpu, cpumask)) {
 			int new_apicid = cpu_to_logical_apicid(cpu);
-			if (apicid_cluster(apicid) != 
+			if (apicid_cluster(apicid) !=
 					apicid_cluster(new_apicid)){
 				printk ("%s: Not a valid mask!\n",__FUNCTION__);
 				return 0xFF;
@@ -182,4 +182,4 @@
 	return hard_smp_processor_id() >> index_msb;
 }
 
-#endif /* ASM_X86__MACH_SUMMIT__MACH_APIC_H */
+#endif /* __ASM_SUMMIT_APIC_H */
diff --git a/include/asm-x86/summit/apicdef.h b/include/asm-x86/summit/apicdef.h
new file mode 100644
index 0000000..f3fbca1
--- /dev/null
+++ b/include/asm-x86/summit/apicdef.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_SUMMIT_APICDEF_H
+#define __ASM_SUMMIT_APICDEF_H
+
+#define		APIC_ID_MASK		(0xFF<<24)
+
+static inline unsigned get_apic_id(unsigned long x)
+{
+	return (x>>24)&0xFF;
+}
+
+#define		GET_APIC_ID(x)	get_apic_id(x)
+
+#endif
diff --git a/include/asm-x86/mach-summit/mach_ipi.h b/include/asm-x86/summit/ipi.h
similarity index 77%
rename from include/asm-x86/mach-summit/mach_ipi.h
rename to include/asm-x86/summit/ipi.h
index a3b31c5..53bd1e7 100644
--- a/include/asm-x86/mach-summit/mach_ipi.h
+++ b/include/asm-x86/summit/ipi.h
@@ -1,5 +1,5 @@
-#ifndef ASM_X86__MACH_SUMMIT__MACH_IPI_H
-#define ASM_X86__MACH_SUMMIT__MACH_IPI_H
+#ifndef __ASM_SUMMIT_IPI_H
+#define __ASM_SUMMIT_IPI_H
 
 void send_IPI_mask_sequence(cpumask_t mask, int vector);
 
@@ -22,4 +22,4 @@
 	send_IPI_mask(cpu_online_map, vector);
 }
 
-#endif /* ASM_X86__MACH_SUMMIT__MACH_IPI_H */
+#endif /* __ASM_SUMMIT_IPI_H */
diff --git a/include/asm-x86/mach-summit/irq_vectors_limits.h b/include/asm-x86/summit/irq_vectors_limits.h
similarity index 65%
rename from include/asm-x86/mach-summit/irq_vectors_limits.h
rename to include/asm-x86/summit/irq_vectors_limits.h
index 22f376a..890ce3f 100644
--- a/include/asm-x86/mach-summit/irq_vectors_limits.h
+++ b/include/asm-x86/summit/irq_vectors_limits.h
@@ -1,5 +1,5 @@
-#ifndef ASM_X86__MACH_SUMMIT__IRQ_VECTORS_LIMITS_H
-#define ASM_X86__MACH_SUMMIT__IRQ_VECTORS_LIMITS_H
+#ifndef _ASM_IRQ_VECTORS_LIMITS_H
+#define _ASM_IRQ_VECTORS_LIMITS_H
 
 /*
  * For Summit or generic (i.e. installer) kernels, we have lots of I/O APICs,
@@ -11,4 +11,4 @@
 #define NR_IRQS	224
 #define NR_IRQ_VECTORS	1024
 
-#endif /* ASM_X86__MACH_SUMMIT__IRQ_VECTORS_LIMITS_H */
+#endif /* _ASM_IRQ_VECTORS_LIMITS_H */
diff --git a/include/asm-x86/mach-summit/mach_mpparse.h b/include/asm-x86/summit/mpparse.h
similarity index 94%
rename from include/asm-x86/mach-summit/mach_mpparse.h
rename to include/asm-x86/summit/mpparse.h
index 92396f2..013ce6f 100644
--- a/include/asm-x86/mach-summit/mach_mpparse.h
+++ b/include/asm-x86/summit/mpparse.h
@@ -1,7 +1,6 @@
-#ifndef ASM_X86__MACH_SUMMIT__MACH_MPPARSE_H
-#define ASM_X86__MACH_SUMMIT__MACH_MPPARSE_H
+#ifndef __ASM_SUMMIT_MPPARSE_H
+#define __ASM_SUMMIT_MPPARSE_H
 
-#include <mach_apic.h>
 #include <asm/tsc.h>
 
 extern int use_cyclone;
@@ -12,11 +11,11 @@
 #define setup_summit()	{}
 #endif
 
-static inline int mps_oem_check(struct mp_config_table *mpc, char *oem, 
+static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
 		char *productid)
 {
-	if (!strncmp(oem, "IBM ENSW", 8) && 
-			(!strncmp(productid, "VIGIL SMP", 9) 
+	if (!strncmp(oem, "IBM ENSW", 8) &&
+			(!strncmp(productid, "VIGIL SMP", 9)
 			 || !strncmp(productid, "EXA", 3)
 			 || !strncmp(productid, "RUTHLESS SMP", 12))){
 		mark_tsc_unstable("Summit based system");
@@ -107,4 +106,4 @@
 		rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
 }
 
-#endif /* ASM_X86__MACH_SUMMIT__MACH_MPPARSE_H */
+#endif /* __ASM_SUMMIT_MPPARSE_H */
diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h
index 4db0066..3f4e52b 100644
--- a/include/asm-x86/thread_info.h
+++ b/include/asm-x86/thread_info.h
@@ -241,6 +241,7 @@
 #define TS_POLLING		0x0004	/* true if in idle loop
 					   and not sleeping */
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
+#define TS_XSAVE		0x0010	/* Use xsave/xrstor */
 
 #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
 
diff --git a/include/asm-x86/ucontext.h b/include/asm-x86/ucontext.h
index 9948dd3..89eaa54 100644
--- a/include/asm-x86/ucontext.h
+++ b/include/asm-x86/ucontext.h
@@ -1,6 +1,12 @@
 #ifndef ASM_X86__UCONTEXT_H
 #define ASM_X86__UCONTEXT_H
 
+#define UC_FP_XSTATE	0x1	/* indicates the presence of extended state
+				 * information in the memory layout pointed
+				 * by the fpstate pointer in the ucontext's
+				 * sigcontext struct (uc_mcontext).
+				 */
+
 struct ucontext {
 	unsigned long	  uc_flags;
 	struct ucontext  *uc_link;
diff --git a/include/asm-x86/xcr.h b/include/asm-x86/xcr.h
new file mode 100644
index 0000000..f2cba4e7
--- /dev/null
+++ b/include/asm-x86/xcr.h
@@ -0,0 +1,49 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2008 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2 or (at your
+ *   option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * asm-x86/xcr.h
+ *
+ * Definitions for the eXtended Control Register instructions
+ */
+
+#ifndef _ASM_X86_XCR_H
+#define _ASM_X86_XCR_H
+
+#define XCR_XFEATURE_ENABLED_MASK	0x00000000
+
+#ifdef __KERNEL__
+# ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+static inline u64 xgetbv(u32 index)
+{
+	u32 eax, edx;
+
+	asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
+		     : "=a" (eax), "=d" (edx)
+		     : "c" (index));
+	return eax + ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+	u32 eax = value;
+	u32 edx = value >> 32;
+
+	asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
+		     : : "a" (eax), "d" (edx), "c" (index));
+}
+
+# endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_XCR_H */
diff --git a/include/asm-x86/xsave.h b/include/asm-x86/xsave.h
new file mode 100644
index 0000000..08e9a1a
--- /dev/null
+++ b/include/asm-x86/xsave.h
@@ -0,0 +1,118 @@
+#ifndef __ASM_X86_XSAVE_H
+#define __ASM_X86_XSAVE_H
+
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+
+#define XSTATE_FP	0x1
+#define XSTATE_SSE	0x2
+
+#define XSTATE_FPSSE	(XSTATE_FP | XSTATE_SSE)
+
+#define FXSAVE_SIZE	512
+
+/*
+ * These are the features that the OS can handle currently.
+ */
+#define XCNTXT_MASK	(XSTATE_FP | XSTATE_SSE)
+
+#ifdef CONFIG_X86_64
+#define REX_PREFIX	"0x48, "
+#else
+#define REX_PREFIX
+#endif
+
+extern unsigned int xstate_size;
+extern u64 pcntxt_mask;
+extern struct xsave_struct *init_xstate_buf;
+
+extern void xsave_cntxt_init(void);
+extern void xsave_init(void);
+extern int init_fpu(struct task_struct *child);
+extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
+			    void __user *fpstate,
+			    struct _fpx_sw_bytes *sw);
+
+static inline int xrstor_checking(struct xsave_struct *fx)
+{
+	int err;
+
+	asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
+		     "2:\n"
+		     ".section .fixup,\"ax\"\n"
+		     "3:  movl $-1,%[err]\n"
+		     "    jmp  2b\n"
+		     ".previous\n"
+		     _ASM_EXTABLE(1b, 3b)
+		     : [err] "=r" (err)
+		     : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
+		     : "memory");
+
+	return err;
+}
+
+static inline int xsave_user(struct xsave_struct __user *buf)
+{
+	int err;
+	__asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
+			     "2:\n"
+			     ".section .fixup,\"ax\"\n"
+			     "3:  movl $-1,%[err]\n"
+			     "    jmp  2b\n"
+			     ".previous\n"
+			     ".section __ex_table,\"a\"\n"
+			     _ASM_ALIGN "\n"
+			     _ASM_PTR "1b,3b\n"
+			     ".previous"
+			     : [err] "=r" (err)
+			     : "D" (buf), "a" (-1), "d" (-1), "0" (0)
+			     : "memory");
+	if (unlikely(err) && __clear_user(buf, xstate_size))
+		err = -EFAULT;
+	/* No need to clear here because the caller clears USED_MATH */
+	return err;
+}
+
+static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
+{
+	int err;
+	struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+
+	__asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n"
+			     "2:\n"
+			     ".section .fixup,\"ax\"\n"
+			     "3:  movl $-1,%[err]\n"
+			     "    jmp  2b\n"
+			     ".previous\n"
+			     ".section __ex_table,\"a\"\n"
+			     _ASM_ALIGN "\n"
+			     _ASM_PTR "1b,3b\n"
+			     ".previous"
+			     : [err] "=r" (err)
+			     : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
+			     : "memory");	/* memory required? */
+	return err;
+}
+
+static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+
+	asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
+		     : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+		     :   "memory");
+}
+
+static inline void xsave(struct task_struct *tsk)
+{
+	/* This, however, we can work around by forcing the compiler to select
+	   an addressing mode that doesn't require extended registers. */
+	__asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
+			     : : "D" (&(tsk->thread.xstate->xsave)),
+				 "a" (-1), "d"(-1) : "memory");
+}
+#endif
diff --git a/include/linux/ata.h b/include/linux/ata.h
index be00973d..a53318b 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -30,6 +30,7 @@
 #define __LINUX_ATA_H__
 
 #include <linux/types.h>
+#include <asm/byteorder.h>
 
 /* defines only for the constants which don't work well as enums */
 #define ATA_DMA_BOUNDARY	0xffffUL
@@ -558,6 +559,15 @@
 	return id[ATA_ID_COMMAND_SET_2] & (1 << 12);
 }
 
+static inline int ata_id_flush_enabled(const u16 *id)
+{
+	if (ata_id_has_flush(id) == 0)
+		return 0;
+	if ((id[ATA_ID_CSF_DEFAULT] & 0xC000) != 0x4000)
+		return 0;
+	return id[ATA_ID_CFS_ENABLE_2] & (1 << 12);
+}
+
 static inline int ata_id_has_flush_ext(const u16 *id)
 {
 	if ((id[ATA_ID_COMMAND_SET_2] & 0xC000) != 0x4000)
@@ -565,6 +575,19 @@
 	return id[ATA_ID_COMMAND_SET_2] & (1 << 13);
 }
 
+static inline int ata_id_flush_ext_enabled(const u16 *id)
+{
+	if (ata_id_has_flush_ext(id) == 0)
+		return 0;
+	if ((id[ATA_ID_CSF_DEFAULT] & 0xC000) != 0x4000)
+		return 0;
+	/*
+	 * some Maxtor disks have bit 13 defined incorrectly
+	 * so check bit 10 too
+	 */
+	return (id[ATA_ID_CFS_ENABLE_2] & 0x2400) == 0x2400;
+}
+
 static inline int ata_id_has_lba48(const u16 *id)
 {
 	if ((id[ATA_ID_COMMAND_SET_2] & 0xC000) != 0x4000)
@@ -574,6 +597,15 @@
 	return id[ATA_ID_COMMAND_SET_2] & (1 << 10);
 }
 
+static inline int ata_id_lba48_enabled(const u16 *id)
+{
+	if (ata_id_has_lba48(id) == 0)
+		return 0;
+	if ((id[ATA_ID_CSF_DEFAULT] & 0xC000) != 0x4000)
+		return 0;
+	return id[ATA_ID_CFS_ENABLE_2] & (1 << 10);
+}
+
 static inline int ata_id_hpa_enabled(const u16 *id)
 {
 	/* Yes children, word 83 valid bits cover word 82 data */
@@ -645,7 +677,15 @@
 
 static inline int ata_id_is_sata(const u16 *id)
 {
-	return ata_id_major_version(id) >= 5 && id[ATA_ID_HW_CONFIG] == 0;
+	/*
+	 * See if word 93 is 0 AND drive is at least ATA-5 compatible
+	 * verifying that word 80 by casting it to a signed type --
+	 * this trick allows us to filter out the reserved values of
+	 * 0x0000 and 0xffff along with the earlier ATA revisions...
+	 */
+	if (id[ATA_ID_HW_CONFIG] == 0 && (short)id[ATA_ID_MAJOR_VER] >= 0x0020)
+		return 1;
+	return 0;
 }
 
 static inline int ata_id_has_tpm(const u16 *id)
@@ -742,6 +782,76 @@
 	return ata_id_major_version(dev_id) >= 7 && (dev_id[62] & 0x8000);
 }
 
+/*
+ * ata_id_is_lba_capacity_ok() performs a sanity check on
+ * the claimed LBA capacity value for the device.
+ *
+ * Returns 1 if LBA capacity looks sensible, 0 otherwise.
+ *
+ * It is called only once for each device.
+ */
+static inline int ata_id_is_lba_capacity_ok(u16 *id)
+{
+	unsigned long lba_sects, chs_sects, head, tail;
+
+	/* No non-LBA info .. so valid! */
+	if (id[ATA_ID_CYLS] == 0)
+		return 1;
+
+	lba_sects = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
+
+	/*
+	 * The ATA spec tells large drives to return
+	 * C/H/S = 16383/16/63 independent of their size.
+	 * Some drives can be jumpered to use 15 heads instead of 16.
+	 * Some drives can be jumpered to use 4092 cyls instead of 16383.
+	 */
+	if ((id[ATA_ID_CYLS] == 16383 ||
+	     (id[ATA_ID_CYLS] == 4092 && id[ATA_ID_CUR_CYLS] == 16383)) &&
+	    id[ATA_ID_SECTORS] == 63 &&
+	    (id[ATA_ID_HEADS] == 15 || id[ATA_ID_HEADS] == 16) &&
+	    (lba_sects >= 16383 * 63 * id[ATA_ID_HEADS]))
+		return 1;
+
+	chs_sects = id[ATA_ID_CYLS] * id[ATA_ID_HEADS] * id[ATA_ID_SECTORS];
+
+	/* perform a rough sanity check on lba_sects: within 10% is OK */
+	if (lba_sects - chs_sects < chs_sects/10)
+		return 1;
+
+	/* some drives have the word order reversed */
+	head = (lba_sects >> 16) & 0xffff;
+	tail = lba_sects & 0xffff;
+	lba_sects = head | (tail << 16);
+
+	if (lba_sects - chs_sects < chs_sects/10) {
+		*(__le32 *)&id[ATA_ID_LBA_CAPACITY] = __cpu_to_le32(lba_sects);
+		return 1;	/* LBA capacity is (now) good */
+	}
+
+	return 0;	/* LBA capacity value may be bad */
+}
+
+static inline void ata_id_to_hd_driveid(u16 *id)
+{
+#ifdef __BIG_ENDIAN
+	/* accessed in struct hd_driveid as 8-bit values */
+	id[ATA_ID_MAX_MULTSECT]	 = __cpu_to_le16(id[ATA_ID_MAX_MULTSECT]);
+	id[ATA_ID_CAPABILITY]	 = __cpu_to_le16(id[ATA_ID_CAPABILITY]);
+	id[ATA_ID_OLD_PIO_MODES] = __cpu_to_le16(id[ATA_ID_OLD_PIO_MODES]);
+	id[ATA_ID_OLD_DMA_MODES] = __cpu_to_le16(id[ATA_ID_OLD_DMA_MODES]);
+	id[ATA_ID_MULTSECT]	 = __cpu_to_le16(id[ATA_ID_MULTSECT]);
+
+	/* as 32-bit values */
+	*(u32 *)&id[ATA_ID_LBA_CAPACITY] = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
+	*(u32 *)&id[ATA_ID_SPG]		 = ata_id_u32(id, ATA_ID_SPG);
+
+	/* as 64-bit value */
+	*(u64 *)&id[ATA_ID_LBA_CAPACITY_2] =
+		ata_id_u64(id, ATA_ID_LBA_CAPACITY_2);
+#endif
+}
+
 static inline int is_multi_taskfile(struct ata_taskfile *tf)
 {
 	return (tf->command == ATA_CMD_READ_MULTI) ||
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 56c73b8..c360c55 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -25,9 +25,99 @@
 #include <linux/types.h>
 #include <linux/msi.h>
 
-#ifdef CONFIG_DMAR
+#if defined(CONFIG_DMAR) || defined(CONFIG_INTR_REMAP)
 struct intel_iommu;
 
+struct dmar_drhd_unit {
+	struct list_head list;		/* list of drhd units	*/
+	struct  acpi_dmar_header *hdr;	/* ACPI header		*/
+	u64	reg_base_addr;		/* register base address*/
+	struct	pci_dev **devices; 	/* target device array	*/
+	int	devices_cnt;		/* target device count	*/
+	u8	ignored:1; 		/* ignore drhd		*/
+	u8	include_all:1;
+	struct intel_iommu *iommu;
+};
+
+extern struct list_head dmar_drhd_units;
+
+#define for_each_drhd_unit(drhd) \
+	list_for_each_entry(drhd, &dmar_drhd_units, list)
+
+extern int dmar_table_init(void);
+extern int early_dmar_detect(void);
+extern int dmar_dev_scope_init(void);
+
+/* Intel IOMMU detection */
+extern void detect_intel_iommu(void);
+
+
+extern int parse_ioapics_under_ir(void);
+extern int alloc_iommu(struct dmar_drhd_unit *);
+#else
+static inline void detect_intel_iommu(void)
+{
+	return;
+}
+
+static inline int dmar_table_init(void)
+{
+	return -ENODEV;
+}
+#endif /* !CONFIG_DMAR && !CONFIG_INTR_REMAP */
+
+#ifdef CONFIG_INTR_REMAP
+extern int intr_remapping_enabled;
+extern int enable_intr_remapping(int);
+
+struct irte {
+	union {
+		struct {
+			__u64	present 	: 1,
+				fpd		: 1,
+				dst_mode	: 1,
+				redir_hint	: 1,
+				trigger_mode	: 1,
+				dlvry_mode	: 3,
+				avail		: 4,
+				__reserved_1	: 4,
+				vector		: 8,
+				__reserved_2	: 8,
+				dest_id		: 32;
+		};
+		__u64 low;
+	};
+
+	union {
+		struct {
+			__u64	sid		: 16,
+				sq		: 2,
+				svt		: 2,
+				__reserved_3	: 44;
+		};
+		__u64 high;
+	};
+};
+extern int get_irte(int irq, struct irte *entry);
+extern int modify_irte(int irq, struct irte *irte_modified);
+extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count);
+extern int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index,
+   			u16 sub_handle);
+extern int map_irq_to_irte_handle(int irq, u16 *sub_handle);
+extern int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index);
+extern int flush_irte(int irq);
+extern int free_irte(int irq);
+
+extern int irq_remapped(int irq);
+extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev);
+extern struct intel_iommu *map_ioapic_to_ir(int apic);
+#else
+#define irq_remapped(irq)		(0)
+#define enable_intr_remapping(mode)	(-1)
+#define intr_remapping_enabled		(0)
+#endif
+
+#ifdef CONFIG_DMAR
 extern const char *dmar_get_fault_reason(u8 fault_reason);
 
 /* Can't use the common MSI interrupt functions
@@ -40,47 +130,30 @@
 extern int dmar_set_interrupt(struct intel_iommu *iommu);
 extern int arch_setup_dmar_msi(unsigned int irq);
 
-/* Intel IOMMU detection and initialization functions */
-extern void detect_intel_iommu(void);
-extern int intel_iommu_init(void);
-
-extern int dmar_table_init(void);
-extern int early_dmar_detect(void);
-
-extern struct list_head dmar_drhd_units;
+extern int iommu_detected, no_iommu;
 extern struct list_head dmar_rmrr_units;
-
-struct dmar_drhd_unit {
-	struct list_head list;		/* list of drhd units	*/
-	u64	reg_base_addr;		/* register base address*/
-	struct	pci_dev **devices; 	/* target device array	*/
-	int	devices_cnt;		/* target device count	*/
-	u8	ignored:1; 		/* ignore drhd		*/
-	u8	include_all:1;
-	struct intel_iommu *iommu;
-};
-
 struct dmar_rmrr_unit {
 	struct list_head list;		/* list of rmrr units	*/
+	struct acpi_dmar_header *hdr;	/* ACPI header		*/
 	u64	base_address;		/* reserved base address*/
 	u64	end_address;		/* reserved end address */
 	struct pci_dev **devices;	/* target devices */
 	int	devices_cnt;		/* target device count */
 };
 
-#define for_each_drhd_unit(drhd) \
-	list_for_each_entry(drhd, &dmar_drhd_units, list)
 #define for_each_rmrr_units(rmrr) \
 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
+/* Intel DMAR  initialization functions */
+extern int intel_iommu_init(void);
+extern int dmar_disabled;
 #else
-static inline void detect_intel_iommu(void)
-{
-	return;
-}
 static inline int intel_iommu_init(void)
 {
+#ifdef CONFIG_INTR_REMAP
+	return dmar_dev_scope_init();
+#else
 	return -ENODEV;
+#endif
 }
-
 #endif /* !CONFIG_DMAR */
 #endif /* __DMAR_H__ */
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 80171ee..8120fa1 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -837,6 +837,8 @@
 extern void ext3_set_inode_flags(struct inode *);
 extern void ext3_get_inode_flags(struct ext3_inode_info *);
 extern void ext3_set_aops(struct inode *inode);
+extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		       u64 start, u64 len);
 
 /* ioctl.c */
 extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h
new file mode 100644
index 0000000..671decb
--- /dev/null
+++ b/include/linux/fiemap.h
@@ -0,0 +1,64 @@
+/*
+ * FS_IOC_FIEMAP ioctl infrastructure.
+ *
+ * Some portions copyright (C) 2007 Cluster File Systems, Inc
+ *
+ * Authors: Mark Fasheh <mfasheh@suse.com>
+ *          Kalpak Shah <kalpak.shah@sun.com>
+ *          Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LINUX_FIEMAP_H
+#define _LINUX_FIEMAP_H
+
+struct fiemap_extent {
+	__u64 fe_logical;  /* logical offset in bytes for the start of
+			    * the extent from the beginning of the file */
+	__u64 fe_physical; /* physical offset in bytes for the start
+			    * of the extent from the beginning of the disk */
+	__u64 fe_length;   /* length in bytes for this extent */
+	__u64 fe_reserved64[2];
+	__u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
+	__u32 fe_reserved[3];
+};
+
+struct fiemap {
+	__u64 fm_start;		/* logical offset (inclusive) at
+				 * which to start mapping (in) */
+	__u64 fm_length;	/* logical length of mapping which
+				 * userspace wants (in) */
+	__u32 fm_flags;		/* FIEMAP_FLAG_* flags for request (in/out) */
+	__u32 fm_mapped_extents;/* number of extents that were mapped (out) */
+	__u32 fm_extent_count;  /* size of fm_extents array (in) */
+	__u32 fm_reserved;
+	struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+#define FIEMAP_MAX_OFFSET	(~0ULL)
+
+#define FIEMAP_FLAG_SYNC	0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_XATTR	0x00000002 /* map extended attribute tree */
+
+#define FIEMAP_FLAGS_COMPAT	(FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
+
+#define FIEMAP_EXTENT_LAST		0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN		0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC		0x00000004 /* Location still pending.
+						    * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_ENCODED		0x00000008 /* Data can not be read
+						    * while fs is unmounted */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED	0x00000080 /* Data is encrypted by fs.
+						    * Sets EXTENT_NO_BYPASS. */
+#define FIEMAP_EXTENT_NOT_ALIGNED	0x00000100 /* Extent offsets may not be
+						    * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE	0x00000200 /* Data mixed with metadata.
+						    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL		0x00000400 /* Multiple files in block.
+						    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN		0x00000800 /* Space allocated, but
+						    * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED		0x00001000 /* File does not natively
+						    * support extents. Result
+						    * merged for efficiency. */
+
+#endif /* _LINUX_FIEMAP_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 32477e8..44e3cb2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -234,6 +234,7 @@
 #define	FS_IOC_SETFLAGS			_IOW('f', 2, long)
 #define	FS_IOC_GETVERSION		_IOR('v', 1, long)
 #define	FS_IOC_SETVERSION		_IOW('v', 2, long)
+#define FS_IOC_FIEMAP			_IOWR('f', 11, struct fiemap)
 #define FS_IOC32_GETFLAGS		_IOR('f', 1, int)
 #define FS_IOC32_SETFLAGS		_IOW('f', 2, int)
 #define FS_IOC32_GETVERSION		_IOR('v', 1, int)
@@ -294,6 +295,7 @@
 #include <linux/mutex.h>
 #include <linux/capability.h>
 #include <linux/semaphore.h>
+#include <linux/fiemap.h>
 
 #include <asm/atomic.h>
 #include <asm/byteorder.h>
@@ -1182,6 +1184,20 @@
 extern int file_permission(struct file *, int);
 
 /*
+ * VFS FS_IOC_FIEMAP helper definitions.
+ */
+struct fiemap_extent_info {
+	unsigned int fi_flags;		/* Flags as passed from user */
+	unsigned int fi_extents_mapped;	/* Number of mapped extents */
+	unsigned int fi_extents_max;	/* Size of fiemap_extent array */
+	struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent
+						 * array */
+};
+int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
+			    u64 phys, u64 len, u32 flags);
+int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
+
+/*
  * File types
  *
  * NOTE! These match bits 12..15 of stat.st_mode
@@ -1290,6 +1306,8 @@
 	void (*truncate_range)(struct inode *, loff_t, loff_t);
 	long (*fallocate)(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
+	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
+		      u64 len);
 };
 
 struct seq_file;
@@ -1987,6 +2005,9 @@
 
 extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		    unsigned long arg);
+extern int generic_block_fiemap(struct inode *inode,
+				struct fiemap_extent_info *fieinfo, u64 start,
+				u64 len, get_block_t *get_block);
 
 extern void get_filesystem(struct file_system_type *fs);
 extern void put_filesystem(struct file_system_type *fs);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 6514db8..a9d82d6 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -8,7 +8,7 @@
 
 #include <linux/init.h>
 #include <linux/ioport.h>
-#include <linux/hdreg.h>
+#include <linux/ata.h>
 #include <linux/blkdev.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
@@ -17,6 +17,7 @@
 #include <linux/device.h>
 #include <linux/pci.h>
 #include <linux/completion.h>
+#include <linux/pm.h>
 #ifdef CONFIG_BLK_DEV_IDEACPI
 #include <acpi/acpi.h>
 #endif
@@ -87,12 +88,13 @@
 };
 
 #define OK_STAT(stat,good,bad)	(((stat)&((good)|(bad)))==(good))
-#define BAD_R_STAT		(BUSY_STAT   | ERR_STAT)
-#define BAD_W_STAT		(BAD_R_STAT  | WRERR_STAT)
-#define BAD_STAT		(BAD_R_STAT  | DRQ_STAT)
-#define DRIVE_READY		(READY_STAT  | SEEK_STAT)
 
-#define BAD_CRC			(ABRT_ERR    | ICRC_ERR)
+#define BAD_R_STAT	(ATA_BUSY | ATA_ERR)
+#define BAD_W_STAT	(BAD_R_STAT | ATA_DF)
+#define BAD_STAT	(BAD_R_STAT | ATA_DRQ)
+#define DRIVE_READY	(ATA_DRDY | ATA_DSC)
+
+#define BAD_CRC		(ATA_ABORTED | ATA_ICRC)
 
 #define SATA_NR_PORTS		(3)	/* 16 possible ?? */
 
@@ -125,24 +127,41 @@
 #define PARTN_BITS	6	/* number of minor dev bits for partitions */
 #define MAX_DRIVES	2	/* per interface; 2 assumed by lots of code */
 #define SECTOR_SIZE	512
-#define SECTOR_WORDS	(SECTOR_SIZE / 4)	/* number of 32bit words per sector */
+
 #define IDE_LARGE_SEEK(b1,b2,t)	(((b1) > (b2) + (t)) || ((b2) > (b1) + (t)))
 
 /*
  * Timeouts for various operations:
  */
-#define WAIT_DRQ	(HZ/10)		/* 100msec - spec allows up to 20ms */
-#define WAIT_READY	(5*HZ)		/* 5sec - some laptops are very slow */
-#define WAIT_PIDENTIFY	(10*HZ)	/* 10sec  - should be less than 3ms (?), if all ATAPI CD is closed at boot */
-#define WAIT_WORSTCASE	(30*HZ)	/* 30sec  - worst case when spinning up */
-#define WAIT_CMD	(10*HZ)	/* 10sec  - maximum wait for an IRQ to happen */
-#define WAIT_MIN_SLEEP	(2*HZ/100)	/* 20msec - minimum sleep time */
+enum {
+	/* spec allows up to 20ms */
+	WAIT_DRQ	= HZ / 10,	/* 100ms */
+	/* some laptops are very slow */
+	WAIT_READY	= 5 * HZ,	/* 5s */
+	/* should be less than 3ms (?), if all ATAPI CD is closed at boot */
+	WAIT_PIDENTIFY	= 10 * HZ,	/* 10s */
+	/* worst case when spinning up */
+	WAIT_WORSTCASE	= 30 * HZ,	/* 30s */
+	/* maximum wait for an IRQ to happen */
+	WAIT_CMD	= 10 * HZ,	/* 10s */
+	/* Some drives require a longer IRQ timeout. */
+	WAIT_FLOPPY_CMD	= 50 * HZ,	/* 50s */
+	/*
+	 * Some drives (for example, Seagate STT3401A Travan) require a very
+	 * long timeout, because they don't return an interrupt or clear their
+	 * BSY bit until after the command completes (even retension commands).
+	 */
+	WAIT_TAPE_CMD	= 900 * HZ,	/* 900s */
+	/* minimum sleep time */
+	WAIT_MIN_SLEEP	= HZ / 50,	/* 20ms */
+};
 
 /*
  * Op codes for special requests to be handled by ide_special_rq().
  * Values should be in the range of 0x20 to 0x3f.
  */
 #define REQ_DRIVE_RESET		0x20
+#define REQ_DEVSET_EXEC		0x21
 
 /*
  * Check for an interrupt and acknowledge the interrupt status
@@ -303,8 +322,8 @@
 	ide_started,	/* a drive operation was started, handler was set */
 } ide_startstop_t;
 
+struct ide_devset;
 struct ide_driver_s;
-struct ide_settings_s;
 
 #ifdef CONFIG_BLK_DEV_IDEACPI
 struct ide_acpi_drive_link;
@@ -315,10 +334,10 @@
 enum {
 	IDE_AFLAG_DRQ_INTERRUPT		= (1 << 0),
 	IDE_AFLAG_MEDIA_CHANGED		= (1 << 1),
-
-	/* ide-cd */
 	/* Drive cannot lock the door. */
 	IDE_AFLAG_NO_DOORLOCK		= (1 << 2),
+
+	/* ide-cd */
 	/* Drive cannot eject the disc. */
 	IDE_AFLAG_NO_EJECT		= (1 << 3),
 	/* Drive is a pre ATAPI 1.2 drive. */
@@ -354,21 +373,25 @@
 	IDE_AFLAG_CLIK_DRIVE		= (1 << 19),
 	/* Requires BH algorithm for packets */
 	IDE_AFLAG_ZIP_DRIVE		= (1 << 20),
+	/* Write protect */
+	IDE_AFLAG_WP			= (1 << 21),
+	/* Supports format progress report */
+	IDE_AFLAG_SRFP			= (1 << 22),
 
 	/* ide-tape */
-	IDE_AFLAG_IGNORE_DSC		= (1 << 21),
+	IDE_AFLAG_IGNORE_DSC		= (1 << 23),
 	/* 0 When the tape position is unknown */
-	IDE_AFLAG_ADDRESS_VALID		= (1 <<	22),
+	IDE_AFLAG_ADDRESS_VALID		= (1 <<	24),
 	/* Device already opened */
-	IDE_AFLAG_BUSY			= (1 << 23),
+	IDE_AFLAG_BUSY			= (1 << 25),
 	/* Attempt to auto-detect the current user block size */
-	IDE_AFLAG_DETECT_BS		= (1 << 24),
+	IDE_AFLAG_DETECT_BS		= (1 << 26),
 	/* Currently on a filemark */
-	IDE_AFLAG_FILEMARK		= (1 << 25),
+	IDE_AFLAG_FILEMARK		= (1 << 27),
 	/* 0 = no tape is loaded, so we don't rewind after ejecting */
-	IDE_AFLAG_MEDIUM_PRESENT	= (1 << 26),
+	IDE_AFLAG_MEDIUM_PRESENT	= (1 << 28),
 
-	IDE_AFLAG_NO_AUTOCLOSE		= (1 << 27),
+	IDE_AFLAG_NO_AUTOCLOSE		= (1 << 29),
 };
 
 struct ide_drive_s {
@@ -380,10 +403,10 @@
 	struct request		*rq;	/* current request */
 	struct ide_drive_s 	*next;	/* circular list of hwgroup drives */
 	void		*driver_data;	/* extra driver data */
-	struct hd_driveid	*id;	/* drive model identification info */
+	u16			*id;	/* identification info */
 #ifdef CONFIG_IDE_PROC_FS
 	struct proc_dir_entry *proc;	/* /proc/ide/ directory entry */
-	struct ide_settings_s *settings;/* /proc/ide/ drive settings */
+	const struct ide_proc_devset *settings; /* /proc/ide/ drive settings */
 #endif
 	struct hwif_s		*hwif;	/* actually (ide_hwif_t *) */
 
@@ -395,16 +418,16 @@
 	special_t	special;	/* special action flags */
 	select_t	select;		/* basic drive/head select reg value */
 
-	u8	keep_settings;		/* restore settings after drive reset */
-	u8	using_dma;		/* disk is using dma for read/write */
 	u8	retry_pio;		/* retrying dma capable host in pio */
 	u8	state;			/* retry state */
 	u8	waiting_for_dma;	/* dma currently in progress */
-	u8	unmask;			/* okay to unmask other irqs */
-	u8	noflush;		/* don't attempt flushes */
-	u8	dsc_overlap;		/* DSC overlap */
-	u8	nice1;			/* give potential excess bandwidth */
 
+	unsigned keep_settings	: 1;	/* restore settings after drive reset */
+	unsigned using_dma	: 1;	/* disk is using dma for read/write */
+	unsigned unmask		: 1;	/* okay to unmask other irqs */
+	unsigned noflush	: 1;	/* don't attempt flushes */
+	unsigned dsc_overlap	: 1;	/* DSC overlap */
+	unsigned nice1		: 1;	/* give potential excess bandwidth */
 	unsigned present	: 1;	/* drive is physically present */
 	unsigned dead		: 1;	/* device ejected hint */
 	unsigned id_read	: 1;	/* 1=id read from disk 0 = synthetic */
@@ -414,23 +437,22 @@
 	unsigned forced_geom	: 1;	/* 1 if hdx=c,h,s was given at boot */
 	unsigned no_unmask	: 1;	/* disallow setting unmask bit */
 	unsigned no_io_32bit	: 1;	/* disallow enabling 32bit I/O */
-	unsigned atapi_overlap	: 1;	/* ATAPI overlap (not supported) */
 	unsigned doorlocking	: 1;	/* for removable only: door lock/unlock works */
 	unsigned nodma		: 1;	/* disallow DMA */
-	unsigned remap_0_to_1	: 1;	/* 0=noremap, 1=remap 0->1 (for EZDrive) */
 	unsigned blocked        : 1;	/* 1=powermanagment told us not to do anything, so sleep nicely */
 	unsigned scsi		: 1;	/* 0=default, 1=ide-scsi emulation */
 	unsigned sleeping	: 1;	/* 1=sleeping & sleep field valid */
 	unsigned post_reset	: 1;
 	unsigned udma33_warned	: 1;
+	unsigned addressing	: 2;	/* 0=28-bit, 1=48-bit, 2=48-bit doing 28-bit */
+	unsigned wcache		: 1;	/* status of write cache */
+	unsigned nowerr		: 1;	/* used for ignoring ATA_DF */
 
-	u8	addressing;	/* 0=28-bit, 1=48-bit, 2=48-bit doing 28-bit */
         u8	quirk_list;	/* considered quirky, set for a specific host */
         u8	init_speed;	/* transfer rate set at boot */
         u8	current_speed;	/* current transfer rate set */
 	u8	desired_speed;	/* desired transfer rate set */
         u8	dn;		/* now wide spread use */
-        u8	wcache;		/* status of write cache */
 	u8	acoustic;	/* acoustic management */
 	u8	media;		/* disk, cdrom, tape, floppy, ... */
 	u8	ready_stat;	/* min status value for drive ready */
@@ -438,9 +460,7 @@
 	u8	mult_req;	/* requested multiple sector setting */
 	u8	tune_req;	/* requested drive tuning setting */
 	u8	io_32bit;	/* 0=16-bit, 1=32-bit, 2/3=32bit+sync */
-	u8	bad_wstat;	/* used for ignoring WRERR_STAT */
-	u8	nowerr;		/* used for ignoring WRERR_STAT */
-	u8	sect0;		/* offset of first sector for DM6:DDO */
+	u8	bad_wstat;	/* used for ignoring ATA_DF */
 	u8	head;		/* "real" number of heads */
 	u8	sect;		/* "real" sectors per track */
 	u8	bios_head;	/* BIOS/fdisk/LILO number of heads */
@@ -474,10 +494,6 @@
 
 #define to_ide_device(dev)container_of(dev, ide_drive_t, gendev)
 
-#define IDE_CHIPSET_PCI_MASK	\
-    ((1<<ide_pci)|(1<<ide_cmd646)|(1<<ide_ali14xx))
-#define IDE_CHIPSET_IS_PCI(c)	((IDE_CHIPSET_PCI_MASK >> (c)) & 1)
-
 struct ide_task_s;
 struct ide_port_info;
 
@@ -567,7 +583,6 @@
 	u8 major;	/* our major number */
 	u8 index;	/* 0 for ide0; 1 for ide1; ... */
 	u8 channel;	/* for dual-port chips: 0=primary, 1=secondary */
-	u8 bus_state;	/* power state of the IDE bus */
 
 	u32 host_flags;
 
@@ -645,6 +660,7 @@
 	ide_hwif_t	*ports[MAX_HWIFS];
 	unsigned int	n_ports;
 	struct device	*dev[2];
+	unsigned int	(*init_chipset)(struct pci_dev *);
 	unsigned long	host_flags;
 	void		*host_priv;
 };
@@ -692,9 +708,61 @@
 
 extern struct mutex ide_setting_mtx;
 
-int set_io_32bit(ide_drive_t *, int);
-int set_pio_mode(ide_drive_t *, int);
-int set_using_dma(ide_drive_t *, int);
+/*
+ * configurable drive settings
+ */
+
+#define DS_SYNC	(1 << 0)
+
+struct ide_devset {
+	int		(*get)(ide_drive_t *);
+	int		(*set)(ide_drive_t *, int);
+	unsigned int	flags;
+};
+
+#define __DEVSET(_flags, _get, _set) { \
+	.flags	= _flags, \
+	.get	= _get,	\
+	.set	= _set,	\
+}
+
+#define ide_devset_get(name, field) \
+static int get_##name(ide_drive_t *drive) \
+{ \
+	return drive->field; \
+}
+
+#define ide_devset_set(name, field) \
+static int set_##name(ide_drive_t *drive, int arg) \
+{ \
+	drive->field = arg; \
+	return 0; \
+}
+
+#define __IDE_DEVSET(_name, _flags, _get, _set) \
+const struct ide_devset ide_devset_##_name = \
+	__DEVSET(_flags, _get, _set)
+
+#define IDE_DEVSET(_name, _flags, _get, _set) \
+static __IDE_DEVSET(_name, _flags, _get, _set)
+
+#define ide_devset_rw(_name, _func) \
+IDE_DEVSET(_name, 0, get_##_func, set_##_func)
+
+#define ide_devset_w(_name, _func) \
+IDE_DEVSET(_name, 0, NULL, set_##_func)
+
+#define ide_devset_rw_sync(_name, _func) \
+IDE_DEVSET(_name, DS_SYNC, get_##_func, set_##_func)
+
+#define ide_decl_devset(_name) \
+extern const struct ide_devset ide_devset_##_name
+
+ide_decl_devset(io_32bit);
+ide_decl_devset(keepsettings);
+ide_decl_devset(pio_mode);
+ide_decl_devset(unmaskirq);
+ide_decl_devset(using_dma);
 
 /* ATAPI packet command flags */
 enum {
@@ -710,6 +778,12 @@
 	PC_FLAG_TIMEDOUT		= (1 << 7),
 };
 
+/*
+ * With each packet command, we allocate a buffer of IDE_PC_BUFFER_SIZE bytes.
+ * This is used for several packet commands (not for READ/WRITE commands).
+ */
+#define IDE_PC_BUFFER_SIZE	256
+
 struct ide_atapi_pc {
 	/* actual packet bytes */
 	u8 c[12];
@@ -739,7 +813,7 @@
 	 * those are more or less driver-specific and some of them are subject
 	 * to change/removal later.
 	 */
-	u8 pc_buf[256];
+	u8 pc_buf[IDE_PC_BUFFER_SIZE];
 
 	/* idetape only */
 	struct idetape_bh *bh;
@@ -757,37 +831,34 @@
 
 #ifdef CONFIG_IDE_PROC_FS
 /*
- * configurable drive settings
- */
-
-#define TYPE_INT	0
-#define TYPE_BYTE	1
-#define TYPE_SHORT	2
-
-#define SETTING_READ	(1 << 0)
-#define SETTING_WRITE	(1 << 1)
-#define SETTING_RW	(SETTING_READ | SETTING_WRITE)
-
-typedef int (ide_procset_t)(ide_drive_t *, int);
-typedef struct ide_settings_s {
-	char			*name;
-	int			rw;
-	int			data_type;
-	int			min;
-	int			max;
-	int			mul_factor;
-	int			div_factor;
-	void			*data;
-	ide_procset_t		*set;
-	int			auto_remove;
-	struct ide_settings_s	*next;
-} ide_settings_t;
-
-int ide_add_setting(ide_drive_t *, const char *, int, int, int, int, int, int, void *, ide_procset_t *set);
-
-/*
  * /proc/ide interface
  */
+
+#define ide_devset_rw_field(_name, _field) \
+ide_devset_get(_name, _field); \
+ide_devset_set(_name, _field); \
+IDE_DEVSET(_name, DS_SYNC, get_##_name, set_##_name)
+
+struct ide_proc_devset {
+	const char		*name;
+	const struct ide_devset	*setting;
+	int			min, max;
+	int			(*mulf)(ide_drive_t *);
+	int			(*divf)(ide_drive_t *);
+};
+
+#define __IDE_PROC_DEVSET(_name, _min, _max, _mulf, _divf) { \
+	.name = __stringify(_name), \
+	.setting = &ide_devset_##_name, \
+	.min = _min, \
+	.max = _max, \
+	.mulf = _mulf, \
+	.divf = _divf, \
+}
+
+#define IDE_PROC_DEVSET(_name, _min, _max) \
+__IDE_PROC_DEVSET(_name, _min, _max, NULL, NULL)
+
 typedef struct {
 	const char	*name;
 	mode_t		mode;
@@ -804,8 +875,6 @@
 void ide_proc_register_driver(ide_drive_t *, ide_driver_t *);
 void ide_proc_unregister_driver(ide_drive_t *, ide_driver_t *);
 
-void ide_add_generic_settings(ide_drive_t *);
-
 read_proc_t proc_ide_read_capacity;
 read_proc_t proc_ide_read_geometry;
 
@@ -833,7 +902,6 @@
 static inline void ide_proc_unregister_port(ide_hwif_t *hwif) { ; }
 static inline void ide_proc_register_driver(ide_drive_t *drive, ide_driver_t *driver) { ; }
 static inline void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver) { ; }
-static inline void ide_add_generic_settings(ide_drive_t *drive) { ; }
 #define PROC_IDE_READ_RETURN(page,start,off,count,eof,len) return 0;
 #endif
 
@@ -879,7 +947,6 @@
 struct ide_driver_s {
 	const char			*version;
 	u8				media;
-	unsigned supports_dsc_overlap	: 1;
 	ide_startstop_t	(*do_request)(ide_drive_t *, struct request *, sector_t);
 	int		(*end_request)(ide_drive_t *, int, int);
 	ide_startstop_t	(*error)(ide_drive_t *, struct request *rq, u8, u8);
@@ -889,7 +956,8 @@
 	void		(*resume)(ide_drive_t *);
 	void		(*shutdown)(ide_drive_t *);
 #ifdef CONFIG_IDE_PROC_FS
-	ide_proc_entry_t	*proc;
+	ide_proc_entry_t		*proc;
+	const struct ide_proc_devset	*settings;
 #endif
 };
 
@@ -898,7 +966,17 @@
 int ide_device_get(ide_drive_t *);
 void ide_device_put(ide_drive_t *);
 
-int generic_ide_ioctl(ide_drive_t *, struct file *, struct block_device *, unsigned, unsigned long);
+struct ide_ioctl_devset {
+	unsigned int	get_ioctl;
+	unsigned int	set_ioctl;
+	const struct ide_devset *setting;
+};
+
+int ide_setting_ioctl(ide_drive_t *, struct block_device *, unsigned int,
+		      unsigned long, const struct ide_ioctl_devset *);
+
+int generic_ide_ioctl(ide_drive_t *, struct file *, struct block_device *,
+		      unsigned, unsigned long);
 
 extern int ide_vlb_clk;
 extern int ide_pci_clk;
@@ -920,14 +998,19 @@
 
 ide_startstop_t ide_error (ide_drive_t *drive, const char *msg, byte stat);
 
-extern void ide_fix_driveid(struct hd_driveid *);
+void ide_fix_driveid(u16 *);
 
 extern void ide_fixstring(u8 *, const int, const int);
 
+int ide_busy_sleep(ide_hwif_t *, unsigned long, int);
+
 int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long);
 
 extern ide_startstop_t ide_do_reset (ide_drive_t *);
 
+extern int ide_devset_execute(ide_drive_t *drive,
+			      const struct ide_devset *setting, int arg);
+
 extern void ide_do_drive_cmd(ide_drive_t *, struct request *);
 
 extern void ide_end_drive_cmd(ide_drive_t *, u8, u8);
@@ -1051,6 +1134,8 @@
 void ide_input_data(ide_drive_t *, struct request *, void *, unsigned int);
 void ide_output_data(ide_drive_t *, struct request *, void *, unsigned int);
 
+int ide_io_buffers(ide_drive_t *, struct ide_atapi_pc *, unsigned int, int);
+
 extern void SELECT_DRIVE(ide_drive_t *);
 void SELECT_MASK(ide_drive_t *, int);
 
@@ -1061,11 +1146,36 @@
 
 void ide_pktcmd_tf_load(ide_drive_t *, u32, u16, u8);
 
+int ide_check_atapi_device(ide_drive_t *, const char *);
+
+void ide_init_pc(struct ide_atapi_pc *);
+
+/*
+ * Special requests for ide-tape block device strategy routine.
+ *
+ * In order to service a character device command, we add special requests to
+ * the tail of our block device request queue and wait for their completion.
+ */
+enum {
+	REQ_IDETAPE_PC1		= (1 << 0), /* packet command (first stage) */
+	REQ_IDETAPE_PC2		= (1 << 1), /* packet command (second stage) */
+	REQ_IDETAPE_READ	= (1 << 2),
+	REQ_IDETAPE_WRITE	= (1 << 3),
+};
+
+void ide_queue_pc_head(ide_drive_t *, struct gendisk *, struct ide_atapi_pc *,
+		       struct request *);
+int ide_queue_pc_tail(ide_drive_t *, struct gendisk *, struct ide_atapi_pc *);
+
+int ide_do_test_unit_ready(ide_drive_t *, struct gendisk *);
+int ide_do_start_stop(ide_drive_t *, struct gendisk *, int);
+int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
+
 ide_startstop_t ide_pc_intr(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	ide_handler_t *handler, unsigned int timeout, ide_expiry_t *expiry,
 	void (*update_buffers)(ide_drive_t *, struct ide_atapi_pc *),
 	void (*retry_pc)(ide_drive_t *), void (*dsc_handle)(ide_drive_t *),
-	void (*io_buffers)(ide_drive_t *, struct ide_atapi_pc *, unsigned int,
+	int (*io_buffers)(ide_drive_t *, struct ide_atapi_pc *, unsigned int,
 			   int));
 ide_startstop_t ide_transfer_pc(ide_drive_t *, struct ide_atapi_pc *,
 				ide_handler_t *, unsigned int, ide_expiry_t *);
@@ -1080,8 +1190,6 @@
 int ide_no_data_taskfile(ide_drive_t *, ide_task_t *);
 
 int ide_taskfile_ioctl(ide_drive_t *, unsigned int, unsigned long);
-int ide_cmd_ioctl(ide_drive_t *, unsigned int, unsigned long);
-int ide_task_ioctl(ide_drive_t *, unsigned int, unsigned long);
 
 extern int ide_driveid_update(ide_drive_t *);
 extern int ide_config_drive_speed(ide_drive_t *, u8);
@@ -1092,7 +1200,6 @@
 
 extern void ide_stall_queue(ide_drive_t *drive, unsigned long timeout);
 
-extern int ide_spin_wait_hwgroup(ide_drive_t *);
 extern void ide_timer_expiry(unsigned long);
 extern irqreturn_t ide_intr(int irq, void *dev_id);
 extern void do_ide_request(struct request_queue *);
@@ -1229,6 +1336,14 @@
 		     const struct ide_port_info *, void *);
 void ide_pci_remove(struct pci_dev *);
 
+#ifdef CONFIG_PM
+int ide_pci_suspend(struct pci_dev *, pm_message_t);
+int ide_pci_resume(struct pci_dev *);
+#else
+#define ide_pci_suspend NULL
+#define ide_pci_resume NULL
+#endif
+
 void ide_map_sg(ide_drive_t *, struct request *);
 void ide_init_sg_cmd(ide_drive_t *, struct request *);
 
@@ -1240,7 +1355,7 @@
 	const char *id_firmware;
 };
 
-int ide_in_drive_list(struct hd_driveid *, const struct drive_list_entry *);
+int ide_in_drive_list(u16 *, const struct drive_list_entry *);
 
 #ifdef CONFIG_BLK_DEV_IDEDMA
 int __ide_dma_bad_drive(ide_drive_t *);
@@ -1347,24 +1462,6 @@
 extern void ide_toggle_bounce(ide_drive_t *drive, int on);
 extern int ide_set_xfer_rate(ide_drive_t *drive, u8 rate);
 
-static inline int ide_dev_has_iordy(struct hd_driveid *id)
-{
-	return ((id->field_valid & 2) && (id->capability & 8)) ? 1 : 0;
-}
-
-static inline int ide_dev_is_sata(struct hd_driveid *id)
-{
-	/*
-	 * See if word 93 is 0 AND drive is at least ATA-5 compatible
-	 * verifying that word 80 by casting it to a signed type --
-	 * this trick allows us to filter out the reserved values of
-	 * 0x0000 and 0xffff along with the earlier ATA revisions...
-	 */
-	if (id->hw_config == 0 && (short)id->major_rev_num >= 0x0020)
-		return 1;
-	return 0;
-}
-
 u64 ide_get_lba_addr(struct ide_taskfile *, int);
 u8 ide_dump_status(ide_drive_t *, const char *, u8);
 
@@ -1436,13 +1533,6 @@
 extern struct bus_type ide_bus_type;
 extern struct class *ide_port_class;
 
-/* check if CACHE FLUSH (EXT) command is supported (bits defined in ATA-6) */
-#define ide_id_has_flush_cache(id)	((id)->cfs_enable_2 & 0x3000)
-
-/* some Maxtor disks have bit 13 defined incorrectly so check bit 10 too */
-#define ide_id_has_flush_cache_ext(id)	\
-	(((id)->cfs_enable_2 & 0x2400) == 0x2400)
-
 static inline void ide_dump_identify(u8 *id)
 {
 	print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 2, id, 512, 0);
@@ -1453,10 +1543,10 @@
 	return hwif->dev ? dev_to_node(hwif->dev) : -1;
 }
 
-static inline ide_drive_t *ide_get_paired_drive(ide_drive_t *drive)
+static inline ide_drive_t *ide_get_pair_dev(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif	= HWIF(drive);
+	ide_drive_t *peer = &drive->hwif->drives[(drive->dn ^ 1) & 1];
 
-	return &hwif->drives[(drive->dn ^ 1) & 1];
+	return peer->present ? peer : NULL;
 }
 #endif /* _IDE_H */
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 350033e..ee9bcc6 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -108,6 +108,9 @@
 
 extern int request_resource(struct resource *root, struct resource *new);
 extern int release_resource(struct resource *new);
+extern void reserve_region_with_split(struct resource *root,
+			     resource_size_t start, resource_size_t end,
+			     const char *name);
 extern int insert_resource(struct resource *parent, struct resource *new);
 extern void insert_resource_expand_to_fit(struct resource *root, struct resource *new);
 extern int allocate_resource(struct resource *root, struct resource *new,
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8ccb462..8d9411b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -62,6 +62,7 @@
 #define IRQ_MOVE_PENDING	0x00200000	/* need to re-target IRQ destination */
 #define IRQ_NO_BALANCING	0x00400000	/* IRQ is excluded from balancing */
 #define IRQ_SPURIOUS_DISABLED	0x00800000	/* IRQ was disabled by the spurious trap */
+#define IRQ_MOVE_PCNTXT	0x01000000	/* IRQ migration from process context */
 
 #ifdef CONFIG_IRQ_PER_CPU
 # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 3dd2090..66c3499 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -850,7 +850,8 @@
 	 */
 	struct block_device	*j_dev;
 	int			j_blocksize;
-	unsigned long long		j_blk_offset;
+	unsigned long long	j_blk_offset;
+	char			j_devname[BDEVNAME_SIZE+24];
 
 	/*
 	 * Device which holds the client fs.  For internal journal this will be
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index fac3337..9f2a375 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -23,12 +23,19 @@
 	__attribute__((__section__(SHARED_ALIGNED_SECTION)))		\
 	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name		\
 	____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)			\
+	__attribute__((__section__(".data.percpu.page_aligned")))	\
+	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
 #else
 #define DEFINE_PER_CPU(type, name)					\
 	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
 
 #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		      \
 	DEFINE_PER_CPU(type, name)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)		      \
+	DEFINE_PER_CPU(type, name)
 #endif
 
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 2083888..9007ccd 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -35,7 +35,7 @@
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
-s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
+s64 __percpu_counter_sum(struct percpu_counter *fbc);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -44,19 +44,13 @@
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
 {
-	s64 ret = __percpu_counter_sum(fbc, 0);
+	s64 ret = __percpu_counter_sum(fbc);
 	return ret < 0 ? 0 : ret;
 }
 
-static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
-{
-	return __percpu_counter_sum(fbc, 1);
-}
-
-
 static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
-	return __percpu_counter_sum(fbc, 0);
+	return __percpu_counter_sum(fbc);
 }
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0314074..60c49e3 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -89,7 +89,14 @@
 	set_balance_irq_affinity(irq, cpumask);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	set_pending_irq(irq, cpumask);
+	if (desc->status & IRQ_MOVE_PCNTXT) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&desc->lock, flags);
+		desc->chip->set_affinity(irq, cpumask);
+		spin_unlock_irqrestore(&desc->lock, flags);
+	} else
+		set_pending_irq(irq, cpumask);
 #else
 	desc->affinity = cpumask;
 	desc->chip->set_affinity(irq, cpumask);
diff --git a/kernel/resource.c b/kernel/resource.c
index 03d796c..414d6fc 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -516,6 +516,74 @@
 	return result;
 }
 
+static void __init __reserve_region_with_split(struct resource *root,
+		resource_size_t start, resource_size_t end,
+		const char *name)
+{
+	struct resource *parent = root;
+	struct resource *conflict;
+	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+
+	if (!res)
+		return;
+
+	res->name = name;
+	res->start = start;
+	res->end = end;
+	res->flags = IORESOURCE_BUSY;
+
+	for (;;) {
+		conflict = __request_resource(parent, res);
+		if (!conflict)
+			break;
+		if (conflict != parent) {
+			parent = conflict;
+			if (!(conflict->flags & IORESOURCE_BUSY))
+				continue;
+		}
+
+		/* Uhhuh, that didn't work out.. */
+		kfree(res);
+		res = NULL;
+		break;
+	}
+
+	if (!res) {
+		printk(KERN_DEBUG "    __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n",
+			 conflict->name, conflict->start, conflict->end,
+			 name, start, end);
+
+		/* failed, split and try again */
+
+		/* conflict coverred whole area */
+		if (conflict->start <= start && conflict->end >= end)
+			return;
+
+		if (conflict->start > start)
+			__reserve_region_with_split(root, start, conflict->start-1, name);
+		if (!(conflict->flags & IORESOURCE_BUSY)) {
+			resource_size_t common_start, common_end;
+
+			common_start = max(conflict->start, start);
+			common_end = min(conflict->end, end);
+			if (common_start < common_end)
+				__reserve_region_with_split(root, common_start, common_end, name);
+		}
+		if (conflict->end < end)
+			__reserve_region_with_split(root, conflict->end+1, end, name);
+	}
+
+}
+
+void reserve_region_with_split(struct resource *root,
+		resource_size_t start, resource_size_t end,
+		const char *name)
+{
+	write_lock(&resource_lock);
+	__reserve_region_with_split(root, start, end, name);
+	write_unlock(&resource_lock);
+}
+
 EXPORT_SYMBOL(adjust_resource);
 
 /**
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 4a8ba4b..a866389 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -52,7 +52,7 @@
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
+s64 __percpu_counter_sum(struct percpu_counter *fbc)
 {
 	s64 ret;
 	int cpu;
@@ -62,11 +62,9 @@
 	for_each_online_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
-		if (set)
-			*pcount = 0;
+		*pcount = 0;
 	}
-	if (set)
-		fbc->count = ret;
+	fbc->count = ret;
 
 	spin_unlock(&fbc->lock);
 	return ret;