Merge branch 'merge' into next

We want to bring in the latest IRQ fixes
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index e8461cb..c802a90 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -62,26 +62,45 @@
 $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o): \
 	$(addprefix $(obj)/,$(libfdtheader))
 
-src-wlib := string.S crt0.S crtsavres.S stdio.c main.c \
+src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
 		$(libfdt) libfdt-wrapper.c \
 		ns16550.c serial.c simple_alloc.c div64.S util.S \
-		gunzip_util.c elf_util.c $(zlib) devtree.c oflib.c ofconsole.c \
-		4xx.c ebony.c mv64x60.c mpsc.c mv64x60_i2c.c cuboot.c bamboo.c \
-		cpm-serial.c stdlib.c mpc52xx-psc.c planetcore.c uartlite.c \
-		fsl-soc.c mpc8xx.c pq2.c ugecon.c
-src-plat := of.c cuboot-52xx.c cuboot-824x.c cuboot-83xx.c cuboot-85xx.c holly.c \
-		cuboot-ebony.c cuboot-hotfoot.c epapr.c treeboot-ebony.c \
-		prpmc2800.c \
-		ps3-head.S ps3-hvcall.S ps3.c treeboot-bamboo.c cuboot-8xx.c \
-		cuboot-pq2.c cuboot-sequoia.c treeboot-walnut.c \
-		cuboot-bamboo.c cuboot-mpc7448hpc2.c cuboot-taishan.c \
-		fixed-head.S ep88xc.c ep405.c cuboot-c2k.c \
-		cuboot-katmai.c cuboot-rainier.c redboot-8xx.c ep8248e.c \
-		cuboot-warp.c cuboot-85xx-cpm2.c cuboot-yosemite.c simpleboot.c \
-		virtex405-head.S virtex.c redboot-83xx.c cuboot-sam440ep.c \
-		cuboot-acadia.c cuboot-amigaone.c cuboot-kilauea.c \
-		gamecube-head.S gamecube.c wii-head.S wii.c treeboot-iss4xx.c \
-		treeboot-currituck.c
+		gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \
+		oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
+		uartlite.c mpc52xx-psc.c
+src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
+src-wlib-$(CONFIG_44x) += 4xx.c ebony.c bamboo.c
+src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c
+src-wlib-$(CONFIG_PPC_82xx) += pq2.c fsl-soc.c planetcore.c
+src-wlib-$(CONFIG_EMBEDDED6xx) += mv64x60.c mv64x60_i2c.c ugecon.c
+
+src-plat-y := of.c
+src-plat-$(CONFIG_40x) += fixed-head.S ep405.c cuboot-hotfoot.c \
+				treeboot-walnut.c cuboot-acadia.c \
+				cuboot-kilauea.c simpleboot.c \
+				virtex405-head.S virtex.c
+src-plat-$(CONFIG_44x) += treeboot-ebony.c cuboot-ebony.c treeboot-bamboo.c \
+				cuboot-bamboo.c cuboot-sam440ep.c \
+				cuboot-sequoia.c cuboot-rainier.c \
+				cuboot-taishan.c cuboot-katmai.c \
+				cuboot-warp.c cuboot-yosemite.c \
+				treeboot-iss4xx.c treeboot-currituck.c \
+				simpleboot.c fixed-head.S virtex.c
+src-plat-$(CONFIG_8xx) += cuboot-8xx.c fixed-head.S ep88xc.c redboot-8xx.c
+src-plat-$(CONFIG_PPC_MPC52xx) += cuboot-52xx.c
+src-plat-$(CONFIG_PPC_82xx) += cuboot-pq2.c fixed-head.S ep8248e.c cuboot-824x.c
+src-plat-$(CONFIG_PPC_83xx) += cuboot-83xx.c fixed-head.S redboot-83xx.c
+src-plat-$(CONFIG_FSL_SOC_BOOKE) += cuboot-85xx.c cuboot-85xx-cpm2.c
+src-plat-$(CONFIG_EMBEDDED6xx) += cuboot-pq2.c cuboot-mpc7448hpc2.c \
+					cuboot-c2k.c gamecube-head.S \
+					gamecube.c wii-head.S wii.c holly.c \
+					prpmc2800.c
+src-plat-$(CONFIG_AMIGAONE) += cuboot-amigaone.c
+src-plat-$(CONFIG_PPC_PS3) += ps3-head.S ps3-hvcall.S ps3.c
+src-plat-$(CONFIG_EPAPR_BOOT) += epapr.c
+
+src-wlib := $(sort $(src-wlib-y))
+src-plat := $(sort $(src-plat-y))
 src-boot := $(src-wlib) $(src-plat) empty.c
 
 src-boot := $(addprefix $(obj)/, $(src-boot))
@@ -412,4 +431,3 @@
 	$(call cmd,install_wrapper)
 
 $(obj)/bootwrapper_install: $(all-installed)
-
diff --git a/arch/powerpc/boot/flatdevtree_env.h b/arch/powerpc/boot/flatdevtree_env.h
deleted file mode 100644
index 66e0ebb..0000000
--- a/arch/powerpc/boot/flatdevtree_env.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * This file adds the header file glue so that the shared files
- * flatdevicetree.[ch] can compile and work in the powerpc bootwrapper.
- *
- * strncmp & strchr copied from <file:lib/string.c>
- * Copyright (C) 1991, 1992  Linus Torvalds
- *
- * Maintained by: Mark A. Greer <mgreer@mvista.com>
- */
-#ifndef _PPC_BOOT_FLATDEVTREE_ENV_H_
-#define _PPC_BOOT_FLATDEVTREE_ENV_H_
-
-#include <stdarg.h>
-#include <stddef.h>
-#include "types.h"
-#include "string.h"
-#include "stdio.h"
-#include "ops.h"
-
-#define be16_to_cpu(x)		(x)
-#define cpu_to_be16(x)		(x)
-#define be32_to_cpu(x)		(x)
-#define cpu_to_be32(x)		(x)
-#define be64_to_cpu(x)		(x)
-#define cpu_to_be64(x)		(x)
-
-#endif /* _PPC_BOOT_FLATDEVTREE_ENV_H_ */
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index c1442a3..273e2bd 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -16,6 +16,7 @@
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
 CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODVERSIONS=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 6608232..187fb8d 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -24,6 +24,7 @@
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
 CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODVERSIONS=y
diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
index 37c32aba..a6f8c7a 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -26,8 +26,8 @@
 			   unsigned long target, int flags);
 unsigned int create_cond_branch(const unsigned int *addr,
 				unsigned long target, int flags);
-void patch_branch(unsigned int *addr, unsigned long target, int flags);
-void patch_instruction(unsigned int *addr, unsigned int instr);
+int patch_branch(unsigned int *addr, unsigned long target, int flags);
+int patch_instruction(unsigned int *addr, unsigned int instr);
 
 int instr_is_relative_branch(unsigned int instr);
 int instr_is_branch_to_addr(const unsigned int *instr, unsigned long addr);
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 957a83f..cbfe678 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -53,6 +53,16 @@
  */
 #define IOMAP_MAX_ORDER		13
 
+#define IOMMU_POOL_HASHBITS	2
+#define IOMMU_NR_POOLS		(1 << IOMMU_POOL_HASHBITS)
+
+struct iommu_pool {
+	unsigned long start;
+	unsigned long end;
+	unsigned long hint;
+	spinlock_t lock;
+} ____cacheline_aligned_in_smp;
+
 struct iommu_table {
 	unsigned long  it_busno;     /* Bus number this table belongs to */
 	unsigned long  it_size;      /* Size of iommu table in entries */
@@ -61,10 +71,10 @@
 	unsigned long  it_index;     /* which iommu table this is */
 	unsigned long  it_type;      /* type: PCI or Virtual Bus */
 	unsigned long  it_blocksize; /* Entries in each block (cacheline) */
-	unsigned long  it_hint;      /* Hint for next alloc */
-	unsigned long  it_largehint; /* Hint for large allocs */
-	unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
-	spinlock_t     it_lock;      /* Protects it_map */
+	unsigned long  poolsize;
+	unsigned long  nr_pools;
+	struct iommu_pool large_pool;
+	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
 };
 
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index f014552..e8a26db 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -163,12 +163,7 @@
  * to think about, feedback welcome. --BenH.
  */
 
-/* There are #define as they have to be used in assembly
- *
- * WARNING: If you change this list, make sure to update the array of
- * names currently in arch/powerpc/mm/hugetlbpage.c or bad things will
- * happen
- */
+/* These are #defines as they have to be used in assembly */
 #define MMU_PAGE_4K	0
 #define MMU_PAGE_16K	1
 #define MMU_PAGE_64K	2
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 413a5ea..53b6dfa 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -389,10 +389,8 @@
 
 #ifdef CONFIG_PSERIES_IDLE
 extern void update_smt_snooze_delay(int snooze);
-extern int pseries_notify_cpuidle_add_cpu(int cpu);
 #else
 static inline void update_smt_snooze_delay(int snooze) {}
-static inline int pseries_notify_cpuidle_add_cpu(int cpu) { return 0; }
 #endif
 
 extern void flush_instruction_cache(void);
diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
index cbe2297..5712f06 100644
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -8,7 +8,7 @@
 
 struct pt_regs;
 
-TRACE_EVENT(irq_entry,
+DECLARE_EVENT_CLASS(ppc64_interrupt_class,
 
 	TP_PROTO(struct pt_regs *regs),
 
@@ -25,55 +25,32 @@
 	TP_printk("pt_regs=%p", __entry->regs)
 );
 
-TRACE_EVENT(irq_exit,
+DEFINE_EVENT(ppc64_interrupt_class, irq_entry,
 
 	TP_PROTO(struct pt_regs *regs),
 
-	TP_ARGS(regs),
-
-	TP_STRUCT__entry(
-		__field(struct pt_regs *, regs)
-	),
-
-	TP_fast_assign(
-		__entry->regs = regs;
-	),
-
-	TP_printk("pt_regs=%p", __entry->regs)
+	TP_ARGS(regs)
 );
 
-TRACE_EVENT(timer_interrupt_entry,
+DEFINE_EVENT(ppc64_interrupt_class, irq_exit,
 
 	TP_PROTO(struct pt_regs *regs),
 
-	TP_ARGS(regs),
-
-	TP_STRUCT__entry(
-		__field(struct pt_regs *, regs)
-	),
-
-	TP_fast_assign(
-		__entry->regs = regs;
-	),
-
-	TP_printk("pt_regs=%p", __entry->regs)
+	TP_ARGS(regs)
 );
 
-TRACE_EVENT(timer_interrupt_exit,
+DEFINE_EVENT(ppc64_interrupt_class, timer_interrupt_entry,
 
 	TP_PROTO(struct pt_regs *regs),
 
-	TP_ARGS(regs),
+	TP_ARGS(regs)
+);
 
-	TP_STRUCT__entry(
-		__field(struct pt_regs *, regs)
-	),
+DEFINE_EVENT(ppc64_interrupt_class, timer_interrupt_exit,
 
-	TP_fast_assign(
-		__entry->regs = regs;
-	),
+	TP_PROTO(struct pt_regs *regs),
 
-	TP_printk("pt_regs=%p", __entry->regs)
+	TP_ARGS(regs)
 );
 
 #ifdef CONFIG_PPC_PSERIES
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 5971c85..cf38a17 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -197,7 +197,16 @@
 	wrteei	0
 #else
 	ld	r10,PACAKMSR(r13)
-	mtmsrd	r10,1
+	/*
+	 * For performance reasons we clear RI the same time that we
+	 * clear EE. We only need to clear RI just before we restore r13
+	 * below, but batching it with EE saves us one expensive mtmsrd call.
+	 * We have to be careful to restore RI if we branch anywhere from
+	 * here (eg syscall_exit_work).
+	 */
+	li	r9,MSR_RI
+	andc	r11,r10,r9
+	mtmsrd	r11,1
 #endif /* CONFIG_PPC_BOOK3E */
 
 	ld	r9,TI_FLAGS(r12)
@@ -214,17 +223,6 @@
 END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
 	andi.	r6,r8,MSR_PR
 	ld	r4,_LINK(r1)
-	/*
-	 * Clear RI before restoring r13.  If we are returning to
-	 * userspace and we take an exception after restoring r13,
-	 * we end up corrupting the userspace r13 value.
-	 */
-#ifdef CONFIG_PPC_BOOK3S
-	/* No MSR:RI on BookE */
-	li	r12,MSR_RI
-	andc	r11,r10,r12
-	mtmsrd	r11,1			/* clear MSR.RI */
-#endif /* CONFIG_PPC_BOOK3S */
 
 	beq-	1f
 	ACCOUNT_CPU_USER_EXIT(r11, r12)
@@ -271,6 +269,9 @@
 	b	syscall_exit
 	
 syscall_exit_work:
+#ifdef CONFIG_PPC_BOOK3S
+	mtmsrd	r10,1		/* Restore RI */
+#endif
 	/* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr.
 	 If TIF_NOERROR is set, just save r3 as it is. */
 
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index bf99cfa..6f33296 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -63,11 +63,9 @@
 		return -EINVAL;
 
 	/* replace the text with the new text */
-	if (probe_kernel_write((void *)ip, &new, MCOUNT_INSN_SIZE))
+	if (patch_instruction((unsigned int *)ip, new))
 		return -EPERM;
 
-	flush_icache_range(ip, ip + 8);
-
 	return 0;
 }
 
@@ -212,12 +210,9 @@
 	 */
 	op = 0x48000008;	/* b +8 */
 
-	if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+	if (patch_instruction((unsigned int *)ip, op))
 		return -EPERM;
 
-
-	flush_icache_range(ip, ip + 8);
-
 	return 0;
 }
 
@@ -286,11 +281,9 @@
 
 	op = PPC_INST_NOP;
 
-	if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+	if (patch_instruction((unsigned int *)ip, op))
 		return -EPERM;
 
-	flush_icache_range(ip, ip + 8);
-
 	return 0;
 }
 #endif /* PPC64 */
@@ -426,11 +419,9 @@
 
 	pr_devel("write to %lx\n", rec->ip);
 
-	if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+	if (patch_instruction((unsigned int *)ip, op))
 		return -EPERM;
 
-	flush_icache_range(ip, ip + 8);
-
 	return 0;
 }
 #endif /* CONFIG_PPC64 */
@@ -484,6 +475,58 @@
 	return ret;
 }
 
+static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+{
+	unsigned long ftrace_addr = (unsigned long)FTRACE_ADDR;
+	int ret;
+
+	ret = ftrace_update_record(rec, enable);
+
+	switch (ret) {
+	case FTRACE_UPDATE_IGNORE:
+		return 0;
+	case FTRACE_UPDATE_MAKE_CALL:
+		return ftrace_make_call(rec, ftrace_addr);
+	case FTRACE_UPDATE_MAKE_NOP:
+		return ftrace_make_nop(NULL, rec, ftrace_addr);
+	}
+
+	return 0;
+}
+
+void ftrace_replace_code(int enable)
+{
+	struct ftrace_rec_iter *iter;
+	struct dyn_ftrace *rec;
+	int ret;
+
+	for (iter = ftrace_rec_iter_start(); iter;
+	     iter = ftrace_rec_iter_next(iter)) {
+		rec = ftrace_rec_iter_record(iter);
+		ret = __ftrace_replace_code(rec, enable);
+		if (ret) {
+			ftrace_bug(ret, rec->ip);
+			return;
+		}
+	}
+}
+
+void arch_ftrace_update_code(int command)
+{
+	if (command & FTRACE_UPDATE_CALLS)
+		ftrace_replace_code(1);
+	else if (command & FTRACE_DISABLE_CALLS)
+		ftrace_replace_code(0);
+
+	if (command & FTRACE_UPDATE_TRACE_FUNC)
+		ftrace_update_ftrace_func(ftrace_trace_function);
+
+	if (command & FTRACE_START_FUNC_RET)
+		ftrace_enable_ftrace_graph_caller();
+	else if (command & FTRACE_STOP_FUNC_RET)
+		ftrace_disable_ftrace_graph_caller();
+}
+
 int __init ftrace_dyn_arch_init(void *data)
 {
 	/* caller expects data to be zero */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 359f078..7bc94da 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -33,6 +33,7 @@
 #include <linux/bitmap.h>
 #include <linux/iommu-helper.h>
 #include <linux/crash_dump.h>
+#include <linux/hash.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/iommu.h>
@@ -58,6 +59,26 @@
 
 __setup("iommu=", setup_iommu);
 
+static DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
+
+/*
+ * We precalculate the hash to avoid doing it on every allocation.
+ *
+ * The hash is important to spread CPUs across all the pools. For example,
+ * on a POWER7 with 4 way SMT we want interrupts on the primary threads and
+ * with 4 pools all primary threads would map to the same pool.
+ */
+static int __init setup_iommu_pool_hash(void)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i)
+		per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
+
+	return 0;
+}
+subsys_initcall(setup_iommu_pool_hash);
+
 static unsigned long iommu_range_alloc(struct device *dev,
 				       struct iommu_table *tbl,
                                        unsigned long npages,
@@ -71,6 +92,9 @@
 	int pass = 0;
 	unsigned long align_mask;
 	unsigned long boundary_size;
+	unsigned long flags;
+	unsigned int pool_nr;
+	struct iommu_pool *pool;
 
 	align_mask = 0xffffffffffffffffl >> (64 - align_order);
 
@@ -83,36 +107,46 @@
 		return DMA_ERROR_CODE;
 	}
 
-	if (handle && *handle)
+	/*
+	 * We don't need to disable preemption here because any CPU can
+	 * safely use any IOMMU pool.
+	 */
+	pool_nr = __raw_get_cpu_var(iommu_pool_hash) & (tbl->nr_pools - 1);
+
+	if (largealloc)
+		pool = &(tbl->large_pool);
+	else
+		pool = &(tbl->pools[pool_nr]);
+
+	spin_lock_irqsave(&(pool->lock), flags);
+
+again:
+	if ((pass == 0) && handle && *handle)
 		start = *handle;
 	else
-		start = largealloc ? tbl->it_largehint : tbl->it_hint;
+		start = pool->hint;
 
-	/* Use only half of the table for small allocs (15 pages or less) */
-	limit = largealloc ? tbl->it_size : tbl->it_halfpoint;
-
-	if (largealloc && start < tbl->it_halfpoint)
-		start = tbl->it_halfpoint;
+	limit = pool->end;
 
 	/* The case below can happen if we have a small segment appended
 	 * to a large, or when the previous alloc was at the very end of
 	 * the available space. If so, go back to the initial start.
 	 */
 	if (start >= limit)
-		start = largealloc ? tbl->it_largehint : tbl->it_hint;
-
- again:
+		start = pool->start;
 
 	if (limit + tbl->it_offset > mask) {
 		limit = mask - tbl->it_offset + 1;
 		/* If we're constrained on address range, first try
 		 * at the masked hint to avoid O(n) search complexity,
-		 * but on second pass, start at 0.
+		 * but on second pass, start at 0 in pool 0.
 		 */
-		if ((start & mask) >= limit || pass > 0)
-			start = 0;
-		else
+		if ((start & mask) >= limit || pass > 0) {
+			pool = &(tbl->pools[0]);
+			start = pool->start;
+		} else {
 			start &= mask;
+		}
 	}
 
 	if (dev)
@@ -126,16 +160,25 @@
 			     tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT,
 			     align_mask);
 	if (n == -1) {
-		if (likely(pass < 2)) {
-			/* First failure, just rescan the half of the table.
-			 * Second failure, rescan the other half of the table.
-			 */
-			start = (largealloc ^ pass) ? tbl->it_halfpoint : 0;
-			limit = pass ? tbl->it_size : limit;
+		if (likely(pass == 0)) {
+			/* First try the pool from the start */
+			pool->hint = pool->start;
 			pass++;
 			goto again;
+
+		} else if (pass <= tbl->nr_pools) {
+			/* Now try scanning all the other pools */
+			spin_unlock(&(pool->lock));
+			pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1);
+			pool = &tbl->pools[pool_nr];
+			spin_lock(&(pool->lock));
+			pool->hint = pool->start;
+			pass++;
+			goto again;
+
 		} else {
-			/* Third failure, give up */
+			/* Give up */
+			spin_unlock_irqrestore(&(pool->lock), flags);
 			return DMA_ERROR_CODE;
 		}
 	}
@@ -145,10 +188,10 @@
 	/* Bump the hint to a new block for small allocs. */
 	if (largealloc) {
 		/* Don't bump to new block to avoid fragmentation */
-		tbl->it_largehint = end;
+		pool->hint = end;
 	} else {
 		/* Overflow will be taken care of at the next allocation */
-		tbl->it_hint = (end + tbl->it_blocksize - 1) &
+		pool->hint = (end + tbl->it_blocksize - 1) &
 		                ~(tbl->it_blocksize - 1);
 	}
 
@@ -156,6 +199,8 @@
 	if (handle)
 		*handle = end;
 
+	spin_unlock_irqrestore(&(pool->lock), flags);
+
 	return n;
 }
 
@@ -165,18 +210,14 @@
 			      unsigned long mask, unsigned int align_order,
 			      struct dma_attrs *attrs)
 {
-	unsigned long entry, flags;
+	unsigned long entry;
 	dma_addr_t ret = DMA_ERROR_CODE;
 	int build_fail;
 
-	spin_lock_irqsave(&(tbl->it_lock), flags);
-
 	entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
 
-	if (unlikely(entry == DMA_ERROR_CODE)) {
-		spin_unlock_irqrestore(&(tbl->it_lock), flags);
+	if (unlikely(entry == DMA_ERROR_CODE))
 		return DMA_ERROR_CODE;
-	}
 
 	entry += tbl->it_offset;	/* Offset into real TCE table */
 	ret = entry << IOMMU_PAGE_SHIFT;	/* Set the return dma address */
@@ -193,8 +234,6 @@
 	 */
 	if (unlikely(build_fail)) {
 		__iommu_free(tbl, ret, npages);
-
-		spin_unlock_irqrestore(&(tbl->it_lock), flags);
 		return DMA_ERROR_CODE;
 	}
 
@@ -202,16 +241,14 @@
 	if (ppc_md.tce_flush)
 		ppc_md.tce_flush(tbl);
 
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
-
 	/* Make sure updates are seen by hardware */
 	mb();
 
 	return ret;
 }
 
-static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 
-			 unsigned int npages)
+static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
+			     unsigned int npages)
 {
 	unsigned long entry, free_entry;
 
@@ -231,20 +268,57 @@
 			printk(KERN_INFO "\tindex     = 0x%llx\n", (u64)tbl->it_index);
 			WARN_ON(1);
 		}
-		return;
+
+		return false;
 	}
 
+	return true;
+}
+
+static struct iommu_pool *get_pool(struct iommu_table *tbl,
+				   unsigned long entry)
+{
+	struct iommu_pool *p;
+	unsigned long largepool_start = tbl->large_pool.start;
+
+	/* The large pool is the last pool at the top of the table */
+	if (entry >= largepool_start) {
+		p = &tbl->large_pool;
+	} else {
+		unsigned int pool_nr = entry / tbl->poolsize;
+
+		BUG_ON(pool_nr > tbl->nr_pools);
+		p = &tbl->pools[pool_nr];
+	}
+
+	return p;
+}
+
+static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+			 unsigned int npages)
+{
+	unsigned long entry, free_entry;
+	unsigned long flags;
+	struct iommu_pool *pool;
+
+	entry = dma_addr >> IOMMU_PAGE_SHIFT;
+	free_entry = entry - tbl->it_offset;
+
+	pool = get_pool(tbl, free_entry);
+
+	if (!iommu_free_check(tbl, dma_addr, npages))
+		return;
+
 	ppc_md.tce_free(tbl, entry, npages);
+
+	spin_lock_irqsave(&(pool->lock), flags);
 	bitmap_clear(tbl->it_map, free_entry, npages);
+	spin_unlock_irqrestore(&(pool->lock), flags);
 }
 
 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 		unsigned int npages)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&(tbl->it_lock), flags);
-
 	__iommu_free(tbl, dma_addr, npages);
 
 	/* Make sure TLB cache is flushed if the HW needs it. We do
@@ -253,8 +327,6 @@
 	 */
 	if (ppc_md.tce_flush)
 		ppc_md.tce_flush(tbl);
-
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
 }
 
 int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
@@ -263,7 +335,6 @@
 		 struct dma_attrs *attrs)
 {
 	dma_addr_t dma_next = 0, dma_addr;
-	unsigned long flags;
 	struct scatterlist *s, *outs, *segstart;
 	int outcount, incount, i, build_fail = 0;
 	unsigned int align;
@@ -285,8 +356,6 @@
 
 	DBG("sg mapping %d elements:\n", nelems);
 
-	spin_lock_irqsave(&(tbl->it_lock), flags);
-
 	max_seg_size = dma_get_max_seg_size(dev);
 	for_each_sg(sglist, s, nelems, i) {
 		unsigned long vaddr, npages, entry, slen;
@@ -369,8 +438,6 @@
 	if (ppc_md.tce_flush)
 		ppc_md.tce_flush(tbl);
 
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
-
 	DBG("mapped %d elements:\n", outcount);
 
 	/* For the sake of iommu_unmap_sg, we clear out the length in the
@@ -402,7 +469,6 @@
 		if (s == outs)
 			break;
 	}
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
 	return 0;
 }
 
@@ -412,15 +478,12 @@
 		struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
-	unsigned long flags;
 
 	BUG_ON(direction == DMA_NONE);
 
 	if (!tbl)
 		return;
 
-	spin_lock_irqsave(&(tbl->it_lock), flags);
-
 	sg = sglist;
 	while (nelems--) {
 		unsigned int npages;
@@ -440,8 +503,6 @@
 	 */
 	if (ppc_md.tce_flush)
 		ppc_md.tce_flush(tbl);
-
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
 }
 
 static void iommu_table_clear(struct iommu_table *tbl)
@@ -494,9 +555,8 @@
 	unsigned long sz;
 	static int welcomed = 0;
 	struct page *page;
-
-	/* Set aside 1/4 of the table for large allocations. */
-	tbl->it_halfpoint = tbl->it_size * 3 / 4;
+	unsigned int i;
+	struct iommu_pool *p;
 
 	/* number of bytes needed for the bitmap */
 	sz = (tbl->it_size + 7) >> 3;
@@ -515,9 +575,28 @@
 	if (tbl->it_offset == 0)
 		set_bit(0, tbl->it_map);
 
-	tbl->it_hint = 0;
-	tbl->it_largehint = tbl->it_halfpoint;
-	spin_lock_init(&tbl->it_lock);
+	/* We only split the IOMMU table if we have 1GB or more of space */
+	if ((tbl->it_size << IOMMU_PAGE_SHIFT) >= (1UL * 1024 * 1024 * 1024))
+		tbl->nr_pools = IOMMU_NR_POOLS;
+	else
+		tbl->nr_pools = 1;
+
+	/* We reserve the top 1/4 of the table for large allocations */
+	tbl->poolsize = (tbl->it_size * 3 / 4) / IOMMU_NR_POOLS;
+
+	for (i = 0; i < IOMMU_NR_POOLS; i++) {
+		p = &tbl->pools[i];
+		spin_lock_init(&(p->lock));
+		p->start = tbl->poolsize * i;
+		p->hint = p->start;
+		p->end = p->start + tbl->poolsize;
+	}
+
+	p = &tbl->large_pool;
+	spin_lock_init(&(p->lock));
+	p->start = tbl->poolsize * i;
+	p->hint = p->start;
+	p->end = tbl->it_size;
 
 	iommu_table_clear(tbl);
 
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 8e78e93..0f75bd5 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1646,7 +1646,6 @@
 		pci_free_resource_list(&resources);
 		return;
 	}
-	bus->secondary = hose->first_busno;
 	hose->bus = bus;
 
 	/* Get probe mode and perform scan */
diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c
index 89dde17..d7dd42b 100644
--- a/arch/powerpc/kernel/pci_of_scan.c
+++ b/arch/powerpc/kernel/pci_of_scan.c
@@ -198,7 +198,6 @@
 
 /**
  * of_scan_pci_bridge - Set up a PCI bridge and scan for child nodes
- * @node: device tree node of bridge
  * @dev: pci_dev structure for the bridge
  *
  * of_scan_bus() calls this routine for each PCI bridge that it finds, and
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index e4cb343..e1417c4 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -571,7 +571,6 @@
 	if (system_state == SYSTEM_RUNNING)
 		vdso_data->processorCount++;
 #endif
-	ipi_call_lock();
 	notify_cpu_starting(cpu);
 	set_cpu_online(cpu, true);
 	/* Update sibling maps */
@@ -601,7 +600,6 @@
 		of_node_put(np);
 	}
 	of_node_put(l2_cache);
-	ipi_call_unlock();
 
 	local_irq_enable();
 
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index cb87301..06cbc30 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -625,7 +625,7 @@
  * vio_cmo_set_dev_desired - Set desired entitlement for a device
  *
  * @viodev: struct vio_dev for device to alter
- * @new_desired: new desired entitlement level in bytes
+ * @desired: new desired entitlement level in bytes
  *
  * For use by devices to request a change to their entitlement at runtime or
  * through sysfs.  The desired entitlement level is changed and a balancing
@@ -1262,7 +1262,7 @@
 
 /**
  * vio_register_driver: - Register a new vio driver
- * @drv:	The vio_driver structure to be registered.
+ * @viodrv:	The vio_driver structure to be registered.
  */
 int __vio_register_driver(struct vio_driver *viodrv, struct module *owner,
 			  const char *mod_name)
@@ -1282,7 +1282,7 @@
 
 /**
  * vio_unregister_driver - Remove registration of vio driver.
- * @driver:	The vio_driver struct to be removed form registration
+ * @viodrv:	The vio_driver struct to be removed form registration
  */
 void vio_unregister_driver(struct vio_driver *viodrv)
 {
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 7735a2c..746e0c8 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -17,14 +17,15 @@
 obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
 			   memcpy_64.o usercopy_64.o mem_64.o string.o \
 			   checksum_wrappers_64.o hweight_64.o \
-			   copyuser_power7.o
+			   copyuser_power7.o string_64.o copypage_power7.o \
+			   memcpy_power7.o
 obj-$(CONFIG_XMON)	+= sstep.o ldstfp.o
 obj-$(CONFIG_KPROBES)	+= sstep.o ldstfp.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= sstep.o ldstfp.o
 
 ifeq ($(CONFIG_PPC64),y)
 obj-$(CONFIG_SMP)	+= locks.o
-obj-$(CONFIG_ALTIVEC)	+= copyuser_power7_vmx.o
+obj-$(CONFIG_ALTIVEC)	+= vmx-helper.o
 endif
 
 obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 7c975d4..dd223b3 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -13,17 +13,23 @@
 #include <linux/mm.h>
 #include <asm/page.h>
 #include <asm/code-patching.h>
+#include <asm/uaccess.h>
 
 
-void patch_instruction(unsigned int *addr, unsigned int instr)
+int patch_instruction(unsigned int *addr, unsigned int instr)
 {
-	*addr = instr;
+	int err;
+
+	err = __put_user(instr, addr);
+	if (err)
+		return err;
 	asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" : : "r" (addr));
+	return 0;
 }
 
-void patch_branch(unsigned int *addr, unsigned long target, int flags)
+int patch_branch(unsigned int *addr, unsigned long target, int flags)
 {
-	patch_instruction(addr, create_branch(addr, target, flags));
+	return patch_instruction(addr, create_branch(addr, target, flags));
 }
 
 unsigned int create_branch(const unsigned int *addr,
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index 53dcb6b..9f9434a 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -17,7 +17,11 @@
         .section        ".text"
 
 _GLOBAL(copy_page)
+BEGIN_FTR_SECTION
 	lis	r5,PAGE_SIZE@h
+FTR_SECTION_ELSE
+	b	.copypage_power7
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
 	ori	r5,r5,PAGE_SIZE@l
 BEGIN_FTR_SECTION
 	ld      r10,PPC64_CACHES@toc(r2)
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
new file mode 100644
index 0000000..01e2b5d
--- /dev/null
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -0,0 +1,168 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/page.h>
+#include <asm/ppc_asm.h>
+
+#define STACKFRAMESIZE	256
+#define STK_REG(i)	(112 + ((i)-14)*8)
+
+_GLOBAL(copypage_power7)
+	/*
+	 * We prefetch both the source and destination using enhanced touch
+	 * instructions. We use a stream ID of 0 for the load side and
+	 * 1 for the store side. Since source and destination are page
+	 * aligned we don't need to clear the bottom 7 bits of either
+	 * address.
+	 */
+	ori	r9,r3,1		/* stream=1 */
+
+#ifdef CONFIG_PPC_64K_PAGES
+	lis	r7,0x0E01	/* depth=7, units=512 */
+#else
+	lis	r7,0x0E00	/* depth=7 */
+	ori	r7,r7,0x1000	/* units=32 */
+#endif
+	ori	r10,r7,1	/* stream=1 */
+
+	lis	r8,0x8000	/* GO=1 */
+	clrldi	r8,r8,32
+
+.machine push
+.machine "power4"
+	dcbt	r0,r4,0b01000
+	dcbt	r0,r7,0b01010
+	dcbtst	r0,r9,0b01000
+	dcbtst	r0,r10,0b01010
+	eieio
+	dcbt	r0,r8,0b01010	/* GO */
+.machine pop
+
+#ifdef CONFIG_ALTIVEC
+	mflr	r0
+	std	r3,48(r1)
+	std	r4,56(r1)
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	bl	.enter_vmx_copy
+	cmpwi	r3,0
+	ld	r0,STACKFRAMESIZE+16(r1)
+	ld	r3,STACKFRAMESIZE+48(r1)
+	ld	r4,STACKFRAMESIZE+56(r1)
+	mtlr	r0
+
+	li	r0,(PAGE_SIZE/128)
+	mtctr	r0
+
+	beq	.Lnonvmx_copy
+
+	addi	r1,r1,STACKFRAMESIZE
+
+	li	r6,16
+	li	r7,32
+	li	r8,48
+	li	r9,64
+	li	r10,80
+	li	r11,96
+	li	r12,112
+
+	.align	5
+1:	lvx	vr7,r0,r4
+	lvx	vr6,r4,r6
+	lvx	vr5,r4,r7
+	lvx	vr4,r4,r8
+	lvx	vr3,r4,r9
+	lvx	vr2,r4,r10
+	lvx	vr1,r4,r11
+	lvx	vr0,r4,r12
+	addi	r4,r4,128
+	stvx	vr7,r0,r3
+	stvx	vr6,r3,r6
+	stvx	vr5,r3,r7
+	stvx	vr4,r3,r8
+	stvx	vr3,r3,r9
+	stvx	vr2,r3,r10
+	stvx	vr1,r3,r11
+	stvx	vr0,r3,r12
+	addi	r3,r3,128
+	bdnz	1b
+
+	b	.exit_vmx_copy		/* tail call optimise */
+
+#else
+	li	r0,(PAGE_SIZE/128)
+	mtctr	r0
+
+	stdu	r1,-STACKFRAMESIZE(r1)
+#endif
+
+.Lnonvmx_copy:
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+	std	r17,STK_REG(r17)(r1)
+	std	r18,STK_REG(r18)(r1)
+	std	r19,STK_REG(r19)(r1)
+	std	r20,STK_REG(r20)(r1)
+
+1:	ld	r0,0(r4)
+	ld	r5,8(r4)
+	ld	r6,16(r4)
+	ld	r7,24(r4)
+	ld	r8,32(r4)
+	ld	r9,40(r4)
+	ld	r10,48(r4)
+	ld	r11,56(r4)
+	ld	r12,64(r4)
+	ld	r14,72(r4)
+	ld	r15,80(r4)
+	ld	r16,88(r4)
+	ld	r17,96(r4)
+	ld	r18,104(r4)
+	ld	r19,112(r4)
+	ld	r20,120(r4)
+	addi	r4,r4,128
+	std	r0,0(r3)
+	std	r5,8(r3)
+	std	r6,16(r3)
+	std	r7,24(r3)
+	std	r8,32(r3)
+	std	r9,40(r3)
+	std	r10,48(r3)
+	std	r11,56(r3)
+	std	r12,64(r3)
+	std	r14,72(r3)
+	std	r15,80(r3)
+	std	r16,88(r3)
+	std	r17,96(r3)
+	std	r18,104(r3)
+	std	r19,112(r3)
+	std	r20,120(r3)
+	addi	r3,r3,128
+	bdnz	1b
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+	ld	r17,STK_REG(r17)(r1)
+	ld	r18,STK_REG(r18)(r1)
+	ld	r19,STK_REG(r19)(r1)
+	ld	r20,STK_REG(r20)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+	blr
diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S
index 497db7b..48e3f8c 100644
--- a/arch/powerpc/lib/copyuser_power7.S
+++ b/arch/powerpc/lib/copyuser_power7.S
@@ -61,7 +61,7 @@
 	ld	r15,STK_REG(r15)(r1)
 	ld	r14,STK_REG(r14)(r1)
 .Ldo_err3:
-	bl	.exit_vmx_copy
+	bl	.exit_vmx_usercopy
 	ld	r0,STACKFRAMESIZE+16(r1)
 	mtlr	r0
 	b	.Lexit
@@ -290,7 +290,7 @@
 	mflr	r0
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	.enter_vmx_copy
+	bl	.enter_vmx_usercopy
 	cmpwi	r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STACKFRAMESIZE+48(r1)
@@ -298,6 +298,68 @@
 	ld	r5,STACKFRAMESIZE+64(r1)
 	mtlr	r0
 
+	/*
+	 * We prefetch both the source and destination using enhanced touch
+	 * instructions. We use a stream ID of 0 for the load side and
+	 * 1 for the store side.
+	 */
+	clrrdi	r6,r4,7
+	clrrdi	r9,r3,7
+	ori	r9,r9,1		/* stream=1 */
+
+	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
+	cmpldi	r7,0x3FF
+	ble	1f
+	li	r7,0x3FF
+1:	lis	r0,0x0E00	/* depth=7 */
+	sldi	r7,r7,7
+	or	r7,r7,r0
+	ori	r10,r7,1	/* stream=1 */
+
+	lis	r8,0x8000	/* GO=1 */
+	clrldi	r8,r8,32
+
+.machine push
+.machine "power4"
+	dcbt	r0,r6,0b01000
+	dcbt	r0,r7,0b01010
+	dcbtst	r0,r9,0b01000
+	dcbtst	r0,r10,0b01010
+	eieio
+	dcbt	r0,r8,0b01010	/* GO */
+.machine pop
+
+	/*
+	 * We prefetch both the source and destination using enhanced touch
+	 * instructions. We use a stream ID of 0 for the load side and
+	 * 1 for the store side.
+	 */
+	clrrdi	r6,r4,7
+	clrrdi	r9,r3,7
+	ori	r9,r9,1		/* stream=1 */
+
+	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
+	cmpldi	cr1,r7,0x3FF
+	ble	cr1,1f
+	li	r7,0x3FF
+1:	lis	r0,0x0E00	/* depth=7 */
+	sldi	r7,r7,7
+	or	r7,r7,r0
+	ori	r10,r7,1	/* stream=1 */
+
+	lis	r8,0x8000	/* GO=1 */
+	clrldi	r8,r8,32
+
+.machine push
+.machine "power4"
+	dcbt	r0,r6,0b01000
+	dcbt	r0,r7,0b01010
+	dcbtst	r0,r9,0b01000
+	dcbtst	r0,r10,0b01010
+	eieio
+	dcbt	r0,r8,0b01010	/* GO */
+.machine pop
+
 	beq	.Lunwind_stack_nonvmx_copy
 
 	/*
@@ -476,7 +538,7 @@
 err3;	stb	r0,0(r3)
 
 15:	addi	r1,r1,STACKFRAMESIZE
-	b	.exit_vmx_copy		/* tail call optimise */
+	b	.exit_vmx_usercopy	/* tail call optimise */
 
 .Lvmx_unaligned_copy:
 	/* Get the destination 16B aligned */
@@ -679,5 +741,5 @@
 err3;	stb	r0,0(r3)
 
 15:	addi	r1,r1,STACKFRAMESIZE
-	b	.exit_vmx_copy		/* tail call optimise */
+	b	.exit_vmx_usercopy	/* tail call optimise */
 #endif /* CONFiG_ALTIVEC */
diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S
index 82fea39..d2bbbc8 100644
--- a/arch/powerpc/lib/memcpy_64.S
+++ b/arch/powerpc/lib/memcpy_64.S
@@ -11,7 +11,11 @@
 
 	.align	7
 _GLOBAL(memcpy)
+BEGIN_FTR_SECTION
 	std	r3,48(r1)	/* save destination pointer for return value */
+FTR_SECTION_ELSE
+	b	memcpy_power7
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
 	PPC_MTOCRF(0x01,r5)
 	cmpldi	cr1,r5,16
 	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
new file mode 100644
index 0000000..84674d8
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -0,0 +1,650 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/ppc_asm.h>
+
+#define STACKFRAMESIZE	256
+#define STK_REG(i)	(112 + ((i)-14)*8)
+
+_GLOBAL(memcpy_power7)
+#ifdef CONFIG_ALTIVEC
+	cmpldi	r5,16
+	cmpldi	cr1,r5,4096
+
+	std	r3,48(r1)
+
+	blt	.Lshort_copy
+	bgt	cr1,.Lvmx_copy
+#else
+	cmpldi	r5,16
+
+	std	r3,48(r1)
+
+	blt	.Lshort_copy
+#endif
+
+.Lnonvmx_copy:
+	/* Get the source 8B aligned */
+	neg	r6,r4
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-3)
+
+	bf	cr7*4+3,1f
+	lbz	r0,0(r4)
+	addi	r4,r4,1
+	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	sub	r5,r5,r6
+	cmpldi	r5,128
+	blt	5f
+
+	mflr	r0
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+	std	r17,STK_REG(r17)(r1)
+	std	r18,STK_REG(r18)(r1)
+	std	r19,STK_REG(r19)(r1)
+	std	r20,STK_REG(r20)(r1)
+	std	r21,STK_REG(r21)(r1)
+	std	r22,STK_REG(r22)(r1)
+	std	r0,STACKFRAMESIZE+16(r1)
+
+	srdi	r6,r5,7
+	mtctr	r6
+
+	/* Now do cacheline (128B) sized loads and stores. */
+	.align	5
+4:
+	ld	r0,0(r4)
+	ld	r6,8(r4)
+	ld	r7,16(r4)
+	ld	r8,24(r4)
+	ld	r9,32(r4)
+	ld	r10,40(r4)
+	ld	r11,48(r4)
+	ld	r12,56(r4)
+	ld	r14,64(r4)
+	ld	r15,72(r4)
+	ld	r16,80(r4)
+	ld	r17,88(r4)
+	ld	r18,96(r4)
+	ld	r19,104(r4)
+	ld	r20,112(r4)
+	ld	r21,120(r4)
+	addi	r4,r4,128
+	std	r0,0(r3)
+	std	r6,8(r3)
+	std	r7,16(r3)
+	std	r8,24(r3)
+	std	r9,32(r3)
+	std	r10,40(r3)
+	std	r11,48(r3)
+	std	r12,56(r3)
+	std	r14,64(r3)
+	std	r15,72(r3)
+	std	r16,80(r3)
+	std	r17,88(r3)
+	std	r18,96(r3)
+	std	r19,104(r3)
+	std	r20,112(r3)
+	std	r21,120(r3)
+	addi	r3,r3,128
+	bdnz	4b
+
+	clrldi	r5,r5,(64-7)
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+	ld	r17,STK_REG(r17)(r1)
+	ld	r18,STK_REG(r18)(r1)
+	ld	r19,STK_REG(r19)(r1)
+	ld	r20,STK_REG(r20)(r1)
+	ld	r21,STK_REG(r21)(r1)
+	ld	r22,STK_REG(r22)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+
+	/* Up to 127B to go */
+5:	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+6:	bf	cr7*4+1,7f
+	ld	r0,0(r4)
+	ld	r6,8(r4)
+	ld	r7,16(r4)
+	ld	r8,24(r4)
+	ld	r9,32(r4)
+	ld	r10,40(r4)
+	ld	r11,48(r4)
+	ld	r12,56(r4)
+	addi	r4,r4,64
+	std	r0,0(r3)
+	std	r6,8(r3)
+	std	r7,16(r3)
+	std	r8,24(r3)
+	std	r9,32(r3)
+	std	r10,40(r3)
+	std	r11,48(r3)
+	std	r12,56(r3)
+	addi	r3,r3,64
+
+	/* Up to 63B to go */
+7:	bf	cr7*4+2,8f
+	ld	r0,0(r4)
+	ld	r6,8(r4)
+	ld	r7,16(r4)
+	ld	r8,24(r4)
+	addi	r4,r4,32
+	std	r0,0(r3)
+	std	r6,8(r3)
+	std	r7,16(r3)
+	std	r8,24(r3)
+	addi	r3,r3,32
+
+	/* Up to 31B to go */
+8:	bf	cr7*4+3,9f
+	ld	r0,0(r4)
+	ld	r6,8(r4)
+	addi	r4,r4,16
+	std	r0,0(r3)
+	std	r6,8(r3)
+	addi	r3,r3,16
+
+9:	clrldi	r5,r5,(64-4)
+
+	/* Up to 15B to go */
+.Lshort_copy:
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+	lwz	r6,4(r4)
+	addi	r4,r4,8
+	stw	r0,0(r3)
+	stw	r6,4(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+
+15:	ld	r3,48(r1)
+	blr
+
+.Lunwind_stack_nonvmx_copy:
+	addi	r1,r1,STACKFRAMESIZE
+	b	.Lnonvmx_copy
+
+#ifdef CONFIG_ALTIVEC
+.Lvmx_copy:
+	mflr	r0
+	std	r4,56(r1)
+	std	r5,64(r1)
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	bl	.enter_vmx_copy
+	cmpwi	r3,0
+	ld	r0,STACKFRAMESIZE+16(r1)
+	ld	r3,STACKFRAMESIZE+48(r1)
+	ld	r4,STACKFRAMESIZE+56(r1)
+	ld	r5,STACKFRAMESIZE+64(r1)
+	mtlr	r0
+
+	/*
+	 * We prefetch both the source and destination using enhanced touch
+	 * instructions. We use a stream ID of 0 for the load side and
+	 * 1 for the store side.
+	 */
+	clrrdi	r6,r4,7
+	clrrdi	r9,r3,7
+	ori	r9,r9,1		/* stream=1 */
+
+	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
+	cmpldi	cr1,r7,0x3FF
+	ble	cr1,1f
+	li	r7,0x3FF
+1:	lis	r0,0x0E00	/* depth=7 */
+	sldi	r7,r7,7
+	or	r7,r7,r0
+	ori	r10,r7,1	/* stream=1 */
+
+	lis	r8,0x8000	/* GO=1 */
+	clrldi	r8,r8,32
+
+.machine push
+.machine "power4"
+	dcbt	r0,r6,0b01000
+	dcbt	r0,r7,0b01010
+	dcbtst	r0,r9,0b01000
+	dcbtst	r0,r10,0b01010
+	eieio
+	dcbt	r0,r8,0b01010	/* GO */
+.machine pop
+
+	beq	.Lunwind_stack_nonvmx_copy
+
+	/*
+	 * If source and destination are not relatively aligned we use a
+	 * slower permute loop.
+	 */
+	xor	r6,r4,r3
+	rldicl.	r6,r6,0,(64-4)
+	bne	.Lvmx_unaligned_copy
+
+	/* Get the destination 16B aligned */
+	neg	r6,r3
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-4)
+
+	bf	cr7*4+3,1f
+	lbz	r0,0(r4)
+	addi	r4,r4,1
+	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	bf	cr7*4+0,4f
+	ld	r0,0(r4)
+	addi	r4,r4,8
+	std	r0,0(r3)
+	addi	r3,r3,8
+
+4:	sub	r5,r5,r6
+
+	/* Get the desination 128B aligned */
+	neg	r6,r3
+	srdi	r7,r6,4
+	mtocrf	0x01,r7
+	clrldi	r6,r6,(64-7)
+
+	li	r9,16
+	li	r10,32
+	li	r11,48
+
+	bf	cr7*4+3,5f
+	lvx	vr1,r0,r4
+	addi	r4,r4,16
+	stvx	vr1,r0,r3
+	addi	r3,r3,16
+
+5:	bf	cr7*4+2,6f
+	lvx	vr1,r0,r4
+	lvx	vr0,r4,r9
+	addi	r4,r4,32
+	stvx	vr1,r0,r3
+	stvx	vr0,r3,r9
+	addi	r3,r3,32
+
+6:	bf	cr7*4+1,7f
+	lvx	vr3,r0,r4
+	lvx	vr2,r4,r9
+	lvx	vr1,r4,r10
+	lvx	vr0,r4,r11
+	addi	r4,r4,64
+	stvx	vr3,r0,r3
+	stvx	vr2,r3,r9
+	stvx	vr1,r3,r10
+	stvx	vr0,r3,r11
+	addi	r3,r3,64
+
+7:	sub	r5,r5,r6
+	srdi	r6,r5,7
+
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+
+	li	r12,64
+	li	r14,80
+	li	r15,96
+	li	r16,112
+
+	mtctr	r6
+
+	/*
+	 * Now do cacheline sized loads and stores. By this stage the
+	 * cacheline stores are also cacheline aligned.
+	 */
+	.align	5
+8:
+	lvx	vr7,r0,r4
+	lvx	vr6,r4,r9
+	lvx	vr5,r4,r10
+	lvx	vr4,r4,r11
+	lvx	vr3,r4,r12
+	lvx	vr2,r4,r14
+	lvx	vr1,r4,r15
+	lvx	vr0,r4,r16
+	addi	r4,r4,128
+	stvx	vr7,r0,r3
+	stvx	vr6,r3,r9
+	stvx	vr5,r3,r10
+	stvx	vr4,r3,r11
+	stvx	vr3,r3,r12
+	stvx	vr2,r3,r14
+	stvx	vr1,r3,r15
+	stvx	vr0,r3,r16
+	addi	r3,r3,128
+	bdnz	8b
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+
+	/* Up to 127B to go */
+	clrldi	r5,r5,(64-7)
+	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+	bf	cr7*4+1,9f
+	lvx	vr3,r0,r4
+	lvx	vr2,r4,r9
+	lvx	vr1,r4,r10
+	lvx	vr0,r4,r11
+	addi	r4,r4,64
+	stvx	vr3,r0,r3
+	stvx	vr2,r3,r9
+	stvx	vr1,r3,r10
+	stvx	vr0,r3,r11
+	addi	r3,r3,64
+
+9:	bf	cr7*4+2,10f
+	lvx	vr1,r0,r4
+	lvx	vr0,r4,r9
+	addi	r4,r4,32
+	stvx	vr1,r0,r3
+	stvx	vr0,r3,r9
+	addi	r3,r3,32
+
+10:	bf	cr7*4+3,11f
+	lvx	vr1,r0,r4
+	addi	r4,r4,16
+	stvx	vr1,r0,r3
+	addi	r3,r3,16
+
+	/* Up to 15B to go */
+11:	clrldi	r5,r5,(64-4)
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+	ld	r0,0(r4)
+	addi	r4,r4,8
+	std	r0,0(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+
+15:	addi	r1,r1,STACKFRAMESIZE
+	ld	r3,48(r1)
+	b	.exit_vmx_copy		/* tail call optimise */
+
+.Lvmx_unaligned_copy:
+	/* Get the destination 16B aligned */
+	neg	r6,r3
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-4)
+
+	bf	cr7*4+3,1f
+	lbz	r0,0(r4)
+	addi	r4,r4,1
+	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	bf	cr7*4+0,4f
+	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+	lwz	r7,4(r4)
+	addi	r4,r4,8
+	stw	r0,0(r3)
+	stw	r7,4(r3)
+	addi	r3,r3,8
+
+4:	sub	r5,r5,r6
+
+	/* Get the desination 128B aligned */
+	neg	r6,r3
+	srdi	r7,r6,4
+	mtocrf	0x01,r7
+	clrldi	r6,r6,(64-7)
+
+	li	r9,16
+	li	r10,32
+	li	r11,48
+
+	lvsl	vr16,0,r4	/* Setup permute control vector */
+	lvx	vr0,0,r4
+	addi	r4,r4,16
+
+	bf	cr7*4+3,5f
+	lvx	vr1,r0,r4
+	vperm	vr8,vr0,vr1,vr16
+	addi	r4,r4,16
+	stvx	vr8,r0,r3
+	addi	r3,r3,16
+	vor	vr0,vr1,vr1
+
+5:	bf	cr7*4+2,6f
+	lvx	vr1,r0,r4
+	vperm	vr8,vr0,vr1,vr16
+	lvx	vr0,r4,r9
+	vperm	vr9,vr1,vr0,vr16
+	addi	r4,r4,32
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	addi	r3,r3,32
+
+6:	bf	cr7*4+1,7f
+	lvx	vr3,r0,r4
+	vperm	vr8,vr0,vr3,vr16
+	lvx	vr2,r4,r9
+	vperm	vr9,vr3,vr2,vr16
+	lvx	vr1,r4,r10
+	vperm	vr10,vr2,vr1,vr16
+	lvx	vr0,r4,r11
+	vperm	vr11,vr1,vr0,vr16
+	addi	r4,r4,64
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	stvx	vr10,r3,r10
+	stvx	vr11,r3,r11
+	addi	r3,r3,64
+
+7:	sub	r5,r5,r6
+	srdi	r6,r5,7
+
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+
+	li	r12,64
+	li	r14,80
+	li	r15,96
+	li	r16,112
+
+	mtctr	r6
+
+	/*
+	 * Now do cacheline sized loads and stores. By this stage the
+	 * cacheline stores are also cacheline aligned.
+	 */
+	.align	5
+8:
+	lvx	vr7,r0,r4
+	vperm	vr8,vr0,vr7,vr16
+	lvx	vr6,r4,r9
+	vperm	vr9,vr7,vr6,vr16
+	lvx	vr5,r4,r10
+	vperm	vr10,vr6,vr5,vr16
+	lvx	vr4,r4,r11
+	vperm	vr11,vr5,vr4,vr16
+	lvx	vr3,r4,r12
+	vperm	vr12,vr4,vr3,vr16
+	lvx	vr2,r4,r14
+	vperm	vr13,vr3,vr2,vr16
+	lvx	vr1,r4,r15
+	vperm	vr14,vr2,vr1,vr16
+	lvx	vr0,r4,r16
+	vperm	vr15,vr1,vr0,vr16
+	addi	r4,r4,128
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	stvx	vr10,r3,r10
+	stvx	vr11,r3,r11
+	stvx	vr12,r3,r12
+	stvx	vr13,r3,r14
+	stvx	vr14,r3,r15
+	stvx	vr15,r3,r16
+	addi	r3,r3,128
+	bdnz	8b
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+
+	/* Up to 127B to go */
+	clrldi	r5,r5,(64-7)
+	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+	bf	cr7*4+1,9f
+	lvx	vr3,r0,r4
+	vperm	vr8,vr0,vr3,vr16
+	lvx	vr2,r4,r9
+	vperm	vr9,vr3,vr2,vr16
+	lvx	vr1,r4,r10
+	vperm	vr10,vr2,vr1,vr16
+	lvx	vr0,r4,r11
+	vperm	vr11,vr1,vr0,vr16
+	addi	r4,r4,64
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	stvx	vr10,r3,r10
+	stvx	vr11,r3,r11
+	addi	r3,r3,64
+
+9:	bf	cr7*4+2,10f
+	lvx	vr1,r0,r4
+	vperm	vr8,vr0,vr1,vr16
+	lvx	vr0,r4,r9
+	vperm	vr9,vr1,vr0,vr16
+	addi	r4,r4,32
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	addi	r3,r3,32
+
+10:	bf	cr7*4+3,11f
+	lvx	vr1,r0,r4
+	vperm	vr8,vr0,vr1,vr16
+	addi	r4,r4,16
+	stvx	vr8,r0,r3
+	addi	r3,r3,16
+
+	/* Up to 15B to go */
+11:	clrldi	r5,r5,(64-4)
+	addi	r4,r4,-16	/* Unwind the +16 load offset */
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+	lwz	r6,4(r4)
+	addi	r4,r4,8
+	stw	r0,0(r3)
+	stw	r6,4(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+
+15:	addi	r1,r1,STACKFRAMESIZE
+	ld	r3,48(r1)
+	b	.exit_vmx_copy		/* tail call optimise */
+#endif /* CONFiG_ALTIVEC */
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 093d631..1b5a0a0 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -119,6 +119,7 @@
 2:	li	r3,0
 	blr
 
+#ifdef CONFIG_PPC32
 _GLOBAL(__clear_user)
 	addi	r6,r3,-4
 	li	r3,0
@@ -160,3 +161,4 @@
 	PPC_LONG	1b,91b
 	PPC_LONG	8b,92b
 	.text
+#endif
diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S
new file mode 100644
index 0000000..3b1e480
--- /dev/null
+++ b/arch/powerpc/lib/string_64.S
@@ -0,0 +1,202 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+	.section	".toc","aw"
+PPC64_CACHES:
+	.tc		ppc64_caches[TC],ppc64_caches
+	.section	".text"
+
+/**
+ * __clear_user: - Zero a block of memory in user space, with less checking.
+ * @to:   Destination address, in user space.
+ * @n:    Number of bytes to zero.
+ *
+ * Zero a block of memory in user space.  Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be cleared.
+ * On success, this will be zero.
+ */
+
+	.macro err1
+100:
+	.section __ex_table,"a"
+	.align 3
+	.llong 100b,.Ldo_err1
+	.previous
+	.endm
+
+	.macro err2
+200:
+	.section __ex_table,"a"
+	.align 3
+	.llong 200b,.Ldo_err2
+	.previous
+	.endm
+
+	.macro err3
+300:
+	.section __ex_table,"a"
+	.align 3
+	.llong 300b,.Ldo_err3
+	.previous
+	.endm
+
+.Ldo_err1:
+	mr	r3,r8
+
+.Ldo_err2:
+	mtctr	r4
+1:
+err3;	stb	r0,0(r3)
+	addi	r3,r3,1
+	addi	r4,r4,-1
+	bdnz	1b
+
+.Ldo_err3:
+	mr	r3,r4
+	blr
+
+_GLOBAL(__clear_user)
+	cmpdi	r4,32
+	neg	r6,r3
+	li	r0,0
+	blt	.Lshort_clear
+	mr	r8,r3
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-3)
+
+	/* Get the destination 8 byte aligned */
+	bf	cr7*4+3,1f
+err1;	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+err1;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+err1;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	sub	r4,r4,r6
+
+	cmpdi	r4,32
+	cmpdi	cr1,r4,512
+	blt	.Lshort_clear
+	bgt	cr1,.Llong_clear
+
+.Lmedium_clear:
+	srdi	r6,r4,5
+	mtctr	r6
+
+	/* Do 32 byte chunks */
+4:
+err2;	std	r0,0(r3)
+err2;	std	r0,8(r3)
+err2;	std	r0,16(r3)
+err2;	std	r0,24(r3)
+	addi	r3,r3,32
+	addi	r4,r4,-32
+	bdnz	4b
+
+.Lshort_clear:
+	/* up to 31 bytes to go */
+	cmpdi	r4,16
+	blt	6f
+err2;	std	r0,0(r3)
+err2;	std	r0,8(r3)
+	addi	r3,r3,16
+	addi	r4,r4,-16
+
+	/* Up to 15 bytes to go */
+6:	mr	r8,r3
+	clrldi	r4,r4,(64-4)
+	mtocrf	0x01,r4
+	bf	cr7*4+0,7f
+err1;	std	r0,0(r3)
+	addi	r3,r3,8
+
+7:	bf	cr7*4+1,8f
+err1;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+8:	bf	cr7*4+2,9f
+err1;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+9:	bf	cr7*4+3,10f
+err1;	stb	r0,0(r3)
+
+10:	li	r3,0
+	blr
+
+.Llong_clear:
+	ld	r5,PPC64_CACHES@toc(r2)
+
+	bf	cr7*4+0,11f
+err2;	std	r0,0(r3)
+	addi	r3,r3,8
+	addi	r4,r4,-8
+
+	/* Destination is 16 byte aligned, need to get it cacheline aligned */
+11:	lwz	r7,DCACHEL1LOGLINESIZE(r5)
+	lwz	r9,DCACHEL1LINESIZE(r5)
+
+	/*
+	 * With worst case alignment the long clear loop takes a minimum
+	 * of 1 byte less than 2 cachelines.
+	 */
+	sldi	r10,r9,2
+	cmpd	r4,r10
+	blt	.Lmedium_clear
+
+	neg	r6,r3
+	addi	r10,r9,-1
+	and.	r5,r6,r10
+	beq	13f
+
+	srdi	r6,r5,4
+	mtctr	r6
+	mr	r8,r3
+12:
+err1;	std	r0,0(r3)
+err1;	std	r0,8(r3)
+	addi	r3,r3,16
+	bdnz	12b
+
+	sub	r4,r4,r5
+
+13:	srd	r6,r4,r7
+	mtctr	r6
+	mr	r8,r3
+14:
+err1;	dcbz	r0,r3
+	add	r3,r3,r9
+	bdnz	14b
+
+	and	r4,r4,r10
+
+	cmpdi	r4,32
+	blt	.Lshort_clear
+	b	.Lmedium_clear
diff --git a/arch/powerpc/lib/copyuser_power7_vmx.c b/arch/powerpc/lib/vmx-helper.c
similarity index 79%
rename from arch/powerpc/lib/copyuser_power7_vmx.c
rename to arch/powerpc/lib/vmx-helper.c
index bf2654f2..3cf529c 100644
--- a/arch/powerpc/lib/copyuser_power7_vmx.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -22,7 +22,7 @@
 #include <linux/hardirq.h>
 #include <asm/switch_to.h>
 
-int enter_vmx_copy(void)
+int enter_vmx_usercopy(void)
 {
 	if (in_interrupt())
 		return 0;
@@ -44,8 +44,31 @@
  * This function must return 0 because we tail call optimise when calling
  * from __copy_tofrom_user_power7 which returns 0 on success.
  */
-int exit_vmx_copy(void)
+int exit_vmx_usercopy(void)
 {
 	pagefault_enable();
 	return 0;
 }
+
+int enter_vmx_copy(void)
+{
+	if (in_interrupt())
+		return 0;
+
+	preempt_disable();
+
+	enable_kernel_altivec();
+
+	return 1;
+}
+
+/*
+ * All calls to this function will be optimised into tail calls. We are
+ * passed a pointer to the destination which we return as required by a
+ * memcpy implementation.
+ */
+void *exit_vmx_copy(void *dest)
+{
+	preempt_enable();
+	return dest;
+}
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 1e95556..39b1597 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -340,6 +340,8 @@
 				dbg("Using form 1 affinity\n");
 				form1_affinity = 1;
 			}
+
+			of_node_put(chosen);
 		}
 	}
 
diff --git a/arch/powerpc/platforms/44x/currituck.c b/arch/powerpc/platforms/44x/currituck.c
index 583e67f..9f6c33d6 100644
--- a/arch/powerpc/platforms/44x/currituck.c
+++ b/arch/powerpc/platforms/44x/currituck.c
@@ -160,7 +160,7 @@
 	/* No need to check the DMA config as we /know/ our windows are all of
  	 * RAM.  Lets hope that doesn't change */
 #ifdef CONFIG_SWIOTLB
-	if (memblock_end_of_DRAM() > 0xffffffff) {
+	if ((memblock_end_of_DRAM() - 1) > 0xffffffff) {
 		ppc_swiotlb_enable = 1;
 		set_pci_dma_ops(&swiotlb_dma_ops);
 		ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb;
diff --git a/arch/powerpc/platforms/85xx/corenet_ds.c b/arch/powerpc/platforms/85xx/corenet_ds.c
index dd3617c..925b028 100644
--- a/arch/powerpc/platforms/85xx/corenet_ds.c
+++ b/arch/powerpc/platforms/85xx/corenet_ds.c
@@ -77,7 +77,7 @@
 #endif
 
 #ifdef CONFIG_SWIOTLB
-	if (memblock_end_of_DRAM() > max) {
+	if ((memblock_end_of_DRAM() - 1) > max) {
 		ppc_swiotlb_enable = 1;
 		set_pci_dma_ops(&swiotlb_dma_ops);
 		ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb;
diff --git a/arch/powerpc/platforms/85xx/ge_imp3a.c b/arch/powerpc/platforms/85xx/ge_imp3a.c
index 1801462..b6a728b 100644
--- a/arch/powerpc/platforms/85xx/ge_imp3a.c
+++ b/arch/powerpc/platforms/85xx/ge_imp3a.c
@@ -125,7 +125,7 @@
 	mpc85xx_smp_init();
 
 #ifdef CONFIG_SWIOTLB
-	if (memblock_end_of_DRAM() > max) {
+	if ((memblock_end_of_DRAM() - 1) > max) {
 		ppc_swiotlb_enable = 1;
 		set_pci_dma_ops(&swiotlb_dma_ops);
 		ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb;
diff --git a/arch/powerpc/platforms/85xx/mpc8536_ds.c b/arch/powerpc/platforms/85xx/mpc8536_ds.c
index 585bd22b..767c7cf 100644
--- a/arch/powerpc/platforms/85xx/mpc8536_ds.c
+++ b/arch/powerpc/platforms/85xx/mpc8536_ds.c
@@ -75,7 +75,7 @@
 #endif
 
 #ifdef CONFIG_SWIOTLB
-	if (memblock_end_of_DRAM() > max) {
+	if ((memblock_end_of_DRAM() - 1) > max) {
 		ppc_swiotlb_enable = 1;
 		set_pci_dma_ops(&swiotlb_dma_ops);
 		ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb;
diff --git a/arch/powerpc/platforms/85xx/mpc85xx_ds.c b/arch/powerpc/platforms/85xx/mpc85xx_ds.c
index 1fd91e9..d30f6c4 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_ds.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_ds.c
@@ -173,7 +173,7 @@
 	mpc85xx_smp_init();
 
 #ifdef CONFIG_SWIOTLB
-	if (memblock_end_of_DRAM() > max) {
+	if ((memblock_end_of_DRAM() - 1) > max) {
 		ppc_swiotlb_enable = 1;
 		set_pci_dma_ops(&swiotlb_dma_ops);
 		ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb;
diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c
index d208ebc..8e4b094 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c
@@ -359,7 +359,7 @@
 	mpc85xx_mds_qe_init();
 
 #ifdef CONFIG_SWIOTLB
-	if (memblock_end_of_DRAM() > max) {
+	if ((memblock_end_of_DRAM() - 1) > max) {
 		ppc_swiotlb_enable = 1;
 		set_pci_dma_ops(&swiotlb_dma_ops);
 		ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb;
diff --git a/arch/powerpc/platforms/85xx/p1022_ds.c b/arch/powerpc/platforms/85xx/p1022_ds.c
index f700c81..74e310b 100644
--- a/arch/powerpc/platforms/85xx/p1022_ds.c
+++ b/arch/powerpc/platforms/85xx/p1022_ds.c
@@ -450,7 +450,7 @@
 	mpc85xx_smp_init();
 
 #ifdef CONFIG_SWIOTLB
-	if (memblock_end_of_DRAM() > max) {
+	if ((memblock_end_of_DRAM() - 1) > max) {
 		ppc_swiotlb_enable = 1;
 		set_pci_dma_ops(&swiotlb_dma_ops);
 		ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb;
diff --git a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c
index 3755e61..817245b 100644
--- a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c
+++ b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c
@@ -102,7 +102,7 @@
 #endif
 
 #ifdef CONFIG_SWIOTLB
-	if (memblock_end_of_DRAM() > max) {
+	if ((memblock_end_of_DRAM() - 1) > max) {
 		ppc_swiotlb_enable = 1;
 		set_pci_dma_ops(&swiotlb_dma_ops);
 		ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb;
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index b9f509a..c264969 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -518,7 +518,6 @@
 	__set_bit(0, window->table.it_map);
 	tce_build_cell(&window->table, window->table.it_offset, 1,
 		       (unsigned long)iommu->pad_page, DMA_TO_DEVICE, NULL);
-	window->table.it_hint = window->table.it_blocksize;
 
 	return window;
 }
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 2d311c0..07c09cb 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -192,12 +192,15 @@
 	long l, limit;
 	long tcenum_start = tcenum, npages_start = npages;
 	int ret = 0;
+	unsigned long flags;
 
 	if (npages == 1) {
 		return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
 		                           direction, attrs);
 	}
 
+	local_irq_save(flags);	/* to protect tcep and the page behind it */
+
 	tcep = __get_cpu_var(tce_page);
 
 	/* This is safe to do since interrupts are off when we're called
@@ -207,6 +210,7 @@
 		tcep = (u64 *)__get_free_page(GFP_ATOMIC);
 		/* If allocation fails, fall back to the loop implementation */
 		if (!tcep) {
+			local_irq_restore(flags);
 			return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
 					    direction, attrs);
 		}
@@ -240,6 +244,8 @@
 		tcenum += limit;
 	} while (npages > 0 && !rc);
 
+	local_irq_restore(flags);
+
 	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
 		ret = (int)rc;
 		tce_freemulti_pSeriesLP(tbl, tcenum_start,
@@ -707,6 +713,21 @@
 
 early_param("disable_ddw", disable_ddw_setup);
 
+static inline void __remove_ddw(struct device_node *np, const u32 *ddw_avail, u64 liobn)
+{
+	int ret;
+
+	ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
+	if (ret)
+		pr_warning("%s: failed to remove DMA window: rtas returned "
+			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
+			np->full_name, ret, ddw_avail[2], liobn);
+	else
+		pr_debug("%s: successfully removed DMA window: rtas returned "
+			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
+			np->full_name, ret, ddw_avail[2], liobn);
+}
+
 static void remove_ddw(struct device_node *np)
 {
 	struct dynamic_dma_window_prop *dwp;
@@ -736,15 +757,7 @@
 		pr_debug("%s successfully cleared tces in window.\n",
 			 np->full_name);
 
-	ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
-	if (ret)
-		pr_warning("%s: failed to remove direct window: rtas returned "
-			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
-			np->full_name, ret, ddw_avail[2], liobn);
-	else
-		pr_debug("%s: successfully removed direct window: rtas returned "
-			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
-			np->full_name, ret, ddw_avail[2], liobn);
+	__remove_ddw(np, ddw_avail, liobn);
 
 delprop:
 	ret = prom_remove_property(np, win64);
@@ -869,6 +882,35 @@
 	return ret;
 }
 
+static void restore_default_window(struct pci_dev *dev,
+				u32 ddw_restore_token, unsigned long liobn)
+{
+	struct eeh_dev *edev;
+	u32 cfg_addr;
+	u64 buid;
+	int ret;
+
+	/*
+	 * Get the config address and phb buid of the PE window.
+	 * Rely on eeh to retrieve this for us.
+	 * Retrieve them from the pci device, not the node with the
+	 * dma-window property
+	 */
+	edev = pci_dev_to_eeh_dev(dev);
+	cfg_addr = edev->config_addr;
+	if (edev->pe_config_addr)
+		cfg_addr = edev->pe_config_addr;
+	buid = edev->phb->buid;
+
+	do {
+		ret = rtas_call(ddw_restore_token, 3, 1, NULL, cfg_addr,
+					BUID_HI(buid), BUID_LO(buid));
+	} while (rtas_busy_delay(ret));
+	dev_info(&dev->dev,
+		"ibm,reset-pe-dma-windows(%x) %x %x %x returned %d\n",
+		 ddw_restore_token, cfg_addr, BUID_HI(buid), BUID_LO(buid), ret);
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -889,9 +931,13 @@
 	u64 dma_addr, max_addr;
 	struct device_node *dn;
 	const u32 *uninitialized_var(ddw_avail);
+	const u32 *uninitialized_var(ddw_extensions);
+	u32 ddw_restore_token = 0;
 	struct direct_window *window;
 	struct property *win64;
 	struct dynamic_dma_window_prop *ddwprop;
+	const void *dma_window = NULL;
+	unsigned long liobn, offset, size;
 
 	mutex_lock(&direct_window_init_mutex);
 
@@ -911,7 +957,40 @@
 	if (!ddw_avail || len < 3 * sizeof(u32))
 		goto out_unlock;
 
-       /*
+	/*
+	 * the extensions property is only required to exist in certain
+	 * levels of firmware and later
+	 * the ibm,ddw-extensions property is a list with the first
+	 * element containing the number of extensions and each
+	 * subsequent entry is a value corresponding to that extension
+	 */
+	ddw_extensions = of_get_property(pdn, "ibm,ddw-extensions", &len);
+	if (ddw_extensions) {
+		/*
+		 * each new defined extension length should be added to
+		 * the top of the switch so the "earlier" entries also
+		 * get picked up
+		 */
+		switch (ddw_extensions[0]) {
+			/* ibm,reset-pe-dma-windows */
+			case 1:
+				ddw_restore_token = ddw_extensions[1];
+				break;
+		}
+	}
+
+	/*
+	 * Only remove the existing DMA window if we can restore back to
+	 * the default state. Removing the existing window maximizes the
+	 * resources available to firmware for dynamic window creation.
+	 */
+	if (ddw_restore_token) {
+		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+		of_parse_dma_window(pdn, dma_window, &liobn, &offset, &size);
+		__remove_ddw(pdn, ddw_avail, liobn);
+	}
+
+	/*
 	 * Query if there is a second window of size to map the
 	 * whole partition.  Query returns number of windows, largest
 	 * block assigned to PE (partition endpoint), and two bitmasks
@@ -920,7 +999,7 @@
 	dn = pci_device_to_OF_node(dev);
 	ret = query_ddw(dev, ddw_avail, &query);
 	if (ret != 0)
-		goto out_unlock;
+		goto out_restore_window;
 
 	if (query.windows_available == 0) {
 		/*
@@ -929,7 +1008,7 @@
 		 * trading in for a larger page size.
 		 */
 		dev_dbg(&dev->dev, "no free dynamic windows");
-		goto out_unlock;
+		goto out_restore_window;
 	}
 	if (query.page_size & 4) {
 		page_shift = 24; /* 16MB */
@@ -940,7 +1019,7 @@
 	} else {
 		dev_dbg(&dev->dev, "no supported direct page size in mask %x",
 			  query.page_size);
-		goto out_unlock;
+		goto out_restore_window;
 	}
 	/* verify the window * number of ptes will map the partition */
 	/* check largest block * page size > max memory hotplug addr */
@@ -949,14 +1028,14 @@
 		dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u "
 			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
 			  1ULL << page_shift);
-		goto out_unlock;
+		goto out_restore_window;
 	}
 	len = order_base_2(max_addr);
 	win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
 	if (!win64) {
 		dev_info(&dev->dev,
 			"couldn't allocate property for 64bit dma window\n");
-		goto out_unlock;
+		goto out_restore_window;
 	}
 	win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
 	win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
@@ -1018,6 +1097,10 @@
 	kfree(win64->value);
 	kfree(win64);
 
+out_restore_window:
+	if (ddw_restore_token)
+		restore_default_window(dev, ddw_restore_token, liobn);
+
 out_unlock:
 	mutex_unlock(&direct_window_init_mutex);
 	return dma_addr;
diff --git a/arch/powerpc/platforms/pseries/processor_idle.c b/arch/powerpc/platforms/pseries/processor_idle.c
index c71be66..7f5668b 100644
--- a/arch/powerpc/platforms/pseries/processor_idle.c
+++ b/arch/powerpc/platforms/pseries/processor_idle.c
@@ -11,6 +11,7 @@
 #include <linux/moduleparam.h>
 #include <linux/cpuidle.h>
 #include <linux/cpu.h>
+#include <linux/notifier.h>
 
 #include <asm/paca.h>
 #include <asm/reg.h>
@@ -189,17 +190,28 @@
 		.enter = &shared_cede_loop },
 };
 
-int pseries_notify_cpuidle_add_cpu(int cpu)
+static int pseries_cpuidle_add_cpu_notifier(struct notifier_block *n,
+			unsigned long action, void *hcpu)
 {
+	int hotcpu = (unsigned long)hcpu;
 	struct cpuidle_device *dev =
-			per_cpu_ptr(pseries_cpuidle_devices, cpu);
-	if (dev && cpuidle_get_driver()) {
-		cpuidle_disable_device(dev);
-		cpuidle_enable_device(dev);
+			per_cpu_ptr(pseries_cpuidle_devices, hotcpu);
+
+	switch (action & 0xf) {
+	case CPU_ONLINE:
+		if (dev && cpuidle_get_driver()) {
+			cpuidle_disable_device(dev);
+			cpuidle_enable_device(dev);
+		}
+		break;
 	}
-	return 0;
+	return NOTIFY_OK;
 }
 
+static struct notifier_block setup_hotplug_notifier = {
+	.notifier_call = pseries_cpuidle_add_cpu_notifier,
+};
+
 /*
  * pseries_cpuidle_driver_init()
  */
@@ -324,6 +336,7 @@
 		return retval;
 	}
 
+	register_cpu_notifier(&setup_hotplug_notifier);
 	printk(KERN_DEBUG "pseries_idle_driver registered\n");
 
 	return 0;
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index e16bb8d..71706bc 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -147,7 +147,6 @@
 	set_cpu_current_state(cpu, CPU_STATE_ONLINE);
 	set_default_offline_state(cpu);
 #endif
-	pseries_notify_cpuidle_add_cpu(cpu);
 }
 
 static int __devinit smp_pSeries_kick_cpu(int nr)