Merge branches 'release', 'idle', 'redhat-bugzilla-436589', 'sbs' and 'video' into release

commit: 86d9fc1293aa9456677eab94e9fd2d3a10920548 [log] [tgz]
author: Len Brown <len.brown@intel.com> Wed Mar 26 22:50:09 2008 -0400
committer: Len Brown <len.brown@intel.com> Wed Mar 26 22:50:09 2008 -0400
tree: 104309ac106d4c9328d88c6e1360e2060412c328
parent: 08dcf29e01dcb786c13dc80045bd65f804117efb [diff]
parent: 8e92b6605da989c0aa8ff7e33306f36f0efd957c [diff]
parent: 33fd7afd66ffdc6addf1b085fe6403b6af532f8e [diff]
parent: 7642d2113098f1270e9f9f0120f44d0035091636 [diff]
diff --git a/Documentation/i386/IO-APIC.txt b/Documentation/i386/IO-APIC.txt
index f951666..30b4c71 100644
--- a/Documentation/i386/IO-APIC.txt
+++ b/Documentation/i386/IO-APIC.txt

@@ -70,7 +70,7 @@
 
 These INTA-D PCI IRQs are always 'local to the card', their real meaning
 depends on which slot they are in. If you look at the daisy chaining diagram,
-a card in slot4, issuing INTA IRQ, it will end up as a signal on PIRQ2 of
+a card in slot4, issuing INTA IRQ, it will end up as a signal on PIRQ4 of
 the PCI chipset. Most cards issue INTA, this creates optimal distribution
 between the PIRQ lines. (distributing IRQ sources properly is not a
 necessity, PCI IRQs can be shared at will, but it's a good for performance

diff --git a/MAINTAINERS b/MAINTAINERS
index 73883b8..2f70e5c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS

@@ -2322,6 +2322,8 @@
 M:	anil.s.keshavamurthy@intel.com
 P:	David S. Miller
 M:	davem@davemloft.net
+P:	Masami Hiramatsu
+M:	mhiramat@redhat.com
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 

diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index f2b5a621d..8a85c93 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c

@@ -63,7 +63,7 @@
  */
 static int speedstep_smi_ownership (void)
 {
-	u32 command, result, magic;
+	u32 command, result, magic, dummy;
 	u32 function = GET_SPEEDSTEP_OWNER;
 	unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
 
@@ -73,8 +73,11 @@
 	dprintk("trying to obtain ownership with command %x at port %x\n", command, smi_port);
 
 	__asm__ __volatile__(
+		"push %%ebp\n"
 		"out %%al, (%%dx)\n"
-		: "=D" (result)
+		"pop %%ebp\n"
+		: "=D" (result), "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
+			"=S" (dummy)
 		: "a" (command), "b" (function), "c" (0), "d" (smi_port),
 			"D" (0), "S" (magic)
 		: "memory"
@@ -96,7 +99,7 @@
  */
 static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
 {
-	u32 command, result = 0, edi, high_mhz, low_mhz;
+	u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
 	u32 state=0;
 	u32 function = GET_SPEEDSTEP_FREQS;
 
@@ -109,10 +112,12 @@
 
 	dprintk("trying to determine frequencies with command %x at port %x\n", command, smi_port);
 
-	__asm__ __volatile__("movl $0, %%edi\n"
+	__asm__ __volatile__(
+		"push %%ebp\n"
 		"out %%al, (%%dx)\n"
-		: "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi)
-		: "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0)
+		"pop %%ebp"
+		: "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi), "=S" (dummy)
+		: "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0)
 	);
 
 	dprintk("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz);
@@ -135,16 +140,18 @@
 static int speedstep_get_state (void)
 {
 	u32 function=GET_SPEEDSTEP_STATE;
-	u32 result, state, edi, command;
+	u32 result, state, edi, command, dummy;
 
 	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
 
 	dprintk("trying to determine current setting with command %x at port %x\n", command, smi_port);
 
-	__asm__ __volatile__("movl $0, %%edi\n"
+	__asm__ __volatile__(
+		"push %%ebp\n"
 		"out %%al, (%%dx)\n"
-		: "=a" (result), "=b" (state), "=D" (edi)
-		: "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0)
+		"pop %%ebp\n"
+		: "=a" (result), "=b" (state), "=D" (edi), "=c" (dummy), "=d" (dummy), "=S" (dummy)
+		: "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0), "D" (0)
 	);
 
 	dprintk("state is %x, result is %x\n", state, result);
@@ -160,7 +167,7 @@
  */
 static void speedstep_set_state (unsigned int state)
 {
-	unsigned int result = 0, command, new_state;
+	unsigned int result = 0, command, new_state, dummy;
 	unsigned long flags;
 	unsigned int function=SET_SPEEDSTEP_STATE;
 	unsigned int retry = 0;
@@ -182,10 +189,12 @@
 		}
 		retry++;
 		__asm__ __volatile__(
-			"movl $0, %%edi\n"
+			"push %%ebp\n"
 			"out %%al, (%%dx)\n"
-			: "=b" (new_state), "=D" (result)
-			: "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0)
+			"pop %%ebp"
+			: "=b" (new_state), "=D" (result), "=c" (dummy), "=a" (dummy),
+				"=d" (dummy), "=S" (dummy)
+			: "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0)
 			);
 	} while ((new_state != state) && (retry <= SMI_TRIES));
 
@@ -195,7 +204,7 @@
 	if (new_state == state) {
 		dprintk("change to %u MHz succeeded after %u tries with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result);
 	} else {
-		printk(KERN_ERR "cpufreq: change failed with new_state %u and result %u\n", new_state, result);
+		printk(KERN_ERR "cpufreq: change to state %u failed with new_state %u and result %u\n", state, new_state, result);
 	}
 
 	return;

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 103d61a..3e18db4 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c

@@ -176,12 +176,13 @@
 }
 
 /**
- * Checks and updates an fixed-range MTRR if it differs from the value it
- * should have. If K8 extentions are wanted, update the K8 SYSCFG MSR also.
- * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information
- * \param msr MSR address of the MTTR which should be checked and updated
- * \param changed pointer which indicates whether the MTRR needed to be changed
- * \param msrwords pointer to the MSR values which the MSR should have
+ * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have
+ * @msr: MSR address of the MTTR which should be checked and updated
+ * @changed: pointer which indicates whether the MTRR needed to be changed
+ * @msrwords: pointer to the MSR values which the MSR should have
+ *
+ * If K8 extentions are wanted, update the K8 SYSCFG MSR also.
+ * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information.
  */
 static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
 {
@@ -199,12 +200,15 @@
 	}
 }
 
+/**
+ * generic_get_free_region - Get a free MTRR.
+ * @base: The starting (base) address of the region.
+ * @size: The size (in bytes) of the region.
+ * @replace_reg: mtrr index to be replaced; set to invalid value if none.
+ *
+ * Returns: The index of the region on success, else negative on error.
+ */
 int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
-/*  [SUMMARY] Get a free MTRR.
-    <base> The starting (base) address of the region.
-    <size> The size (in bytes) of the region.
-    [RETURNS] The index of the region on success, else -1 on error.
-*/
 {
 	int i, max;
 	mtrr_type ltype;
@@ -249,8 +253,8 @@
 }
 
 /**
- * Checks and updates the fixed-range MTRRs if they differ from the saved set
- * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges()
+ * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set
+ * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges()
  */
 static int set_fixed_ranges(mtrr_type * frs)
 {
@@ -294,13 +298,13 @@
 
 static u32 deftype_lo, deftype_hi;
 
+/**
+ * set_mtrr_state - Set the MTRR state for this CPU.
+ *
+ * NOTE: The CPU must already be in a safe state for MTRR changes.
+ * RETURNS: 0 if no changes made, else a mask indicating what was changed.
+ */
 static unsigned long set_mtrr_state(void)
-/*  [SUMMARY] Set the MTRR state for this CPU.
-    <state> The MTRR state information to read.
-    <ctxt> Some relevant CPU context.
-    [NOTE] The CPU must already be in a safe state for MTRR changes.
-    [RETURNS] 0 if no changes made, else a mask indication what was changed.
-*/
 {
 	unsigned int i;
 	unsigned long change_mask = 0;

diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index c706a30..5921e5f 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c

@@ -78,6 +78,14 @@
 	},
 	{
 		.callback	= dmi_io_delay_0xed_port,
+		.ident		= "HP Pavilion dv6000",
+		.matches	= {
+			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME, "30B8")
+		}
+	},
+	{
+		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "HP Pavilion tx1000",
 		.matches	= {
 			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),

diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 027fc06..b402c0f 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c

@@ -30,6 +30,7 @@
 
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
+#include <linux/module.h>
 #include <asm/geode.h>
 
 static struct mfgpt_timer_t {

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index a1d7071..2b3e5d4 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c

@@ -406,8 +406,6 @@
 	 */
 	min_low_pfn = PFN_UP(init_pg_tables_end);
 
-	find_max_pfn();
-
 	max_low_pfn = find_max_low_pfn();
 
 #ifdef CONFIG_HIGHMEM
@@ -764,12 +762,13 @@
 	if (efi_enabled)
 		efi_init();
 
-	max_low_pfn = setup_memory();
-
 	/* update e820 for memory not covered by WB MTRRs */
+	find_max_pfn();
 	mtrr_bp_init();
 	if (mtrr_trim_uncached_memory(max_pfn))
-		max_low_pfn = setup_memory();
+		find_max_pfn();
+
+	max_low_pfn = setup_memory();
 
 #ifdef CONFIG_VMI
 	/*

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 7637dc9..f4f7ecf 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c

@@ -801,7 +801,7 @@
 	/* Don't do the funky fallback heuristics the AMD version employs
 	   for now. */
 	node = apicid_to_node[apicid];
-	if (node == NUMA_NO_NODE)
+	if (node == NUMA_NO_NODE || !node_online(node))
 		node = first_node(node_online_map);
 	numa_set_node(cpu, node);
 

diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index c394ca0..8e25e06 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c

@@ -324,7 +324,6 @@
 	 * this space and use it to adjust the boundary between ZONE_NORMAL
 	 * and ZONE_HIGHMEM.
 	 */
-	find_max_pfn();
 	get_memcfg_numa();
 
 	kva_pages = calculate_numa_remap_pages();

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 4afaba0..794895c 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c

@@ -137,7 +137,11 @@
 	switch (mode) {
 	case IOR_MODE_UNCACHED:
 	default:
-		prot = PAGE_KERNEL_NOCACHE;
+		/*
+		 * FIXME: we will use UC MINUS for now, as video fb drivers
+		 * depend on it. Upcoming ioremap_wc() will fix this behavior.
+		 */
+		prot = PAGE_KERNEL_UC_MINUS;
 		break;
 	case IOR_MODE_CACHED:
 		prot = PAGE_KERNEL;

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 14e48b5..7b79f6b 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c

@@ -771,7 +771,7 @@
 int set_memory_uc(unsigned long addr, int numpages)
 {
 	return change_page_attr_set(addr, numpages,
-				    __pgprot(_PAGE_PCD | _PAGE_PWT));
+				    __pgprot(_PAGE_PCD));
 }
 EXPORT_SYMBOL(set_memory_uc);
 

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index e8e2d88..788da97 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c

@@ -1487,7 +1487,6 @@
 		return 0;
 	}
 
-	acpi_unlazy_tlb(smp_processor_id());
 	/*
 	 * Must be done before busmaster disable as we might need to
 	 * access HPET !
@@ -1577,6 +1576,8 @@
 		return 0;
 	}
 
+	acpi_unlazy_tlb(smp_processor_id());
+
 	/* Tell the scheduler that we are going deep-idle: */
 	sched_clock_idle_sleep_event();
 	/*
@@ -1692,7 +1693,9 @@
 		switch (cx->type) {
 			case ACPI_STATE_C1:
 			state->flags |= CPUIDLE_FLAG_SHALLOW;
-			state->flags |= CPUIDLE_FLAG_TIME_VALID;
+			if (cx->entry_method == ACPI_CSTATE_FFH)
+				state->flags |= CPUIDLE_FLAG_TIME_VALID;
+
 			state->enter = acpi_idle_enter_c1;
 			dev->safe_state = state;
 			break;

diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c
index a2cf300..bcf2c70 100644
--- a/drivers/acpi/sbshc.c
+++ b/drivers/acpi/sbshc.c

@@ -130,7 +130,6 @@
 		goto end;
 	}
 	smb_hc_write(hc, ACPI_SMB_COMMAND, command);
-	smb_hc_write(hc, ACPI_SMB_COMMAND, command);
 	if (!(protocol & 0x01)) {
 		smb_hc_write(hc, ACPI_SMB_BLOCK_COUNT, length);
 		for (i = 0; i < length; ++i)

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 57570ac..e6ce262 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c

@@ -39,20 +39,26 @@
 			   int size)
 {
 	int len;
+	int count;
 
-	if (!acpi_dev->flags.hardware_id)
+	if (!acpi_dev->flags.hardware_id && !acpi_dev->flags.compatible_ids)
 		return -ENODEV;
 
-	len = snprintf(modalias, size, "acpi:%s:",
-		       acpi_dev->pnp.hardware_id);
-	if (len < 0 || len >= size)
-		return -EINVAL;
+	len = snprintf(modalias, size, "acpi:");
 	size -= len;
 
+	if (acpi_dev->flags.hardware_id) {
+		count = snprintf(&modalias[len], size, "%s:",
+				 acpi_dev->pnp.hardware_id);
+		if (count < 0 || count >= size)
+			return -EINVAL;
+		len += count;
+		size -= count;
+	}
+
 	if (acpi_dev->flags.compatible_ids) {
 		struct acpi_compatible_id_list *cid_list;
 		int i;
-		int count;
 
 		cid_list = acpi_dev->pnp.cid_list;
 		for (i = 0; i < cid_list->count; i++) {

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index d73663a..fc555a9 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c

@@ -67,7 +67,7 @@
 	/* enter the state and update stats */
 	dev->last_residency = target_state->enter(dev, target_state);
 	dev->last_state = target_state;
-	target_state->time += dev->last_residency;
+	target_state->time += (unsigned long long)dev->last_residency;
 	target_state->usage++;
 
 	/* give the governor an opportunity to reflect on the outcome */
@@ -224,7 +224,7 @@
 	state->exit_latency = 0;
 	state->target_residency = 0;
 	state->power_usage = -1;
-	state->flags = CPUIDLE_FLAG_POLL | CPUIDLE_FLAG_TIME_VALID;
+	state->flags = CPUIDLE_FLAG_POLL;
 	state->enter = poll_idle;
 }
 #else

diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 69102ca..e949618 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c

@@ -218,6 +218,12 @@
 	return sprintf(buf, "%u\n", state->_name);\
 }
 
+#define define_show_state_ull_function(_name) \
+static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \
+{ \
+	return sprintf(buf, "%llu\n", state->_name);\
+}
+
 #define define_show_state_str_function(_name) \
 static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \
 { \
@@ -228,8 +234,8 @@
 
 define_show_state_function(exit_latency)
 define_show_state_function(power_usage)
-define_show_state_function(usage)
-define_show_state_function(time)
+define_show_state_ull_function(usage)
+define_show_state_ull_function(time)
 define_show_state_str_function(name)
 define_show_state_str_function(desc)
 

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 125e7b7..f7cb8e0 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c

@@ -486,12 +486,7 @@
 		break;
 
 	case PCI_CLASS_BRIDGE_PCI:
-		/* don't size subtractive decoding (transparent)
-		 * PCI-to-PCI bridges */
-		if (bus->self->transparent)
-			break;
 		pci_bridge_check_ranges(bus);
-		/* fall through */
 	default:
 		pbus_size_io(bus);
 		/* If the bridge supports prefetchable range, size it

diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index 174b877..9cf472a 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h

@@ -85,6 +85,7 @@
 #define __PAGE_KERNEL_RX		(__PAGE_KERNEL_EXEC & ~_PAGE_RW)
 #define __PAGE_KERNEL_EXEC_NOCACHE	(__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
 #define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_UC_MINUS		(__PAGE_KERNEL | _PAGE_PCD)
 #define __PAGE_KERNEL_VSYSCALL		(__PAGE_KERNEL_RX | _PAGE_USER)
 #define __PAGE_KERNEL_VSYSCALL_NOCACHE	(__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
 #define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
@@ -101,6 +102,7 @@
 #define PAGE_KERNEL_EXEC		MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
 #define PAGE_KERNEL_RX			MAKE_GLOBAL(__PAGE_KERNEL_RX)
 #define PAGE_KERNEL_NOCACHE		MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_UC_MINUS		MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
 #define PAGE_KERNEL_EXEC_NOCACHE	MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
 #define PAGE_KERNEL_LARGE		MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
 #define PAGE_KERNEL_LARGE_EXEC		MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)

diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 6b72a45..51e6b1e 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h

@@ -38,8 +38,8 @@
 	unsigned int	power_usage; /* in mW */
 	unsigned int	target_residency; /* in US */
 
-	unsigned int	usage;
-	unsigned int	time; /* in US */
+	unsigned long long	usage;
+	unsigned long long	time; /* in US */
 
 	int (*enter)	(struct cpuidle_device *dev,
 			 struct cpuidle_state *state);

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fed07d0..6a1e7af 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h

@@ -1541,6 +1541,12 @@
 
 extern void sched_idle_next(void);
 
+#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+extern void wake_up_idle_cpu(int cpu);
+#else
+static inline void wake_up_idle_cpu(int cpu) { }
+#endif
+
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;

diff --git a/kernel/relay.c b/kernel/relay.c
index 4c035a8..d6204a4 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c

@@ -736,7 +736,7 @@
 	kref_get(&buf->kref);
 	filp->private_data = buf;
 
-	return 0;
+	return nonseekable_open(inode, filp);
 }
 
 /**
@@ -1056,6 +1056,10 @@
 	.get = generic_pipe_buf_get,
 };
 
+static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+}
+
 /*
  *	subbuf_splice_actor - splice up to one subbuf's worth of data
  */
@@ -1083,6 +1087,7 @@
 		.partial = partial,
 		.flags = flags,
 		.ops = &relay_pipe_buf_ops,
+		.spd_release = relay_page_release,
 	};
 
 	if (rbuf->subbufs_produced == rbuf->subbufs_consumed)

diff --git a/kernel/sched.c b/kernel/sched.c
index 28c73f0..8dcdec6f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c

@@ -1052,6 +1052,49 @@
 	resched_task(cpu_curr(cpu));
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
+
+#ifdef CONFIG_NO_HZ
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (cpu == smp_processor_id())
+		return;
+
+	/*
+	 * This is safe, as this function is called with the timer
+	 * wheel base lock of (cpu) held. When the CPU is on the way
+	 * to idle and has not yet set rq->curr to idle then it will
+	 * be serialized on the timer wheel base lock and take the new
+	 * timer into account automatically.
+	 */
+	if (rq->curr != rq->idle)
+		return;
+
+	/*
+	 * We can set TIF_RESCHED on the idle task of the other CPU
+	 * lockless. The worst case is that the other CPU runs the
+	 * idle task through an additional NOOP schedule()
+	 */
+	set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+
+	/* NEED_RESCHED must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(rq->idle))
+		smp_send_reschedule(cpu);
+}
+#endif
+
 #else
 static void __resched_task(struct task_struct *p, int tif_bit)
 {

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 278534b..7f60097 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c

@@ -174,7 +174,7 @@
 			if (watchdog)
 				del_timer(&watchdog_timer);
 			watchdog = cs;
-			init_timer_deferrable(&watchdog_timer);
+			init_timer(&watchdog_timer);
 			watchdog_timer.function = clocksource_watchdog;
 
 			/* Reset watchdog cycles */

diff --git a/kernel/timer.c b/kernel/timer.c
index 99b00a2..b024106 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c

@@ -451,10 +451,18 @@
 	spin_lock_irqsave(&base->lock, flags);
 	timer_set_base(timer, base);
 	internal_add_timer(base, timer);
+	/*
+	 * Check whether the other CPU is idle and needs to be
+	 * triggered to reevaluate the timer wheel when nohz is
+	 * active. We are protected against the other CPU fiddling
+	 * with the timer by holding the timer base lock. This also
+	 * makes sure that a CPU on the way to idle can not evaluate
+	 * the timer wheel.
+	 */
+	wake_up_idle_cpu(cpu);
 	spin_unlock_irqrestore(&base->lock, flags);
 }
 
-
 /**
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 74c1b6b..51c9e2c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c

@@ -401,12 +401,20 @@
 	struct page *page;
 	unsigned long nr_pages;
 
+	/*
+	 * We want to release as many surplus pages as possible, spread
+	 * evenly across all nodes. Iterate across all nodes until we
+	 * can no longer free unreserved surplus pages. This occurs when
+	 * the nodes with surplus pages have no free pages.
+	 */
+	unsigned long remaining_iterations = num_online_nodes();
+
 	/* Uncommit the reservation */
 	resv_huge_pages -= unused_resv_pages;
 
 	nr_pages = min(unused_resv_pages, surplus_huge_pages);
 
-	while (nr_pages) {
+	while (remaining_iterations-- && nr_pages) {
 		nid = next_node(nid, node_online_map);
 		if (nid == MAX_NUMNODES)
 			nid = first_node(node_online_map);
@@ -424,6 +432,7 @@
 			surplus_huge_pages--;
 			surplus_huge_pages_node[nid]--;
 			nr_pages--;
+			remaining_iterations = num_online_nodes();
 		}
 	}
 }
@@ -671,9 +680,11 @@
 {
 	return sprintf(buf,
 		"Node %d HugePages_Total: %5u\n"
-		"Node %d HugePages_Free:  %5u\n",
+		"Node %d HugePages_Free:  %5u\n"
+		"Node %d HugePages_Surp:  %5u\n",
 		nid, nr_huge_pages_node[nid],
-		nid, free_huge_pages_node[nid]);
+		nid, free_huge_pages_node[nid],
+		nid, surplus_huge_pages_node[nid]);
 }
 
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */

diff --git a/mm/slab.c b/mm/slab.c
index bb4070e..04b308c 100644
--- a/mm/slab.c
+++ b/mm/slab.c

@@ -1481,7 +1481,7 @@
 	list_add(&cache_cache.next, &cache_chain);
 	cache_cache.colour_off = cache_line_size();
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
-	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
+	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
 
 	/*
 	 * struct kmem_cache size depends on nr_node_ids, which
@@ -1602,7 +1602,7 @@
 		int nid;
 
 		for_each_online_node(nid) {
-			init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], nid);
+			init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
 
 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
 				  &initkmem_list3[SIZE_AC + nid], nid);

diff --git a/mm/slub.c b/mm/slub.c
index ca71d5b..b72bc98 100644
--- a/mm/slub.c
+++ b/mm/slub.c

@@ -2685,6 +2685,7 @@
 }
 EXPORT_SYMBOL(kfree);
 
+#if defined(SLUB_DEBUG) || defined(CONFIG_SLABINFO)
 static unsigned long count_partial(struct kmem_cache_node *n)
 {
 	unsigned long flags;
@@ -2697,6 +2698,7 @@
 	spin_unlock_irqrestore(&n->list_lock, flags);
 	return x;
 }
+#endif
 
 /*
  * kmem_cache_shrink removes empty slabs from the partial lists and sorts

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 9712716..c22d6b6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c

@@ -322,15 +322,6 @@
 		ctxt->direction = DMA_FROM_DEVICE;
 		clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
 		clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
-		if ((ch+1)->rc_discrim == 0) {
-			/*
-			 * Checked in sq_cq_reap to see if we need to
-			 * be enqueued
-			 */
-			set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
-			ctxt->next = hdr_ctxt;
-			hdr_ctxt->next = head;
-		}
 
 		/* Prepare READ WR */
 		memset(&read_wr, 0, sizeof read_wr);
@@ -348,7 +339,17 @@
 		rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start],
 				  &sgl_offset,
 				  read_wr.num_sge);
-
+		if (((ch+1)->rc_discrim == 0) &&
+		    (read_wr.num_sge == ch_sge_ary[ch_no].count)) {
+			/*
+			 * Mark the last RDMA_READ with a bit to
+			 * indicate all RPC data has been fetched from
+			 * the client and the RPC needs to be enqueued.
+			 */
+			set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+			ctxt->next = hdr_ctxt;
+			hdr_ctxt->next = head;
+		}
 		/* Post the read */
 		err = svc_rdma_send(xprt, &read_wr);
 		if (err) {
commit	86d9fc1293aa9456677eab94e9fd2d3a10920548	[log] [tgz]
author	Len Brown <len.brown@intel.com>	Wed Mar 26 22:50:09 2008 -0400
committer	Len Brown <len.brown@intel.com>	Wed Mar 26 22:50:09 2008 -0400
tree	104309ac106d4c9328d88c6e1360e2060412c328
parent	08dcf29e01dcb786c13dc80045bd65f804117efb [diff]
parent	8e92b6605da989c0aa8ff7e33306f36f0efd957c [diff]
parent	33fd7afd66ffdc6addf1b085fe6403b6af532f8e [diff]
parent	7642d2113098f1270e9f9f0120f44d0035091636 [diff]
parent	5c9fcb5deef4d3a49798d76c48b726d2e3c7df72 [diff]