[PATCH] i386/x86-64: Share interrupt vectors when there is a large number of interrupt sources

Here's a patch that builds on Natalie Protasevich's IRQ compression
patch and tries to work for MPS boots as well as ACPI.  It is meant for
a 4-node IBM x460 NUMA box, which was dying because it had interrupt
pins with GSI numbers > NR_IRQS and thus overflowed irq_desc.

The problem is that this system has 270 GSIs (which are 1:1 mapped with
I/O APIC RTEs) and an 8-node box would have 540.  This is much bigger
than NR_IRQS (224 for both i386 and x86_64).  Also, there aren't enough
vectors to go around.  There are about 190 usable vectors, not counting
the reserved ones and the unused vectors at 0x20 to 0x2F.  So, my patch
attempts to compress the GSI range and share vectors by sharing IRQs.

Cc: "Protasevich, Natalie" <Natalie.Protasevich@unisys.com>

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index b66c13c..82754bb 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -39,17 +39,14 @@
 
 #ifdef	CONFIG_X86_64
 
-static inline void acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
-}
 extern void __init clustered_apic_check(void);
-static inline int ioapic_setup_disabled(void)
-{
-	return 0;
-}
 
+extern int gsi_irq_sharing(int gsi);
 #include <asm/proto.h>
 
+static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
+
+
 #else				/* X86 */
 
 #ifdef	CONFIG_X86_LOCAL_APIC
@@ -57,6 +54,8 @@
 #include <mach_mpparse.h>
 #endif				/* CONFIG_X86_LOCAL_APIC */
 
+static inline int gsi_irq_sharing(int gsi) { return gsi; }
+
 #endif				/* X86 */
 
 #define BAD_MADT_ENTRY(entry, end) (					    \
@@ -459,7 +458,7 @@
 		*irq = IO_APIC_VECTOR(gsi);
 	else
 #endif
-		*irq = gsi;
+		*irq = gsi_irq_sharing(gsi);
 	return 0;
 }
 
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index c8eee20..97154ab 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -57,7 +57,7 @@
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
  */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
 #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
 
 /*
@@ -85,6 +85,7 @@
 	int pin;							\
 	struct irq_pin_list *entry = irq_2_pin + irq;			\
 									\
+	BUG_ON(irq >= NR_IRQS);						\
 	for (;;) {							\
 		unsigned int reg;					\
 		pin = entry->pin;					\
@@ -127,6 +128,8 @@
 }
 #endif
 
+static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
+
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -137,6 +140,7 @@
 	static int first_free_entry = NR_IRQS;
 	struct irq_pin_list *entry = irq_2_pin + irq;
 
+	BUG_ON(irq >= NR_IRQS);
 	while (entry->next)
 		entry = irq_2_pin + entry->next;
 
@@ -144,7 +148,7 @@
 		entry->next = first_free_entry;
 		entry = irq_2_pin + entry->next;
 		if (++first_free_entry >= PIN_MAP_SIZE)
-			panic("io_apic.c: whoops");
+			panic("io_apic.c: ran out of irq_2_pin entries!");
 	}
 	entry->apic = apic;
 	entry->pin = pin;
@@ -420,6 +424,7 @@
 				best_guess = irq;
 		}
 	}
+	BUG_ON(best_guess >= NR_IRQS);
 	return best_guess;
 }
 
@@ -610,6 +615,64 @@
 	return MPBIOS_trigger(idx);
 }
 
+static int next_irq = 16;
+
+/*
+ * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
+ * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
+ * from ACPI, which can reach 800 in large boxen.
+ *
+ * Compact the sparse GSI space into a sequential IRQ series and reuse
+ * vectors if possible.
+ */
+int gsi_irq_sharing(int gsi)
+{
+	int i, tries, vector;
+
+	BUG_ON(gsi >= NR_IRQ_VECTORS);
+
+	if (platform_legacy_irq(gsi))
+		return gsi;
+
+	if (gsi_2_irq[gsi] != 0xFF)
+		return (int)gsi_2_irq[gsi];
+
+	tries = NR_IRQS;
+  try_again:
+	vector = assign_irq_vector(gsi);
+
+	/*
+	 * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
+	 * use of vector and if found, return that IRQ.  However, we never want
+	 * to share legacy IRQs, which usually have a different trigger mode
+	 * than PCI.
+	 */
+	for (i = 0; i < NR_IRQS; i++)
+		if (IO_APIC_VECTOR(i) == vector)
+			break;
+	if (platform_legacy_irq(i)) {
+		if (--tries >= 0) {
+			IO_APIC_VECTOR(i) = 0;
+			goto try_again;
+		}
+		panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
+	}
+	if (i < NR_IRQS) {
+		gsi_2_irq[gsi] = i;
+		printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
+				gsi, vector, i);
+		return i;
+	}
+
+	i = next_irq++;
+	BUG_ON(i >= NR_IRQS);
+	gsi_2_irq[gsi] = i;
+	IO_APIC_VECTOR(i) = vector;
+	printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
+			gsi, vector, i);
+	return i;
+}
+
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
@@ -639,6 +702,7 @@
 			while (i < apic)
 				irq += nr_ioapic_registers[i++];
 			irq += pin;
+			irq = gsi_irq_sharing(irq);
 			break;
 		}
 		default:
@@ -648,6 +712,7 @@
 			break;
 		}
 	}
+	BUG_ON(irq >= NR_IRQS);
 
 	/*
 	 * PCI IRQ command line redirection. Yes, limits are hardcoded.
@@ -663,6 +728,7 @@
 			}
 		}
 	}
+	BUG_ON(irq >= NR_IRQS);
 	return irq;
 }
 
@@ -690,8 +756,8 @@
 {
 	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
 
-	BUG_ON(irq >= NR_IRQ_VECTORS);
-	if (IO_APIC_VECTOR(irq) > 0)
+	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
+	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
 		return IO_APIC_VECTOR(irq);
 next:
 	current_vector += 8;
@@ -699,9 +765,8 @@
 		goto next;
 
 	if (current_vector >= FIRST_SYSTEM_VECTOR) {
-		offset++;
-		if (!(offset%8))
-			return -ENOSPC;
+		/* If we run out of vectors on large boxen, must share them. */
+		offset = (offset + 1) % 8;
 		current_vector = FIRST_DEVICE_VECTOR + offset;
 	}
 
@@ -1917,6 +1982,7 @@
 	entry.polarity = active_high_low;
 	entry.mask = 1;					 /* Disabled (masked) */
 
+	irq = gsi_irq_sharing(irq);
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index f16d38d..8f6958e 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -218,7 +218,7 @@
 			m->mpc_irqtype, m->mpc_irqflag & 3,
 			(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
 			m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+	if (++mp_irq_entries >= MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
 
diff --git a/include/asm-x86_64/desc.h b/include/asm-x86_64/desc.h
index 68ac3c6..1a3d380 100644
--- a/include/asm-x86_64/desc.h
+++ b/include/asm-x86_64/desc.h
@@ -98,16 +98,19 @@
 
 static inline void set_intr_gate(int nr, void *func) 
 { 
+	BUG_ON((unsigned)nr > 0xFF);
 	_set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); 
 } 
 
 static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) 
 { 
+	BUG_ON((unsigned)nr > 0xFF);
 	_set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); 
 } 
 
 static inline void set_system_gate(int nr, void *func) 
 { 
+	BUG_ON((unsigned)nr > 0xFF);
 	_set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); 
 } 
 
diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h
index f267e10..6b37538 100644
--- a/include/asm-x86_64/mpspec.h
+++ b/include/asm-x86_64/mpspec.h
@@ -157,7 +157,8 @@
  */
 
 #define MAX_MP_BUSSES 256
-#define MAX_IRQ_SOURCES 256
+/* Each PCI slot may be a combo card with its own bus.  4 IRQ pins per slot. */
+#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
 enum mp_bustype {
 	MP_BUS_ISA = 1,
 	MP_BUS_EISA,