Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jbarnes/pci-2.6
* 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jbarnes/pci-2.6: (41 commits)
PCI: fix pci_ioremap_bar() on s390
PCI: fix AER capability check
PCI: use pci_find_ext_capability everywhere
PCI: remove #ifdef DEBUG around dev_dbg call
PCI hotplug: fix get_##name return value problem
PCI: document the pcie_aspm kernel parameter
PCI: introduce an pci_ioremap(pdev, barnr) function
powerpc/PCI: Add legacy PCI access via sysfs
PCI: Add ability to mmap legacy_io on some platforms
PCI: probing debug message uniformization
PCI: support PCIe ARI capability
PCI: centralize the capabilities code in probe.c
PCI: centralize the capabilities code in pci-sysfs.c
PCI: fix 64-vbit prefetchable memory resource BARs
PCI: replace cfg space size (256/4096) by macros.
PCI: use resource_size() everywhere.
PCI: use same arg names in PCI_VDEVICE comment
PCI hotplug: rpaphp: make debug var unique
PCI: use %pF instead of print_fn_descriptor_symbol() in quirks.c
PCI: fix hotplug get_##name return value problem
...
diff --git a/CREDITS b/CREDITS
index c62dcb3..2358846 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1653,14 +1653,14 @@
S: USA
N: Dave Jones
-E: davej@codemonkey.org.uk
+E: davej@redhat.com
W: http://www.codemonkey.org.uk
-D: x86 errata/setup maintenance.
-D: AGPGART driver.
+D: Assorted VIA x86 support.
+D: 2.5 AGPGART overhaul.
D: CPUFREQ maintenance.
-D: Backport/Forwardport merge monkey.
-D: Various Janitor work.
-S: United Kingdom
+D: Fedora kernel maintainence.
+D: Misc/Other.
+S: 314 Littleton Rd, Westford, MA 01886, USA
N: Martin Josfsson
E: gandalf@wlug.westbo.se
diff --git a/Documentation/markers.txt b/Documentation/markers.txt
index d9f50a1..089f613 100644
--- a/Documentation/markers.txt
+++ b/Documentation/markers.txt
@@ -50,10 +50,12 @@
to call) for the specific marker through marker_probe_register() and can be
activated by calling marker_arm(). Marker deactivation can be done by calling
marker_disarm() as many times as marker_arm() has been called. Removing a probe
-is done through marker_probe_unregister(); it will disarm the probe and make
-sure there is no caller left using the probe when it returns. Probe removal is
-preempt-safe because preemption is disabled around the probe call. See the
-"Probe example" section below for a sample probe module.
+is done through marker_probe_unregister(); it will disarm the probe.
+marker_synchronize_unregister() must be called before the end of the module exit
+function to make sure there is no caller left using the probe. This, and the
+fact that preemption is disabled around the probe call, make sure that probe
+removal and module unload are safe. See the "Probe example" section below for a
+sample probe module.
The marker mechanism supports inserting multiple instances of the same marker.
Markers can be put in inline functions, inlined static functions, and
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 49378a9..10a0263 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -95,8 +95,9 @@
'p' - Will dump the current registers and flags to your console.
-'q' - Will dump a list of all running hrtimers.
- WARNING: Does not cover any other timers
+'q' - Will dump per CPU lists of all armed hrtimers (but NOT regular
+ timer_list timers) and detailed information about all
+ clockevent devices.
'r' - Turns off keyboard raw mode and sets it to XLATE.
diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt
new file mode 100644
index 0000000..5d354e1
--- /dev/null
+++ b/Documentation/tracepoints.txt
@@ -0,0 +1,101 @@
+ Using the Linux Kernel Tracepoints
+
+ Mathieu Desnoyers
+
+
+This document introduces Linux Kernel Tracepoints and their use. It provides
+examples of how to insert tracepoints in the kernel and connect probe functions
+to them and provides some examples of probe functions.
+
+
+* Purpose of tracepoints
+
+A tracepoint placed in code provides a hook to call a function (probe) that you
+can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or
+"off" (no probe is attached). When a tracepoint is "off" it has no effect,
+except for adding a tiny time penalty (checking a condition for a branch) and
+space penalty (adding a few bytes for the function call at the end of the
+instrumented function and adds a data structure in a separate section). When a
+tracepoint is "on", the function you provide is called each time the tracepoint
+is executed, in the execution context of the caller. When the function provided
+ends its execution, it returns to the caller (continuing from the tracepoint
+site).
+
+You can put tracepoints at important locations in the code. They are
+lightweight hooks that can pass an arbitrary number of parameters,
+which prototypes are described in a tracepoint declaration placed in a header
+file.
+
+They can be used for tracing and performance accounting.
+
+
+* Usage
+
+Two elements are required for tracepoints :
+
+- A tracepoint definition, placed in a header file.
+- The tracepoint statement, in C code.
+
+In order to use tracepoints, you should include linux/tracepoint.h.
+
+In include/trace/subsys.h :
+
+#include <linux/tracepoint.h>
+
+DEFINE_TRACE(subsys_eventname,
+ TPPTOTO(int firstarg, struct task_struct *p),
+ TPARGS(firstarg, p));
+
+In subsys/file.c (where the tracing statement must be added) :
+
+#include <trace/subsys.h>
+
+void somefct(void)
+{
+ ...
+ trace_subsys_eventname(arg, task);
+ ...
+}
+
+Where :
+- subsys_eventname is an identifier unique to your event
+ - subsys is the name of your subsystem.
+ - eventname is the name of the event to trace.
+- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the function
+ called by this tracepoint.
+- TPARGS(firstarg, p) are the parameters names, same as found in the prototype.
+
+Connecting a function (probe) to a tracepoint is done by providing a probe
+(function to call) for the specific tracepoint through
+register_trace_subsys_eventname(). Removing a probe is done through
+unregister_trace_subsys_eventname(); it will remove the probe sure there is no
+caller left using the probe when it returns. Probe removal is preempt-safe
+because preemption is disabled around the probe call. See the "Probe example"
+section below for a sample probe module.
+
+The tracepoint mechanism supports inserting multiple instances of the same
+tracepoint, but a single definition must be made of a given tracepoint name over
+all the kernel to make sure no type conflict will occur. Name mangling of the
+tracepoints is done using the prototypes to make sure typing is correct.
+Verification of probe type correctness is done at the registration site by the
+compiler. Tracepoints can be put in inline functions, inlined static functions,
+and unrolled loops as well as regular functions.
+
+The naming scheme "subsys_event" is suggested here as a convention intended
+to limit collisions. Tracepoint names are global to the kernel: they are
+considered as being the same whether they are in the core kernel image or in
+modules.
+
+
+* Probe / tracepoint example
+
+See the example provided in samples/tracepoints/src
+
+Compile them with your kernel.
+
+Run, as root :
+modprobe tracepoint-example (insmod order is not important)
+modprobe tracepoint-probe-example
+cat /proc/tracepoint-example (returns an expected error)
+rmmod tracepoint-example tracepoint-probe-example
+dmesg
diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt
index a4afb56..5bbbe20 100644
--- a/Documentation/tracers/mmiotrace.txt
+++ b/Documentation/tracers/mmiotrace.txt
@@ -36,7 +36,7 @@
$ echo mmiotrace > /debug/tracing/current_tracer
$ cat /debug/tracing/trace_pipe > mydump.txt &
Start X or whatever.
-$ echo "X is up" > /debug/tracing/marker
+$ echo "X is up" > /debug/tracing/trace_marker
$ echo none > /debug/tracing/current_tracer
Check for lost events.
@@ -59,9 +59,8 @@
Load the driver you want to trace and use it. Mmiotrace will only catch MMIO
accesses to areas that are ioremapped while mmiotrace is active.
-[Unimplemented feature:]
During tracing you can place comments (markers) into the trace by
-$ echo "X is up" > /debug/tracing/marker
+$ echo "X is up" > /debug/tracing/trace_marker
This makes it easier to see which part of the (huge) trace corresponds to
which action. It is recommended to place descriptive markers about what you
do.
diff --git a/MAINTAINERS b/MAINTAINERS
index 355c192..5c3f79c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1198,7 +1198,7 @@
CPU FREQUENCY DRIVERS
P: Dave Jones
-M: davej@codemonkey.org.uk
+M: davej@redhat.com
L: cpufreq@vger.kernel.org
W: http://www.codemonkey.org.uk/projects/cpufreq/
T: git kernel.org/pub/scm/linux/kernel/git/davej/cpufreq.git
diff --git a/arch/alpha/kernel/sys_sable.c b/arch/alpha/kernel/sys_sable.c
index 99a7f19..a4555f4 100644
--- a/arch/alpha/kernel/sys_sable.c
+++ b/arch/alpha/kernel/sys_sable.c
@@ -47,7 +47,7 @@
static irq_swizzle_t *sable_lynx_irq_swizzle;
-static void sable_lynx_init_irq(int nr_irqs);
+static void sable_lynx_init_irq(int nr_of_irqs);
#if defined(CONFIG_ALPHA_GENERIC) || defined(CONFIG_ALPHA_SABLE)
@@ -530,11 +530,11 @@
}
static void __init
-sable_lynx_init_irq(int nr_irqs)
+sable_lynx_init_irq(int nr_of_irqs)
{
long i;
- for (i = 0; i < nr_irqs; ++i) {
+ for (i = 0; i < nr_of_irqs; ++i) {
irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
irq_desc[i].chip = &sable_lynx_irq_type;
}
diff --git a/arch/arm/mach-ixp2000/ixdp2x00.c b/arch/arm/mach-ixp2000/ixdp2x00.c
index b0653a8..3045130 100644
--- a/arch/arm/mach-ixp2000/ixdp2x00.c
+++ b/arch/arm/mach-ixp2000/ixdp2x00.c
@@ -143,7 +143,7 @@
.unmask = ixdp2x00_irq_unmask
};
-void __init ixdp2x00_init_irq(volatile unsigned long *stat_reg, volatile unsigned long *mask_reg, unsigned long nr_irqs)
+void __init ixdp2x00_init_irq(volatile unsigned long *stat_reg, volatile unsigned long *mask_reg, unsigned long nr_of_irqs)
{
unsigned int irq;
@@ -154,7 +154,7 @@
board_irq_stat = stat_reg;
board_irq_mask = mask_reg;
- board_irq_count = nr_irqs;
+ board_irq_count = nr_of_irqs;
*board_irq_mask = 0xffffffff;
diff --git a/arch/arm/mach-omap2/irq.c b/arch/arm/mach-omap2/irq.c
index d354e0f..c40fc37 100644
--- a/arch/arm/mach-omap2/irq.c
+++ b/arch/arm/mach-omap2/irq.c
@@ -119,7 +119,7 @@
void __init omap_init_irq(void)
{
- unsigned long nr_irqs = 0;
+ unsigned long nr_of_irqs = 0;
unsigned int nr_banks = 0;
int i;
@@ -133,14 +133,14 @@
omap_irq_bank_init_one(bank);
- nr_irqs += bank->nr_irqs;
+ nr_of_irqs += bank->nr_irqs;
nr_banks++;
}
printk(KERN_INFO "Total of %ld interrupts on %d active controller%s\n",
- nr_irqs, nr_banks, nr_banks > 1 ? "s" : "");
+ nr_of_irqs, nr_banks, nr_banks > 1 ? "s" : "");
- for (i = 0; i < nr_irqs; i++) {
+ for (i = 0; i < nr_of_irqs; i++) {
set_irq_chip(i, &omap_irq_chip);
set_irq_handler(i, handle_level_irq);
set_irq_flags(i, IRQF_VALID);
diff --git a/arch/arm/mach-sa1100/include/mach/ide.h b/arch/arm/mach-sa1100/include/mach/ide.h
deleted file mode 100644
index 4c99c8f..0000000
--- a/arch/arm/mach-sa1100/include/mach/ide.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * arch/arm/mach-sa1100/include/mach/ide.h
- *
- * Copyright (c) 1998 Hugo Fiennes & Nicolas Pitre
- *
- * 18-aug-2000: Cleanup by Erik Mouw (J.A.K.Mouw@its.tudelft.nl)
- * Get rid of the special ide_init_hwif_ports() functions
- * and make a generalised function that can be used by all
- * architectures.
- */
-
-#include <asm/irq.h>
-#include <mach/hardware.h>
-#include <asm/mach-types.h>
-
-#error "This code is broken and needs update to match with current ide support"
-
-
-/*
- * Set up a hw structure for a specified data port, control port and IRQ.
- * This should follow whatever the default interface uses.
- */
-static inline void ide_init_hwif_ports(hw_regs_t *hw, unsigned long data_port,
- unsigned long ctrl_port, int *irq)
-{
- unsigned long reg = data_port;
- int i;
- int regincr = 1;
-
- /* The Empeg board has the first two address lines unused */
- if (machine_is_empeg())
- regincr = 1 << 2;
-
- /* The LART doesn't use A0 for IDE */
- if (machine_is_lart())
- regincr = 1 << 1;
-
- memset(hw, 0, sizeof(*hw));
-
- for (i = 0; i <= 7; i++) {
- hw->io_ports_array[i] = reg;
- reg += regincr;
- }
-
- hw->io_ports.ctl_addr = ctrl_port;
-
- if (irq)
- *irq = 0;
-}
-
-/*
- * This registers the standard ports for this architecture with the IDE
- * driver.
- */
-static __inline__ void
-ide_init_default_hwifs(void)
-{
- if (machine_is_lart()) {
-#ifdef CONFIG_SA1100_LART
- hw_regs_t hw;
-
- /* Enable GPIO as interrupt line */
- GPDR &= ~LART_GPIO_IDE;
- set_irq_type(LART_IRQ_IDE, IRQ_TYPE_EDGE_RISING);
-
- /* set PCMCIA interface timing */
- MECR = 0x00060006;
-
- /* init the interface */
- ide_init_hwif_ports(&hw, PCMCIA_IO_0_BASE + 0x0000, PCMCIA_IO_0_BASE + 0x1000, NULL);
- hw.irq = LART_IRQ_IDE;
- ide_register_hw(&hw);
-#endif
- }
-}
diff --git a/arch/avr32/mach-at32ap/extint.c b/arch/avr32/mach-at32ap/extint.c
index c36a6d5..310477b 100644
--- a/arch/avr32/mach-at32ap/extint.c
+++ b/arch/avr32/mach-at32ap/extint.c
@@ -191,7 +191,7 @@
struct eic *eic;
struct resource *regs;
unsigned int i;
- unsigned int nr_irqs;
+ unsigned int nr_of_irqs;
unsigned int int_irq;
int ret;
u32 pattern;
@@ -224,7 +224,7 @@
eic_writel(eic, IDR, ~0UL);
eic_writel(eic, MODE, ~0UL);
pattern = eic_readl(eic, MODE);
- nr_irqs = fls(pattern);
+ nr_of_irqs = fls(pattern);
/* Trigger on low level unless overridden by driver */
eic_writel(eic, EDGE, 0UL);
@@ -232,7 +232,7 @@
eic->chip = &eic_chip;
- for (i = 0; i < nr_irqs; i++) {
+ for (i = 0; i < nr_of_irqs; i++) {
set_irq_chip_and_handler(eic->first_irq + i, &eic_chip,
handle_level_irq);
set_irq_chip_data(eic->first_irq + i, eic);
@@ -256,7 +256,7 @@
eic->regs, int_irq);
dev_info(&pdev->dev,
"Handling %u external IRQs, starting with IRQ %u\n",
- nr_irqs, eic->first_irq);
+ nr_of_irqs, eic->first_irq);
return 0;
diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c
index fc29948..39cb6da 100644
--- a/arch/m32r/kernel/smpboot.c
+++ b/arch/m32r/kernel/smpboot.c
@@ -40,6 +40,7 @@
*/
#include <linux/module.h>
+#include <linux/cpu.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 64e1445..5ac51e6 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -10,9 +10,13 @@
* 2 of the License, or (at your option) any later version.
*/
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#else
+#include <asm/types.h>
+#endif
#include <asm/asm-compat.h>
#include <asm/kdump.h>
-#include <asm/types.h>
/*
* On PPC32 page size is 4K. For PPC64 we support either 4K or 64K software
diff --git a/arch/powerpc/platforms/cell/spufs/sputrace.c b/arch/powerpc/platforms/cell/spufs/sputrace.c
index 92d20e9..2ece399 100644
--- a/arch/powerpc/platforms/cell/spufs/sputrace.c
+++ b/arch/powerpc/platforms/cell/spufs/sputrace.c
@@ -232,6 +232,7 @@
remove_proc_entry("sputrace", NULL);
kfree(sputrace_log);
+ marker_synchronize_unregister();
}
module_init(sputrace_init);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 49349ba..5b9b123 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,6 +26,7 @@
select HAVE_KPROBES
select ARCH_WANT_OPTIONAL_GPIOLIB
select HAVE_KRETPROBES
+ select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE
select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
@@ -1242,14 +1243,6 @@
resultant kernel should continue to boot on existing non-EFI
platforms.
-config IRQBALANCE
- def_bool y
- prompt "Enable kernel irq balancing"
- depends on X86_32 && SMP && X86_IO_APIC
- help
- The default yes will allow the kernel to do irq load balancing.
- Saying no will keep the kernel from doing irq load balancing.
-
config SECCOMP
def_bool y
prompt "Enable seccomp to safely compute untrusted bytecode"
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 52d0359..13b8c86 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -287,7 +287,6 @@
# CONFIG_MTRR_SANITIZER is not set
CONFIG_X86_PAT=y
CONFIG_EFI=y
-# CONFIG_IRQBALANCE is not set
CONFIG_SECCOMP=y
# CONFIG_HZ_100 is not set
# CONFIG_HZ_250 is not set
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0d41f03..d7e5a58 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,7 +23,7 @@
CFLAGS_tsc.o := $(nostackp)
obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
-obj-y += traps.o irq_$(BITS).o dumpstack_$(BITS).o
+obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time_$(BITS).o ioport.o ldt.o
obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
@@ -60,8 +60,8 @@
obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o
-obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
+obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
@@ -108,7 +108,7 @@
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
- obj-y += bios_uv.o
+ obj-y += bios_uv.o uv_irq.o uv_sysfs.o
obj-y += genx2apic_cluster.o
obj-y += genx2apic_phys.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index eb875cd..0d1c26a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1256,7 +1256,7 @@
count =
acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
- NR_IRQ_VECTORS);
+ nr_irqs);
if (count < 0) {
printk(KERN_ERR PREFIX
"Error parsing interrupt source overrides entry\n");
@@ -1276,7 +1276,7 @@
count =
acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
- NR_IRQ_VECTORS);
+ nr_irqs);
if (count < 0) {
printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 426e5d9..c44cd6d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -10,6 +10,7 @@
#include <linux/dmi.h>
#include <linux/cpumask.h>
#include <asm/segment.h>
+#include <asm/desc.h>
#include "realmode/wakeup.h"
#include "sleep.h"
@@ -98,6 +99,8 @@
header->trampoline_segment = setup_trampoline() >> 4;
#ifdef CONFIG_SMP
stack_start.sp = temp_stack + 4096;
+ early_gdt_descr.address =
+ (unsigned long)get_cpu_gdt_table(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic.c
similarity index 78%
rename from arch/x86/kernel/apic_32.c
rename to arch/x86/kernel/apic.c
index 21c831d..04a7f96 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic.c
@@ -23,11 +23,13 @@
#include <linux/mc146818rtc.h>
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
+#include <linux/ioport.h>
#include <linux/cpu.h>
#include <linux/clockchips.h>
#include <linux/acpi_pmtmr.h>
#include <linux/module.h>
#include <linux/dmi.h>
+#include <linux/dmar.h>
#include <asm/atomic.h>
#include <asm/smp.h>
@@ -36,8 +38,14 @@
#include <asm/desc.h>
#include <asm/arch_hooks.h>
#include <asm/hpet.h>
+#include <asm/pgalloc.h>
#include <asm/i8253.h>
#include <asm/nmi.h>
+#include <asm/idle.h>
+#include <asm/proto.h>
+#include <asm/timex.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
#include <mach_apic.h>
#include <mach_apicdef.h>
@@ -50,16 +58,58 @@
# error SPURIOUS_APIC_VECTOR definition error
#endif
-unsigned long mp_lapic_addr;
-
+#ifdef CONFIG_X86_32
/*
* Knob to control our willingness to enable the local APIC.
*
* +1=force-enable
*/
static int force_enable_local_apic;
-int disable_apic;
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+ force_enable_local_apic = 1;
+ return 0;
+}
+early_param("lapic", parse_lapic);
+/* Local APIC was disabled by the BIOS and enabled by the kernel */
+static int enabled_via_apicbase;
+#endif
+
+#ifdef CONFIG_X86_64
+static int apic_calibrate_pmtmr __initdata;
+static __init int setup_apicpmtimer(char *s)
+{
+ apic_calibrate_pmtmr = 1;
+ notsc_setup(NULL);
+ return 0;
+}
+__setup("apicpmtimer", setup_apicpmtimer);
+#endif
+
+#ifdef CONFIG_X86_64
+#define HAVE_X2APIC
+#endif
+
+#ifdef HAVE_X2APIC
+int x2apic;
+/* x2apic enabled before OS handover */
+int x2apic_preenabled;
+int disable_x2apic;
+static __init int setup_nox2apic(char *str)
+{
+ disable_x2apic = 1;
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ return 0;
+}
+early_param("nox2apic", setup_nox2apic);
+#endif
+
+unsigned long mp_lapic_addr;
+int disable_apic;
/* Disable local APIC timer from the kernel commandline or via dmi quirk */
static int disable_apic_timer __cpuinitdata;
/* Local APIC timer works in C2 */
@@ -110,9 +160,6 @@
};
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
-/* Local APIC was disabled by the BIOS and enabled by the kernel */
-static int enabled_via_apicbase;
-
static unsigned long apic_phys;
/*
@@ -202,6 +249,42 @@
struct apic_ops __read_mostly *apic_ops = &xapic_ops;
EXPORT_SYMBOL_GPL(apic_ops);
+#ifdef HAVE_X2APIC
+static void x2apic_wait_icr_idle(void)
+{
+ /* no need to wait for icr idle in x2apic */
+ return;
+}
+
+static u32 safe_x2apic_wait_icr_idle(void)
+{
+ /* no need to wait for icr idle in x2apic */
+ return 0;
+}
+
+void x2apic_icr_write(u32 low, u32 id)
+{
+ wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
+}
+
+u64 x2apic_icr_read(void)
+{
+ unsigned long val;
+
+ rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
+ return val;
+}
+
+static struct apic_ops x2apic_ops = {
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .icr_read = x2apic_icr_read,
+ .icr_write = x2apic_icr_write,
+ .wait_icr_idle = x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
+};
+#endif
+
/**
* enable_NMI_through_LVT0 - enable NMI through local vector table 0
*/
@@ -219,6 +302,7 @@
apic_write(APIC_LVT0, v);
}
+#ifdef CONFIG_X86_32
/**
* get_physical_broadcast - Get number of physical broadcast IDs
*/
@@ -226,6 +310,7 @@
{
return modern_apic() ? 0xff : 0xf;
}
+#endif
/**
* lapic_get_maxlvt - get the maximum number of local vector table entries
@@ -247,11 +332,7 @@
*/
/* Clock divisor */
-#ifdef CONFG_X86_64
-#define APIC_DIVISOR 1
-#else
#define APIC_DIVISOR 16
-#endif
/*
* This function sets up the local APIC timer, with a timeout of
@@ -383,7 +464,7 @@
* Setup the local APIC timer for this CPU. Copy the initilized values
* of the boot CPU and register the clock event in the framework.
*/
-static void __devinit setup_APIC_timer(void)
+static void __cpuinit setup_APIC_timer(void)
{
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
@@ -453,14 +534,51 @@
}
}
+static int __init calibrate_by_pmtimer(long deltapm, long *delta)
+{
+ const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
+ const long pm_thresh = pm_100ms / 100;
+ unsigned long mult;
+ u64 res;
+
+#ifndef CONFIG_X86_PM_TIMER
+ return -1;
+#endif
+
+ apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
+
+ /* Check, if the PM timer is available */
+ if (!deltapm)
+ return -1;
+
+ mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
+
+ if (deltapm > (pm_100ms - pm_thresh) &&
+ deltapm < (pm_100ms + pm_thresh)) {
+ apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
+ } else {
+ res = (((u64)deltapm) * mult) >> 22;
+ do_div(res, 1000000);
+ printk(KERN_WARNING "APIC calibration not consistent "
+ "with PM Timer: %ldms instead of 100ms\n",
+ (long)res);
+ /* Correct the lapic counter value */
+ res = (((u64)(*delta)) * pm_100ms);
+ do_div(res, deltapm);
+ printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
+ "%lu (%ld)\n", (unsigned long)res, *delta);
+ *delta = (long)res;
+ }
+
+ return 0;
+}
+
static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
- const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
- const long pm_thresh = pm_100ms/100;
void (*real_handler)(struct clock_event_device *dev);
unsigned long deltaj;
- long delta, deltapm;
+ long delta;
int pm_referenced = 0;
local_irq_disable();
@@ -470,10 +588,10 @@
global_clock_event->event_handler = lapic_cal_handler;
/*
- * Setup the APIC counter to 1e9. There is no way the lapic
+ * Setup the APIC counter to maximum. There is no way the lapic
* can underflow in the 100ms detection time frame
*/
- __setup_APIC_LVTT(1000000000, 0, 0);
+ __setup_APIC_LVTT(0xffffffff, 0, 0);
/* Let the interrupts run */
local_irq_enable();
@@ -490,34 +608,9 @@
delta = lapic_cal_t1 - lapic_cal_t2;
apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
- /* Check, if the PM timer is available */
- deltapm = lapic_cal_pm2 - lapic_cal_pm1;
- apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
-
- if (deltapm) {
- unsigned long mult;
- u64 res;
-
- mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
-
- if (deltapm > (pm_100ms - pm_thresh) &&
- deltapm < (pm_100ms + pm_thresh)) {
- apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
- } else {
- res = (((u64) deltapm) * mult) >> 22;
- do_div(res, 1000000);
- printk(KERN_WARNING "APIC calibration not consistent "
- "with PM Timer: %ldms instead of 100ms\n",
- (long)res);
- /* Correct the lapic counter value */
- res = (((u64) delta) * pm_100ms);
- do_div(res, deltapm);
- printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
- "%lu (%ld)\n", (unsigned long) res, delta);
- delta = (long) res;
- }
- pm_referenced = 1;
- }
+ /* we trust the PM based calibration if possible */
+ pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
+ &delta);
/* Calculate the scaled math multiplication factor */
lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
@@ -559,7 +652,10 @@
levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
- /* We trust the pm timer based calibration */
+ /*
+ * PM timer calibration failed or not turned on
+ * so lets try APIC timer based calibration
+ */
if (!pm_referenced) {
apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -652,7 +748,7 @@
setup_APIC_timer();
}
-void __devinit setup_secondary_APIC_clock(void)
+void __cpuinit setup_secondary_APIC_clock(void)
{
setup_APIC_timer();
}
@@ -718,6 +814,9 @@
* Besides, if we don't timer interrupts ignore the global
* interrupt lock, which is the WrongThing (tm) to do.
*/
+#ifdef CONFIG_X86_64
+ exit_idle();
+#endif
irq_enter();
local_apic_timer_interrupt();
irq_exit();
@@ -991,40 +1090,43 @@
static void __cpuinit lapic_setup_esr(void)
{
- unsigned long oldvalue, value, maxlvt;
- if (lapic_is_integrated() && !esr_disable) {
- if (esr_disable) {
- /*
- * Something untraceable is creating bad interrupts on
- * secondary quads ... for the moment, just leave the
- * ESR disabled - we can't do anything useful with the
- * errors anyway - mbligh
- */
- printk(KERN_INFO "Leaving ESR disabled.\n");
- return;
- }
- /* !82489DX */
- maxlvt = lapic_get_maxlvt();
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- oldvalue = apic_read(APIC_ESR);
+ unsigned int oldvalue, value, maxlvt;
- /* enables sending errors */
- value = ERROR_APIC_VECTOR;
- apic_write(APIC_LVTERR, value);
- /*
- * spec says clear errors after enabling vector.
- */
- if (maxlvt > 3)
- apic_write(APIC_ESR, 0);
- value = apic_read(APIC_ESR);
- if (value != oldvalue)
- apic_printk(APIC_VERBOSE, "ESR value before enabling "
- "vector: 0x%08lx after: 0x%08lx\n",
- oldvalue, value);
- } else {
+ if (!lapic_is_integrated()) {
printk(KERN_INFO "No ESR for 82489DX.\n");
+ return;
}
+
+ if (esr_disable) {
+ /*
+ * Something untraceable is creating bad interrupts on
+ * secondary quads ... for the moment, just leave the
+ * ESR disabled - we can't do anything useful with the
+ * errors anyway - mbligh
+ */
+ printk(KERN_INFO "Leaving ESR disabled.\n");
+ return;
+ }
+
+ maxlvt = lapic_get_maxlvt();
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ oldvalue = apic_read(APIC_ESR);
+
+ /* enables sending errors */
+ value = ERROR_APIC_VECTOR;
+ apic_write(APIC_LVTERR, value);
+
+ /*
+ * spec says clear errors after enabling vector.
+ */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+ value = apic_read(APIC_ESR);
+ if (value != oldvalue)
+ apic_printk(APIC_VERBOSE, "ESR value before enabling "
+ "vector: 0x%08x after: 0x%08x\n",
+ oldvalue, value);
}
@@ -1033,24 +1135,27 @@
*/
void __cpuinit setup_local_APIC(void)
{
- unsigned long value, integrated;
+ unsigned int value;
int i, j;
+#ifdef CONFIG_X86_32
/* Pound the ESR really hard over the head with a big hammer - mbligh */
- if (esr_disable) {
+ if (lapic_is_integrated() && esr_disable) {
apic_write(APIC_ESR, 0);
apic_write(APIC_ESR, 0);
apic_write(APIC_ESR, 0);
apic_write(APIC_ESR, 0);
}
+#endif
- integrated = lapic_is_integrated();
+ preempt_disable();
/*
* Double-check whether this APIC is really registered.
+ * This is meaningless in clustered apic mode, so we skip it.
*/
if (!apic_id_registered())
- WARN_ON_ONCE(1);
+ BUG();
/*
* Intel recommends to set DFR, LDR and TPR before enabling
@@ -1096,6 +1201,7 @@
*/
value |= APIC_SPIV_APIC_ENABLED;
+#ifdef CONFIG_X86_32
/*
* Some unknown Intel IO/APIC (or APIC) errata is biting us with
* certain networking cards. If high frequency interrupts are
@@ -1116,8 +1222,13 @@
* See also the comment in end_level_ioapic_irq(). --macro
*/
- /* Enable focus processor (bit==0) */
+ /*
+ * - enable focus processor (bit==0)
+ * - 64bit mode always use processor focus
+ * so no need to set it
+ */
value &= ~APIC_SPIV_FOCUS_DISABLED;
+#endif
/*
* Set spurious IRQ vector
@@ -1154,9 +1265,11 @@
value = APIC_DM_NMI;
else
value = APIC_DM_NMI | APIC_LVT_MASKED;
- if (!integrated) /* 82489DX */
+ if (!lapic_is_integrated()) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
apic_write(APIC_LVT1, value);
+
+ preempt_enable();
}
void __cpuinit end_local_APIC_setup(void)
@@ -1177,6 +1290,153 @@
apic_pm_activate();
}
+#ifdef HAVE_X2APIC
+void check_x2apic(void)
+{
+ int msr, msr2;
+
+ rdmsr(MSR_IA32_APICBASE, msr, msr2);
+
+ if (msr & X2APIC_ENABLE) {
+ printk("x2apic enabled by BIOS, switching to x2apic ops\n");
+ x2apic_preenabled = x2apic = 1;
+ apic_ops = &x2apic_ops;
+ }
+}
+
+void enable_x2apic(void)
+{
+ int msr, msr2;
+
+ rdmsr(MSR_IA32_APICBASE, msr, msr2);
+ if (!(msr & X2APIC_ENABLE)) {
+ printk("Enabling x2apic\n");
+ wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
+ }
+}
+
+void enable_IR_x2apic(void)
+{
+#ifdef CONFIG_INTR_REMAP
+ int ret;
+ unsigned long flags;
+
+ if (!cpu_has_x2apic)
+ return;
+
+ if (!x2apic_preenabled && disable_x2apic) {
+ printk(KERN_INFO
+ "Skipped enabling x2apic and Interrupt-remapping "
+ "because of nox2apic\n");
+ return;
+ }
+
+ if (x2apic_preenabled && disable_x2apic)
+ panic("Bios already enabled x2apic, can't enforce nox2apic");
+
+ if (!x2apic_preenabled && skip_ioapic_setup) {
+ printk(KERN_INFO
+ "Skipped enabling x2apic and Interrupt-remapping "
+ "because of skipping io-apic setup\n");
+ return;
+ }
+
+ ret = dmar_table_init();
+ if (ret) {
+ printk(KERN_INFO
+ "dmar_table_init() failed with %d:\n", ret);
+
+ if (x2apic_preenabled)
+ panic("x2apic enabled by bios. But IR enabling failed");
+ else
+ printk(KERN_INFO
+ "Not enabling x2apic,Intr-remapping\n");
+ return;
+ }
+
+ local_irq_save(flags);
+ mask_8259A();
+
+ ret = save_mask_IO_APIC_setup();
+ if (ret) {
+ printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret);
+ goto end;
+ }
+
+ ret = enable_intr_remapping(1);
+
+ if (ret && x2apic_preenabled) {
+ local_irq_restore(flags);
+ panic("x2apic enabled by bios. But IR enabling failed");
+ }
+
+ if (ret)
+ goto end_restore;
+
+ if (!x2apic) {
+ x2apic = 1;
+ apic_ops = &x2apic_ops;
+ enable_x2apic();
+ }
+
+end_restore:
+ if (ret)
+ /*
+ * IR enabling failed
+ */
+ restore_IO_APIC_setup();
+ else
+ reinit_intr_remapped_IO_APIC(x2apic_preenabled);
+
+end:
+ unmask_8259A();
+ local_irq_restore(flags);
+
+ if (!ret) {
+ if (!x2apic_preenabled)
+ printk(KERN_INFO
+ "Enabled x2apic and interrupt-remapping\n");
+ else
+ printk(KERN_INFO
+ "Enabled Interrupt-remapping\n");
+ } else
+ printk(KERN_ERR
+ "Failed to enable Interrupt-remapping and x2apic\n");
+#else
+ if (!cpu_has_x2apic)
+ return;
+
+ if (x2apic_preenabled)
+ panic("x2apic enabled prior OS handover,"
+ " enable CONFIG_INTR_REMAP");
+
+ printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
+ " and x2apic\n");
+#endif
+
+ return;
+}
+#endif /* HAVE_X2APIC */
+
+#ifdef CONFIG_X86_64
+/*
+ * Detect and enable local APICs on non-SMP boards.
+ * Original code written by Keir Fraser.
+ * On AMD64 we trust the BIOS - if it says no APIC it is likely
+ * not correctly set up (usually the APIC timer won't work etc.)
+ */
+static int __init detect_init_APIC(void)
+{
+ if (!cpu_has_apic) {
+ printk(KERN_INFO "No local APIC present\n");
+ return -1;
+ }
+
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ boot_cpu_physical_apicid = 0;
+ return 0;
+}
+#else
/*
* Detect and initialize APIC
*/
@@ -1255,12 +1515,46 @@
printk(KERN_INFO "No local APIC present or hardware disabled\n");
return -1;
}
+#endif
+
+#ifdef CONFIG_X86_64
+void __init early_init_lapic_mapping(void)
+{
+ unsigned long phys_addr;
+
+ /*
+ * If no local APIC can be found then go out
+ * : it means there is no mpatable and MADT
+ */
+ if (!smp_found_config)
+ return;
+
+ phys_addr = mp_lapic_addr;
+
+ set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
+ apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
+ APIC_BASE, phys_addr);
+
+ /*
+ * Fetch the APIC ID of the BSP in case we have a
+ * default configuration (or the MP table is broken).
+ */
+ boot_cpu_physical_apicid = read_apic_id();
+}
+#endif
/**
* init_apic_mappings - initialize APIC mappings
*/
void __init init_apic_mappings(void)
{
+#ifdef HAVE_X2APIC
+ if (x2apic) {
+ boot_cpu_physical_apicid = read_apic_id();
+ return;
+ }
+#endif
+
/*
* If no local APIC can be found then set up a fake all
* zeroes page to simulate the local APIC and another
@@ -1273,8 +1567,8 @@
apic_phys = mp_lapic_addr;
set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
- printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
- apic_phys);
+ apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
+ APIC_BASE, apic_phys);
/*
* Fetch the APIC ID of the BSP in case we have a
@@ -1282,18 +1576,27 @@
*/
if (boot_cpu_physical_apicid == -1U)
boot_cpu_physical_apicid = read_apic_id();
-
}
/*
* This initializes the IO-APIC and APIC hardware if this is
* a UP kernel.
*/
-
int apic_version[MAX_APICS];
int __init APIC_init_uniprocessor(void)
{
+#ifdef CONFIG_X86_64
+ if (disable_apic) {
+ printk(KERN_INFO "Apic disabled\n");
+ return -1;
+ }
+ if (!cpu_has_apic) {
+ disable_apic = 1;
+ printk(KERN_INFO "Apic disabled by BIOS\n");
+ return -1;
+ }
+#else
if (!smp_found_config && !cpu_has_apic)
return -1;
@@ -1302,39 +1605,68 @@
*/
if (!cpu_has_apic &&
APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
- printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+ printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n",
boot_cpu_physical_apicid);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
return -1;
}
+#endif
+
+#ifdef HAVE_X2APIC
+ enable_IR_x2apic();
+#endif
+#ifdef CONFIG_X86_64
+ setup_apic_routing();
+#endif
verify_local_APIC();
-
connect_bsp_APIC();
+#ifdef CONFIG_X86_64
+ apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
+#else
/*
* Hack: In case of kdump, after a crash, kernel might be booting
* on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
* might be zero if read from MP tables. Get it from LAPIC.
*/
-#ifdef CONFIG_CRASH_DUMP
+# ifdef CONFIG_CRASH_DUMP
boot_cpu_physical_apicid = read_apic_id();
+# endif
#endif
physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
-
setup_local_APIC();
+#ifdef CONFIG_X86_64
+ /*
+ * Now enable IO-APICs, actually call clear_IO_APIC
+ * We need clear_IO_APIC before enabling vector on BP
+ */
+ if (!skip_ioapic_setup && nr_ioapics)
+ enable_IO_APIC();
+#endif
+
#ifdef CONFIG_X86_IO_APIC
if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
#endif
localise_nmi_watchdog();
end_local_APIC_setup();
+
#ifdef CONFIG_X86_IO_APIC
- if (smp_found_config)
- if (!skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
+ if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+ setup_IO_APIC();
+# ifdef CONFIG_X86_64
+ else
+ nr_ioapics = 0;
+# endif
#endif
+
+#ifdef CONFIG_X86_64
+ setup_boot_APIC_clock();
+ check_nmi_watchdog();
+#else
setup_boot_clock();
+#endif
return 0;
}
@@ -1348,8 +1680,11 @@
*/
void smp_spurious_interrupt(struct pt_regs *regs)
{
- unsigned long v;
+ u32 v;
+#ifdef CONFIG_X86_64
+ exit_idle();
+#endif
irq_enter();
/*
* Check if this really is a spurious interrupt and ACK it
@@ -1360,10 +1695,14 @@
if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
ack_APIC_irq();
+#ifdef CONFIG_X86_64
+ add_pda(irq_spurious_count, 1);
+#else
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
"should never happen.\n", smp_processor_id());
__get_cpu_var(irq_stat).irq_spurious_count++;
+#endif
irq_exit();
}
@@ -1372,8 +1711,11 @@
*/
void smp_error_interrupt(struct pt_regs *regs)
{
- unsigned long v, v1;
+ u32 v, v1;
+#ifdef CONFIG_X86_64
+ exit_idle();
+#endif
irq_enter();
/* First tickle the hardware, only then report what went on. -- REW */
v = apic_read(APIC_ESR);
@@ -1392,7 +1734,7 @@
6: Received illegal vector
7: Illegal register address
*/
- printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
+ printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
smp_processor_id(), v , v1);
irq_exit();
}
@@ -1565,6 +1907,13 @@
cpu_set(cpu, cpu_present_map);
}
+#ifdef CONFIG_X86_64
+int hard_smp_processor_id(void)
+{
+ return read_apic_id();
+}
+#endif
+
/*
* Power management
*/
@@ -1640,7 +1989,7 @@
local_irq_save(flags);
-#ifdef CONFIG_X86_64
+#ifdef HAVE_X2APIC
if (x2apic)
enable_x2apic();
else
@@ -1702,7 +2051,7 @@
.cls = &lapic_sysclass,
};
-static void __devinit apic_pm_activate(void)
+static void __cpuinit apic_pm_activate(void)
{
apic_pm_state.active = 1;
}
@@ -1728,16 +2077,87 @@
#endif /* CONFIG_PM */
+#ifdef CONFIG_X86_64
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ *
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis. Use available data to take a good guess.
+ * If in doubt, go HPET.
+ */
+__cpuinit int apic_is_clustered_box(void)
+{
+ int i, clusters, zeros;
+ unsigned id;
+ u16 *bios_cpu_apicid;
+ DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
+
+ /*
+ * there is not this kind of box with AMD CPU yet.
+ * Some AMD box with quadcore cpu and 8 sockets apicid
+ * will be [4, 0x23] or [8, 0x27] could be thought to
+ * vsmp box still need checking...
+ */
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
+ return 0;
+
+ bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
+ bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
+
+ for (i = 0; i < NR_CPUS; i++) {
+ /* are we being called early in kernel startup? */
+ if (bios_cpu_apicid) {
+ id = bios_cpu_apicid[i];
+ }
+ else if (i < nr_cpu_ids) {
+ if (cpu_present(i))
+ id = per_cpu(x86_bios_cpu_apicid, i);
+ else
+ continue;
+ }
+ else
+ break;
+
+ if (id != BAD_APICID)
+ __set_bit(APIC_CLUSTERID(id), clustermap);
+ }
+
+ /* Problem: Partially populated chassis may not have CPUs in some of
+ * the APIC clusters they have been allocated. Only present CPUs have
+ * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
+ * Since clusters are allocated sequentially, count zeros only if
+ * they are bounded by ones.
+ */
+ clusters = 0;
+ zeros = 0;
+ for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+ if (test_bit(i, clustermap)) {
+ clusters += 1 + zeros;
+ zeros = 0;
+ } else
+ ++zeros;
+ }
+
+ /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
+ * not guaranteed to be synced between boards
+ */
+ if (is_vsmp_box() && clusters > 1)
+ return 1;
+
+ /*
+ * If clusters > 2, then should be multi-chassis.
+ * May have to revisit this when multi-core + hyperthreaded CPUs come
+ * out, but AFAIK this will work even for them.
+ */
+ return (clusters > 2);
+}
+#endif
+
/*
* APIC command line parameters
*/
-static int __init parse_lapic(char *arg)
-{
- force_enable_local_apic = 1;
- return 0;
-}
-early_param("lapic", parse_lapic);
-
static int __init setup_disableapic(char *arg)
{
disable_apic = 1;
@@ -1779,7 +2199,6 @@
if (!arg) {
#ifdef CONFIG_X86_64
skip_ioapic_setup = 0;
- ioapic_force = 1;
return 0;
#endif
return -EINVAL;
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
deleted file mode 100644
index 94ddb69..0000000
--- a/arch/x86/kernel/apic_64.c
+++ /dev/null
@@ -1,1848 +0,0 @@
-/*
- * Local APIC handling, local APIC timers
- *
- * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- * Fixes
- * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
- * thanks to Eric Gilmore
- * and Rolf G. Tews
- * for testing these extensively.
- * Maciej W. Rozycki : Various updates and fixes.
- * Mikael Pettersson : Power Management for UP-APIC.
- * Pavel Machek and
- * Mikael Pettersson : PM converted to driver model.
- */
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/ioport.h>
-#include <linux/clockchips.h>
-#include <linux/acpi_pmtmr.h>
-#include <linux/module.h>
-#include <linux/dmar.h>
-
-#include <asm/atomic.h>
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
-#include <asm/hpet.h>
-#include <asm/pgalloc.h>
-#include <asm/nmi.h>
-#include <asm/idle.h>
-#include <asm/proto.h>
-#include <asm/timex.h>
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-#include <mach_ipi.h>
-#include <mach_apic.h>
-
-/* Disable local APIC timer from the kernel commandline or via dmi quirk */
-static int disable_apic_timer __cpuinitdata;
-static int apic_calibrate_pmtmr __initdata;
-int disable_apic;
-int disable_x2apic;
-int x2apic;
-
-/* x2apic enabled before OS handover */
-int x2apic_preenabled;
-
-/* Local APIC timer works in C2 */
-int local_apic_timer_c2_ok;
-EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
-
-/*
- * Debug level, exported for io_apic.c
- */
-unsigned int apic_verbosity;
-
-/* Have we found an MP table */
-int smp_found_config;
-
-static struct resource lapic_resource = {
- .name = "Local APIC",
- .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
-};
-
-static unsigned int calibration_result;
-
-static int lapic_next_event(unsigned long delta,
- struct clock_event_device *evt);
-static void lapic_timer_setup(enum clock_event_mode mode,
- struct clock_event_device *evt);
-static void lapic_timer_broadcast(cpumask_t mask);
-static void apic_pm_activate(void);
-
-/*
- * The local apic timer can be used for any function which is CPU local.
- */
-static struct clock_event_device lapic_clockevent = {
- .name = "lapic",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
- | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
- .shift = 32,
- .set_mode = lapic_timer_setup,
- .set_next_event = lapic_next_event,
- .broadcast = lapic_timer_broadcast,
- .rating = 100,
- .irq = -1,
-};
-static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
-
-static unsigned long apic_phys;
-
-unsigned long mp_lapic_addr;
-
-/*
- * Get the LAPIC version
- */
-static inline int lapic_get_version(void)
-{
- return GET_APIC_VERSION(apic_read(APIC_LVR));
-}
-
-/*
- * Check, if the APIC is integrated or a separate chip
- */
-static inline int lapic_is_integrated(void)
-{
-#ifdef CONFIG_X86_64
- return 1;
-#else
- return APIC_INTEGRATED(lapic_get_version());
-#endif
-}
-
-/*
- * Check, whether this is a modern or a first generation APIC
- */
-static int modern_apic(void)
-{
- /* AMD systems use old APIC versions, so check the CPU */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 >= 0xf)
- return 1;
- return lapic_get_version() >= 0x14;
-}
-
-/*
- * Paravirt kernels also might be using these below ops. So we still
- * use generic apic_read()/apic_write(), which might be pointing to different
- * ops in PARAVIRT case.
- */
-void xapic_wait_icr_idle(void)
-{
- while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
- cpu_relax();
-}
-
-u32 safe_xapic_wait_icr_idle(void)
-{
- u32 send_status;
- int timeout;
-
- timeout = 0;
- do {
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- if (!send_status)
- break;
- udelay(100);
- } while (timeout++ < 1000);
-
- return send_status;
-}
-
-void xapic_icr_write(u32 low, u32 id)
-{
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
- apic_write(APIC_ICR, low);
-}
-
-u64 xapic_icr_read(void)
-{
- u32 icr1, icr2;
-
- icr2 = apic_read(APIC_ICR2);
- icr1 = apic_read(APIC_ICR);
-
- return icr1 | ((u64)icr2 << 32);
-}
-
-static struct apic_ops xapic_ops = {
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = xapic_icr_read,
- .icr_write = xapic_icr_write,
- .wait_icr_idle = xapic_wait_icr_idle,
- .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
-};
-
-struct apic_ops __read_mostly *apic_ops = &xapic_ops;
-EXPORT_SYMBOL_GPL(apic_ops);
-
-static void x2apic_wait_icr_idle(void)
-{
- /* no need to wait for icr idle in x2apic */
- return;
-}
-
-static u32 safe_x2apic_wait_icr_idle(void)
-{
- /* no need to wait for icr idle in x2apic */
- return 0;
-}
-
-void x2apic_icr_write(u32 low, u32 id)
-{
- wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
-}
-
-u64 x2apic_icr_read(void)
-{
- unsigned long val;
-
- rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
- return val;
-}
-
-static struct apic_ops x2apic_ops = {
- .read = native_apic_msr_read,
- .write = native_apic_msr_write,
- .icr_read = x2apic_icr_read,
- .icr_write = x2apic_icr_write,
- .wait_icr_idle = x2apic_wait_icr_idle,
- .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
-};
-
-/**
- * enable_NMI_through_LVT0 - enable NMI through local vector table 0
- */
-void __cpuinit enable_NMI_through_LVT0(void)
-{
- unsigned int v;
-
- /* unmask and set to NMI */
- v = APIC_DM_NMI;
-
- /* Level triggered for 82489DX (32bit mode) */
- if (!lapic_is_integrated())
- v |= APIC_LVT_LEVEL_TRIGGER;
-
- apic_write(APIC_LVT0, v);
-}
-
-/**
- * lapic_get_maxlvt - get the maximum number of local vector table entries
- */
-int lapic_get_maxlvt(void)
-{
- unsigned int v;
-
- v = apic_read(APIC_LVR);
- /*
- * - we always have APIC integrated on 64bit mode
- * - 82489DXs do not report # of LVT entries
- */
- return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
-}
-
-/*
- * Local APIC timer
- */
-
-/* Clock divisor */
-#ifdef CONFG_X86_64
-#define APIC_DIVISOR 1
-#else
-#define APIC_DIVISOR 16
-#endif
-
-/*
- * This function sets up the local APIC timer, with a timeout of
- * 'clocks' APIC bus clock. During calibration we actually call
- * this function twice on the boot CPU, once with a bogus timeout
- * value, second time for real. The other (noncalibrating) CPUs
- * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
- */
-static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
-{
- unsigned int lvtt_value, tmp_value;
-
- lvtt_value = LOCAL_TIMER_VECTOR;
- if (!oneshot)
- lvtt_value |= APIC_LVT_TIMER_PERIODIC;
- if (!lapic_is_integrated())
- lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
-
- if (!irqen)
- lvtt_value |= APIC_LVT_MASKED;
-
- apic_write(APIC_LVTT, lvtt_value);
-
- /*
- * Divide PICLK by 16
- */
- tmp_value = apic_read(APIC_TDCR);
- apic_write(APIC_TDCR,
- (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
- APIC_TDR_DIV_16);
-
- if (!oneshot)
- apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
-}
-
-/*
- * Setup extended LVT, AMD specific (K8, family 10h)
- *
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
- *
- * If mask=1, the LVT entry does not generate interrupts while mask=0
- * enables the vector. See also the BKDGs.
- */
-
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
-
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
-{
- unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
- unsigned int v = (mask << 16) | (msg_type << 8) | vector;
-
- apic_write(reg, v);
-}
-
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
-{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_MCE;
-}
-
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
-{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_IBS;
-}
-EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
-
-/*
- * Program the next event, relative to now
- */
-static int lapic_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- apic_write(APIC_TMICT, delta);
- return 0;
-}
-
-/*
- * Setup the lapic timer in periodic or oneshot mode
- */
-static void lapic_timer_setup(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- unsigned long flags;
- unsigned int v;
-
- /* Lapic used as dummy for broadcast ? */
- if (evt->features & CLOCK_EVT_FEAT_DUMMY)
- return;
-
- local_irq_save(flags);
-
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- case CLOCK_EVT_MODE_ONESHOT:
- __setup_APIC_LVTT(calibration_result,
- mode != CLOCK_EVT_MODE_PERIODIC, 1);
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- v = apic_read(APIC_LVTT);
- v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
- apic_write(APIC_LVTT, v);
- break;
- case CLOCK_EVT_MODE_RESUME:
- /* Nothing to do here */
- break;
- }
-
- local_irq_restore(flags);
-}
-
-/*
- * Local APIC timer broadcast function
- */
-static void lapic_timer_broadcast(cpumask_t mask)
-{
-#ifdef CONFIG_SMP
- send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
-#endif
-}
-
-/*
- * Setup the local APIC timer for this CPU. Copy the initilized values
- * of the boot CPU and register the clock event in the framework.
- */
-static void setup_APIC_timer(void)
-{
- struct clock_event_device *levt = &__get_cpu_var(lapic_events);
-
- memcpy(levt, &lapic_clockevent, sizeof(*levt));
- levt->cpumask = cpumask_of_cpu(smp_processor_id());
-
- clockevents_register_device(levt);
-}
-
-/*
- * In this function we calibrate APIC bus clocks to the external
- * timer. Unfortunately we cannot use jiffies and the timer irq
- * to calibrate, since some later bootup code depends on getting
- * the first irq? Ugh.
- *
- * We want to do the calibration only once since we
- * want to have local timer irqs syncron. CPUs connected
- * by the same APIC bus have the very same bus frequency.
- * And we want to have irqs off anyways, no accidental
- * APIC irq that way.
- */
-
-#define TICK_COUNT 100000000
-
-static int __init calibrate_APIC_clock(void)
-{
- unsigned apic, apic_start;
- unsigned long tsc, tsc_start;
- int result;
-
- local_irq_disable();
-
- /*
- * Put whatever arbitrary (but long enough) timeout
- * value into the APIC clock, we just want to get the
- * counter running for calibration.
- *
- * No interrupt enable !
- */
- __setup_APIC_LVTT(250000000, 0, 0);
-
- apic_start = apic_read(APIC_TMCCT);
-#ifdef CONFIG_X86_PM_TIMER
- if (apic_calibrate_pmtmr && pmtmr_ioport) {
- pmtimer_wait(5000); /* 5ms wait */
- apic = apic_read(APIC_TMCCT);
- result = (apic_start - apic) * 1000L / 5;
- } else
-#endif
- {
- rdtscll(tsc_start);
-
- do {
- apic = apic_read(APIC_TMCCT);
- rdtscll(tsc);
- } while ((tsc - tsc_start) < TICK_COUNT &&
- (apic_start - apic) < TICK_COUNT);
-
- result = (apic_start - apic) * 1000L * tsc_khz /
- (tsc - tsc_start);
- }
-
- local_irq_enable();
-
- printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
-
- printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
- result / 1000 / 1000, result / 1000 % 1000);
-
- /* Calculate the scaled math multiplication factor */
- lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC,
- lapic_clockevent.shift);
- lapic_clockevent.max_delta_ns =
- clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
- lapic_clockevent.min_delta_ns =
- clockevent_delta2ns(0xF, &lapic_clockevent);
-
- calibration_result = (result * APIC_DIVISOR) / HZ;
-
- /*
- * Do a sanity check on the APIC calibration result
- */
- if (calibration_result < (1000000 / HZ)) {
- printk(KERN_WARNING
- "APIC frequency too slow, disabling apic timer\n");
- return -1;
- }
-
- return 0;
-}
-
-/*
- * Setup the boot APIC
- *
- * Calibrate and verify the result.
- */
-void __init setup_boot_APIC_clock(void)
-{
- /*
- * The local apic timer can be disabled via the kernel
- * commandline or from the CPU detection code. Register the lapic
- * timer as a dummy clock event source on SMP systems, so the
- * broadcast mechanism is used. On UP systems simply ignore it.
- */
- if (disable_apic_timer) {
- printk(KERN_INFO "Disabling APIC timer\n");
- /* No broadcast on UP ! */
- if (num_possible_cpus() > 1) {
- lapic_clockevent.mult = 1;
- setup_APIC_timer();
- }
- return;
- }
-
- apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
- "calibrating APIC timer ...\n");
-
- if (calibrate_APIC_clock()) {
- /* No broadcast on UP ! */
- if (num_possible_cpus() > 1)
- setup_APIC_timer();
- return;
- }
-
- /*
- * If nmi_watchdog is set to IO_APIC, we need the
- * PIT/HPET going. Otherwise register lapic as a dummy
- * device.
- */
- if (nmi_watchdog != NMI_IO_APIC)
- lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
- else
- printk(KERN_WARNING "APIC timer registered as dummy,"
- " due to nmi_watchdog=%d!\n", nmi_watchdog);
-
- /* Setup the lapic or request the broadcast */
- setup_APIC_timer();
-}
-
-void __cpuinit setup_secondary_APIC_clock(void)
-{
- setup_APIC_timer();
-}
-
-/*
- * The guts of the apic timer interrupt
- */
-static void local_apic_timer_interrupt(void)
-{
- int cpu = smp_processor_id();
- struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
-
- /*
- * Normally we should not be here till LAPIC has been initialized but
- * in some cases like kdump, its possible that there is a pending LAPIC
- * timer interrupt from previous kernel's context and is delivered in
- * new kernel the moment interrupts are enabled.
- *
- * Interrupts are enabled early and LAPIC is setup much later, hence
- * its possible that when we get here evt->event_handler is NULL.
- * Check for event_handler being NULL and discard the interrupt as
- * spurious.
- */
- if (!evt->event_handler) {
- printk(KERN_WARNING
- "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
- /* Switch it off */
- lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
- return;
- }
-
- /*
- * the NMI deadlock-detector uses this.
- */
-#ifdef CONFIG_X86_64
- add_pda(apic_timer_irqs, 1);
-#else
- per_cpu(irq_stat, cpu).apic_timer_irqs++;
-#endif
-
- evt->event_handler(evt);
-}
-
-/*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesn't support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- * interrupt as well. Thus we cannot inline the local irq ... ]
- */
-void smp_apic_timer_interrupt(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow.
- */
- ack_APIC_irq();
- /*
- * update_process_times() expects us to have done irq_enter().
- * Besides, if we don't timer interrupts ignore the global
- * interrupt lock, which is the WrongThing (tm) to do.
- */
- exit_idle();
- irq_enter();
- local_apic_timer_interrupt();
- irq_exit();
-
- set_irq_regs(old_regs);
-}
-
-int setup_profiling_timer(unsigned int multiplier)
-{
- return -EINVAL;
-}
-
-
-/*
- * Local APIC start and shutdown
- */
-
-/**
- * clear_local_APIC - shutdown the local APIC
- *
- * This is called, when a CPU is disabled and before rebooting, so the state of
- * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
- * leftovers during boot.
- */
-void clear_local_APIC(void)
-{
- int maxlvt;
- u32 v;
-
- /* APIC hasn't been mapped yet */
- if (!apic_phys)
- return;
-
- maxlvt = lapic_get_maxlvt();
- /*
- * Masking an LVT entry can trigger a local APIC error
- * if the vector is zero. Mask LVTERR first to prevent this.
- */
- if (maxlvt >= 3) {
- v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
- apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
- }
- /*
- * Careful: we have to set masks only first to deassert
- * any level-triggered sources.
- */
- v = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
- v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
- v = apic_read(APIC_LVT1);
- apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
- if (maxlvt >= 4) {
- v = apic_read(APIC_LVTPC);
- apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
- }
-
- /* lets not touch this if we didn't frob it */
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
- if (maxlvt >= 5) {
- v = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
- }
-#endif
- /*
- * Clean APIC state for other OSs:
- */
- apic_write(APIC_LVTT, APIC_LVT_MASKED);
- apic_write(APIC_LVT0, APIC_LVT_MASKED);
- apic_write(APIC_LVT1, APIC_LVT_MASKED);
- if (maxlvt >= 3)
- apic_write(APIC_LVTERR, APIC_LVT_MASKED);
- if (maxlvt >= 4)
- apic_write(APIC_LVTPC, APIC_LVT_MASKED);
-
- /* Integrated APIC (!82489DX) ? */
- if (lapic_is_integrated()) {
- if (maxlvt > 3)
- /* Clear ESR due to Pentium errata 3AP and 11AP */
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- }
-}
-
-/**
- * disable_local_APIC - clear and disable the local APIC
- */
-void disable_local_APIC(void)
-{
- unsigned int value;
-
- clear_local_APIC();
-
- /*
- * Disable APIC (implies clearing of registers
- * for 82489DX!).
- */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_SPIV_APIC_ENABLED;
- apic_write(APIC_SPIV, value);
-
-#ifdef CONFIG_X86_32
- /*
- * When LAPIC was disabled by the BIOS and enabled by the kernel,
- * restore the disabled state.
- */
- if (enabled_via_apicbase) {
- unsigned int l, h;
-
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_ENABLE;
- wrmsr(MSR_IA32_APICBASE, l, h);
- }
-#endif
-}
-
-/*
- * If Linux enabled the LAPIC against the BIOS default disable it down before
- * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
- * not power-off. Additionally clear all LVT entries before disable_local_APIC
- * for the case where Linux didn't enable the LAPIC.
- */
-void lapic_shutdown(void)
-{
- unsigned long flags;
-
- if (!cpu_has_apic)
- return;
-
- local_irq_save(flags);
-
-#ifdef CONFIG_X86_32
- if (!enabled_via_apicbase)
- clear_local_APIC();
- else
-#endif
- disable_local_APIC();
-
-
- local_irq_restore(flags);
-}
-
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
- unsigned int reg0, reg1;
-
- /*
- * The version register is read-only in a real APIC.
- */
- reg0 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
- apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
- reg1 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
- /*
- * The two version reads above should print the same
- * numbers. If the second one is different, then we
- * poke at a non-APIC.
- */
- if (reg1 != reg0)
- return 0;
-
- /*
- * Check if the version looks reasonably.
- */
- reg1 = GET_APIC_VERSION(reg0);
- if (reg1 == 0x00 || reg1 == 0xff)
- return 0;
- reg1 = lapic_get_maxlvt();
- if (reg1 < 0x02 || reg1 == 0xff)
- return 0;
-
- /*
- * The ID register is read/write in a real APIC.
- */
- reg0 = apic_read(APIC_ID);
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
- apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
- reg1 = apic_read(APIC_ID);
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
- apic_write(APIC_ID, reg0);
- if (reg1 != (reg0 ^ APIC_ID_MASK))
- return 0;
-
- /*
- * The next two are just to see if we have sane values.
- * They're only really relevant if we're in Virtual Wire
- * compatibility mode, but most boxes are anymore.
- */
- reg0 = apic_read(APIC_LVT0);
- apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
- reg1 = apic_read(APIC_LVT1);
- apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
- return 1;
-}
-
-/**
- * sync_Arb_IDs - synchronize APIC bus arbitration IDs
- */
-void __init sync_Arb_IDs(void)
-{
- /*
- * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
- * needed on AMD.
- */
- if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
- return;
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
-
- apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
- apic_write(APIC_ICR, APIC_DEST_ALLINC |
- APIC_INT_LEVELTRIG | APIC_DM_INIT);
-}
-
-/*
- * An initial setup of the virtual wire mode.
- */
-void __init init_bsp_APIC(void)
-{
- unsigned int value;
-
- /*
- * Don't do the setup now if we have a SMP BIOS as the
- * through-I/O-APIC virtual wire mode might be active.
- */
- if (smp_found_config || !cpu_has_apic)
- return;
-
- /*
- * Do not trust the local APIC being empty at bootup.
- */
- clear_local_APIC();
-
- /*
- * Enable APIC.
- */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- value |= APIC_SPIV_APIC_ENABLED;
-
-#ifdef CONFIG_X86_32
- /* This bit is reserved on P4/Xeon and should be cleared */
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- (boot_cpu_data.x86 == 15))
- value &= ~APIC_SPIV_FOCUS_DISABLED;
- else
-#endif
- value |= APIC_SPIV_FOCUS_DISABLED;
- value |= SPURIOUS_APIC_VECTOR;
- apic_write(APIC_SPIV, value);
-
- /*
- * Set up the virtual wire mode.
- */
- apic_write(APIC_LVT0, APIC_DM_EXTINT);
- value = APIC_DM_NMI;
- if (!lapic_is_integrated()) /* 82489DX */
- value |= APIC_LVT_LEVEL_TRIGGER;
- apic_write(APIC_LVT1, value);
-}
-
-static void __cpuinit lapic_setup_esr(void)
-{
- unsigned long oldvalue, value, maxlvt;
- if (lapic_is_integrated() && !esr_disable) {
- if (esr_disable) {
- /*
- * Something untraceable is creating bad interrupts on
- * secondary quads ... for the moment, just leave the
- * ESR disabled - we can't do anything useful with the
- * errors anyway - mbligh
- */
- printk(KERN_INFO "Leaving ESR disabled.\n");
- return;
- }
- /* !82489DX */
- maxlvt = lapic_get_maxlvt();
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- oldvalue = apic_read(APIC_ESR);
-
- /* enables sending errors */
- value = ERROR_APIC_VECTOR;
- apic_write(APIC_LVTERR, value);
- /*
- * spec says clear errors after enabling vector.
- */
- if (maxlvt > 3)
- apic_write(APIC_ESR, 0);
- value = apic_read(APIC_ESR);
- if (value != oldvalue)
- apic_printk(APIC_VERBOSE, "ESR value before enabling "
- "vector: 0x%08lx after: 0x%08lx\n",
- oldvalue, value);
- } else {
- printk(KERN_INFO "No ESR for 82489DX.\n");
- }
-}
-
-
-/**
- * setup_local_APIC - setup the local APIC
- */
-void __cpuinit setup_local_APIC(void)
-{
- unsigned int value;
- int i, j;
-
- preempt_disable();
- value = apic_read(APIC_LVR);
-
- BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
-
- /*
- * Double-check whether this APIC is really registered.
- * This is meaningless in clustered apic mode, so we skip it.
- */
- if (!apic_id_registered())
- BUG();
-
- /*
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
- init_apic_ldr();
-
- /*
- * Set Task Priority to 'accept all'. We never change this
- * later on.
- */
- value = apic_read(APIC_TASKPRI);
- value &= ~APIC_TPRI_MASK;
- apic_write(APIC_TASKPRI, value);
-
- /*
- * After a crash, we no longer service the interrupts and a pending
- * interrupt from previous kernel might still have ISR bit set.
- *
- * Most probably by now CPU has serviced that pending interrupt and
- * it might not have done the ack_APIC_irq() because it thought,
- * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
- * does not clear the ISR bit and cpu thinks it has already serivced
- * the interrupt. Hence a vector might get locked. It was noticed
- * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
- */
- for (i = APIC_ISR_NR - 1; i >= 0; i--) {
- value = apic_read(APIC_ISR + i*0x10);
- for (j = 31; j >= 0; j--) {
- if (value & (1<<j))
- ack_APIC_irq();
- }
- }
-
- /*
- * Now that we are all set up, enable the APIC
- */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- /*
- * Enable APIC
- */
- value |= APIC_SPIV_APIC_ENABLED;
-
- /* We always use processor focus */
-
- /*
- * Set spurious IRQ vector
- */
- value |= SPURIOUS_APIC_VECTOR;
- apic_write(APIC_SPIV, value);
-
- /*
- * Set up LVT0, LVT1:
- *
- * set up through-local-APIC on the BP's LINT0. This is not
- * strictly necessary in pure symmetric-IO mode, but sometimes
- * we delegate interrupts to the 8259A.
- */
- /*
- * TODO: set up through-local-APIC from through-I/O-APIC? --macro
- */
- value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
- if (!smp_processor_id() && !value) {
- value = APIC_DM_EXTINT;
- apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
- smp_processor_id());
- } else {
- value = APIC_DM_EXTINT | APIC_LVT_MASKED;
- apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
- smp_processor_id());
- }
- apic_write(APIC_LVT0, value);
-
- /*
- * only the BP should see the LINT1 NMI signal, obviously.
- */
- if (!smp_processor_id())
- value = APIC_DM_NMI;
- else
- value = APIC_DM_NMI | APIC_LVT_MASKED;
- apic_write(APIC_LVT1, value);
- preempt_enable();
-}
-
-void __cpuinit end_local_APIC_setup(void)
-{
- lapic_setup_esr();
-
-#ifdef CONFIG_X86_32
- {
- unsigned int value;
- /* Disable the local apic timer */
- value = apic_read(APIC_LVTT);
- value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
- apic_write(APIC_LVTT, value);
- }
-#endif
-
- setup_apic_nmi_watchdog(NULL);
- apic_pm_activate();
-}
-
-void check_x2apic(void)
-{
- int msr, msr2;
-
- rdmsr(MSR_IA32_APICBASE, msr, msr2);
-
- if (msr & X2APIC_ENABLE) {
- printk("x2apic enabled by BIOS, switching to x2apic ops\n");
- x2apic_preenabled = x2apic = 1;
- apic_ops = &x2apic_ops;
- }
-}
-
-void enable_x2apic(void)
-{
- int msr, msr2;
-
- rdmsr(MSR_IA32_APICBASE, msr, msr2);
- if (!(msr & X2APIC_ENABLE)) {
- printk("Enabling x2apic\n");
- wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
- }
-}
-
-void enable_IR_x2apic(void)
-{
-#ifdef CONFIG_INTR_REMAP
- int ret;
- unsigned long flags;
-
- if (!cpu_has_x2apic)
- return;
-
- if (!x2apic_preenabled && disable_x2apic) {
- printk(KERN_INFO
- "Skipped enabling x2apic and Interrupt-remapping "
- "because of nox2apic\n");
- return;
- }
-
- if (x2apic_preenabled && disable_x2apic)
- panic("Bios already enabled x2apic, can't enforce nox2apic");
-
- if (!x2apic_preenabled && skip_ioapic_setup) {
- printk(KERN_INFO
- "Skipped enabling x2apic and Interrupt-remapping "
- "because of skipping io-apic setup\n");
- return;
- }
-
- ret = dmar_table_init();
- if (ret) {
- printk(KERN_INFO
- "dmar_table_init() failed with %d:\n", ret);
-
- if (x2apic_preenabled)
- panic("x2apic enabled by bios. But IR enabling failed");
- else
- printk(KERN_INFO
- "Not enabling x2apic,Intr-remapping\n");
- return;
- }
-
- local_irq_save(flags);
- mask_8259A();
- save_mask_IO_APIC_setup();
-
- ret = enable_intr_remapping(1);
-
- if (ret && x2apic_preenabled) {
- local_irq_restore(flags);
- panic("x2apic enabled by bios. But IR enabling failed");
- }
-
- if (ret)
- goto end;
-
- if (!x2apic) {
- x2apic = 1;
- apic_ops = &x2apic_ops;
- enable_x2apic();
- }
-end:
- if (ret)
- /*
- * IR enabling failed
- */
- restore_IO_APIC_setup();
- else
- reinit_intr_remapped_IO_APIC(x2apic_preenabled);
-
- unmask_8259A();
- local_irq_restore(flags);
-
- if (!ret) {
- if (!x2apic_preenabled)
- printk(KERN_INFO
- "Enabled x2apic and interrupt-remapping\n");
- else
- printk(KERN_INFO
- "Enabled Interrupt-remapping\n");
- } else
- printk(KERN_ERR
- "Failed to enable Interrupt-remapping and x2apic\n");
-#else
- if (!cpu_has_x2apic)
- return;
-
- if (x2apic_preenabled)
- panic("x2apic enabled prior OS handover,"
- " enable CONFIG_INTR_REMAP");
-
- printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
- " and x2apic\n");
-#endif
-
- return;
-}
-
-/*
- * Detect and enable local APICs on non-SMP boards.
- * Original code written by Keir Fraser.
- * On AMD64 we trust the BIOS - if it says no APIC it is likely
- * not correctly set up (usually the APIC timer won't work etc.)
- */
-static int __init detect_init_APIC(void)
-{
- if (!cpu_has_apic) {
- printk(KERN_INFO "No local APIC present\n");
- return -1;
- }
-
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- boot_cpu_physical_apicid = 0;
- return 0;
-}
-
-void __init early_init_lapic_mapping(void)
-{
- unsigned long phys_addr;
-
- /*
- * If no local APIC can be found then go out
- * : it means there is no mpatable and MADT
- */
- if (!smp_found_config)
- return;
-
- phys_addr = mp_lapic_addr;
-
- set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, phys_addr);
-
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- boot_cpu_physical_apicid = read_apic_id();
-}
-
-/**
- * init_apic_mappings - initialize APIC mappings
- */
-void __init init_apic_mappings(void)
-{
- if (x2apic) {
- boot_cpu_physical_apicid = read_apic_id();
- return;
- }
-
- /*
- * If no local APIC can be found then set up a fake all
- * zeroes page to simulate the local APIC and another
- * one for the IO-APIC.
- */
- if (!smp_found_config && detect_init_APIC()) {
- apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
- apic_phys = __pa(apic_phys);
- } else
- apic_phys = mp_lapic_addr;
-
- set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, apic_phys);
-
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- boot_cpu_physical_apicid = read_apic_id();
-}
-
-/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
- */
-int apic_version[MAX_APICS];
-
-int __init APIC_init_uniprocessor(void)
-{
- if (disable_apic) {
- printk(KERN_INFO "Apic disabled\n");
- return -1;
- }
- if (!cpu_has_apic) {
- disable_apic = 1;
- printk(KERN_INFO "Apic disabled by BIOS\n");
- return -1;
- }
-
- enable_IR_x2apic();
- setup_apic_routing();
-
- verify_local_APIC();
-
- connect_bsp_APIC();
-
- physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
- apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
-
- setup_local_APIC();
-
- /*
- * Now enable IO-APICs, actually call clear_IO_APIC
- * We need clear_IO_APIC before enabling vector on BP
- */
- if (!skip_ioapic_setup && nr_ioapics)
- enable_IO_APIC();
-
- if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
- localise_nmi_watchdog();
- end_local_APIC_setup();
-
- if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
- else
- nr_ioapics = 0;
- setup_boot_APIC_clock();
- check_nmi_watchdog();
- return 0;
-}
-
-/*
- * Local APIC interrupts
- */
-
-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
- */
-asmlinkage void smp_spurious_interrupt(void)
-{
- unsigned int v;
- exit_idle();
- irq_enter();
- /*
- * Check if this really is a spurious interrupt and ACK it
- * if it is a vectored one. Just in case...
- * Spurious interrupts should not be ACKed.
- */
- v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
- if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
- ack_APIC_irq();
-
- add_pda(irq_spurious_count, 1);
- irq_exit();
-}
-
-/*
- * This interrupt should never happen with our APIC/SMP architecture
- */
-asmlinkage void smp_error_interrupt(void)
-{
- unsigned int v, v1;
-
- exit_idle();
- irq_enter();
- /* First tickle the hardware, only then report what went on. -- REW */
- v = apic_read(APIC_ESR);
- apic_write(APIC_ESR, 0);
- v1 = apic_read(APIC_ESR);
- ack_APIC_irq();
- atomic_inc(&irq_err_count);
-
- /* Here is what the APIC error bits mean:
- 0: Send CS error
- 1: Receive CS error
- 2: Send accept error
- 3: Receive accept error
- 4: Reserved
- 5: Send illegal vector
- 6: Received illegal vector
- 7: Illegal register address
- */
- printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
- smp_processor_id(), v , v1);
- irq_exit();
-}
-
-/**
- * connect_bsp_APIC - attach the APIC to the interrupt system
- */
-void __init connect_bsp_APIC(void)
-{
-#ifdef CONFIG_X86_32
- if (pic_mode) {
- /*
- * Do not trust the local APIC being empty at bootup.
- */
- clear_local_APIC();
- /*
- * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
- * local APIC to INT and NMI lines.
- */
- apic_printk(APIC_VERBOSE, "leaving PIC mode, "
- "enabling APIC mode.\n");
- outb(0x70, 0x22);
- outb(0x01, 0x23);
- }
-#endif
- enable_apic_mode();
-}
-
-/**
- * disconnect_bsp_APIC - detach the APIC from the interrupt system
- * @virt_wire_setup: indicates, whether virtual wire mode is selected
- *
- * Virtual wire mode is necessary to deliver legacy interrupts even when the
- * APIC is disabled.
- */
-void disconnect_bsp_APIC(int virt_wire_setup)
-{
- unsigned int value;
-
-#ifdef CONFIG_X86_32
- if (pic_mode) {
- /*
- * Put the board back into PIC mode (has an effect only on
- * certain older boards). Note that APIC interrupts, including
- * IPIs, won't work beyond this point! The only exception are
- * INIT IPIs.
- */
- apic_printk(APIC_VERBOSE, "disabling APIC mode, "
- "entering PIC mode.\n");
- outb(0x70, 0x22);
- outb(0x00, 0x23);
- return;
- }
-#endif
-
- /* Go back to Virtual Wire compatibility mode */
-
- /* For the spurious interrupt use vector F, and enable it */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- value |= APIC_SPIV_APIC_ENABLED;
- value |= 0xf;
- apic_write(APIC_SPIV, value);
-
- if (!virt_wire_setup) {
- /*
- * For LVT0 make it edge triggered, active high,
- * external and enabled
- */
- value = apic_read(APIC_LVT0);
- value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
- apic_write(APIC_LVT0, value);
- } else {
- /* Disable LVT0 */
- apic_write(APIC_LVT0, APIC_LVT_MASKED);
- }
-
- /*
- * For LVT1 make it edge triggered, active high,
- * nmi and enabled
- */
- value = apic_read(APIC_LVT1);
- value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
- apic_write(APIC_LVT1, value);
-}
-
-void __cpuinit generic_processor_info(int apicid, int version)
-{
- int cpu;
- cpumask_t tmp_map;
-
- /*
- * Validate version
- */
- if (version == 0x0) {
- printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
- "fixing up to 0x10. (tell your hw vendor)\n",
- version);
- version = 0x10;
- }
- apic_version[apicid] = version;
-
- if (num_processors >= NR_CPUS) {
- printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
- " Processor ignored.\n", NR_CPUS);
- return;
- }
-
- num_processors++;
- cpus_complement(tmp_map, cpu_present_map);
- cpu = first_cpu(tmp_map);
-
- physid_set(apicid, phys_cpu_present_map);
- if (apicid == boot_cpu_physical_apicid) {
- /*
- * x86_bios_cpu_apicid is required to have processors listed
- * in same order as logical cpu numbers. Hence the first
- * entry is BSP, and so on.
- */
- cpu = 0;
- }
- if (apicid > max_physical_apicid)
- max_physical_apicid = apicid;
-
-#ifdef CONFIG_X86_32
- /*
- * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
- * but we need to work other dependencies like SMP_SUSPEND etc
- * before this can be done without some confusion.
- * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
- * - Ashok Raj <ashok.raj@intel.com>
- */
- if (max_physical_apicid >= 8) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (!APIC_XAPIC(version)) {
- def_to_bigsmp = 0;
- break;
- }
- /* If P4 and above fall through */
- case X86_VENDOR_AMD:
- def_to_bigsmp = 1;
- }
- }
-#endif
-
-#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
- /* are we being called early in kernel startup? */
- if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
- u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
- u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-
- cpu_to_apicid[cpu] = apicid;
- bios_cpu_apicid[cpu] = apicid;
- } else {
- per_cpu(x86_cpu_to_apicid, cpu) = apicid;
- per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
- }
-#endif
-
- cpu_set(cpu, cpu_possible_map);
- cpu_set(cpu, cpu_present_map);
-}
-
-int hard_smp_processor_id(void)
-{
- return read_apic_id();
-}
-
-/*
- * Power management
- */
-#ifdef CONFIG_PM
-
-static struct {
- /*
- * 'active' is true if the local APIC was enabled by us and
- * not the BIOS; this signifies that we are also responsible
- * for disabling it before entering apm/acpi suspend
- */
- int active;
- /* r/w apic fields */
- unsigned int apic_id;
- unsigned int apic_taskpri;
- unsigned int apic_ldr;
- unsigned int apic_dfr;
- unsigned int apic_spiv;
- unsigned int apic_lvtt;
- unsigned int apic_lvtpc;
- unsigned int apic_lvt0;
- unsigned int apic_lvt1;
- unsigned int apic_lvterr;
- unsigned int apic_tmict;
- unsigned int apic_tdcr;
- unsigned int apic_thmr;
-} apic_pm_state;
-
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
-{
- unsigned long flags;
- int maxlvt;
-
- if (!apic_pm_state.active)
- return 0;
-
- maxlvt = lapic_get_maxlvt();
-
- apic_pm_state.apic_id = apic_read(APIC_ID);
- apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
- apic_pm_state.apic_ldr = apic_read(APIC_LDR);
- apic_pm_state.apic_dfr = apic_read(APIC_DFR);
- apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
- apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
- if (maxlvt >= 4)
- apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
- apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
- apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
- apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
- apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
- apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
- if (maxlvt >= 5)
- apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-#endif
-
- local_irq_save(flags);
- disable_local_APIC();
- local_irq_restore(flags);
- return 0;
-}
-
-static int lapic_resume(struct sys_device *dev)
-{
- unsigned int l, h;
- unsigned long flags;
- int maxlvt;
-
- if (!apic_pm_state.active)
- return 0;
-
- maxlvt = lapic_get_maxlvt();
-
- local_irq_save(flags);
-
-#ifdef CONFIG_X86_64
- if (x2apic)
- enable_x2apic();
- else
-#endif
- {
- /*
- * Make sure the APICBASE points to the right address
- *
- * FIXME! This will be wrong if we ever support suspend on
- * SMP! We'll need to do this as part of the CPU restore!
- */
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
- }
-
- apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
- apic_write(APIC_ID, apic_pm_state.apic_id);
- apic_write(APIC_DFR, apic_pm_state.apic_dfr);
- apic_write(APIC_LDR, apic_pm_state.apic_ldr);
- apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
- apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
- apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
- apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
- if (maxlvt >= 5)
- apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-#endif
- if (maxlvt >= 4)
- apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
- apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
- apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
- apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
-
- local_irq_restore(flags);
-
- return 0;
-}
-
-/*
- * This device has no shutdown method - fully functioning local APICs
- * are needed on every CPU up until machine_halt/restart/poweroff.
- */
-
-static struct sysdev_class lapic_sysclass = {
- .name = "lapic",
- .resume = lapic_resume,
- .suspend = lapic_suspend,
-};
-
-static struct sys_device device_lapic = {
- .id = 0,
- .cls = &lapic_sysclass,
-};
-
-static void __cpuinit apic_pm_activate(void)
-{
- apic_pm_state.active = 1;
-}
-
-static int __init init_lapic_sysfs(void)
-{
- int error;
-
- if (!cpu_has_apic)
- return 0;
- /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
-
- error = sysdev_class_register(&lapic_sysclass);
- if (!error)
- error = sysdev_register(&device_lapic);
- return error;
-}
-device_initcall(init_lapic_sysfs);
-
-#else /* CONFIG_PM */
-
-static void apic_pm_activate(void) { }
-
-#endif /* CONFIG_PM */
-
-/*
- * apic_is_clustered_box() -- Check if we can expect good TSC
- *
- * Thus far, the major user of this is IBM's Summit2 series:
- *
- * Clustered boxes may have unsynced TSC problems if they are
- * multi-chassis. Use available data to take a good guess.
- * If in doubt, go HPET.
- */
-__cpuinit int apic_is_clustered_box(void)
-{
- int i, clusters, zeros;
- unsigned id;
- u16 *bios_cpu_apicid;
- DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
-
- /*
- * there is not this kind of box with AMD CPU yet.
- * Some AMD box with quadcore cpu and 8 sockets apicid
- * will be [4, 0x23] or [8, 0x27] could be thought to
- * vsmp box still need checking...
- */
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
- return 0;
-
- bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
- bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
-
- for (i = 0; i < NR_CPUS; i++) {
- /* are we being called early in kernel startup? */
- if (bios_cpu_apicid) {
- id = bios_cpu_apicid[i];
- }
- else if (i < nr_cpu_ids) {
- if (cpu_present(i))
- id = per_cpu(x86_bios_cpu_apicid, i);
- else
- continue;
- }
- else
- break;
-
- if (id != BAD_APICID)
- __set_bit(APIC_CLUSTERID(id), clustermap);
- }
-
- /* Problem: Partially populated chassis may not have CPUs in some of
- * the APIC clusters they have been allocated. Only present CPUs have
- * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
- * Since clusters are allocated sequentially, count zeros only if
- * they are bounded by ones.
- */
- clusters = 0;
- zeros = 0;
- for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
- if (test_bit(i, clustermap)) {
- clusters += 1 + zeros;
- zeros = 0;
- } else
- ++zeros;
- }
-
- /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
- * not guaranteed to be synced between boards
- */
- if (is_vsmp_box() && clusters > 1)
- return 1;
-
- /*
- * If clusters > 2, then should be multi-chassis.
- * May have to revisit this when multi-core + hyperthreaded CPUs come
- * out, but AFAIK this will work even for them.
- */
- return (clusters > 2);
-}
-
-static __init int setup_nox2apic(char *str)
-{
- disable_x2apic = 1;
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC);
- return 0;
-}
-early_param("nox2apic", setup_nox2apic);
-
-
-/*
- * APIC command line parameters
- */
-static int __init setup_disableapic(char *arg)
-{
- disable_apic = 1;
- setup_clear_cpu_cap(X86_FEATURE_APIC);
- return 0;
-}
-early_param("disableapic", setup_disableapic);
-
-/* same as disableapic, for compatibility */
-static int __init setup_nolapic(char *arg)
-{
- return setup_disableapic(arg);
-}
-early_param("nolapic", setup_nolapic);
-
-static int __init parse_lapic_timer_c2_ok(char *arg)
-{
- local_apic_timer_c2_ok = 1;
- return 0;
-}
-early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
-
-static int __init parse_disable_apic_timer(char *arg)
-{
- disable_apic_timer = 1;
- return 0;
-}
-early_param("noapictimer", parse_disable_apic_timer);
-
-static int __init parse_nolapic_timer(char *arg)
-{
- disable_apic_timer = 1;
- return 0;
-}
-early_param("nolapic_timer", parse_nolapic_timer);
-
-static __init int setup_apicpmtimer(char *s)
-{
- apic_calibrate_pmtmr = 1;
- notsc_setup(NULL);
- return 0;
-}
-__setup("apicpmtimer", setup_apicpmtimer);
-
-static int __init apic_set_verbosity(char *arg)
-{
- if (!arg) {
-#ifdef CONFIG_X86_64
- skip_ioapic_setup = 0;
- ioapic_force = 1;
- return 0;
-#endif
- return -EINVAL;
- }
-
- if (strcmp("debug", arg) == 0)
- apic_verbosity = APIC_DEBUG;
- else if (strcmp("verbose", arg) == 0)
- apic_verbosity = APIC_VERBOSE;
- else {
- printk(KERN_WARNING "APIC Verbosity level %s not recognised"
- " use apic=verbose or apic=debug\n", arg);
- return -EINVAL;
- }
-
- return 0;
-}
-early_param("apic", apic_set_verbosity);
-
-static int __init lapic_insert_resource(void)
-{
- if (!apic_phys)
- return -1;
-
- /* Put local APIC into the resource map. */
- lapic_resource.start = apic_phys;
- lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
- insert_resource(&iomem_resource, &lapic_resource);
-
- return 0;
-}
-
-/*
- * need call insert after e820_reserve_resources()
- * that is using request_resource
- */
-late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index fdd585f..f0dfe6f 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -1,8 +1,6 @@
/*
* BIOS run time interface routines.
*
- * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
- *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -16,33 +14,128 @@
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson
*/
+#include <linux/efi.h>
+#include <asm/efi.h>
+#include <linux/io.h>
#include <asm/uv/bios.h>
+#include <asm/uv/uv_hub.h>
-const char *
-x86_bios_strerror(long status)
+struct uv_systab uv_systab;
+
+s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
{
- const char *str;
- switch (status) {
- case 0: str = "Call completed without error"; break;
- case -1: str = "Not implemented"; break;
- case -2: str = "Invalid argument"; break;
- case -3: str = "Call completed with error"; break;
- default: str = "Unknown BIOS status code"; break;
+ struct uv_systab *tab = &uv_systab;
+
+ if (!tab->function)
+ /*
+ * BIOS does not support UV systab
+ */
+ return BIOS_STATUS_UNIMPLEMENTED;
+
+ return efi_call6((void *)__va(tab->function),
+ (u64)which, a1, a2, a3, a4, a5);
+}
+
+s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+ u64 a4, u64 a5)
+{
+ unsigned long bios_flags;
+ s64 ret;
+
+ local_irq_save(bios_flags);
+ ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+ local_irq_restore(bios_flags);
+
+ return ret;
+}
+
+s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+ u64 a4, u64 a5)
+{
+ s64 ret;
+
+ preempt_disable();
+ ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+ preempt_enable();
+
+ return ret;
+}
+
+
+long sn_partition_id;
+EXPORT_SYMBOL_GPL(sn_partition_id);
+long uv_coherency_id;
+EXPORT_SYMBOL_GPL(uv_coherency_id);
+long uv_region_size;
+EXPORT_SYMBOL_GPL(uv_region_size);
+int uv_type;
+
+
+s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
+ long *region)
+{
+ s64 ret;
+ u64 v0, v1;
+ union partition_info_u part;
+
+ ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
+ (u64)(&v0), (u64)(&v1), 0, 0);
+ if (ret != BIOS_STATUS_SUCCESS)
+ return ret;
+
+ part.val = v0;
+ if (uvtype)
+ *uvtype = part.hub_version;
+ if (partid)
+ *partid = part.partition_id;
+ if (coher)
+ *coher = part.coherence_id;
+ if (region)
+ *region = part.region_size;
+ return ret;
+}
+
+
+s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
+{
+ return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
+ (u64)ticks_per_second, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_freq_base);
+
+
+#ifdef CONFIG_EFI
+void uv_bios_init(void)
+{
+ struct uv_systab *tab;
+
+ if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
+ (efi.uv_systab == (unsigned long)NULL)) {
+ printk(KERN_CRIT "No EFI UV System Table.\n");
+ uv_systab.function = (unsigned long)NULL;
+ return;
}
- return str;
-}
-long
-x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
- unsigned long *drift_info)
-{
- struct uv_bios_retval isrv;
+ tab = (struct uv_systab *)ioremap(efi.uv_systab,
+ sizeof(struct uv_systab));
+ if (strncmp(tab->signature, "UVST", 4) != 0)
+ printk(KERN_ERR "bad signature in UV system table!");
- BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
- *ticks_per_second = isrv.v0;
- *drift_info = isrv.v1;
- return isrv.status;
+ /*
+ * Copy table to permanent spot for later use.
+ */
+ memcpy(&uv_systab, tab, sizeof(struct uv_systab));
+ iounmap(tab);
+
+ printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision);
}
-EXPORT_SYMBOL_GPL(x86_bios_freq_base);
+#else /* !CONFIG_EFI */
+
+void uv_bios_init(void) { }
+#endif
+
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
new file mode 100644
index 0000000..667df55
--- /dev/null
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -0,0 +1 @@
+capflags.c
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 32e7352..8f1e31d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -249,7 +249,7 @@
}
numa_set_node(cpu, node);
- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+ printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
#endif
}
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 06fcce5..b046185 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -1,5 +1,5 @@
/*
- * (C) 2001-2004 Dave Jones. <davej@codemonkey.org.uk>
+ * (C) 2001-2004 Dave Jones. <davej@redhat.com>
* (C) 2002 Padraig Brady. <padraig@antefacto.com>
*
* Licensed under the terms of the GNU GPL License version 2.
@@ -1019,7 +1019,7 @@
module_param(revid_errata, int, 0644);
MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
-MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
MODULE_LICENSE ("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index b5ced80..c1ac579 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -246,7 +246,7 @@
}
-MODULE_AUTHOR("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
+MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 0a61159..7c7d56b 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -1,6 +1,6 @@
/*
* AMD K7 Powernow driver.
- * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs.
+ * (C) 2003 Dave Jones on behalf of SuSE Labs.
* (C) 2003-2004 Dave Jones <davej@redhat.com>
*
* Licensed under the terms of the GNU GPL License version 2.
@@ -692,7 +692,7 @@
module_param(acpi_force, int, 0444);
MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
-MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors.");
MODULE_LICENSE ("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 84bb395..008d23b 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -7,7 +7,7 @@
* Support : mark.langsdorf@amd.com
*
* Based on the powernow-k7.c module written by Dave Jones.
- * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs
+ * (C) 2003 Dave Jones on behalf of SuSE Labs
* (C) 2004 Dominik Brodowski <linux@brodo.de>
* (C) 2004 Pavel Machek <pavel@suse.cz>
* Licensed under the terms of the GNU GPL License version 2.
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 191f726..04d0376 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -431,7 +431,7 @@
}
-MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
+MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges.");
MODULE_LICENSE ("GPL");
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 99468db..cce0b61 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -174,7 +174,7 @@
node = first_node(node_online_map);
numa_set_node(cpu, node);
- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+ printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
#endif
}
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index f390c9f..dd3af6e 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -1,6 +1,6 @@
/*
- * Athlon/Hammer specific Machine Check Exception Reporting
- * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk>
+ * Athlon specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Dave Jones <davej@redhat.com>
*/
#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 774d87c..0ebf3fc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -1,6 +1,6 @@
/*
* mce.c - x86 Machine Check Exception Reporting
- * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
+ * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@redhat.com>
*/
#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index cc1fccd..a74af12 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -1,7 +1,7 @@
/*
* Non Fatal Machine Check Exception Reporting
*
- * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk>
+ * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
*
* This file contains routines to check for non-fatal MCEs every 15s
*
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 6bff382..9abd48b 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -17,6 +17,8 @@
#include <linux/bitops.h>
#include <linux/smp.h>
#include <linux/nmi.h>
+#include <linux/kprobes.h>
+
#include <asm/apic.h>
#include <asm/intel_arch_perfmon.h>
@@ -336,7 +338,8 @@
release_perfctr_nmi(wd_ops->perfctr);
}
-static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+static void __kprobes
+single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
{
/* start the cycle over again */
write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
@@ -401,7 +404,7 @@
return 1;
}
-static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
{
/*
* P6 based Pentium M need to re-unmask
@@ -605,7 +608,7 @@
release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
}
-static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
{
unsigned dummy;
/*
@@ -784,7 +787,7 @@
return hz;
}
-int lapic_wd_event(unsigned nmi_hz)
+int __kprobes lapic_wd_event(unsigned nmi_hz)
{
struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
u64 ctr;
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 945a31c..1119d24 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -367,6 +367,10 @@
efi.smbios = config_tables[i].table;
printk(" SMBIOS=0x%lx ", config_tables[i].table);
} else if (!efi_guidcmp(config_tables[i].guid,
+ UV_SYSTEM_TABLE_GUID)) {
+ efi.uv_systab = config_tables[i].table;
+ printk(" UVsystab=0x%lx ", config_tables[i].table);
+ } else if (!efi_guidcmp(config_tables[i].guid,
HCDP_TABLE_GUID)) {
efi.hcdp = config_tables[i].table;
printk(" HCDP=0x%lx ", config_tables[i].table);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index b21fbfa..c356423 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -629,7 +629,7 @@
ENTRY(irq_entries_start)
RING0_INT_FRAME
vector=0
-.rept NR_IRQS
+.rept NR_VECTORS
ALIGN
.if vector
CFI_ADJUST_CFA_OFFSET -4
@@ -1153,20 +1153,6 @@
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(mcount)
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- subl $MCOUNT_INSN_SIZE, %eax
-
-.globl mcount_call
-mcount_call:
- call ftrace_stub
-
- popl %edx
- popl %ecx
- popl %eax
-
ret
END(mcount)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1db6ce4..09e7145 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -64,32 +64,6 @@
#ifdef CONFIG_FTRACE
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(mcount)
-
- subq $0x38, %rsp
- movq %rax, (%rsp)
- movq %rcx, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rsi, 24(%rsp)
- movq %rdi, 32(%rsp)
- movq %r8, 40(%rsp)
- movq %r9, 48(%rsp)
-
- movq 0x38(%rsp), %rdi
- subq $MCOUNT_INSN_SIZE, %rdi
-
-.globl mcount_call
-mcount_call:
- call ftrace_stub
-
- movq 48(%rsp), %r9
- movq 40(%rsp), %r8
- movq 32(%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 16(%rsp), %rdx
- movq 8(%rsp), %rcx
- movq (%rsp), %rax
- addq $0x38, %rsp
-
retq
END(mcount)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ab115cd..d073d98 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -11,17 +11,18 @@
#include <linux/spinlock.h>
#include <linux/hardirq.h>
+#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/list.h>
-#include <asm/alternative.h>
#include <asm/ftrace.h>
+#include <asm/nops.h>
/* Long is fine, even if it is only 4 bytes ;-) */
-static long *ftrace_nop;
+static unsigned long *ftrace_nop;
union ftrace_code_union {
char code[MCOUNT_INSN_SIZE];
@@ -60,11 +61,7 @@
ftrace_modify_code(unsigned long ip, unsigned char *old_code,
unsigned char *new_code)
{
- unsigned replaced;
- unsigned old = *(unsigned *)old_code; /* 4 bytes */
- unsigned new = *(unsigned *)new_code; /* 4 bytes */
- unsigned char newch = new_code[4];
- int faulted = 0;
+ unsigned char replaced[MCOUNT_INSN_SIZE];
/*
* Note: Due to modules and __init, code can
@@ -72,29 +69,20 @@
* as well as code changing.
*
* No real locking needed, this code is run through
- * kstop_machine.
+ * kstop_machine, or before SMP starts.
*/
- asm volatile (
- "1: lock\n"
- " cmpxchg %3, (%2)\n"
- " jnz 2f\n"
- " movb %b4, 4(%2)\n"
- "2:\n"
- ".section .fixup, \"ax\"\n"
- "3: movl $1, %0\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : "=r"(faulted), "=a"(replaced)
- : "r"(ip), "r"(new), "c"(newch),
- "0"(faulted), "a"(old)
- : "memory");
+ if (__copy_from_user_inatomic(replaced, (char __user *)ip, MCOUNT_INSN_SIZE))
+ return 1;
+
+ if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+ return 2;
+
+ WARN_ON_ONCE(__copy_to_user_inatomic((char __user *)ip, new_code,
+ MCOUNT_INSN_SIZE));
+
sync_core();
- if (replaced != old && replaced != new)
- faulted = 2;
-
- return faulted;
+ return 0;
}
notrace int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -112,30 +100,76 @@
notrace int ftrace_mcount_set(unsigned long *data)
{
- unsigned long ip = (long)(&mcount_call);
- unsigned long *addr = data;
- unsigned char old[MCOUNT_INSN_SIZE], *new;
-
- /*
- * Replace the mcount stub with a pointer to the
- * ip recorder function.
- */
- memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
- new = ftrace_call_replace(ip, *addr);
- *addr = ftrace_modify_code(ip, old, new);
-
+ /* mcount is initialized as a nop */
+ *data = 0;
return 0;
}
int __init ftrace_dyn_arch_init(void *data)
{
- const unsigned char *const *noptable = find_nop_table();
+ extern const unsigned char ftrace_test_p6nop[];
+ extern const unsigned char ftrace_test_nop5[];
+ extern const unsigned char ftrace_test_jmp[];
+ int faulted = 0;
- /* This is running in kstop_machine */
+ /*
+ * There is no good nop for all x86 archs.
+ * We will default to using the P6_NOP5, but first we
+ * will test to make sure that the nop will actually
+ * work on this CPU. If it faults, we will then
+ * go to a lesser efficient 5 byte nop. If that fails
+ * we then just use a jmp as our nop. This isn't the most
+ * efficient nop, but we can not use a multi part nop
+ * since we would then risk being preempted in the middle
+ * of that nop, and if we enabled tracing then, it might
+ * cause a system crash.
+ *
+ * TODO: check the cpuid to determine the best nop.
+ */
+ asm volatile (
+ "jmp ftrace_test_jmp\n"
+ /* This code needs to stay around */
+ ".section .text, \"ax\"\n"
+ "ftrace_test_jmp:"
+ "jmp ftrace_test_p6nop\n"
+ "nop\n"
+ "nop\n"
+ "nop\n" /* 2 byte jmp + 3 bytes */
+ "ftrace_test_p6nop:"
+ P6_NOP5
+ "jmp 1f\n"
+ "ftrace_test_nop5:"
+ ".byte 0x66,0x66,0x66,0x66,0x90\n"
+ "jmp 1f\n"
+ ".previous\n"
+ "1:"
+ ".section .fixup, \"ax\"\n"
+ "2: movl $1, %0\n"
+ " jmp ftrace_test_nop5\n"
+ "3: movl $2, %0\n"
+ " jmp 1b\n"
+ ".previous\n"
+ _ASM_EXTABLE(ftrace_test_p6nop, 2b)
+ _ASM_EXTABLE(ftrace_test_nop5, 3b)
+ : "=r"(faulted) : "0" (faulted));
- ftrace_mcount_set(data);
+ switch (faulted) {
+ case 0:
+ pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n");
+ ftrace_nop = (unsigned long *)ftrace_test_p6nop;
+ break;
+ case 1:
+ pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n");
+ ftrace_nop = (unsigned long *)ftrace_test_nop5;
+ break;
+ case 2:
+ pr_info("ftrace: converting mcount calls to jmp . + 5\n");
+ ftrace_nop = (unsigned long *)ftrace_test_jmp;
+ break;
+ }
- ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE];
+ /* The return code is retured via data */
+ *(unsigned long *)data = 0;
return 0;
}
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 9eca5ba..2ec2de8 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -179,8 +179,10 @@
* is an example).
*/
if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
- (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
+ printk(KERN_DEBUG "system APIC only can use physical flat");
return 1;
+ }
#endif
return 0;
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 33581d9..bfd5328 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -341,12 +341,12 @@
static __init void uv_rtc_init(void)
{
- long status, ticks_per_sec, drift;
+ long status;
+ u64 ticks_per_sec;
- status =
- x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
- &drift);
- if (status != 0 || ticks_per_sec < 100000) {
+ status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
+ &ticks_per_sec);
+ if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
printk(KERN_WARNING
"unable to determine platform RTC clock frequency, "
"guessing.\n");
@@ -356,7 +356,22 @@
sn_rtc_cycles_per_second = ticks_per_sec;
}
-static bool uv_system_inited;
+/*
+ * Called on each cpu to initialize the per_cpu UV data area.
+ * ZZZ hotplug not supported yet
+ */
+void __cpuinit uv_cpu_init(void)
+{
+ /* CPU 0 initilization will be done via uv_system_init. */
+ if (!uv_blade_info)
+ return;
+
+ uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
+
+ if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
+ set_x2apic_extra_bits(uv_hub_info->pnode);
+}
+
void __init uv_system_init(void)
{
@@ -412,6 +427,9 @@
gnode_upper = (((unsigned long)node_id.s.node_id) &
~((1 << n_val) - 1)) << m_val;
+ uv_bios_init();
+ uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
+ &uv_coherency_id, &uv_region_size);
uv_rtc_init();
for_each_present_cpu(cpu) {
@@ -433,7 +451,7 @@
uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
- uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
+ uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id;
uv_node_to_blade[nid] = blade;
uv_cpu_to_blade[cpu] = blade;
max_pnode = max(pnode, max_pnode);
@@ -448,21 +466,6 @@
map_mmr_high(max_pnode);
map_config_high(max_pnode);
map_mmioh_high(max_pnode);
- uv_system_inited = true;
+
+ uv_cpu_init();
}
-
-/*
- * Called on each cpu to initialize the per_cpu UV data area.
- * ZZZ hotplug not supported yet
- */
-void __cpuinit uv_cpu_init(void)
-{
- BUG_ON(!uv_system_inited);
-
- uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
-
- if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
- set_x2apic_extra_bits(uv_hub_info->pnode);
-}
-
-
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index acf62fc..77017e8 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,29 +1,49 @@
#include <linux/clocksource.h>
#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/sysdev.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/hpet.h>
#include <linux/init.h>
-#include <linux/sysdev.h>
+#include <linux/cpu.h>
#include <linux/pm.h>
+#include <linux/io.h>
#include <asm/fixmap.h>
-#include <asm/hpet.h>
#include <asm/i8253.h>
-#include <asm/io.h>
+#include <asm/hpet.h>
-#define HPET_MASK CLOCKSOURCE_MASK(32)
-#define HPET_SHIFT 22
+#define HPET_MASK CLOCKSOURCE_MASK(32)
+#define HPET_SHIFT 22
/* FSEC = 10^-15
NSEC = 10^-9 */
-#define FSEC_PER_NSEC 1000000L
+#define FSEC_PER_NSEC 1000000L
+
+#define HPET_DEV_USED_BIT 2
+#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
+#define HPET_DEV_VALID 0x8
+#define HPET_DEV_FSB_CAP 0x1000
+#define HPET_DEV_PERI_CAP 0x2000
+
+#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
/*
* HPET address is set in acpi/boot.c, when an ACPI entry exists
*/
-unsigned long hpet_address;
-static void __iomem *hpet_virt_address;
+unsigned long hpet_address;
+unsigned long hpet_num_timers;
+static void __iomem *hpet_virt_address;
+
+struct hpet_dev {
+ struct clock_event_device evt;
+ unsigned int num;
+ int cpu;
+ unsigned int irq;
+ unsigned int flags;
+ char name[10];
+};
unsigned long hpet_readl(unsigned long a)
{
@@ -59,7 +79,7 @@
static int boot_hpet_disable;
int hpet_force_user;
-static int __init hpet_setup(char* str)
+static int __init hpet_setup(char *str)
{
if (str) {
if (!strncmp("disable", str, 7))
@@ -80,7 +100,7 @@
static inline int is_hpet_capable(void)
{
- return (!boot_hpet_disable && hpet_address);
+ return !boot_hpet_disable && hpet_address;
}
/*
@@ -102,6 +122,9 @@
* timer 0 and timer 1 in case of RTC emulation.
*/
#ifdef CONFIG_HPET
+
+static void hpet_reserve_msi_timers(struct hpet_data *hd);
+
static void hpet_reserve_platform_timers(unsigned long id)
{
struct hpet __iomem *hpet = hpet_virt_address;
@@ -111,10 +134,10 @@
nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
- memset(&hd, 0, sizeof (hd));
- hd.hd_phys_address = hpet_address;
- hd.hd_address = hpet;
- hd.hd_nirqs = nrtimers;
+ memset(&hd, 0, sizeof(hd));
+ hd.hd_phys_address = hpet_address;
+ hd.hd_address = hpet;
+ hd.hd_nirqs = nrtimers;
hpet_reserve_timer(&hd, 0);
#ifdef CONFIG_HPET_EMULATE_RTC
@@ -130,10 +153,12 @@
hd.hd_irq[1] = HPET_LEGACY_RTC;
for (i = 2; i < nrtimers; timer++, i++) {
- hd.hd_irq[i] = (readl(&timer->hpet_config) & Tn_INT_ROUTE_CNF_MASK) >>
- Tn_INT_ROUTE_CNF_SHIFT;
+ hd.hd_irq[i] = (readl(&timer->hpet_config) &
+ Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
}
+ hpet_reserve_msi_timers(&hd);
+
hpet_alloc(&hd);
}
@@ -227,60 +252,70 @@
printk(KERN_DEBUG "hpet clockevent registered\n");
}
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int hpet_setup_msi_irq(unsigned int irq);
+
+static void hpet_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt, int timer)
{
unsigned long cfg, cmp, now;
uint64_t delta;
- switch(mode) {
+ switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
- delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
- delta >>= hpet_clockevent.shift;
+ delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
+ delta >>= evt->shift;
now = hpet_readl(HPET_COUNTER);
cmp = now + (unsigned long) delta;
- cfg = hpet_readl(HPET_T0_CFG);
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
HPET_TN_SETVAL | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_T0_CFG);
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
/*
* The first write after writing TN_SETVAL to the
* config register sets the counter value, the second
* write sets the period.
*/
- hpet_writel(cmp, HPET_T0_CMP);
+ hpet_writel(cmp, HPET_Tn_CMP(timer));
udelay(1);
- hpet_writel((unsigned long) delta, HPET_T0_CMP);
+ hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
break;
case CLOCK_EVT_MODE_ONESHOT:
- cfg = hpet_readl(HPET_T0_CFG);
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
cfg &= ~HPET_TN_PERIODIC;
cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_T0_CFG);
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
- cfg = hpet_readl(HPET_T0_CFG);
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
cfg &= ~HPET_TN_ENABLE;
- hpet_writel(cfg, HPET_T0_CFG);
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
break;
case CLOCK_EVT_MODE_RESUME:
- hpet_enable_legacy_int();
+ if (timer == 0) {
+ hpet_enable_legacy_int();
+ } else {
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ hpet_setup_msi_irq(hdev->irq);
+ disable_irq(hdev->irq);
+ irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
+ enable_irq(hdev->irq);
+ }
break;
}
}
-static int hpet_legacy_next_event(unsigned long delta,
- struct clock_event_device *evt)
+static int hpet_next_event(unsigned long delta,
+ struct clock_event_device *evt, int timer)
{
u32 cnt;
cnt = hpet_readl(HPET_COUNTER);
cnt += (u32) delta;
- hpet_writel(cnt, HPET_T0_CMP);
+ hpet_writel(cnt, HPET_Tn_CMP(timer));
/*
* We need to read back the CMP register to make sure that
@@ -292,6 +327,347 @@
return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
}
+static void hpet_legacy_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ hpet_set_mode(mode, evt, 0);
+}
+
+static int hpet_legacy_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ return hpet_next_event(delta, evt, 0);
+}
+
+/*
+ * HPET MSI Support
+ */
+#ifdef CONFIG_PCI_MSI
+
+static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
+static struct hpet_dev *hpet_devs;
+
+void hpet_msi_unmask(unsigned int irq)
+{
+ struct hpet_dev *hdev = get_irq_data(irq);
+ unsigned long cfg;
+
+ /* unmask it */
+ cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+ cfg |= HPET_TN_FSB;
+ hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+}
+
+void hpet_msi_mask(unsigned int irq)
+{
+ unsigned long cfg;
+ struct hpet_dev *hdev = get_irq_data(irq);
+
+ /* mask it */
+ cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+ cfg &= ~HPET_TN_FSB;
+ hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+}
+
+void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
+{
+ struct hpet_dev *hdev = get_irq_data(irq);
+
+ hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
+ hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
+}
+
+void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
+{
+ struct hpet_dev *hdev = get_irq_data(irq);
+
+ msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
+ msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
+ msg->address_hi = 0;
+}
+
+static void hpet_msi_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ hpet_set_mode(mode, evt, hdev->num);
+}
+
+static int hpet_msi_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ return hpet_next_event(delta, evt, hdev->num);
+}
+
+static int hpet_setup_msi_irq(unsigned int irq)
+{
+ if (arch_setup_hpet_msi(irq)) {
+ destroy_irq(irq);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int hpet_assign_irq(struct hpet_dev *dev)
+{
+ unsigned int irq;
+
+ irq = create_irq();
+ if (!irq)
+ return -EINVAL;
+
+ set_irq_data(irq, dev);
+
+ if (hpet_setup_msi_irq(irq))
+ return -EINVAL;
+
+ dev->irq = irq;
+ return 0;
+}
+
+static irqreturn_t hpet_interrupt_handler(int irq, void *data)
+{
+ struct hpet_dev *dev = (struct hpet_dev *)data;
+ struct clock_event_device *hevt = &dev->evt;
+
+ if (!hevt->event_handler) {
+ printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n",
+ dev->num);
+ return IRQ_HANDLED;
+ }
+
+ hevt->event_handler(hevt);
+ return IRQ_HANDLED;
+}
+
+static int hpet_setup_irq(struct hpet_dev *dev)
+{
+
+ if (request_irq(dev->irq, hpet_interrupt_handler,
+ IRQF_SHARED|IRQF_NOBALANCING, dev->name, dev))
+ return -1;
+
+ disable_irq(dev->irq);
+ irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
+ enable_irq(dev->irq);
+
+ printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
+ dev->name, dev->irq);
+
+ return 0;
+}
+
+/* This should be called in specific @cpu */
+static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
+{
+ struct clock_event_device *evt = &hdev->evt;
+ uint64_t hpet_freq;
+
+ WARN_ON(cpu != smp_processor_id());
+ if (!(hdev->flags & HPET_DEV_VALID))
+ return;
+
+ if (hpet_setup_msi_irq(hdev->irq))
+ return;
+
+ hdev->cpu = cpu;
+ per_cpu(cpu_hpet_dev, cpu) = hdev;
+ evt->name = hdev->name;
+ hpet_setup_irq(hdev);
+ evt->irq = hdev->irq;
+
+ evt->rating = 110;
+ evt->features = CLOCK_EVT_FEAT_ONESHOT;
+ if (hdev->flags & HPET_DEV_PERI_CAP)
+ evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+
+ evt->set_mode = hpet_msi_set_mode;
+ evt->set_next_event = hpet_msi_next_event;
+ evt->shift = 32;
+
+ /*
+ * The period is a femto seconds value. We need to calculate the
+ * scaled math multiplication factor for nanosecond to hpet tick
+ * conversion.
+ */
+ hpet_freq = 1000000000000000ULL;
+ do_div(hpet_freq, hpet_period);
+ evt->mult = div_sc((unsigned long) hpet_freq,
+ NSEC_PER_SEC, evt->shift);
+ /* Calculate the max delta */
+ evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
+ /* 5 usec minimum reprogramming delta. */
+ evt->min_delta_ns = 5000;
+
+ evt->cpumask = cpumask_of_cpu(hdev->cpu);
+ clockevents_register_device(evt);
+}
+
+#ifdef CONFIG_HPET
+/* Reserve at least one timer for userspace (/dev/hpet) */
+#define RESERVE_TIMERS 1
+#else
+#define RESERVE_TIMERS 0
+#endif
+
+static void hpet_msi_capability_lookup(unsigned int start_timer)
+{
+ unsigned int id;
+ unsigned int num_timers;
+ unsigned int num_timers_used = 0;
+ int i;
+
+ id = hpet_readl(HPET_ID);
+
+ num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+ num_timers++; /* Value read out starts from 0 */
+
+ hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
+ if (!hpet_devs)
+ return;
+
+ hpet_num_timers = num_timers;
+
+ for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
+ struct hpet_dev *hdev = &hpet_devs[num_timers_used];
+ unsigned long cfg = hpet_readl(HPET_Tn_CFG(i));
+
+ /* Only consider HPET timer with MSI support */
+ if (!(cfg & HPET_TN_FSB_CAP))
+ continue;
+
+ hdev->flags = 0;
+ if (cfg & HPET_TN_PERIODIC_CAP)
+ hdev->flags |= HPET_DEV_PERI_CAP;
+ hdev->num = i;
+
+ sprintf(hdev->name, "hpet%d", i);
+ if (hpet_assign_irq(hdev))
+ continue;
+
+ hdev->flags |= HPET_DEV_FSB_CAP;
+ hdev->flags |= HPET_DEV_VALID;
+ num_timers_used++;
+ if (num_timers_used == num_possible_cpus())
+ break;
+ }
+
+ printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n",
+ num_timers, num_timers_used);
+}
+
+#ifdef CONFIG_HPET
+static void hpet_reserve_msi_timers(struct hpet_data *hd)
+{
+ int i;
+
+ if (!hpet_devs)
+ return;
+
+ for (i = 0; i < hpet_num_timers; i++) {
+ struct hpet_dev *hdev = &hpet_devs[i];
+
+ if (!(hdev->flags & HPET_DEV_VALID))
+ continue;
+
+ hd->hd_irq[hdev->num] = hdev->irq;
+ hpet_reserve_timer(hd, hdev->num);
+ }
+}
+#endif
+
+static struct hpet_dev *hpet_get_unused_timer(void)
+{
+ int i;
+
+ if (!hpet_devs)
+ return NULL;
+
+ for (i = 0; i < hpet_num_timers; i++) {
+ struct hpet_dev *hdev = &hpet_devs[i];
+
+ if (!(hdev->flags & HPET_DEV_VALID))
+ continue;
+ if (test_and_set_bit(HPET_DEV_USED_BIT,
+ (unsigned long *)&hdev->flags))
+ continue;
+ return hdev;
+ }
+ return NULL;
+}
+
+struct hpet_work_struct {
+ struct delayed_work work;
+ struct completion complete;
+};
+
+static void hpet_work(struct work_struct *w)
+{
+ struct hpet_dev *hdev;
+ int cpu = smp_processor_id();
+ struct hpet_work_struct *hpet_work;
+
+ hpet_work = container_of(w, struct hpet_work_struct, work.work);
+
+ hdev = hpet_get_unused_timer();
+ if (hdev)
+ init_one_hpet_msi_clockevent(hdev, cpu);
+
+ complete(&hpet_work->complete);
+}
+
+static int hpet_cpuhp_notify(struct notifier_block *n,
+ unsigned long action, void *hcpu)
+{
+ unsigned long cpu = (unsigned long)hcpu;
+ struct hpet_work_struct work;
+ struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
+
+ switch (action & 0xf) {
+ case CPU_ONLINE:
+ INIT_DELAYED_WORK(&work.work, hpet_work);
+ init_completion(&work.complete);
+ /* FIXME: add schedule_work_on() */
+ schedule_delayed_work_on(cpu, &work.work, 0);
+ wait_for_completion(&work.complete);
+ break;
+ case CPU_DEAD:
+ if (hdev) {
+ free_irq(hdev->irq, hdev);
+ hdev->flags &= ~HPET_DEV_USED;
+ per_cpu(cpu_hpet_dev, cpu) = NULL;
+ }
+ break;
+ }
+ return NOTIFY_OK;
+}
+#else
+
+static int hpet_setup_msi_irq(unsigned int irq)
+{
+ return 0;
+}
+static void hpet_msi_capability_lookup(unsigned int start_timer)
+{
+ return;
+}
+
+#ifdef CONFIG_HPET
+static void hpet_reserve_msi_timers(struct hpet_data *hd)
+{
+ return;
+}
+#endif
+
+static int hpet_cpuhp_notify(struct notifier_block *n,
+ unsigned long action, void *hcpu)
+{
+ return NOTIFY_OK;
+}
+
+#endif
+
/*
* Clock source related code
*/
@@ -427,8 +803,10 @@
if (id & HPET_ID_LEGSUP) {
hpet_legacy_clockevent_register();
+ hpet_msi_capability_lookup(2);
return 1;
}
+ hpet_msi_capability_lookup(0);
return 0;
out_nohpet:
@@ -445,6 +823,8 @@
*/
static __init int hpet_late_init(void)
{
+ int cpu;
+
if (boot_hpet_disable)
return -ENODEV;
@@ -460,6 +840,13 @@
hpet_reserve_platform_timers(hpet_readl(HPET_ID));
+ for_each_online_cpu(cpu) {
+ hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
+ }
+
+ /* This notifier should be called after workqueue is ready */
+ hotcpu_notifier(hpet_cpuhp_notify, -20);
+
return 0;
}
fs_initcall(hpet_late_init);
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic.c
similarity index 68%
rename from arch/x86/kernel/io_apic_64.c
rename to arch/x86/kernel/io_apic.c
index 02063ae..b764d74 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic.c
@@ -27,17 +27,21 @@
#include <linux/sched.h>
#include <linux/pci.h>
#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
#include <linux/acpi.h>
+#include <linux/module.h>
#include <linux/sysdev.h>
#include <linux/msi.h>
#include <linux/htirq.h>
-#include <linux/dmar.h>
-#include <linux/jiffies.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h> /* time_after() */
#ifdef CONFIG_ACPI
#include <acpi/acpi_bus.h>
#endif
#include <linux/bootmem.h>
#include <linux/dmar.h>
+#include <linux/hpet.h>
#include <asm/idle.h>
#include <asm/io.h>
@@ -46,61 +50,28 @@
#include <asm/proto.h>
#include <asm/acpi.h>
#include <asm/dma.h>
+#include <asm/timer.h>
#include <asm/i8259.h>
#include <asm/nmi.h>
#include <asm/msidef.h>
#include <asm/hypertransport.h>
+#include <asm/setup.h>
#include <asm/irq_remapping.h>
+#include <asm/hpet.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_irq.h>
#include <mach_ipi.h>
#include <mach_apic.h>
+#include <mach_apicdef.h>
#define __apicdebuginit(type) static type __init
-struct irq_cfg {
- cpumask_t domain;
- cpumask_t old_domain;
- unsigned move_cleanup_count;
- u8 vector;
- u8 move_in_progress : 1;
-};
-
-/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
- [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
- [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
- [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
- [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
- [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
- [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
- [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
- [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
- [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
- [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
- [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
- [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
- [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
- [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
- [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
- [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
-};
-
-static int assign_irq_vector(int irq, cpumask_t mask);
-
-int first_system_vector = 0xfe;
-
-char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
-
-int sis_apic_bug; /* not actually supported, dummy for compile */
-
-static int no_timer_check;
-
-static int disable_timer_pin_1 __initdata;
-
-int timer_through_8259 __initdata;
-
-/* Where if anywhere is the i8259 connect in external int mode */
-static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+/*
+ * Is the SiS APIC rmw bug present ?
+ * -1 = don't know, 0 = no, 1 = yes
+ */
+int sis_apic_bug = -1;
static DEFINE_SPINLOCK(ioapic_lock);
static DEFINE_SPINLOCK(vector_lock);
@@ -110,9 +81,6 @@
*/
int nr_ioapic_registers[MAX_IO_APICS];
-/* I/O APIC RTE contents at the OS boot up */
-struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
-
/* I/O APIC entries */
struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
int nr_ioapics;
@@ -123,11 +91,69 @@
/* # of MP IRQ source entries */
int mp_irq_entries;
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+int skip_ioapic_setup;
+
+static int __init parse_noapic(char *str)
+{
+ /* disable IO-APIC */
+ disable_ioapic_setup();
+ return 0;
+}
+early_param("noapic", parse_noapic);
+
+struct irq_pin_list;
+struct irq_cfg {
+ unsigned int irq;
+ struct irq_pin_list *irq_2_pin;
+ cpumask_t domain;
+ cpumask_t old_domain;
+ unsigned move_cleanup_count;
+ u8 vector;
+ u8 move_in_progress : 1;
+};
+
+/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+static struct irq_cfg irq_cfgx[NR_IRQS] = {
+ [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
+ [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
+ [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
+ [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
+ [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
+ [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
+ [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
+ [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
+ [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
+ [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
+ [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+ [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+ [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+ [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+ [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+ [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+};
+
+#define for_each_irq_cfg(irq, cfg) \
+ for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+{
+ return irq_cfg(irq);
+}
+
/*
- * Rough estimation of how many shared IRQs there are, can
- * be changed anytime.
+ * Rough estimation of how many shared IRQs there are, can be changed
+ * anytime.
*/
#define MAX_PLUS_SHARED_IRQS NR_IRQS
#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
@@ -139,9 +165,36 @@
* between pins and IRQs.
*/
-static struct irq_pin_list {
- short apic, pin, next;
-} irq_2_pin[PIN_MAP_SIZE];
+struct irq_pin_list {
+ int apic, pin;
+ struct irq_pin_list *next;
+};
+
+static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
+static struct irq_pin_list *irq_2_pin_ptr;
+
+static void __init irq_2_pin_init(void)
+{
+ struct irq_pin_list *pin = irq_2_pin_head;
+ int i;
+
+ for (i = 1; i < PIN_MAP_SIZE; i++)
+ pin[i-1].next = &pin[i];
+
+ irq_2_pin_ptr = &pin[0];
+}
+
+static struct irq_pin_list *get_one_free_irq_2_pin(void)
+{
+ struct irq_pin_list *pin = irq_2_pin_ptr;
+
+ if (!pin)
+ panic("can not get more irq_2_pin\n");
+
+ irq_2_pin_ptr = pin->next;
+ pin->next = NULL;
+ return pin;
+}
struct io_apic {
unsigned int index;
@@ -172,10 +225,15 @@
/*
* Re-write a value: to be used for read-modify-write
* cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index register
*/
-static inline void io_apic_modify(unsigned int apic, unsigned int value)
+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
+
+ if (sis_apic_bug)
+ writel(reg, &io_apic->index);
writel(value, &io_apic->data);
}
@@ -183,16 +241,17 @@
{
struct irq_pin_list *entry;
unsigned long flags;
+ struct irq_cfg *cfg = irq_cfg(irq);
spin_lock_irqsave(&ioapic_lock, flags);
- entry = irq_2_pin + irq;
+ entry = cfg->irq_2_pin;
for (;;) {
unsigned int reg;
int pin;
- pin = entry->pin;
- if (pin == -1)
+ if (!entry)
break;
+ pin = entry->pin;
reg = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
if (reg & IO_APIC_REDIR_REMOTE_IRR) {
@@ -201,45 +260,13 @@
}
if (!entry->next)
break;
- entry = irq_2_pin + entry->next;
+ entry = entry->next;
}
spin_unlock_irqrestore(&ioapic_lock, flags);
return false;
}
-/*
- * Synchronize the IO-APIC and the CPU by doing
- * a dummy read from the IO-APIC
- */
-static inline void io_apic_sync(unsigned int apic)
-{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
- readl(&io_apic->data);
-}
-
-#define __DO_ACTION(R, ACTION, FINAL) \
- \
-{ \
- int pin; \
- struct irq_pin_list *entry = irq_2_pin + irq; \
- \
- BUG_ON(irq >= NR_IRQS); \
- for (;;) { \
- unsigned int reg; \
- pin = entry->pin; \
- if (pin == -1) \
- break; \
- reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
- reg ACTION; \
- io_apic_modify(entry->apic, reg); \
- FINAL; \
- if (!entry->next) \
- break; \
- entry = irq_2_pin + entry->next; \
- } \
-}
-
union entry_union {
struct { u32 w1, w2; };
struct IO_APIC_route_entry entry;
@@ -299,59 +326,71 @@
static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
{
int apic, pin;
- struct irq_pin_list *entry = irq_2_pin + irq;
+ struct irq_cfg *cfg;
+ struct irq_pin_list *entry;
- BUG_ON(irq >= NR_IRQS);
+ cfg = irq_cfg(irq);
+ entry = cfg->irq_2_pin;
for (;;) {
unsigned int reg;
+
+ if (!entry)
+ break;
+
apic = entry->apic;
pin = entry->pin;
- if (pin == -1)
- break;
+#ifdef CONFIG_INTR_REMAP
/*
* With interrupt-remapping, destination information comes
* from interrupt-remapping table entry.
*/
if (!irq_remapped(irq))
io_apic_write(apic, 0x11 + pin*2, dest);
+#else
+ io_apic_write(apic, 0x11 + pin*2, dest);
+#endif
reg = io_apic_read(apic, 0x10 + pin*2);
reg &= ~IO_APIC_REDIR_VECTOR_MASK;
reg |= vector;
- io_apic_modify(apic, reg);
+ io_apic_modify(apic, 0x10 + pin*2, reg);
if (!entry->next)
break;
- entry = irq_2_pin + entry->next;
+ entry = entry->next;
}
}
+static int assign_irq_vector(int irq, cpumask_t mask);
+
static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
unsigned long flags;
unsigned int dest;
cpumask_t tmp;
+ struct irq_desc *desc;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
return;
+ cfg = irq_cfg(irq);
if (assign_irq_vector(irq, mask))
return;
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
-
/*
* Only the high 8 bits are valid.
*/
dest = SET_APIC_LOGICAL_ID(dest);
+ desc = irq_to_desc(irq);
spin_lock_irqsave(&ioapic_lock, flags);
__target_IO_APIC_irq(irq, dest, cfg->vector);
- irq_desc[irq].affinity = mask;
+ desc->affinity = mask;
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-#endif
+#endif /* CONFIG_SMP */
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
@@ -360,19 +399,30 @@
*/
static void add_pin_to_irq(unsigned int irq, int apic, int pin)
{
- static int first_free_entry = NR_IRQS;
- struct irq_pin_list *entry = irq_2_pin + irq;
+ struct irq_cfg *cfg;
+ struct irq_pin_list *entry;
- BUG_ON(irq >= NR_IRQS);
- while (entry->next)
- entry = irq_2_pin + entry->next;
-
- if (entry->pin != -1) {
- entry->next = first_free_entry;
- entry = irq_2_pin + entry->next;
- if (++first_free_entry >= PIN_MAP_SIZE)
- panic("io_apic.c: ran out of irq_2_pin entries!");
+ /* first time to refer irq_cfg, so with new */
+ cfg = irq_cfg_alloc(irq);
+ entry = cfg->irq_2_pin;
+ if (!entry) {
+ entry = get_one_free_irq_2_pin();
+ cfg->irq_2_pin = entry;
+ entry->apic = apic;
+ entry->pin = pin;
+ return;
}
+
+ while (entry->next) {
+ /* not again, please */
+ if (entry->apic == apic && entry->pin == pin)
+ return;
+
+ entry = entry->next;
+ }
+
+ entry->next = get_one_free_irq_2_pin();
+ entry = entry->next;
entry->apic = apic;
entry->pin = pin;
}
@@ -384,30 +434,86 @@
int oldapic, int oldpin,
int newapic, int newpin)
{
- struct irq_pin_list *entry = irq_2_pin + irq;
+ struct irq_cfg *cfg = irq_cfg(irq);
+ struct irq_pin_list *entry = cfg->irq_2_pin;
+ int replaced = 0;
- while (1) {
+ while (entry) {
if (entry->apic == oldapic && entry->pin == oldpin) {
entry->apic = newapic;
entry->pin = newpin;
- }
- if (!entry->next)
+ replaced = 1;
+ /* every one is different, right? */
break;
- entry = irq_2_pin + entry->next;
+ }
+ entry = entry->next;
+ }
+
+ /* why? call replace before add? */
+ if (!replaced)
+ add_pin_to_irq(irq, newapic, newpin);
+}
+
+static inline void io_apic_modify_irq(unsigned int irq,
+ int mask_and, int mask_or,
+ void (*final)(struct irq_pin_list *entry))
+{
+ int pin;
+ struct irq_cfg *cfg;
+ struct irq_pin_list *entry;
+
+ cfg = irq_cfg(irq);
+ for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
+ unsigned int reg;
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin * 2);
+ reg &= mask_and;
+ reg |= mask_or;
+ io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
+ if (final)
+ final(entry);
}
}
+static void __unmask_IO_APIC_irq(unsigned int irq)
+{
+ io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+}
-#define DO_ACTION(name,R,ACTION, FINAL) \
- \
- static void name##_IO_APIC_irq (unsigned int irq) \
- __DO_ACTION(R, ACTION, FINAL)
+#ifdef CONFIG_X86_64
+void io_apic_sync(struct irq_pin_list *entry)
+{
+ /*
+ * Synchronize the IO-APIC and the CPU by doing
+ * a dummy read from the IO-APIC
+ */
+ struct io_apic __iomem *io_apic;
+ io_apic = io_apic_base(entry->apic);
+ readl(&io_apic->data);
+}
-/* mask = 1 */
-DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
+static void __mask_IO_APIC_irq(unsigned int irq)
+{
+ io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+}
+#else /* CONFIG_X86_32 */
+static void __mask_IO_APIC_irq(unsigned int irq)
+{
+ io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+}
-/* mask = 0 */
-DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
+static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+{
+ io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+ IO_APIC_REDIR_MASKED, NULL);
+}
+
+static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+{
+ io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+ IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
+}
+#endif /* CONFIG_X86_32 */
static void mask_IO_APIC_irq (unsigned int irq)
{
@@ -450,6 +556,68 @@
clear_IO_APIC_pin(apic, pin);
}
+#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
+void send_IPI_self(int vector)
+{
+ unsigned int cfg;
+
+ /*
+ * Wait for idle.
+ */
+ apic_wait_icr_idle();
+ cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ apic_write(APIC_ICR, cfg);
+}
+#endif /* !CONFIG_SMP && CONFIG_X86_32*/
+
+#ifdef CONFIG_X86_32
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries [MAX_PIRQS];
+static int pirqs_enabled;
+
+static int __init ioapic_pirq_setup(char *str)
+{
+ int i, max;
+ int ints[MAX_PIRQS+1];
+
+ get_options(str, ARRAY_SIZE(ints), ints);
+
+ for (i = 0; i < MAX_PIRQS; i++)
+ pirq_entries[i] = -1;
+
+ pirqs_enabled = 1;
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "PIRQ redirection, working around broken MP-BIOS.\n");
+ max = MAX_PIRQS;
+ if (ints[0] < MAX_PIRQS)
+ max = ints[0];
+
+ for (i = 0; i < max; i++) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+ /*
+ * PIRQs are mapped upside down, usually.
+ */
+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+ }
+ return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_INTR_REMAP
+/* I/O APIC RTE contents at the OS boot up */
+static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
+
/*
* Saves and masks all the unmasked IO-APIC RTE's
*/
@@ -474,7 +642,7 @@
kzalloc(sizeof(struct IO_APIC_route_entry) *
nr_ioapic_registers[apic], GFP_KERNEL);
if (!early_ioapic_entries[apic])
- return -ENOMEM;
+ goto nomem;
}
for (apic = 0; apic < nr_ioapics; apic++)
@@ -488,17 +656,31 @@
ioapic_write_entry(apic, pin, entry);
}
}
+
return 0;
+
+nomem:
+ while (apic >= 0)
+ kfree(early_ioapic_entries[apic--]);
+ memset(early_ioapic_entries, 0,
+ ARRAY_SIZE(early_ioapic_entries));
+
+ return -ENOMEM;
}
void restore_IO_APIC_setup(void)
{
int apic, pin;
- for (apic = 0; apic < nr_ioapics; apic++)
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (!early_ioapic_entries[apic])
+ break;
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
ioapic_write_entry(apic, pin,
early_ioapic_entries[apic][pin]);
+ kfree(early_ioapic_entries[apic]);
+ early_ioapic_entries[apic] = NULL;
+ }
}
void reinit_intr_remapped_IO_APIC(int intr_remapping)
@@ -512,25 +694,7 @@
*/
restore_IO_APIC_setup();
}
-
-int skip_ioapic_setup;
-int ioapic_force;
-
-static int __init parse_noapic(char *str)
-{
- disable_ioapic_setup();
- return 0;
-}
-early_param("noapic", parse_noapic);
-
-/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
-static int __init disable_timer_pin_setup(char *arg)
-{
- disable_timer_pin_1 = 1;
- return 1;
-}
-__setup("disable_timer_pin_1", disable_timer_pin_setup);
-
+#endif
/*
* Find the IRQ entry number of a certain pin.
@@ -634,22 +798,54 @@
best_guess = irq;
}
}
- BUG_ON(best_guess >= NR_IRQS);
return best_guess;
}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+ if (irq < 16) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "Broken MPtable reports ISA irq %d\n", irq);
+ return 0;
+}
+
+#endif
+
/* ISA interrupts are always polarity zero edge triggered,
* when listed as conforming in the MP table. */
#define default_ISA_trigger(idx) (0)
#define default_ISA_polarity(idx) (0)
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
+#define default_EISA_polarity(idx) default_ISA_polarity(idx)
+
/* PCI interrupts are always polarity one level triggered,
* when listed as conforming in the MP table. */
#define default_PCI_trigger(idx) (1)
#define default_PCI_polarity(idx) (1)
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_MCA_trigger(idx) (1)
+#define default_MCA_polarity(idx) default_ISA_polarity(idx)
+
static int MPBIOS_polarity(int idx)
{
int bus = mp_irqs[idx].mp_srcbus;
@@ -707,6 +903,36 @@
trigger = default_ISA_trigger(idx);
else
trigger = default_PCI_trigger(idx);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_ISA: /* ISA pin */
+ {
+ /* set before the switch */
+ break;
+ }
+ case MP_BUS_EISA: /* EISA pin */
+ {
+ trigger = default_EISA_trigger(idx);
+ break;
+ }
+ case MP_BUS_PCI: /* PCI pin */
+ {
+ /* set before the switch */
+ break;
+ }
+ case MP_BUS_MCA: /* MCA pin */
+ {
+ trigger = default_MCA_trigger(idx);
+ break;
+ }
+ default:
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ }
+#endif
break;
case 1: /* edge */
{
@@ -744,6 +970,7 @@
return MPBIOS_trigger(idx);
}
+int (*ioapic_renumber_irq)(int ioapic, int irq);
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
@@ -765,8 +992,32 @@
while (i < apic)
irq += nr_ioapic_registers[i++];
irq += pin;
+ /*
+ * For MPS mode, so far only needed by ES7000 platform
+ */
+ if (ioapic_renumber_irq)
+ irq = ioapic_renumber_irq(apic, irq);
}
- BUG_ON(irq >= NR_IRQS);
+
+#ifdef CONFIG_X86_32
+ /*
+ * PCI IRQ command line redirection. Yes, limits are hardcoded.
+ */
+ if ((pin >= 16) && (pin <= 23)) {
+ if (pirq_entries[pin-16] != -1) {
+ if (!pirq_entries[pin-16]) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "disabling PIRQ%d\n", pin-16);
+ } else {
+ irq = pirq_entries[pin-16];
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "using PIRQ%d -> IRQ %d\n",
+ pin-16, irq);
+ }
+ }
+ }
+#endif
+
return irq;
}
@@ -801,8 +1052,7 @@
int cpu;
struct irq_cfg *cfg;
- BUG_ON((unsigned)irq >= NR_IRQS);
- cfg = &irq_cfg[irq];
+ cfg = irq_cfg(irq);
/* Only try and allocate irqs on cpus that are present */
cpus_and(mask, mask, cpu_online_map);
@@ -837,8 +1087,13 @@
}
if (unlikely(current_vector == vector))
continue;
+#ifdef CONFIG_X86_64
if (vector == IA32_SYSCALL_VECTOR)
goto next;
+#else
+ if (vector == SYSCALL_VECTOR)
+ goto next;
+#endif
for_each_cpu_mask_nr(new_cpu, new_mask)
if (per_cpu(vector_irq, new_cpu)[vector] != -1)
goto next;
@@ -875,8 +1130,7 @@
cpumask_t mask;
int cpu, vector;
- BUG_ON((unsigned)irq >= NR_IRQS);
- cfg = &irq_cfg[irq];
+ cfg = irq_cfg(irq);
BUG_ON(!cfg->vector);
vector = cfg->vector;
@@ -893,12 +1147,13 @@
/* Initialize vector_irq on a new cpu */
/* This function must be called with vector_lock held */
int irq, vector;
+ struct irq_cfg *cfg;
/* Mark the inuse vectors */
- for (irq = 0; irq < NR_IRQS; ++irq) {
- if (!cpu_isset(cpu, irq_cfg[irq].domain))
+ for_each_irq_cfg(irq, cfg) {
+ if (!cpu_isset(cpu, cfg->domain))
continue;
- vector = irq_cfg[irq].vector;
+ vector = cfg->vector;
per_cpu(vector_irq, cpu)[vector] = irq;
}
/* Mark the free vectors */
@@ -906,7 +1161,9 @@
irq = per_cpu(vector_irq, cpu)[vector];
if (irq < 0)
continue;
- if (!cpu_isset(cpu, irq_cfg[irq].domain))
+
+ cfg = irq_cfg(irq);
+ if (!cpu_isset(cpu, cfg->domain))
per_cpu(vector_irq, cpu)[vector] = -1;
}
}
@@ -916,16 +1173,49 @@
static struct irq_chip ir_ioapic_chip;
#endif
+#define IOAPIC_AUTO -1
+#define IOAPIC_EDGE 0
+#define IOAPIC_LEVEL 1
+
+#ifdef CONFIG_X86_32
+static inline int IO_APIC_irq_trigger(int irq)
+{
+ int apic, idx, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ idx = find_irq_entry(apic, pin, mp_INT);
+ if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
+ return irq_trigger(idx);
+ }
+ }
+ /*
+ * nonexistent IRQs are edge default
+ */
+ return 0;
+}
+#else
+static inline int IO_APIC_irq_trigger(int irq)
+{
+ return 1;
+}
+#endif
+
static void ioapic_register_intr(int irq, unsigned long trigger)
{
- if (trigger)
- irq_desc[irq].status |= IRQ_LEVEL;
+ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL)
+ desc->status |= IRQ_LEVEL;
else
- irq_desc[irq].status &= ~IRQ_LEVEL;
+ desc->status &= ~IRQ_LEVEL;
#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
- irq_desc[irq].status |= IRQ_MOVE_PCNTXT;
+ desc->status |= IRQ_MOVE_PCNTXT;
if (trigger)
set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
handle_fasteoi_irq,
@@ -936,7 +1226,8 @@
return;
}
#endif
- if (trigger)
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL)
set_irq_chip_and_handler_name(irq, &ioapic_chip,
handle_fasteoi_irq,
"fasteoi");
@@ -1009,13 +1300,15 @@
static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
int trigger, int polarity)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
struct IO_APIC_route_entry entry;
cpumask_t mask;
if (!IO_APIC_IRQ(irq))
return;
+ cfg = irq_cfg(irq);
+
mask = TARGET_CPUS;
if (assign_irq_vector(irq, mask))
return;
@@ -1047,37 +1340,49 @@
static void __init setup_IO_APIC_irqs(void)
{
- int apic, pin, idx, irq, first_notcon = 1;
+ int apic, pin, idx, irq;
+ int notcon = 0;
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- idx = find_irq_entry(apic,pin,mp_INT);
- if (idx == -1) {
- if (first_notcon) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
- first_notcon = 0;
- } else
- apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
- continue;
+ idx = find_irq_entry(apic, pin, mp_INT);
+ if (idx == -1) {
+ if (!notcon) {
+ notcon = 1;
+ apic_printk(APIC_VERBOSE,
+ KERN_DEBUG " %d-%d",
+ mp_ioapics[apic].mp_apicid,
+ pin);
+ } else
+ apic_printk(APIC_VERBOSE, " %d-%d",
+ mp_ioapics[apic].mp_apicid,
+ pin);
+ continue;
+ }
+ if (notcon) {
+ apic_printk(APIC_VERBOSE,
+ " (apicid-pin) not connected\n");
+ notcon = 0;
+ }
+
+ irq = pin_2_irq(idx, apic, pin);
+#ifdef CONFIG_X86_32
+ if (multi_timer_check(apic, irq))
+ continue;
+#endif
+ add_pin_to_irq(irq, apic, pin);
+
+ setup_IO_APIC_irq(apic, pin, irq,
+ irq_trigger(idx), irq_polarity(idx));
}
- if (!first_notcon) {
- apic_printk(APIC_VERBOSE, " not connected.\n");
- first_notcon = 1;
- }
-
- irq = pin_2_irq(idx, apic, pin);
- add_pin_to_irq(irq, apic, pin);
-
- setup_IO_APIC_irq(apic, pin, irq,
- irq_trigger(idx), irq_polarity(idx));
- }
}
- if (!first_notcon)
- apic_printk(APIC_VERBOSE, " not connected.\n");
+ if (notcon)
+ apic_printk(APIC_VERBOSE,
+ " (apicid-pin) not connected\n");
}
/*
@@ -1088,8 +1393,10 @@
{
struct IO_APIC_route_entry entry;
+#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
return;
+#endif
memset(&entry, 0, sizeof(entry));
@@ -1124,7 +1431,10 @@
union IO_APIC_reg_00 reg_00;
union IO_APIC_reg_01 reg_01;
union IO_APIC_reg_02 reg_02;
+ union IO_APIC_reg_03 reg_03;
unsigned long flags;
+ struct irq_cfg *cfg;
+ unsigned int irq;
if (apic_verbosity == APIC_QUIET)
return;
@@ -1147,12 +1457,16 @@
reg_01.raw = io_apic_read(apic, 1);
if (reg_01.bits.version >= 0x10)
reg_02.raw = io_apic_read(apic, 2);
+ if (reg_01.bits.version >= 0x20)
+ reg_03.raw = io_apic_read(apic, 3);
spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
+ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01);
printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
@@ -1160,11 +1474,27 @@
printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
- if (reg_01.bits.version >= 0x10) {
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+ * but the value of reg_02 is read as the previous read register
+ * value, so ignore it if reg_02 == reg_01.
+ */
+ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
}
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+ * or reg_03, but the value of reg_0[23] is read as the previous read
+ * register value, so ignore it if reg_03 == reg_0[12].
+ */
+ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+ reg_03.raw != reg_01.raw) {
+ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
+ }
+
printk(KERN_DEBUG ".... IRQ redirection table:\n");
printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
@@ -1193,16 +1523,16 @@
}
}
printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for (i = 0; i < NR_IRQS; i++) {
- struct irq_pin_list *entry = irq_2_pin + i;
- if (entry->pin < 0)
+ for_each_irq_cfg(irq, cfg) {
+ struct irq_pin_list *entry = cfg->irq_2_pin;
+ if (!entry)
continue;
- printk(KERN_DEBUG "IRQ%d ", i);
+ printk(KERN_DEBUG "IRQ%d ", irq);
for (;;) {
printk("-> %d:%d", entry->apic, entry->pin);
if (!entry->next)
break;
- entry = irq_2_pin + entry->next;
+ entry = entry->next;
}
printk("\n");
}
@@ -1236,7 +1566,7 @@
__apicdebuginit(void) print_local_APIC(void *dummy)
{
unsigned int v, ver, maxlvt;
- unsigned long icr;
+ u64 icr;
if (apic_verbosity == APIC_QUIET)
return;
@@ -1253,20 +1583,31 @@
v = apic_read(APIC_TASKPRI);
printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
- v = apic_read(APIC_ARBPRI);
- printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
- v & APIC_ARBPRI_MASK);
- v = apic_read(APIC_PROCPRI);
- printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ if (!APIC_XAPIC(ver)) {
+ v = apic_read(APIC_ARBPRI);
+ printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+ v & APIC_ARBPRI_MASK);
+ }
+ v = apic_read(APIC_PROCPRI);
+ printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+ }
- v = apic_read(APIC_EOI);
- printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
- v = apic_read(APIC_RRR);
- printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+ /*
+ * Remote read supported only in the 82489DX and local APIC for
+ * Pentium processors.
+ */
+ if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
+ v = apic_read(APIC_RRR);
+ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+ }
+
v = apic_read(APIC_LDR);
printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
- v = apic_read(APIC_DFR);
- printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+ if (!x2apic_enabled()) {
+ v = apic_read(APIC_DFR);
+ printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+ }
v = apic_read(APIC_SPIV);
printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
@@ -1277,8 +1618,13 @@
printk(KERN_DEBUG "... APIC IRR field:\n");
print_APIC_bitfield(APIC_IRR);
- v = apic_read(APIC_ESR);
- printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+
+ v = apic_read(APIC_ESR);
+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+ }
icr = apic_icr_read();
printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
@@ -1312,7 +1658,12 @@
__apicdebuginit(void) print_all_local_APICs(void)
{
- on_each_cpu(print_local_APIC, NULL, 1);
+ int cpu;
+
+ preempt_disable();
+ for_each_online_cpu(cpu)
+ smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+ preempt_enable();
}
__apicdebuginit(void) print_PIC(void)
@@ -1359,17 +1710,22 @@
fs_initcall(print_all_ICs);
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+
void __init enable_IO_APIC(void)
{
union IO_APIC_reg_01 reg_01;
int i8259_apic, i8259_pin;
- int i, apic;
+ int apic;
unsigned long flags;
- for (i = 0; i < PIN_MAP_SIZE; i++) {
- irq_2_pin[i].pin = -1;
- irq_2_pin[i].next = 0;
- }
+#ifdef CONFIG_X86_32
+ int i;
+ if (!pirqs_enabled)
+ for (i = 0; i < MAX_PIRQS; i++)
+ pirq_entries[i] = -1;
+#endif
/*
* The number of IO-APIC IRQ registers (== #pins):
@@ -1399,6 +1755,10 @@
}
found_i8259:
/* Look to see what if the MP table has reported the ExtINT */
+ /* If we could not find the appropriate pin by looking at the ioapic
+ * the i8259 probably is not connected the ioapic but give the
+ * mptable a chance anyway.
+ */
i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
/* Trust the MP table if nothing is setup in the hardware */
@@ -1458,6 +1818,133 @@
disconnect_bsp_APIC(ioapic_i8259.pin != -1);
}
+#ifdef CONFIG_X86_32
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
+ */
+
+static void __init setup_ioapic_ids_from_mpc(void)
+{
+ union IO_APIC_reg_00 reg_00;
+ physid_mask_t phys_id_present_map;
+ int apic;
+ int i;
+ unsigned char old_id;
+ unsigned long flags;
+
+ if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
+ return;
+
+ /*
+ * Don't check I/O APIC IDs for xAPIC systems. They have
+ * no meaning without the serial APIC bus.
+ */
+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return;
+ /*
+ * This is broken; anything with a real cpu count has to
+ * circumvent this idiocy regardless.
+ */
+ phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+ /*
+ * Set the IOAPIC ID to the value stored in the MPC table.
+ */
+ for (apic = 0; apic < nr_ioapics; apic++) {
+
+ /* Read the register 0 value */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(apic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ old_id = mp_ioapics[apic].mp_apicid;
+
+ if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+ apic, mp_ioapics[apic].mp_apicid);
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ reg_00.bits.ID);
+ mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
+ }
+
+ /*
+ * Sanity check, is the ID really free? Every APIC in a
+ * system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (check_apicid_used(phys_id_present_map,
+ mp_ioapics[apic].mp_apicid)) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+ apic, mp_ioapics[apic].mp_apicid);
+ for (i = 0; i < get_physical_broadcast(); i++)
+ if (!physid_isset(i, phys_id_present_map))
+ break;
+ if (i >= get_physical_broadcast())
+ panic("Max APIC ID exceeded!\n");
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ i);
+ physid_set(i, phys_id_present_map);
+ mp_ioapics[apic].mp_apicid = i;
+ } else {
+ physid_mask_t tmp;
+ tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
+ apic_printk(APIC_VERBOSE, "Setting %d in the "
+ "phys_id_present_map\n",
+ mp_ioapics[apic].mp_apicid);
+ physids_or(phys_id_present_map, phys_id_present_map, tmp);
+ }
+
+
+ /*
+ * We need to adjust the IRQ routing table
+ * if the ID changed.
+ */
+ if (old_id != mp_ioapics[apic].mp_apicid)
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].mp_dstapic == old_id)
+ mp_irqs[i].mp_dstapic
+ = mp_ioapics[apic].mp_apicid;
+
+ /*
+ * Read the right value from the MPC table and
+ * write it into the ID register.
+ */
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "...changing IO-APIC physical APIC ID to %d ...",
+ mp_ioapics[apic].mp_apicid);
+
+ reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0, reg_00.raw);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /*
+ * Sanity check
+ */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(apic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
+ printk("could not set ID!\n");
+ else
+ apic_printk(APIC_VERBOSE, " ok.\n");
+ }
+}
+#endif
+
+int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+ no_timer_check = 1;
+ return 1;
+}
+__setup("no_timer_check", notimercheck);
+
/*
* There is a nasty bug in some older SMP boards, their mptable lies
* about the timer IRQ. We do the following to work around the situation:
@@ -1471,6 +1958,9 @@
unsigned long t1 = jiffies;
unsigned long flags;
+ if (no_timer_check)
+ return 1;
+
local_save_flags(flags);
local_irq_enable();
/* Let ten ticks pass... */
@@ -1531,9 +2021,11 @@
return was_pending;
}
+#ifdef CONFIG_X86_64
static int ioapic_retrigger_irq(unsigned int irq)
{
- struct irq_cfg *cfg = &irq_cfg[irq];
+
+ struct irq_cfg *cfg = irq_cfg(irq);
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
@@ -1542,6 +2034,14 @@
return 1;
}
+#else
+static int ioapic_retrigger_irq(unsigned int irq)
+{
+ send_IPI_self(irq_cfg(irq)->vector);
+
+ return 1;
+}
+#endif
/*
* Level and edge triggered IO-APIC interrupts need different handling,
@@ -1580,11 +2080,11 @@
*/
static void migrate_ioapic_irq(int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
cpumask_t tmp, cleanup_mask;
struct irte irte;
- int modify_ioapic_rte = desc->status & IRQ_LEVEL;
+ int modify_ioapic_rte;
unsigned int dest;
unsigned long flags;
@@ -1598,9 +2098,12 @@
if (assign_irq_vector(irq, mask))
return;
+ cfg = irq_cfg(irq);
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
+ desc = irq_to_desc(irq);
+ modify_ioapic_rte = desc->status & IRQ_LEVEL;
if (modify_ioapic_rte) {
spin_lock_irqsave(&ioapic_lock, flags);
__target_IO_APIC_irq(irq, dest, cfg->vector);
@@ -1622,18 +2125,19 @@
cfg->move_in_progress = 0;
}
- irq_desc[irq].affinity = mask;
+ desc->affinity = mask;
}
static int migrate_irq_remapped_level(int irq)
{
int ret = -1;
+ struct irq_desc *desc = irq_to_desc(irq);
mask_IO_APIC_irq(irq);
if (io_apic_level_ack_pending(irq)) {
/*
- * Interrupt in progress. Migrating irq now will change the
+ * Interrupt in progress. Migrating irq now will change the
* vector information in the IO-APIC RTE and that will confuse
* the EOI broadcast performed by cpu.
* So, delay the irq migration to the next instance.
@@ -1643,11 +2147,11 @@
}
/* everthing is clear. we have right of way */
- migrate_ioapic_irq(irq, irq_desc[irq].pending_mask);
+ migrate_ioapic_irq(irq, desc->pending_mask);
ret = 0;
- irq_desc[irq].status &= ~IRQ_MOVE_PENDING;
- cpus_clear(irq_desc[irq].pending_mask);
+ desc->status &= ~IRQ_MOVE_PENDING;
+ cpus_clear(desc->pending_mask);
unmask:
unmask_IO_APIC_irq(irq);
@@ -1656,10 +2160,10 @@
static void ir_irq_migration(struct work_struct *work)
{
- int irq;
+ unsigned int irq;
+ struct irq_desc *desc;
- for (irq = 0; irq < NR_IRQS; irq++) {
- struct irq_desc *desc = irq_desc + irq;
+ for_each_irq_desc(irq, desc) {
if (desc->status & IRQ_MOVE_PENDING) {
unsigned long flags;
@@ -1671,8 +2175,7 @@
continue;
}
- desc->chip->set_affinity(irq,
- irq_desc[irq].pending_mask);
+ desc->chip->set_affinity(irq, desc->pending_mask);
spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -1683,9 +2186,11 @@
*/
static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
{
- if (irq_desc[irq].status & IRQ_LEVEL) {
- irq_desc[irq].status |= IRQ_MOVE_PENDING;
- irq_desc[irq].pending_mask = mask;
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc->status & IRQ_LEVEL) {
+ desc->status |= IRQ_MOVE_PENDING;
+ desc->pending_mask = mask;
migrate_irq_remapped_level(irq);
return;
}
@@ -1698,7 +2203,9 @@
{
unsigned vector, me;
ack_APIC_irq();
+#ifdef CONFIG_X86_64
exit_idle();
+#endif
irq_enter();
me = smp_processor_id();
@@ -1707,11 +2214,12 @@
struct irq_desc *desc;
struct irq_cfg *cfg;
irq = __get_cpu_var(vector_irq)[vector];
- if (irq >= NR_IRQS)
+
+ desc = irq_to_desc(irq);
+ if (!desc)
continue;
- desc = irq_desc + irq;
- cfg = irq_cfg + irq;
+ cfg = irq_cfg(irq);
spin_lock(&desc->lock);
if (!cfg->move_cleanup_count)
goto unlock;
@@ -1730,7 +2238,7 @@
static void irq_complete_move(unsigned int irq)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg = irq_cfg(irq);
unsigned vector, me;
if (likely(!cfg->move_in_progress))
@@ -1769,19 +2277,50 @@
ack_APIC_irq();
}
+atomic_t irq_mis_count;
+
static void ack_apic_level(unsigned int irq)
{
+#ifdef CONFIG_X86_32
+ unsigned long v;
+ int i;
+#endif
int do_unmask_irq = 0;
irq_complete_move(irq);
#ifdef CONFIG_GENERIC_PENDING_IRQ
/* If we are moving the irq we need to mask it */
- if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
+ if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
do_unmask_irq = 1;
mask_IO_APIC_irq(irq);
}
#endif
+#ifdef CONFIG_X86_32
+ /*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets). Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless. As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source. The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually. We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt. We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul. --macro
+ */
+ i = irq_cfg(irq)->vector;
+
+ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+#endif
+
/*
* We must acknowledge the irq before we move it or the acknowledge will
* not propagate properly.
@@ -1820,31 +2359,41 @@
move_masked_irq(irq);
unmask_IO_APIC_irq(irq);
}
+
+#ifdef CONFIG_X86_32
+ if (!(v & (1 << (i & 0x1f)))) {
+ atomic_inc(&irq_mis_count);
+ spin_lock(&ioapic_lock);
+ __mask_and_edge_IO_APIC_irq(irq);
+ __unmask_and_level_IO_APIC_irq(irq);
+ spin_unlock(&ioapic_lock);
+ }
+#endif
}
static struct irq_chip ioapic_chip __read_mostly = {
- .name = "IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
- .ack = ack_apic_edge,
- .eoi = ack_apic_level,
+ .name = "IO-APIC",
+ .startup = startup_ioapic_irq,
+ .mask = mask_IO_APIC_irq,
+ .unmask = unmask_IO_APIC_irq,
+ .ack = ack_apic_edge,
+ .eoi = ack_apic_level,
#ifdef CONFIG_SMP
- .set_affinity = set_ioapic_affinity_irq,
+ .set_affinity = set_ioapic_affinity_irq,
#endif
.retrigger = ioapic_retrigger_irq,
};
#ifdef CONFIG_INTR_REMAP
static struct irq_chip ir_ioapic_chip __read_mostly = {
- .name = "IR-IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
- .ack = ack_x2apic_edge,
- .eoi = ack_x2apic_level,
+ .name = "IR-IO-APIC",
+ .startup = startup_ioapic_irq,
+ .mask = mask_IO_APIC_irq,
+ .unmask = unmask_IO_APIC_irq,
+ .ack = ack_x2apic_edge,
+ .eoi = ack_x2apic_level,
#ifdef CONFIG_SMP
- .set_affinity = set_ir_ioapic_affinity_irq,
+ .set_affinity = set_ir_ioapic_affinity_irq,
#endif
.retrigger = ioapic_retrigger_irq,
};
@@ -1853,6 +2402,8 @@
static inline void init_IO_APIC_traps(void)
{
int irq;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
/*
* NOTE! The local APIC isn't very good at handling
@@ -1865,8 +2416,8 @@
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- for (irq = 0; irq < NR_IRQS ; irq++) {
- if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) {
+ for_each_irq_cfg(irq, cfg) {
+ if (IO_APIC_IRQ(irq) && !cfg->vector) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
@@ -1874,13 +2425,27 @@
*/
if (irq < 16)
make_8259A_irq(irq);
- else
+ else {
+ desc = irq_to_desc(irq);
/* Strange. Oh, well.. */
- irq_desc[irq].chip = &no_irq_chip;
+ desc->chip = &no_irq_chip;
+ }
}
}
}
+/*
+ * The local APIC irq-chip implementation:
+ */
+
+static void mask_lapic_irq(unsigned int irq)
+{
+ unsigned long v;
+
+ v = apic_read(APIC_LVT0);
+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
static void unmask_lapic_irq(unsigned int irq)
{
unsigned long v;
@@ -1889,14 +2454,6 @@
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
-static void mask_lapic_irq(unsigned int irq)
-{
- unsigned long v;
-
- v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
-}
-
static void ack_lapic_irq (unsigned int irq)
{
ack_APIC_irq();
@@ -1911,7 +2468,10 @@
static void lapic_register_intr(int irq)
{
- irq_desc[irq].status &= ~IRQ_LEVEL;
+ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+ desc->status &= ~IRQ_LEVEL;
set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
"edge");
}
@@ -1919,19 +2479,19 @@
static void __init setup_nmi(void)
{
/*
- * Dirty trick to enable the NMI watchdog ...
+ * Dirty trick to enable the NMI watchdog ...
* We put the 8259A master into AEOI mode and
* unmask on all local APICs LVT0 as NMI.
*
* The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
* is from Maciej W. Rozycki - so we do not have to EOI from
* the NMI handler or the timer interrupt.
- */
- printk(KERN_INFO "activating NMI Watchdog ...");
+ */
+ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
enable_NMI_through_LVT0();
- printk(" done.\n");
+ apic_printk(APIC_VERBOSE, " done.\n");
}
/*
@@ -1948,12 +2508,17 @@
unsigned char save_control, save_freq_select;
pin = find_isa_irq_pin(8, mp_INT);
- apic = find_isa_irq_apic(8, mp_INT);
- if (pin == -1)
+ if (pin == -1) {
+ WARN_ON_ONCE(1);
return;
+ }
+ apic = find_isa_irq_apic(8, mp_INT);
+ if (apic == -1) {
+ WARN_ON_ONCE(1);
+ return;
+ }
entry0 = ioapic_read_entry(apic, pin);
-
clear_IO_APIC_pin(apic, pin);
memset(&entry1, 0, sizeof(entry1));
@@ -1988,23 +2553,38 @@
ioapic_write_entry(apic, pin, entry0);
}
+static int disable_timer_pin_1 __initdata;
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
+{
+ disable_timer_pin_1 = 1;
+ return 0;
+}
+early_param("disable_timer_pin_1", disable_timer_pin_setup);
+
+int timer_through_8259 __initdata;
+
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
* is so screwy. Thanks to Brian Perkins for testing/hacking this beast
* fanatically on his truly buggy board.
*
- * FIXME: really need to revamp this for modern platforms only.
+ * FIXME: really need to revamp this for all platforms.
*/
static inline void __init check_timer(void)
{
- struct irq_cfg *cfg = irq_cfg + 0;
+ struct irq_cfg *cfg = irq_cfg(0);
int apic1, pin1, apic2, pin2;
unsigned long flags;
+ unsigned int ver;
int no_pin1 = 0;
local_irq_save(flags);
+ ver = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(ver);
+
/*
* get/set the timer IRQ vector:
*/
@@ -2013,10 +2593,18 @@
/*
* As IRQ0 is to be enabled in the 8259A, the virtual
- * wire has to be disabled in the local APIC.
+ * wire has to be disabled in the local APIC. Also
+ * timer interrupts need to be acknowledged manually in
+ * the 8259A for the i82489DX when using the NMI
+ * watchdog as that APIC treats NMIs as level-triggered.
+ * The AEOI mode will finish them in the 8259A
+ * automatically.
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
+#ifdef CONFIG_X86_32
+ timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+#endif
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
@@ -2035,8 +2623,10 @@
* 8259A.
*/
if (pin1 == -1) {
+#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
panic("BIOS bug: timer not connected to IO-APIC");
+#endif
pin1 = pin2;
apic1 = apic2;
no_pin1 = 1;
@@ -2054,7 +2644,7 @@
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
}
unmask_IO_APIC_irq(0);
- if (!no_timer_check && timer_irq_works()) {
+ if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
enable_8259A_irq(0);
@@ -2063,8 +2653,10 @@
clear_IO_APIC_pin(0, pin1);
goto out;
}
+#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
panic("timer doesn't work through Interrupt-remapped IO-APIC");
+#endif
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -2104,6 +2696,9 @@
"through the IO-APIC - disabling NMI Watchdog!\n");
nmi_watchdog = NMI_NONE;
}
+#ifdef CONFIG_X86_32
+ timer_ack = 0;
+#endif
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
@@ -2140,13 +2735,6 @@
local_irq_restore(flags);
}
-static int __init notimercheck(char *s)
-{
- no_timer_check = 1;
- return 1;
-}
-__setup("no_timer_check", notimercheck);
-
/*
* Traditionally ISA IRQ2 is the cascade IRQ, and is not available
* to devices. However there may be an I/O APIC pin available for
@@ -2164,25 +2752,49 @@
* the I/O APIC in all cases now. No actual device should request
* it anyway. --macro
*/
-#define PIC_IRQS (1<<2)
+#define PIC_IRQS (1 << PIC_CASCADE_IR)
void __init setup_IO_APIC(void)
{
+#ifdef CONFIG_X86_32
+ enable_IO_APIC();
+#else
/*
* calling enable_IO_APIC() is moved to setup_local_APIC for BP
*/
+#endif
io_apic_irqs = ~PIC_IRQS;
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
-
+ /*
+ * Set up IO-APIC IRQ routing.
+ */
+#ifdef CONFIG_X86_32
+ if (!acpi_ioapic)
+ setup_ioapic_ids_from_mpc();
+#endif
sync_Arb_IDs();
setup_IO_APIC_irqs();
init_IO_APIC_traps();
check_timer();
}
+/*
+ * Called after all the initialization is done. If we didnt find any
+ * APIC bugs then we can allow the modify fast path
+ */
+
+static int __init io_apic_bug_finalize(void)
+{
+ if (sis_apic_bug == -1)
+ sis_apic_bug = 0;
+ return 0;
+}
+
+late_initcall(io_apic_bug_finalize);
+
struct sysfs_ioapic_data {
struct sys_device dev;
struct IO_APIC_route_entry entry[0];
@@ -2270,32 +2882,51 @@
/*
* Dynamic irq allocate and deallocation
*/
-int create_irq(void)
+unsigned int create_irq_nr(unsigned int irq_want)
{
/* Allocate an unused irq */
- int irq;
- int new;
+ unsigned int irq;
+ unsigned int new;
unsigned long flags;
+ struct irq_cfg *cfg_new;
- irq = -ENOSPC;
+ irq_want = nr_irqs - 1;
+
+ irq = 0;
spin_lock_irqsave(&vector_lock, flags);
- for (new = (NR_IRQS - 1); new >= 0; new--) {
+ for (new = irq_want; new > 0; new--) {
if (platform_legacy_irq(new))
continue;
- if (irq_cfg[new].vector != 0)
+ cfg_new = irq_cfg(new);
+ if (cfg_new && cfg_new->vector != 0)
continue;
+ /* check if need to create one */
+ if (!cfg_new)
+ cfg_new = irq_cfg_alloc(new);
if (__assign_irq_vector(new, TARGET_CPUS) == 0)
irq = new;
break;
}
spin_unlock_irqrestore(&vector_lock, flags);
- if (irq >= 0) {
+ if (irq > 0) {
dynamic_irq_init(irq);
}
return irq;
}
+int create_irq(void)
+{
+ int irq;
+
+ irq = create_irq_nr(nr_irqs - 1);
+
+ if (irq == 0)
+ irq = -1;
+
+ return irq;
+}
+
void destroy_irq(unsigned int irq)
{
unsigned long flags;
@@ -2316,7 +2947,7 @@
#ifdef CONFIG_PCI_MSI
static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
int err;
unsigned dest;
cpumask_t tmp;
@@ -2326,6 +2957,7 @@
if (err)
return err;
+ cfg = irq_cfg(irq);
cpus_and(tmp, cfg->domain, tmp);
dest = cpu_mask_to_apicid(tmp);
@@ -2383,10 +3015,11 @@
#ifdef CONFIG_SMP
static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
cpumask_t tmp;
+ struct irq_desc *desc;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
@@ -2395,6 +3028,7 @@
if (assign_irq_vector(irq, mask))
return;
+ cfg = irq_cfg(irq);
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
@@ -2406,7 +3040,8 @@
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
write_msi_msg(irq, &msg);
- irq_desc[irq].affinity = mask;
+ desc = irq_to_desc(irq);
+ desc->affinity = mask;
}
#ifdef CONFIG_INTR_REMAP
@@ -2416,10 +3051,11 @@
*/
static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
unsigned int dest;
cpumask_t tmp, cleanup_mask;
struct irte irte;
+ struct irq_desc *desc;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
@@ -2431,6 +3067,7 @@
if (assign_irq_vector(irq, mask))
return;
+ cfg = irq_cfg(irq);
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
@@ -2454,7 +3091,8 @@
cfg->move_in_progress = 0;
}
- irq_desc[irq].affinity = mask;
+ desc = irq_to_desc(irq);
+ desc->affinity = mask;
}
#endif
#endif /* CONFIG_SMP */
@@ -2507,7 +3145,7 @@
if (index < 0) {
printk(KERN_ERR
"Unable to allocate %d IRTE for PCI %s\n", nvec,
- pci_name(dev));
+ pci_name(dev));
return -ENOSPC;
}
return index;
@@ -2528,7 +3166,7 @@
#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
/*
* irq migration in process context
*/
@@ -2538,16 +3176,34 @@
#endif
set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+ dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+
return 0;
}
+static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+{
+ unsigned int irq;
+
+ irq = dev->bus->number;
+ irq <<= 8;
+ irq |= dev->devfn;
+ irq <<= 12;
+
+ return irq;
+}
+
int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
{
- int irq, ret;
+ unsigned int irq;
+ int ret;
+ unsigned int irq_want;
- irq = create_irq();
- if (irq < 0)
- return irq;
+ irq_want = build_irq_for_pci_dev(dev) + 0x100;
+
+ irq = create_irq_nr(irq_want);
+ if (irq == 0)
+ return -1;
#ifdef CONFIG_INTR_REMAP
if (!intr_remapping_enabled)
@@ -2574,18 +3230,22 @@
int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
- int irq, ret, sub_handle;
+ unsigned int irq;
+ int ret, sub_handle;
struct msi_desc *desc;
+ unsigned int irq_want;
+
#ifdef CONFIG_INTR_REMAP
struct intel_iommu *iommu = 0;
int index = 0;
#endif
+ irq_want = build_irq_for_pci_dev(dev) + 0x100;
sub_handle = 0;
list_for_each_entry(desc, &dev->msi_list, list) {
- irq = create_irq();
- if (irq < 0)
- return irq;
+ irq = create_irq_nr(irq_want--);
+ if (irq == 0)
+ return -1;
#ifdef CONFIG_INTR_REMAP
if (!intr_remapping_enabled)
goto no_ir;
@@ -2636,10 +3296,11 @@
#ifdef CONFIG_SMP
static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
cpumask_t tmp;
+ struct irq_desc *desc;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
@@ -2648,6 +3309,7 @@
if (assign_irq_vector(irq, mask))
return;
+ cfg = irq_cfg(irq);
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
@@ -2659,7 +3321,8 @@
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
dmar_msi_write(irq, &msg);
- irq_desc[irq].affinity = mask;
+ desc = irq_to_desc(irq);
+ desc->affinity = mask;
}
#endif /* CONFIG_SMP */
@@ -2689,6 +3352,69 @@
}
#endif
+#ifdef CONFIG_HPET_TIMER
+
+#ifdef CONFIG_SMP
+static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+{
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+ struct msi_msg msg;
+ unsigned int dest;
+ cpumask_t tmp;
+
+ cpus_and(tmp, mask, cpu_online_map);
+ if (cpus_empty(tmp))
+ return;
+
+ if (assign_irq_vector(irq, mask))
+ return;
+
+ cfg = irq_cfg(irq);
+ cpus_and(tmp, cfg->domain, mask);
+ dest = cpu_mask_to_apicid(tmp);
+
+ hpet_msi_read(irq, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+ hpet_msi_write(irq, &msg);
+ desc = irq_to_desc(irq);
+ desc->affinity = mask;
+}
+#endif /* CONFIG_SMP */
+
+struct irq_chip hpet_msi_type = {
+ .name = "HPET_MSI",
+ .unmask = hpet_msi_unmask,
+ .mask = hpet_msi_mask,
+ .ack = ack_apic_edge,
+#ifdef CONFIG_SMP
+ .set_affinity = hpet_msi_set_affinity,
+#endif
+ .retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_hpet_msi(unsigned int irq)
+{
+ int ret;
+ struct msi_msg msg;
+
+ ret = msi_compose_msg(NULL, irq, &msg);
+ if (ret < 0)
+ return ret;
+
+ hpet_msi_write(irq, &msg);
+ set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
+ "edge");
+
+ return 0;
+}
+#endif
+
#endif /* CONFIG_PCI_MSI */
/*
* Hypertransport interrupt support
@@ -2713,9 +3439,10 @@
static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
unsigned int dest;
cpumask_t tmp;
+ struct irq_desc *desc;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
@@ -2724,11 +3451,13 @@
if (assign_irq_vector(irq, mask))
return;
+ cfg = irq_cfg(irq);
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
target_ht_irq(irq, dest, cfg->vector);
- irq_desc[irq].affinity = mask;
+ desc = irq_to_desc(irq);
+ desc->affinity = mask;
}
#endif
@@ -2745,7 +3474,7 @@
int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
int err;
cpumask_t tmp;
@@ -2755,6 +3484,7 @@
struct ht_irq_msg msg;
unsigned dest;
+ cfg = irq_cfg(irq);
cpus_and(tmp, cfg->domain, tmp);
dest = cpu_mask_to_apicid(tmp);
@@ -2777,18 +3507,78 @@
set_irq_chip_and_handler_name(irq, &ht_irq_chip,
handle_edge_irq, "edge");
+
+ dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
}
return err;
}
#endif /* CONFIG_HT_IRQ */
-/* --------------------------------------------------------------------------
- ACPI-based IOAPIC Configuration
- -------------------------------------------------------------------------- */
+#ifdef CONFIG_X86_64
+/*
+ * Re-target the irq to the specified CPU and enable the specified MMR located
+ * on the specified blade to allow the sending of MSIs to the specified CPU.
+ */
+int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
+ unsigned long mmr_offset)
+{
+ const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+ struct irq_cfg *cfg;
+ int mmr_pnode;
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+ unsigned long flags;
+ int err;
-#ifdef CONFIG_ACPI
+ err = assign_irq_vector(irq, *eligible_cpu);
+ if (err != 0)
+ return err;
-#define IO_APIC_MAX_ID 0xFE
+ spin_lock_irqsave(&vector_lock, flags);
+ set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
+ irq_name);
+ spin_unlock_irqrestore(&vector_lock, flags);
+
+ cfg = irq_cfg(irq);
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+ BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
+ entry->vector = cfg->vector;
+ entry->delivery_mode = INT_DELIVERY_MODE;
+ entry->dest_mode = INT_DEST_MODE;
+ entry->polarity = 0;
+ entry->trigger = 0;
+ entry->mask = 0;
+ entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+
+ mmr_pnode = uv_blade_to_pnode(mmr_blade);
+ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+
+ return irq;
+}
+
+/*
+ * Disable the specified MMR located on the specified blade so that MSIs are
+ * longer allowed to be sent.
+ */
+void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
+{
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+ int mmr_pnode;
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+ BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
+ entry->mask = 1;
+
+ mmr_pnode = uv_blade_to_pnode(mmr_blade);
+ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+}
+#endif /* CONFIG_X86_64 */
int __init io_apic_get_redir_entries (int ioapic)
{
@@ -2802,6 +3592,122 @@
return reg_01.bits.entries;
}
+int __init probe_nr_irqs(void)
+{
+ int idx;
+ int nr = 0;
+#ifndef CONFIG_XEN
+ int nr_min = 32;
+#else
+ int nr_min = NR_IRQS;
+#endif
+
+ for (idx = 0; idx < nr_ioapics; idx++)
+ nr += io_apic_get_redir_entries(idx) + 1;
+
+ /* double it for hotplug and msi and nmi */
+ nr <<= 1;
+
+ /* something wrong ? */
+ if (nr < nr_min)
+ nr = nr_min;
+
+ return nr;
+}
+
+/* --------------------------------------------------------------------------
+ ACPI-based IOAPIC Configuration
+ -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI
+
+#ifdef CONFIG_X86_32
+int __init io_apic_get_unique_id(int ioapic, int apic_id)
+{
+ union IO_APIC_reg_00 reg_00;
+ static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+ physid_mask_t tmp;
+ unsigned long flags;
+ int i = 0;
+
+ /*
+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
+ * supports up to 16 on one shared APIC bus.
+ *
+ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+ * advantage of new APIC bus architecture.
+ */
+
+ if (physids_empty(apic_id_map))
+ apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ if (apic_id >= get_physical_broadcast()) {
+ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+ "%d\n", ioapic, apic_id, reg_00.bits.ID);
+ apic_id = reg_00.bits.ID;
+ }
+
+ /*
+ * Every APIC in a system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (check_apicid_used(apic_id_map, apic_id)) {
+
+ for (i = 0; i < get_physical_broadcast(); i++) {
+ if (!check_apicid_used(apic_id_map, i))
+ break;
+ }
+
+ if (i == get_physical_broadcast())
+ panic("Max apic_id exceeded!\n");
+
+ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+ "trying %d\n", ioapic, apic_id, i);
+
+ apic_id = i;
+ }
+
+ tmp = apicid_to_cpu_present(apic_id);
+ physids_or(apic_id_map, apic_id_map, tmp);
+
+ if (reg_00.bits.ID != apic_id) {
+ reg_00.bits.ID = apic_id;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(ioapic, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /* Sanity check */
+ if (reg_00.bits.ID != apic_id) {
+ printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+ return -1;
+ }
+ }
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+
+ return apic_id;
+}
+
+int __init io_apic_get_version(int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return reg_01.bits.version;
+}
+#endif
int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
{
@@ -2853,6 +3759,7 @@
void __init setup_ioapic_dest(void)
{
int pin, ioapic, irq, irq_entry;
+ struct irq_cfg *cfg;
if (skip_ioapic_setup == 1)
return;
@@ -2868,7 +3775,8 @@
* when you have too many devices, because at that time only boot
* cpu is online.
*/
- if (!irq_cfg[irq].vector)
+ cfg = irq_cfg(irq);
+ if (!cfg->vector)
setup_IO_APIC_irq(ioapic, pin, irq,
irq_trigger(irq_entry),
irq_polarity(irq_entry));
@@ -2926,18 +3834,33 @@
struct resource *ioapic_res;
int i;
+ irq_2_pin_init();
ioapic_res = ioapic_setup_resources();
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
ioapic_phys = mp_ioapics[i].mp_apicaddr;
+#ifdef CONFIG_X86_32
+ if (!ioapic_phys) {
+ printk(KERN_ERR
+ "WARNING: bogus zero IO-APIC "
+ "address found in MPTABLE, "
+ "disabling IO/APIC support!\n");
+ smp_found_config = 0;
+ skip_ioapic_setup = 1;
+ goto fake_ioapic_page;
+ }
+#endif
} else {
+#ifdef CONFIG_X86_32
+fake_ioapic_page:
+#endif
ioapic_phys = (unsigned long)
alloc_bootmem_pages(PAGE_SIZE);
ioapic_phys = __pa(ioapic_phys);
}
set_fixmap_nocache(idx, ioapic_phys);
apic_printk(APIC_VERBOSE,
- "mapped IOAPIC to %016lx (%016lx)\n",
+ "mapped IOAPIC to %08lx (%08lx)\n",
__fix_to_virt(idx), ioapic_phys);
idx++;
@@ -2971,4 +3894,3 @@
/* Insert the IO APIC resources after PCI initialization has occured to handle
* IO APICS that are mapped in on a BAR in PCI space. */
late_initcall(ioapic_insert_resources);
-
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
deleted file mode 100644
index e710289..0000000
--- a/arch/x86/kernel/io_apic_32.c
+++ /dev/null
@@ -1,2908 +0,0 @@
-/*
- * Intel IO-APIC support for multi-Pentium hosts.
- *
- * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
- *
- * Many thanks to Stig Venaas for trying out countless experimental
- * patches and reporting/debugging problems patiently!
- *
- * (c) 1999, Multiple IO-APIC support, developed by
- * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
- * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
- * further tested and cleaned up by Zach Brown <zab@redhat.com>
- * and Ingo Molnar <mingo@redhat.com>
- *
- * Fixes
- * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
- * thanks to Eric Gilmore
- * and Rolf G. Tews
- * for testing these extensively
- * Paul Diefenbaugh : Added full ACPI support
- */
-
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/bootmem.h>
-#include <linux/mc146818rtc.h>
-#include <linux/compiler.h>
-#include <linux/acpi.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/pci.h>
-#include <linux/msi.h>
-#include <linux/htirq.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/jiffies.h> /* time_after() */
-
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/desc.h>
-#include <asm/timer.h>
-#include <asm/i8259.h>
-#include <asm/nmi.h>
-#include <asm/msidef.h>
-#include <asm/hypertransport.h>
-#include <asm/setup.h>
-
-#include <mach_apic.h>
-#include <mach_apicdef.h>
-
-#define __apicdebuginit(type) static type __init
-
-int (*ioapic_renumber_irq)(int ioapic, int irq);
-atomic_t irq_mis_count;
-
-/* Where if anywhere is the i8259 connect in external int mode */
-static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
-
-static DEFINE_SPINLOCK(ioapic_lock);
-DEFINE_SPINLOCK(vector_lock);
-
-int timer_through_8259 __initdata;
-
-/*
- * Is the SiS APIC rmw bug present ?
- * -1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
-
-/*
- * # of IRQ routing registers
- */
-int nr_ioapic_registers[MAX_IO_APICS];
-
-/* I/O APIC entries */
-struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
-int nr_ioapics;
-
-/* MP IRQ source entries */
-struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
-
-/* # of MP IRQ source entries */
-int mp_irq_entries;
-
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-int mp_bus_id_to_type[MAX_MP_BUSSES];
-#endif
-
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-
-static int disable_timer_pin_1 __initdata;
-
-/*
- * Rough estimation of how many shared IRQs there are, can
- * be changed anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
-
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
-
-static struct irq_pin_list {
- int apic, pin, next;
-} irq_2_pin[PIN_MAP_SIZE];
-
-struct io_apic {
- unsigned int index;
- unsigned int unused[3];
- unsigned int data;
-};
-
-static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
-{
- return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
-}
-
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
- writel(reg, &io_apic->index);
- return readl(&io_apic->data);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
- writel(reg, &io_apic->index);
- writel(value, &io_apic->data);
-}
-
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
- volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
- if (sis_apic_bug)
- writel(reg, &io_apic->index);
- writel(value, &io_apic->data);
-}
-
-union entry_union {
- struct { u32 w1, w2; };
- struct IO_APIC_route_entry entry;
-};
-
-static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
-{
- union entry_union eu;
- unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
- eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
- eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- return eu.entry;
-}
-
-/*
- * When we write a new IO APIC routing entry, we need to write the high
- * word first! If the mask bit in the low word is clear, we will enable
- * the interrupt, and we need to make sure the entry is fully populated
- * before that happens.
- */
-static void
-__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
-{
- union entry_union eu;
- eu.entry = e;
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
-}
-
-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
-{
- unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
- __ioapic_write_entry(apic, pin, e);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-/*
- * When we mask an IO APIC routing entry, we need to write the low
- * word first, in order to set the mask bit before we change the
- * high bits!
- */
-static void ioapic_mask_entry(int apic, int pin)
-{
- unsigned long flags;
- union entry_union eu = { .entry.mask = 1 };
-
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-/*
- * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
- * shared ISA-space IRQs, so we have to support them. We are super
- * fast in the common case, and fast for shared ISA-space IRQs.
- */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
-{
- static int first_free_entry = NR_IRQS;
- struct irq_pin_list *entry = irq_2_pin + irq;
-
- while (entry->next)
- entry = irq_2_pin + entry->next;
-
- if (entry->pin != -1) {
- entry->next = first_free_entry;
- entry = irq_2_pin + entry->next;
- if (++first_free_entry >= PIN_MAP_SIZE)
- panic("io_apic.c: whoops");
- }
- entry->apic = apic;
- entry->pin = pin;
-}
-
-/*
- * Reroute an IRQ to a different pin.
- */
-static void __init replace_pin_at_irq(unsigned int irq,
- int oldapic, int oldpin,
- int newapic, int newpin)
-{
- struct irq_pin_list *entry = irq_2_pin + irq;
-
- while (1) {
- if (entry->apic == oldapic && entry->pin == oldpin) {
- entry->apic = newapic;
- entry->pin = newpin;
- }
- if (!entry->next)
- break;
- entry = irq_2_pin + entry->next;
- }
-}
-
-static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
-{
- struct irq_pin_list *entry = irq_2_pin + irq;
- unsigned int pin, reg;
-
- for (;;) {
- pin = entry->pin;
- if (pin == -1)
- break;
- reg = io_apic_read(entry->apic, 0x10 + pin*2);
- reg &= ~disable;
- reg |= enable;
- io_apic_modify(entry->apic, 0x10 + pin*2, reg);
- if (!entry->next)
- break;
- entry = irq_2_pin + entry->next;
- }
-}
-
-/* mask = 1 */
-static void __mask_IO_APIC_irq(unsigned int irq)
-{
- __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
-}
-
-/* mask = 0 */
-static void __unmask_IO_APIC_irq(unsigned int irq)
-{
- __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
-}
-
-/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
-{
- __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
- IO_APIC_REDIR_LEVEL_TRIGGER);
-}
-
-/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
-{
- __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
- IO_APIC_REDIR_MASKED);
-}
-
-static void mask_IO_APIC_irq(unsigned int irq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- __mask_IO_APIC_irq(irq);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void unmask_IO_APIC_irq(unsigned int irq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_IO_APIC_irq(irq);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
-{
- struct IO_APIC_route_entry entry;
-
- /* Check delivery_mode to be sure we're not clearing an SMI pin */
- entry = ioapic_read_entry(apic, pin);
- if (entry.delivery_mode == dest_SMI)
- return;
-
- /*
- * Disable it in the IO-APIC irq-routing table:
- */
- ioapic_mask_entry(apic, pin);
-}
-
-static void clear_IO_APIC(void)
-{
- int apic, pin;
-
- for (apic = 0; apic < nr_ioapics; apic++)
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
- clear_IO_APIC_pin(apic, pin);
-}
-
-#ifdef CONFIG_SMP
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
-{
- unsigned long flags;
- int pin;
- struct irq_pin_list *entry = irq_2_pin + irq;
- unsigned int apicid_value;
- cpumask_t tmp;
-
- cpus_and(tmp, cpumask, cpu_online_map);
- if (cpus_empty(tmp))
- tmp = TARGET_CPUS;
-
- cpus_and(cpumask, tmp, CPU_MASK_ALL);
-
- apicid_value = cpu_mask_to_apicid(cpumask);
- /* Prepare to do the io_apic_write */
- apicid_value = apicid_value << 24;
- spin_lock_irqsave(&ioapic_lock, flags);
- for (;;) {
- pin = entry->pin;
- if (pin == -1)
- break;
- io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
- if (!entry->next)
- break;
- entry = irq_2_pin + entry->next;
- }
- irq_desc[irq].affinity = cpumask;
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-#if defined(CONFIG_IRQBALANCE)
-# include <asm/processor.h> /* kernel_thread() */
-# include <linux/kernel_stat.h> /* kstat */
-# include <linux/slab.h> /* kmalloc() */
-# include <linux/timer.h>
-
-#define IRQBALANCE_CHECK_ARCH -999
-#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
-#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
-#define BALANCED_IRQ_MORE_DELTA (HZ/10)
-#define BALANCED_IRQ_LESS_DELTA (HZ)
-
-static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
-static int physical_balance __read_mostly;
-static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
-
-static struct irq_cpu_info {
- unsigned long *last_irq;
- unsigned long *irq_delta;
- unsigned long irq;
-} irq_cpu_data[NR_CPUS];
-
-#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
-#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
-#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
-
-#define IDLE_ENOUGH(cpu,now) \
- (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
-
-#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
-
-#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i)))
-
-static cpumask_t balance_irq_affinity[NR_IRQS] = {
- [0 ... NR_IRQS-1] = CPU_MASK_ALL
-};
-
-void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
-{
- balance_irq_affinity[irq] = mask;
-}
-
-static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
- unsigned long now, int direction)
-{
- int search_idle = 1;
- int cpu = curr_cpu;
-
- goto inside;
-
- do {
- if (unlikely(cpu == curr_cpu))
- search_idle = 0;
-inside:
- if (direction == 1) {
- cpu++;
- if (cpu >= NR_CPUS)
- cpu = 0;
- } else {
- cpu--;
- if (cpu == -1)
- cpu = NR_CPUS-1;
- }
- } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
- (search_idle && !IDLE_ENOUGH(cpu, now)));
-
- return cpu;
-}
-
-static inline void balance_irq(int cpu, int irq)
-{
- unsigned long now = jiffies;
- cpumask_t allowed_mask;
- unsigned int new_cpu;
-
- if (irqbalance_disabled)
- return;
-
- cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
- new_cpu = move(cpu, allowed_mask, now, 1);
- if (cpu != new_cpu)
- set_pending_irq(irq, cpumask_of_cpu(new_cpu));
-}
-
-static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
-{
- int i, j;
-
- for_each_online_cpu(i) {
- for (j = 0; j < NR_IRQS; j++) {
- if (!irq_desc[j].action)
- continue;
- /* Is it a significant load ? */
- if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
- useful_load_threshold)
- continue;
- balance_irq(i, j);
- }
- }
- balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
- return;
-}
-
-static void do_irq_balance(void)
-{
- int i, j;
- unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
- unsigned long move_this_load = 0;
- int max_loaded = 0, min_loaded = 0;
- int load;
- unsigned long useful_load_threshold = balanced_irq_interval + 10;
- int selected_irq;
- int tmp_loaded, first_attempt = 1;
- unsigned long tmp_cpu_irq;
- unsigned long imbalance = 0;
- cpumask_t allowed_mask, target_cpu_mask, tmp;
-
- for_each_possible_cpu(i) {
- int package_index;
- CPU_IRQ(i) = 0;
- if (!cpu_online(i))
- continue;
- package_index = CPU_TO_PACKAGEINDEX(i);
- for (j = 0; j < NR_IRQS; j++) {
- unsigned long value_now, delta;
- /* Is this an active IRQ or balancing disabled ? */
- if (!irq_desc[j].action || irq_balancing_disabled(j))
- continue;
- if (package_index == i)
- IRQ_DELTA(package_index, j) = 0;
- /* Determine the total count per processor per IRQ */
- value_now = (unsigned long) kstat_cpu(i).irqs[j];
-
- /* Determine the activity per processor per IRQ */
- delta = value_now - LAST_CPU_IRQ(i, j);
-
- /* Update last_cpu_irq[][] for the next time */
- LAST_CPU_IRQ(i, j) = value_now;
-
- /* Ignore IRQs whose rate is less than the clock */
- if (delta < useful_load_threshold)
- continue;
- /* update the load for the processor or package total */
- IRQ_DELTA(package_index, j) += delta;
-
- /* Keep track of the higher numbered sibling as well */
- if (i != package_index)
- CPU_IRQ(i) += delta;
- /*
- * We have sibling A and sibling B in the package
- *
- * cpu_irq[A] = load for cpu A + load for cpu B
- * cpu_irq[B] = load for cpu B
- */
- CPU_IRQ(package_index) += delta;
- }
- }
- /* Find the least loaded processor package */
- for_each_online_cpu(i) {
- if (i != CPU_TO_PACKAGEINDEX(i))
- continue;
- if (min_cpu_irq > CPU_IRQ(i)) {
- min_cpu_irq = CPU_IRQ(i);
- min_loaded = i;
- }
- }
- max_cpu_irq = ULONG_MAX;
-
-tryanothercpu:
- /*
- * Look for heaviest loaded processor.
- * We may come back to get the next heaviest loaded processor.
- * Skip processors with trivial loads.
- */
- tmp_cpu_irq = 0;
- tmp_loaded = -1;
- for_each_online_cpu(i) {
- if (i != CPU_TO_PACKAGEINDEX(i))
- continue;
- if (max_cpu_irq <= CPU_IRQ(i))
- continue;
- if (tmp_cpu_irq < CPU_IRQ(i)) {
- tmp_cpu_irq = CPU_IRQ(i);
- tmp_loaded = i;
- }
- }
-
- if (tmp_loaded == -1) {
- /*
- * In the case of small number of heavy interrupt sources,
- * loading some of the cpus too much. We use Ingo's original
- * approach to rotate them around.
- */
- if (!first_attempt && imbalance >= useful_load_threshold) {
- rotate_irqs_among_cpus(useful_load_threshold);
- return;
- }
- goto not_worth_the_effort;
- }
-
- first_attempt = 0; /* heaviest search */
- max_cpu_irq = tmp_cpu_irq; /* load */
- max_loaded = tmp_loaded; /* processor */
- imbalance = (max_cpu_irq - min_cpu_irq) / 2;
-
- /*
- * if imbalance is less than approx 10% of max load, then
- * observe diminishing returns action. - quit
- */
- if (imbalance < (max_cpu_irq >> 3))
- goto not_worth_the_effort;
-
-tryanotherirq:
- /* if we select an IRQ to move that can't go where we want, then
- * see if there is another one to try.
- */
- move_this_load = 0;
- selected_irq = -1;
- for (j = 0; j < NR_IRQS; j++) {
- /* Is this an active IRQ? */
- if (!irq_desc[j].action)
- continue;
- if (imbalance <= IRQ_DELTA(max_loaded, j))
- continue;
- /* Try to find the IRQ that is closest to the imbalance
- * without going over.
- */
- if (move_this_load < IRQ_DELTA(max_loaded, j)) {
- move_this_load = IRQ_DELTA(max_loaded, j);
- selected_irq = j;
- }
- }
- if (selected_irq == -1)
- goto tryanothercpu;
-
- imbalance = move_this_load;
-
- /* For physical_balance case, we accumulated both load
- * values in the one of the siblings cpu_irq[],
- * to use the same code for physical and logical processors
- * as much as possible.
- *
- * NOTE: the cpu_irq[] array holds the sum of the load for
- * sibling A and sibling B in the slot for the lowest numbered
- * sibling (A), _AND_ the load for sibling B in the slot for
- * the higher numbered sibling.
- *
- * We seek the least loaded sibling by making the comparison
- * (A+B)/2 vs B
- */
- load = CPU_IRQ(min_loaded) >> 1;
- for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) {
- if (load > CPU_IRQ(j)) {
- /* This won't change cpu_sibling_map[min_loaded] */
- load = CPU_IRQ(j);
- min_loaded = j;
- }
- }
-
- cpus_and(allowed_mask,
- cpu_online_map,
- balance_irq_affinity[selected_irq]);
- target_cpu_mask = cpumask_of_cpu(min_loaded);
- cpus_and(tmp, target_cpu_mask, allowed_mask);
-
- if (!cpus_empty(tmp)) {
- /* mark for change destination */
- set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
-
- /* Since we made a change, come back sooner to
- * check for more variation.
- */
- balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
- return;
- }
- goto tryanotherirq;
-
-not_worth_the_effort:
- /*
- * if we did not find an IRQ to move, then adjust the time interval
- * upward
- */
- balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
- return;
-}
-
-static int balanced_irq(void *unused)
-{
- int i;
- unsigned long prev_balance_time = jiffies;
- long time_remaining = balanced_irq_interval;
-
- /* push everything to CPU 0 to give us a starting point. */
- for (i = 0 ; i < NR_IRQS ; i++) {
- irq_desc[i].pending_mask = cpumask_of_cpu(0);
- set_pending_irq(i, cpumask_of_cpu(0));
- }
-
- set_freezable();
- for ( ; ; ) {
- time_remaining = schedule_timeout_interruptible(time_remaining);
- try_to_freeze();
- if (time_after(jiffies,
- prev_balance_time+balanced_irq_interval)) {
- preempt_disable();
- do_irq_balance();
- prev_balance_time = jiffies;
- time_remaining = balanced_irq_interval;
- preempt_enable();
- }
- }
- return 0;
-}
-
-static int __init balanced_irq_init(void)
-{
- int i;
- struct cpuinfo_x86 *c;
- cpumask_t tmp;
-
- cpus_shift_right(tmp, cpu_online_map, 2);
- c = &boot_cpu_data;
- /* When not overwritten by the command line ask subarchitecture. */
- if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
- irqbalance_disabled = NO_BALANCE_IRQ;
- if (irqbalance_disabled)
- return 0;
-
- /* disable irqbalance completely if there is only one processor online */
- if (num_online_cpus() < 2) {
- irqbalance_disabled = 1;
- return 0;
- }
- /*
- * Enable physical balance only if more than 1 physical processor
- * is present
- */
- if (smp_num_siblings > 1 && !cpus_empty(tmp))
- physical_balance = 1;
-
- for_each_online_cpu(i) {
- irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
- irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
- if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
- printk(KERN_ERR "balanced_irq_init: out of memory");
- goto failed;
- }
- }
-
- printk(KERN_INFO "Starting balanced_irq\n");
- if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
- return 0;
- printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
-failed:
- for_each_possible_cpu(i) {
- kfree(irq_cpu_data[i].irq_delta);
- irq_cpu_data[i].irq_delta = NULL;
- kfree(irq_cpu_data[i].last_irq);
- irq_cpu_data[i].last_irq = NULL;
- }
- return 0;
-}
-
-int __devinit irqbalance_disable(char *str)
-{
- irqbalance_disabled = 1;
- return 1;
-}
-
-__setup("noirqbalance", irqbalance_disable);
-
-late_initcall(balanced_irq_init);
-#endif /* CONFIG_IRQBALANCE */
-#endif /* CONFIG_SMP */
-
-#ifndef CONFIG_SMP
-void send_IPI_self(int vector)
-{
- unsigned int cfg;
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
- cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write(APIC_ICR, cfg);
-}
-#endif /* !CONFIG_SMP */
-
-
-/*
- * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
- * specific CPU-side IRQs.
- */
-
-#define MAX_PIRQS 8
-static int pirq_entries [MAX_PIRQS];
-static int pirqs_enabled;
-int skip_ioapic_setup;
-
-static int __init ioapic_pirq_setup(char *str)
-{
- int i, max;
- int ints[MAX_PIRQS+1];
-
- get_options(str, ARRAY_SIZE(ints), ints);
-
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
-
- pirqs_enabled = 1;
- apic_printk(APIC_VERBOSE, KERN_INFO
- "PIRQ redirection, working around broken MP-BIOS.\n");
- max = MAX_PIRQS;
- if (ints[0] < MAX_PIRQS)
- max = ints[0];
-
- for (i = 0; i < max; i++) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
- /*
- * PIRQs are mapped upside down, usually.
- */
- pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
- }
- return 1;
-}
-
-__setup("pirq=", ioapic_pirq_setup);
-
-/*
- * Find the IRQ entry number of a certain pin.
- */
-static int find_irq_entry(int apic, int pin, int type)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_irqtype == type &&
- (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
- mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
- mp_irqs[i].mp_dstirq == pin)
- return i;
-
- return -1;
-}
-
-/*
- * Find the pin to which IRQ[irq] (ISA) is connected
- */
-static int __init find_isa_irq_pin(int irq, int type)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
-
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mp_irqtype == type) &&
- (mp_irqs[i].mp_srcbusirq == irq))
-
- return mp_irqs[i].mp_dstirq;
- }
- return -1;
-}
-
-static int __init find_isa_irq_apic(int irq, int type)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
-
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mp_irqtype == type) &&
- (mp_irqs[i].mp_srcbusirq == irq))
- break;
- }
- if (i < mp_irq_entries) {
- int apic;
- for (apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
- return apic;
- }
- }
-
- return -1;
-}
-
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-static int pin_2_irq(int idx, int apic, int pin);
-
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
-{
- int apic, i, best_guess = -1;
-
- apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
- "slot:%d, pin:%d.\n", bus, slot, pin);
- if (test_bit(bus, mp_bus_not_pci)) {
- printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
- return -1;
- }
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
-
- for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
- mp_irqs[i].mp_dstapic == MP_APIC_ALL)
- break;
-
- if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].mp_irqtype &&
- (bus == lbus) &&
- (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
-
- if (!(apic || IO_APIC_IRQ(irq)))
- continue;
-
- if (pin == (mp_irqs[i].mp_srcbusirq & 3))
- return irq;
- /*
- * Use the first all-but-pin matching entry as a
- * best-guess fuzzy result for broken mptables.
- */
- if (best_guess < 0)
- best_guess = irq;
- }
- }
- return best_guess;
-}
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-
-/*
- * This function currently is only a helper for the i386 smp boot process where
- * we need to reprogram the ioredtbls to cater for the cpus which have come online
- * so mask in all cases should simply be TARGET_CPUS
- */
-#ifdef CONFIG_SMP
-void __init setup_ioapic_dest(void)
-{
- int pin, ioapic, irq, irq_entry;
-
- if (skip_ioapic_setup == 1)
- return;
-
- for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
- for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
- irq_entry = find_irq_entry(ioapic, pin, mp_INT);
- if (irq_entry == -1)
- continue;
- irq = pin_2_irq(irq_entry, ioapic, pin);
- set_ioapic_affinity_irq(irq, TARGET_CPUS);
- }
-
- }
-}
-#endif
-
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
- if (irq < 16) {
- unsigned int port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
- }
- apic_printk(APIC_VERBOSE, KERN_INFO
- "Broken MPtable reports ISA irq %d\n", irq);
- return 0;
-}
-#endif
-
-/* ISA interrupts are always polarity zero edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx) (0)
-#define default_ISA_polarity(idx) (0)
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value. If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
-#define default_EISA_polarity(idx) default_ISA_polarity(idx)
-
-/* PCI interrupts are always polarity one level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx) (1)
-#define default_PCI_polarity(idx) (1)
-
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx) (1)
-#define default_MCA_polarity(idx) default_ISA_polarity(idx)
-
-static int MPBIOS_polarity(int idx)
-{
- int bus = mp_irqs[idx].mp_srcbus;
- int polarity;
-
- /*
- * Determine IRQ line polarity (high active or low active):
- */
- switch (mp_irqs[idx].mp_irqflag & 3) {
- case 0: /* conforms, ie. bus-type dependent polarity */
- {
- polarity = test_bit(bus, mp_bus_not_pci)?
- default_ISA_polarity(idx):
- default_PCI_polarity(idx);
- break;
- }
- case 1: /* high active */
- {
- polarity = 0;
- break;
- }
- case 2: /* reserved */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- case 3: /* low active */
- {
- polarity = 1;
- break;
- }
- default: /* invalid */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- }
- return polarity;
-}
-
-static int MPBIOS_trigger(int idx)
-{
- int bus = mp_irqs[idx].mp_srcbus;
- int trigger;
-
- /*
- * Determine IRQ trigger mode (edge or level sensitive):
- */
- switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
- case 0: /* conforms, ie. bus-type dependent */
- {
- trigger = test_bit(bus, mp_bus_not_pci)?
- default_ISA_trigger(idx):
- default_PCI_trigger(idx);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
- switch (mp_bus_id_to_type[bus]) {
- case MP_BUS_ISA: /* ISA pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- trigger = default_EISA_trigger(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_MCA: /* MCA pin */
- {
- trigger = default_MCA_trigger(idx);
- break;
- }
- default:
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- }
-#endif
- break;
- }
- case 1: /* edge */
- {
- trigger = 0;
- break;
- }
- case 2: /* reserved */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- case 3: /* level */
- {
- trigger = 1;
- break;
- }
- default: /* invalid */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 0;
- break;
- }
- }
- return trigger;
-}
-
-static inline int irq_polarity(int idx)
-{
- return MPBIOS_polarity(idx);
-}
-
-static inline int irq_trigger(int idx)
-{
- return MPBIOS_trigger(idx);
-}
-
-static int pin_2_irq(int idx, int apic, int pin)
-{
- int irq, i;
- int bus = mp_irqs[idx].mp_srcbus;
-
- /*
- * Debugging check, we are in big trouble if this message pops up!
- */
- if (mp_irqs[idx].mp_dstirq != pin)
- printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
-
- if (test_bit(bus, mp_bus_not_pci))
- irq = mp_irqs[idx].mp_srcbusirq;
- else {
- /*
- * PCI IRQs are mapped in order
- */
- i = irq = 0;
- while (i < apic)
- irq += nr_ioapic_registers[i++];
- irq += pin;
-
- /*
- * For MPS mode, so far only needed by ES7000 platform
- */
- if (ioapic_renumber_irq)
- irq = ioapic_renumber_irq(apic, irq);
- }
-
- /*
- * PCI IRQ command line redirection. Yes, limits are hardcoded.
- */
- if ((pin >= 16) && (pin <= 23)) {
- if (pirq_entries[pin-16] != -1) {
- if (!pirq_entries[pin-16]) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "disabling PIRQ%d\n", pin-16);
- } else {
- irq = pirq_entries[pin-16];
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "using PIRQ%d -> IRQ %d\n",
- pin-16, irq);
- }
- }
- }
- return irq;
-}
-
-static inline int IO_APIC_irq_trigger(int irq)
-{
- int apic, idx, pin;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
- return irq_trigger(idx);
- }
- }
- /*
- * nonexistent IRQs are edge default
- */
- return 0;
-}
-
-/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
-
-static int __assign_irq_vector(int irq)
-{
- static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
- int vector, offset;
-
- BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
-
- if (irq_vector[irq] > 0)
- return irq_vector[irq];
-
- vector = current_vector;
- offset = current_offset;
-next:
- vector += 8;
- if (vector >= first_system_vector) {
- offset = (offset + 1) % 8;
- vector = FIRST_DEVICE_VECTOR + offset;
- }
- if (vector == current_vector)
- return -ENOSPC;
- if (test_and_set_bit(vector, used_vectors))
- goto next;
-
- current_vector = vector;
- current_offset = offset;
- irq_vector[irq] = vector;
-
- return vector;
-}
-
-static int assign_irq_vector(int irq)
-{
- unsigned long flags;
- int vector;
-
- spin_lock_irqsave(&vector_lock, flags);
- vector = __assign_irq_vector(irq);
- spin_unlock_irqrestore(&vector_lock, flags);
-
- return vector;
-}
-
-static struct irq_chip ioapic_chip;
-
-#define IOAPIC_AUTO -1
-#define IOAPIC_EDGE 0
-#define IOAPIC_LEVEL 1
-
-static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
-{
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL) {
- irq_desc[irq].status |= IRQ_LEVEL;
- set_irq_chip_and_handler_name(irq, &ioapic_chip,
- handle_fasteoi_irq, "fasteoi");
- } else {
- irq_desc[irq].status &= ~IRQ_LEVEL;
- set_irq_chip_and_handler_name(irq, &ioapic_chip,
- handle_edge_irq, "edge");
- }
- set_intr_gate(vector, interrupt[irq]);
-}
-
-static void __init setup_IO_APIC_irqs(void)
-{
- struct IO_APIC_route_entry entry;
- int apic, pin, idx, irq, first_notcon = 1, vector;
-
- apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-
- /*
- * add it to the IO-APIC irq-routing table:
- */
- memset(&entry, 0, sizeof(entry));
-
- entry.delivery_mode = INT_DELIVERY_MODE;
- entry.dest_mode = INT_DEST_MODE;
- entry.mask = 0; /* enable IRQ */
- entry.dest.logical.logical_dest =
- cpu_mask_to_apicid(TARGET_CPUS);
-
- idx = find_irq_entry(apic, pin, mp_INT);
- if (idx == -1) {
- if (first_notcon) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- " IO-APIC (apicid-pin) %d-%d",
- mp_ioapics[apic].mp_apicid,
- pin);
- first_notcon = 0;
- } else
- apic_printk(APIC_VERBOSE, ", %d-%d",
- mp_ioapics[apic].mp_apicid, pin);
- continue;
- }
-
- if (!first_notcon) {
- apic_printk(APIC_VERBOSE, " not connected.\n");
- first_notcon = 1;
- }
-
- entry.trigger = irq_trigger(idx);
- entry.polarity = irq_polarity(idx);
-
- if (irq_trigger(idx)) {
- entry.trigger = 1;
- entry.mask = 1;
- }
-
- irq = pin_2_irq(idx, apic, pin);
- /*
- * skip adding the timer int on secondary nodes, which causes
- * a small but painful rift in the time-space continuum
- */
- if (multi_timer_check(apic, irq))
- continue;
- else
- add_pin_to_irq(irq, apic, pin);
-
- if (!apic && !IO_APIC_IRQ(irq))
- continue;
-
- if (IO_APIC_IRQ(irq)) {
- vector = assign_irq_vector(irq);
- entry.vector = vector;
- ioapic_register_intr(irq, vector, IOAPIC_AUTO);
-
- if (!apic && (irq < 16))
- disable_8259A_irq(irq);
- }
- ioapic_write_entry(apic, pin, entry);
- }
- }
-
- if (!first_notcon)
- apic_printk(APIC_VERBOSE, " not connected.\n");
-}
-
-/*
- * Set up the timer pin, possibly with the 8259A-master behind.
- */
-static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
- int vector)
-{
- struct IO_APIC_route_entry entry;
-
- memset(&entry, 0, sizeof(entry));
-
- /*
- * We use logical delivery to get the timer IRQ
- * to the first CPU.
- */
- entry.dest_mode = INT_DEST_MODE;
- entry.mask = 1; /* mask IRQ now */
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
- entry.delivery_mode = INT_DELIVERY_MODE;
- entry.polarity = 0;
- entry.trigger = 0;
- entry.vector = vector;
-
- /*
- * The timer IRQ doesn't have to know that behind the
- * scene we may have a 8259A-master in AEOI mode ...
- */
- ioapic_register_intr(0, vector, IOAPIC_EDGE);
-
- /*
- * Add it to the IO-APIC irq-routing table:
- */
- ioapic_write_entry(apic, pin, entry);
-}
-
-
-__apicdebuginit(void) print_IO_APIC(void)
-{
- int apic, i;
- union IO_APIC_reg_00 reg_00;
- union IO_APIC_reg_01 reg_01;
- union IO_APIC_reg_02 reg_02;
- union IO_APIC_reg_03 reg_03;
- unsigned long flags;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
- for (i = 0; i < nr_ioapics; i++)
- printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
-
- /*
- * We are a bit conservative about what we expect. We have to
- * know about every hardware change ASAP.
- */
- printk(KERN_INFO "testing the IO APIC.......................\n");
-
- for (apic = 0; apic < nr_ioapics; apic++) {
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- reg_01.raw = io_apic_read(apic, 1);
- if (reg_01.bits.version >= 0x10)
- reg_02.raw = io_apic_read(apic, 2);
- if (reg_01.bits.version >= 0x20)
- reg_03.raw = io_apic_read(apic, 3);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
- printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
- printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
- printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
- printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
-
- printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
- printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
-
- printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
- printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
-
- /*
- * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
- * but the value of reg_02 is read as the previous read register
- * value, so ignore it if reg_02 == reg_01.
- */
- if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
- printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
- }
-
- /*
- * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
- * or reg_03, but the value of reg_0[23] is read as the previous read
- * register value, so ignore it if reg_03 == reg_0[12].
- */
- if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
- reg_03.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
- printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
- }
-
- printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
- printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
- " Stat Dest Deli Vect: \n");
-
- for (i = 0; i <= reg_01.bits.entries; i++) {
- struct IO_APIC_route_entry entry;
-
- entry = ioapic_read_entry(apic, i);
-
- printk(KERN_DEBUG " %02x %03X %02X ",
- i,
- entry.dest.logical.logical_dest,
- entry.dest.physical.physical_dest
- );
-
- printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
- entry.mask,
- entry.trigger,
- entry.irr,
- entry.polarity,
- entry.delivery_status,
- entry.dest_mode,
- entry.delivery_mode,
- entry.vector
- );
- }
- }
- printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for (i = 0; i < NR_IRQS; i++) {
- struct irq_pin_list *entry = irq_2_pin + i;
- if (entry->pin < 0)
- continue;
- printk(KERN_DEBUG "IRQ%d ", i);
- for (;;) {
- printk("-> %d:%d", entry->apic, entry->pin);
- if (!entry->next)
- break;
- entry = irq_2_pin + entry->next;
- }
- printk("\n");
- }
-
- printk(KERN_INFO ".................................... done.\n");
-
- return;
-}
-
-__apicdebuginit(void) print_APIC_bitfield(int base)
-{
- unsigned int v;
- int i, j;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
- for (i = 0; i < 8; i++) {
- v = apic_read(base + i*0x10);
- for (j = 0; j < 32; j++) {
- if (v & (1<<j))
- printk("1");
- else
- printk("0");
- }
- printk("\n");
- }
-}
-
-__apicdebuginit(void) print_local_APIC(void *dummy)
-{
- unsigned int v, ver, maxlvt;
- u64 icr;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
- smp_processor_id(), hard_smp_processor_id());
- v = apic_read(APIC_ID);
- printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
- GET_APIC_ID(v));
- v = apic_read(APIC_LVR);
- printk(KERN_INFO "... APIC VERSION: %08x\n", v);
- ver = GET_APIC_VERSION(v);
- maxlvt = lapic_get_maxlvt();
-
- v = apic_read(APIC_TASKPRI);
- printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
-
- if (APIC_INTEGRATED(ver)) { /* !82489DX */
- v = apic_read(APIC_ARBPRI);
- printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
- v & APIC_ARBPRI_MASK);
- v = apic_read(APIC_PROCPRI);
- printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
- }
-
- v = apic_read(APIC_EOI);
- printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
- v = apic_read(APIC_RRR);
- printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
- v = apic_read(APIC_LDR);
- printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
- v = apic_read(APIC_DFR);
- printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
- v = apic_read(APIC_SPIV);
- printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
-
- printk(KERN_DEBUG "... APIC ISR field:\n");
- print_APIC_bitfield(APIC_ISR);
- printk(KERN_DEBUG "... APIC TMR field:\n");
- print_APIC_bitfield(APIC_TMR);
- printk(KERN_DEBUG "... APIC IRR field:\n");
- print_APIC_bitfield(APIC_IRR);
-
- if (APIC_INTEGRATED(ver)) { /* !82489DX */
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- v = apic_read(APIC_ESR);
- printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
- }
-
- icr = apic_icr_read();
- printk(KERN_DEBUG "... APIC ICR: %08x\n", icr);
- printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32);
-
- v = apic_read(APIC_LVTT);
- printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
-
- if (maxlvt > 3) { /* PC is LVT#4. */
- v = apic_read(APIC_LVTPC);
- printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
- }
- v = apic_read(APIC_LVT0);
- printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
- v = apic_read(APIC_LVT1);
- printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
-
- if (maxlvt > 2) { /* ERR is LVT#3. */
- v = apic_read(APIC_LVTERR);
- printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
- }
-
- v = apic_read(APIC_TMICT);
- printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
- v = apic_read(APIC_TMCCT);
- printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
- v = apic_read(APIC_TDCR);
- printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
- printk("\n");
-}
-
-__apicdebuginit(void) print_all_local_APICs(void)
-{
- on_each_cpu(print_local_APIC, NULL, 1);
-}
-
-__apicdebuginit(void) print_PIC(void)
-{
- unsigned int v;
- unsigned long flags;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk(KERN_DEBUG "\nprinting PIC contents\n");
-
- spin_lock_irqsave(&i8259A_lock, flags);
-
- v = inb(0xa1) << 8 | inb(0x21);
- printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
-
- v = inb(0xa0) << 8 | inb(0x20);
- printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
-
- outb(0x0b, 0xa0);
- outb(0x0b, 0x20);
- v = inb(0xa0) << 8 | inb(0x20);
- outb(0x0a, 0xa0);
- outb(0x0a, 0x20);
-
- spin_unlock_irqrestore(&i8259A_lock, flags);
-
- printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
-
- v = inb(0x4d1) << 8 | inb(0x4d0);
- printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
-}
-
-__apicdebuginit(int) print_all_ICs(void)
-{
- print_PIC();
- print_all_local_APICs();
- print_IO_APIC();
-
- return 0;
-}
-
-fs_initcall(print_all_ICs);
-
-
-static void __init enable_IO_APIC(void)
-{
- union IO_APIC_reg_01 reg_01;
- int i8259_apic, i8259_pin;
- int i, apic;
- unsigned long flags;
-
- for (i = 0; i < PIN_MAP_SIZE; i++) {
- irq_2_pin[i].pin = -1;
- irq_2_pin[i].next = 0;
- }
- if (!pirqs_enabled)
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
-
- /*
- * The number of IO-APIC IRQ registers (== #pins):
- */
- for (apic = 0; apic < nr_ioapics; apic++) {
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(apic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- nr_ioapic_registers[apic] = reg_01.bits.entries+1;
- }
- for (apic = 0; apic < nr_ioapics; apic++) {
- int pin;
- /* See if any of the pins is in ExtINT mode */
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- struct IO_APIC_route_entry entry;
- entry = ioapic_read_entry(apic, pin);
-
-
- /* If the interrupt line is enabled and in ExtInt mode
- * I have found the pin where the i8259 is connected.
- */
- if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
- ioapic_i8259.apic = apic;
- ioapic_i8259.pin = pin;
- goto found_i8259;
- }
- }
- }
- found_i8259:
- /* Look to see what if the MP table has reported the ExtINT */
- /* If we could not find the appropriate pin by looking at the ioapic
- * the i8259 probably is not connected the ioapic but give the
- * mptable a chance anyway.
- */
- i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
- i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
- /* Trust the MP table if nothing is setup in the hardware */
- if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
- printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
- ioapic_i8259.pin = i8259_pin;
- ioapic_i8259.apic = i8259_apic;
- }
- /* Complain if the MP table and the hardware disagree */
- if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
- (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
- {
- printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
- }
-
- /*
- * Do not trust the IO-APIC being empty at bootup
- */
- clear_IO_APIC();
-}
-
-/*
- * Not an __init, needed by the reboot code
- */
-void disable_IO_APIC(void)
-{
- /*
- * Clear the IO-APIC before rebooting:
- */
- clear_IO_APIC();
-
- /*
- * If the i8259 is routed through an IOAPIC
- * Put that IOAPIC in virtual wire mode
- * so legacy interrupts can be delivered.
- */
- if (ioapic_i8259.pin != -1) {
- struct IO_APIC_route_entry entry;
-
- memset(&entry, 0, sizeof(entry));
- entry.mask = 0; /* Enabled */
- entry.trigger = 0; /* Edge */
- entry.irr = 0;
- entry.polarity = 0; /* High */
- entry.delivery_status = 0;
- entry.dest_mode = 0; /* Physical */
- entry.delivery_mode = dest_ExtINT; /* ExtInt */
- entry.vector = 0;
- entry.dest.physical.physical_dest = read_apic_id();
-
- /*
- * Add it to the IO-APIC irq-routing table:
- */
- ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
- }
- disconnect_bsp_APIC(ioapic_i8259.pin != -1);
-}
-
-/*
- * function to set the IO-APIC physical IDs based on the
- * values stored in the MPC table.
- *
- * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
- */
-
-static void __init setup_ioapic_ids_from_mpc(void)
-{
- union IO_APIC_reg_00 reg_00;
- physid_mask_t phys_id_present_map;
- int apic;
- int i;
- unsigned char old_id;
- unsigned long flags;
-
- if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
- return;
-
- /*
- * Don't check I/O APIC IDs for xAPIC systems. They have
- * no meaning without the serial APIC bus.
- */
- if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return;
- /*
- * This is broken; anything with a real cpu count has to
- * circumvent this idiocy regardless.
- */
- phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
-
- /*
- * Set the IOAPIC ID to the value stored in the MPC table.
- */
- for (apic = 0; apic < nr_ioapics; apic++) {
-
- /* Read the register 0 value */
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- old_id = mp_ioapics[apic].mp_apicid;
-
- if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- apic, mp_ioapics[apic].mp_apicid);
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- reg_00.bits.ID);
- mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
- }
-
- /*
- * Sanity check, is the ID really free? Every APIC in a
- * system must have a unique ID or we get lots of nice
- * 'stuck on smp_invalidate_needed IPI wait' messages.
- */
- if (check_apicid_used(phys_id_present_map,
- mp_ioapics[apic].mp_apicid)) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- apic, mp_ioapics[apic].mp_apicid);
- for (i = 0; i < get_physical_broadcast(); i++)
- if (!physid_isset(i, phys_id_present_map))
- break;
- if (i >= get_physical_broadcast())
- panic("Max APIC ID exceeded!\n");
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- i);
- physid_set(i, phys_id_present_map);
- mp_ioapics[apic].mp_apicid = i;
- } else {
- physid_mask_t tmp;
- tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
- apic_printk(APIC_VERBOSE, "Setting %d in the "
- "phys_id_present_map\n",
- mp_ioapics[apic].mp_apicid);
- physids_or(phys_id_present_map, phys_id_present_map, tmp);
- }
-
-
- /*
- * We need to adjust the IRQ routing table
- * if the ID changed.
- */
- if (old_id != mp_ioapics[apic].mp_apicid)
- for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_dstapic == old_id)
- mp_irqs[i].mp_dstapic
- = mp_ioapics[apic].mp_apicid;
-
- /*
- * Read the right value from the MPC table and
- * write it into the ID register.
- */
- apic_printk(APIC_VERBOSE, KERN_INFO
- "...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic].mp_apicid);
-
- reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0, reg_00.raw);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- /*
- * Sanity check
- */
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
- printk("could not set ID!\n");
- else
- apic_printk(APIC_VERBOSE, " ok.\n");
- }
-}
-
-int no_timer_check __initdata;
-
-static int __init notimercheck(char *s)
-{
- no_timer_check = 1;
- return 1;
-}
-__setup("no_timer_check", notimercheck);
-
-/*
- * There is a nasty bug in some older SMP boards, their mptable lies
- * about the timer IRQ. We do the following to work around the situation:
- *
- * - timer IRQ defaults to IO-APIC IRQ
- * - if this function detects that timer IRQs are defunct, then we fall
- * back to ISA timer IRQs
- */
-static int __init timer_irq_works(void)
-{
- unsigned long t1 = jiffies;
- unsigned long flags;
-
- if (no_timer_check)
- return 1;
-
- local_save_flags(flags);
- local_irq_enable();
- /* Let ten ticks pass... */
- mdelay((10 * 1000) / HZ);
- local_irq_restore(flags);
-
- /*
- * Expect a few ticks at least, to be sure some possible
- * glue logic does not lock up after one or two first
- * ticks in a non-ExtINT mode. Also the local APIC
- * might have cached one ExtINT interrupt. Finally, at
- * least one tick may be lost due to delays.
- */
- if (time_after(jiffies, t1 + 4))
- return 1;
-
- return 0;
-}
-
-/*
- * In the SMP+IOAPIC case it might happen that there are an unspecified
- * number of pending IRQ events unhandled. These cases are very rare,
- * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
- * better to do it this way as thus we do not have to be aware of
- * 'pending' interrupts in the IRQ path, except at this point.
- */
-/*
- * Edge triggered needs to resend any interrupt
- * that was delayed but this is now handled in the device
- * independent code.
- */
-
-/*
- * Startup quirk:
- *
- * Starting up a edge-triggered IO-APIC interrupt is
- * nasty - we need to make sure that we get the edge.
- * If it is already asserted for some reason, we need
- * return 1 to indicate that is was pending.
- *
- * This is not complete - we should be able to fake
- * an edge even if it isn't on the 8259A...
- *
- * (We do this for level-triggered IRQs too - it cannot hurt.)
- */
-static unsigned int startup_ioapic_irq(unsigned int irq)
-{
- int was_pending = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < 16) {
- disable_8259A_irq(irq);
- if (i8259A_irq_pending(irq))
- was_pending = 1;
- }
- __unmask_IO_APIC_irq(irq);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return was_pending;
-}
-
-static void ack_ioapic_irq(unsigned int irq)
-{
- move_native_irq(irq);
- ack_APIC_irq();
-}
-
-static void ack_ioapic_quirk_irq(unsigned int irq)
-{
- unsigned long v;
- int i;
-
- move_native_irq(irq);
-/*
- * It appears there is an erratum which affects at least version 0x11
- * of I/O APIC (that's the 82093AA and cores integrated into various
- * chipsets). Under certain conditions a level-triggered interrupt is
- * erroneously delivered as edge-triggered one but the respective IRR
- * bit gets set nevertheless. As a result the I/O unit expects an EOI
- * message but it will never arrive and further interrupts are blocked
- * from the source. The exact reason is so far unknown, but the
- * phenomenon was observed when two consecutive interrupt requests
- * from a given source get delivered to the same CPU and the source is
- * temporarily disabled in between.
- *
- * A workaround is to simulate an EOI message manually. We achieve it
- * by setting the trigger mode to edge and then to level when the edge
- * trigger mode gets detected in the TMR of a local APIC for a
- * level-triggered interrupt. We mask the source for the time of the
- * operation to prevent an edge-triggered interrupt escaping meanwhile.
- * The idea is from Manfred Spraul. --macro
- */
- i = irq_vector[irq];
-
- v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
-
- ack_APIC_irq();
-
- if (!(v & (1 << (i & 0x1f)))) {
- atomic_inc(&irq_mis_count);
- spin_lock(&ioapic_lock);
- __mask_and_edge_IO_APIC_irq(irq);
- __unmask_and_level_IO_APIC_irq(irq);
- spin_unlock(&ioapic_lock);
- }
-}
-
-static int ioapic_retrigger_irq(unsigned int irq)
-{
- send_IPI_self(irq_vector[irq]);
-
- return 1;
-}
-
-static struct irq_chip ioapic_chip __read_mostly = {
- .name = "IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
- .ack = ack_ioapic_irq,
- .eoi = ack_ioapic_quirk_irq,
-#ifdef CONFIG_SMP
- .set_affinity = set_ioapic_affinity_irq,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-
-static inline void init_IO_APIC_traps(void)
-{
- int irq;
-
- /*
- * NOTE! The local APIC isn't very good at handling
- * multiple interrupts at the same interrupt level.
- * As the interrupt level is determined by taking the
- * vector number and shifting that right by 4, we
- * want to spread these out a bit so that they don't
- * all fall in the same interrupt level.
- *
- * Also, we've got to be careful not to trash gate
- * 0x80, because int 0x80 is hm, kind of importantish. ;)
- */
- for (irq = 0; irq < NR_IRQS ; irq++) {
- if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
- /*
- * Hmm.. We don't have an entry for this,
- * so default to an old-fashioned 8259
- * interrupt if we can..
- */
- if (irq < 16)
- make_8259A_irq(irq);
- else
- /* Strange. Oh, well.. */
- irq_desc[irq].chip = &no_irq_chip;
- }
- }
-}
-
-/*
- * The local APIC irq-chip implementation:
- */
-
-static void ack_lapic_irq(unsigned int irq)
-{
- ack_APIC_irq();
-}
-
-static void mask_lapic_irq(unsigned int irq)
-{
- unsigned long v;
-
- v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
-}
-
-static void unmask_lapic_irq(unsigned int irq)
-{
- unsigned long v;
-
- v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
-}
-
-static struct irq_chip lapic_chip __read_mostly = {
- .name = "local-APIC",
- .mask = mask_lapic_irq,
- .unmask = unmask_lapic_irq,
- .ack = ack_lapic_irq,
-};
-
-static void lapic_register_intr(int irq, int vector)
-{
- irq_desc[irq].status &= ~IRQ_LEVEL;
- set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
- "edge");
- set_intr_gate(vector, interrupt[irq]);
-}
-
-static void __init setup_nmi(void)
-{
- /*
- * Dirty trick to enable the NMI watchdog ...
- * We put the 8259A master into AEOI mode and
- * unmask on all local APICs LVT0 as NMI.
- *
- * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
- * is from Maciej W. Rozycki - so we do not have to EOI from
- * the NMI handler or the timer interrupt.
- */
- apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
-
- enable_NMI_through_LVT0();
-
- apic_printk(APIC_VERBOSE, " done.\n");
-}
-
-/*
- * This looks a bit hackish but it's about the only one way of sending
- * a few INTA cycles to 8259As and any associated glue logic. ICR does
- * not support the ExtINT mode, unfortunately. We need to send these
- * cycles as some i82489DX-based boards have glue logic that keeps the
- * 8259A interrupt line asserted until INTA. --macro
- */
-static inline void __init unlock_ExtINT_logic(void)
-{
- int apic, pin, i;
- struct IO_APIC_route_entry entry0, entry1;
- unsigned char save_control, save_freq_select;
-
- pin = find_isa_irq_pin(8, mp_INT);
- if (pin == -1) {
- WARN_ON_ONCE(1);
- return;
- }
- apic = find_isa_irq_apic(8, mp_INT);
- if (apic == -1) {
- WARN_ON_ONCE(1);
- return;
- }
-
- entry0 = ioapic_read_entry(apic, pin);
- clear_IO_APIC_pin(apic, pin);
-
- memset(&entry1, 0, sizeof(entry1));
-
- entry1.dest_mode = 0; /* physical delivery */
- entry1.mask = 0; /* unmask IRQ now */
- entry1.dest.physical.physical_dest = hard_smp_processor_id();
- entry1.delivery_mode = dest_ExtINT;
- entry1.polarity = entry0.polarity;
- entry1.trigger = 0;
- entry1.vector = 0;
-
- ioapic_write_entry(apic, pin, entry1);
-
- save_control = CMOS_READ(RTC_CONTROL);
- save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
- CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
- RTC_FREQ_SELECT);
- CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
-
- i = 100;
- while (i-- > 0) {
- mdelay(10);
- if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
- i -= 10;
- }
-
- CMOS_WRITE(save_control, RTC_CONTROL);
- CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
- clear_IO_APIC_pin(apic, pin);
-
- ioapic_write_entry(apic, pin, entry0);
-}
-
-/*
- * This code may look a bit paranoid, but it's supposed to cooperate with
- * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
- * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
- * fanatically on his truly buggy board.
- */
-static inline void __init check_timer(void)
-{
- int apic1, pin1, apic2, pin2;
- int no_pin1 = 0;
- int vector;
- unsigned int ver;
- unsigned long flags;
-
- local_irq_save(flags);
-
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
-
- /*
- * get/set the timer IRQ vector:
- */
- disable_8259A_irq(0);
- vector = assign_irq_vector(0);
- set_intr_gate(vector, interrupt[0]);
-
- /*
- * As IRQ0 is to be enabled in the 8259A, the virtual
- * wire has to be disabled in the local APIC. Also
- * timer interrupts need to be acknowledged manually in
- * the 8259A for the i82489DX when using the NMI
- * watchdog as that APIC treats NMIs as level-triggered.
- * The AEOI mode will finish them in the 8259A
- * automatically.
- */
- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
- init_8259A(1);
- timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
-
- pin1 = find_isa_irq_pin(0, mp_INT);
- apic1 = find_isa_irq_apic(0, mp_INT);
- pin2 = ioapic_i8259.pin;
- apic2 = ioapic_i8259.apic;
-
- apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
- "apic1=%d pin1=%d apic2=%d pin2=%d\n",
- vector, apic1, pin1, apic2, pin2);
-
- /*
- * Some BIOS writers are clueless and report the ExtINTA
- * I/O APIC input from the cascaded 8259A as the timer
- * interrupt input. So just in case, if only one pin
- * was found above, try it both directly and through the
- * 8259A.
- */
- if (pin1 == -1) {
- pin1 = pin2;
- apic1 = apic2;
- no_pin1 = 1;
- } else if (pin2 == -1) {
- pin2 = pin1;
- apic2 = apic1;
- }
-
- if (pin1 != -1) {
- /*
- * Ok, does IRQ0 through the IOAPIC work?
- */
- if (no_pin1) {
- add_pin_to_irq(0, apic1, pin1);
- setup_timer_IRQ0_pin(apic1, pin1, vector);
- }
- unmask_IO_APIC_irq(0);
- if (timer_irq_works()) {
- if (nmi_watchdog == NMI_IO_APIC) {
- setup_nmi();
- enable_8259A_irq(0);
- }
- if (disable_timer_pin_1 > 0)
- clear_IO_APIC_pin(0, pin1);
- goto out;
- }
- clear_IO_APIC_pin(apic1, pin1);
- if (!no_pin1)
- apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
- "8254 timer not connected to IO-APIC\n");
-
- apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
- "(IRQ0) through the 8259A ...\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "..... (found apic %d pin %d) ...\n", apic2, pin2);
- /*
- * legacy devices should be connected to IO APIC #0
- */
- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
- setup_timer_IRQ0_pin(apic2, pin2, vector);
- unmask_IO_APIC_irq(0);
- enable_8259A_irq(0);
- if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
- timer_through_8259 = 1;
- if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
- setup_nmi();
- enable_8259A_irq(0);
- }
- goto out;
- }
- /*
- * Cleanup, just in case ...
- */
- disable_8259A_irq(0);
- clear_IO_APIC_pin(apic2, pin2);
- apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
- }
-
- if (nmi_watchdog == NMI_IO_APIC) {
- apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
- "through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = NMI_NONE;
- }
- timer_ack = 0;
-
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as Virtual Wire IRQ...\n");
-
- lapic_register_intr(0, vector);
- apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
- enable_8259A_irq(0);
-
- if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
- goto out;
- }
- disable_8259A_irq(0);
- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
- apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
-
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as ExtINT IRQ...\n");
-
- init_8259A(0);
- make_8259A_irq(0);
- apic_write(APIC_LVT0, APIC_DM_EXTINT);
-
- unlock_ExtINT_logic();
-
- if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
- goto out;
- }
- apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
- panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
- "report. Then try booting with the 'noapic' option.\n");
-out:
- local_irq_restore(flags);
-}
-
-/*
- * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
- * to devices. However there may be an I/O APIC pin available for
- * this interrupt regardless. The pin may be left unconnected, but
- * typically it will be reused as an ExtINT cascade interrupt for
- * the master 8259A. In the MPS case such a pin will normally be
- * reported as an ExtINT interrupt in the MP table. With ACPI
- * there is no provision for ExtINT interrupts, and in the absence
- * of an override it would be treated as an ordinary ISA I/O APIC
- * interrupt, that is edge-triggered and unmasked by default. We
- * used to do this, but it caused problems on some systems because
- * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
- * the same ExtINT cascade interrupt to drive the local APIC of the
- * bootstrap processor. Therefore we refrain from routing IRQ2 to
- * the I/O APIC in all cases now. No actual device should request
- * it anyway. --macro
- */
-#define PIC_IRQS (1 << PIC_CASCADE_IR)
-
-void __init setup_IO_APIC(void)
-{
- int i;
-
- /* Reserve all the system vectors. */
- for (i = first_system_vector; i < NR_VECTORS; i++)
- set_bit(i, used_vectors);
-
- enable_IO_APIC();
-
- io_apic_irqs = ~PIC_IRQS;
-
- printk("ENABLING IO-APIC IRQs\n");
-
- /*
- * Set up IO-APIC IRQ routing.
- */
- if (!acpi_ioapic)
- setup_ioapic_ids_from_mpc();
- sync_Arb_IDs();
- setup_IO_APIC_irqs();
- init_IO_APIC_traps();
- check_timer();
-}
-
-/*
- * Called after all the initialization is done. If we didnt find any
- * APIC bugs then we can allow the modify fast path
- */
-
-static int __init io_apic_bug_finalize(void)
-{
- if (sis_apic_bug == -1)
- sis_apic_bug = 0;
- return 0;
-}
-
-late_initcall(io_apic_bug_finalize);
-
-struct sysfs_ioapic_data {
- struct sys_device dev;
- struct IO_APIC_route_entry entry[0];
-};
-static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
-
-static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
-{
- struct IO_APIC_route_entry *entry;
- struct sysfs_ioapic_data *data;
- int i;
-
- data = container_of(dev, struct sysfs_ioapic_data, dev);
- entry = data->entry;
- for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
- entry[i] = ioapic_read_entry(dev->id, i);
-
- return 0;
-}
-
-static int ioapic_resume(struct sys_device *dev)
-{
- struct IO_APIC_route_entry *entry;
- struct sysfs_ioapic_data *data;
- unsigned long flags;
- union IO_APIC_reg_00 reg_00;
- int i;
-
- data = container_of(dev, struct sysfs_ioapic_data, dev);
- entry = data->entry;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
- io_apic_write(dev->id, 0, reg_00.raw);
- }
- spin_unlock_irqrestore(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
- ioapic_write_entry(dev->id, i, entry[i]);
-
- return 0;
-}
-
-static struct sysdev_class ioapic_sysdev_class = {
- .name = "ioapic",
- .suspend = ioapic_suspend,
- .resume = ioapic_resume,
-};
-
-static int __init ioapic_init_sysfs(void)
-{
- struct sys_device *dev;
- int i, size, error = 0;
-
- error = sysdev_class_register(&ioapic_sysdev_class);
- if (error)
- return error;
-
- for (i = 0; i < nr_ioapics; i++) {
- size = sizeof(struct sys_device) + nr_ioapic_registers[i]
- * sizeof(struct IO_APIC_route_entry);
- mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
- if (!mp_ioapic_data[i]) {
- printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
- continue;
- }
- dev = &mp_ioapic_data[i]->dev;
- dev->id = i;
- dev->cls = &ioapic_sysdev_class;
- error = sysdev_register(dev);
- if (error) {
- kfree(mp_ioapic_data[i]);
- mp_ioapic_data[i] = NULL;
- printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
- continue;
- }
- }
-
- return 0;
-}
-
-device_initcall(ioapic_init_sysfs);
-
-/*
- * Dynamic irq allocate and deallocation
- */
-int create_irq(void)
-{
- /* Allocate an unused irq */
- int irq, new, vector = 0;
- unsigned long flags;
-
- irq = -ENOSPC;
- spin_lock_irqsave(&vector_lock, flags);
- for (new = (NR_IRQS - 1); new >= 0; new--) {
- if (platform_legacy_irq(new))
- continue;
- if (irq_vector[new] != 0)
- continue;
- vector = __assign_irq_vector(new);
- if (likely(vector > 0))
- irq = new;
- break;
- }
- spin_unlock_irqrestore(&vector_lock, flags);
-
- if (irq >= 0) {
- set_intr_gate(vector, interrupt[irq]);
- dynamic_irq_init(irq);
- }
- return irq;
-}
-
-void destroy_irq(unsigned int irq)
-{
- unsigned long flags;
-
- dynamic_irq_cleanup(irq);
-
- spin_lock_irqsave(&vector_lock, flags);
- clear_bit(irq_vector[irq], used_vectors);
- irq_vector[irq] = 0;
- spin_unlock_irqrestore(&vector_lock, flags);
-}
-
-/*
- * MSI message composition
- */
-#ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
-{
- int vector;
- unsigned dest;
-
- vector = assign_irq_vector(irq);
- if (vector >= 0) {
- dest = cpu_mask_to_apicid(TARGET_CPUS);
-
- msg->address_hi = MSI_ADDR_BASE_HI;
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- ((INT_DEST_MODE == 0) ?
-MSI_ADDR_DEST_MODE_PHYSICAL:
- MSI_ADDR_DEST_MODE_LOGICAL) |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
- MSI_ADDR_REDIRECTION_CPU:
- MSI_ADDR_REDIRECTION_LOWPRI) |
- MSI_ADDR_DEST_ID(dest);
-
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-MSI_DATA_DELIVERY_FIXED:
- MSI_DATA_DELIVERY_LOWPRI) |
- MSI_DATA_VECTOR(vector);
- }
- return vector;
-}
-
-#ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
-{
- struct msi_msg msg;
- unsigned int dest;
- cpumask_t tmp;
- int vector;
-
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- tmp = TARGET_CPUS;
-
- vector = assign_irq_vector(irq);
- if (vector < 0)
- return;
-
- dest = cpu_mask_to_apicid(mask);
-
- read_msi_msg(irq, &msg);
-
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
- write_msi_msg(irq, &msg);
- irq_desc[irq].affinity = mask;
-}
-#endif /* CONFIG_SMP */
-
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
- .name = "PCI-MSI",
- .unmask = unmask_msi_irq,
- .mask = mask_msi_irq,
- .ack = ack_ioapic_irq,
-#ifdef CONFIG_SMP
- .set_affinity = set_msi_irq_affinity,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
-{
- struct msi_msg msg;
- int irq, ret;
- irq = create_irq();
- if (irq < 0)
- return irq;
-
- ret = msi_compose_msg(dev, irq, &msg);
- if (ret < 0) {
- destroy_irq(irq);
- return ret;
- }
-
- set_irq_msi(irq, desc);
- write_msi_msg(irq, &msg);
-
- set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
- "edge");
-
- return 0;
-}
-
-void arch_teardown_msi_irq(unsigned int irq)
-{
- destroy_irq(irq);
-}
-
-#endif /* CONFIG_PCI_MSI */
-
-/*
- * Hypertransport interrupt support
- */
-#ifdef CONFIG_HT_IRQ
-
-#ifdef CONFIG_SMP
-
-static void target_ht_irq(unsigned int irq, unsigned int dest)
-{
- struct ht_irq_msg msg;
- fetch_ht_irq_msg(irq, &msg);
-
- msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
- msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
- msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
- msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
- write_ht_irq_msg(irq, &msg);
-}
-
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
-{
- unsigned int dest;
- cpumask_t tmp;
-
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- tmp = TARGET_CPUS;
-
- cpus_and(mask, tmp, CPU_MASK_ALL);
-
- dest = cpu_mask_to_apicid(mask);
-
- target_ht_irq(irq, dest);
- irq_desc[irq].affinity = mask;
-}
-#endif
-
-static struct irq_chip ht_irq_chip = {
- .name = "PCI-HT",
- .mask = mask_ht_irq,
- .unmask = unmask_ht_irq,
- .ack = ack_ioapic_irq,
-#ifdef CONFIG_SMP
- .set_affinity = set_ht_irq_affinity,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
-{
- int vector;
-
- vector = assign_irq_vector(irq);
- if (vector >= 0) {
- struct ht_irq_msg msg;
- unsigned dest;
- cpumask_t tmp;
-
- cpus_clear(tmp);
- cpu_set(vector >> 8, tmp);
- dest = cpu_mask_to_apicid(tmp);
-
- msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
-
- msg.address_lo =
- HT_IRQ_LOW_BASE |
- HT_IRQ_LOW_DEST_ID(dest) |
- HT_IRQ_LOW_VECTOR(vector) |
- ((INT_DEST_MODE == 0) ?
- HT_IRQ_LOW_DM_PHYSICAL :
- HT_IRQ_LOW_DM_LOGICAL) |
- HT_IRQ_LOW_RQEOI_EDGE |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
- HT_IRQ_LOW_MT_FIXED :
- HT_IRQ_LOW_MT_ARBITRATED) |
- HT_IRQ_LOW_IRQ_MASKED;
-
- write_ht_irq_msg(irq, &msg);
-
- set_irq_chip_and_handler_name(irq, &ht_irq_chip,
- handle_edge_irq, "edge");
- }
- return vector;
-}
-#endif /* CONFIG_HT_IRQ */
-
-/* --------------------------------------------------------------------------
- ACPI-based IOAPIC Configuration
- -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-int __init io_apic_get_unique_id(int ioapic, int apic_id)
-{
- union IO_APIC_reg_00 reg_00;
- static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
- physid_mask_t tmp;
- unsigned long flags;
- int i = 0;
-
- /*
- * The P4 platform supports up to 256 APIC IDs on two separate APIC
- * buses (one for LAPICs, one for IOAPICs), where predecessors only
- * supports up to 16 on one shared APIC bus.
- *
- * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
- * advantage of new APIC bus architecture.
- */
-
- if (physids_empty(apic_id_map))
- apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- if (apic_id >= get_physical_broadcast()) {
- printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
- "%d\n", ioapic, apic_id, reg_00.bits.ID);
- apic_id = reg_00.bits.ID;
- }
-
- /*
- * Every APIC in a system must have a unique ID or we get lots of nice
- * 'stuck on smp_invalidate_needed IPI wait' messages.
- */
- if (check_apicid_used(apic_id_map, apic_id)) {
-
- for (i = 0; i < get_physical_broadcast(); i++) {
- if (!check_apicid_used(apic_id_map, i))
- break;
- }
-
- if (i == get_physical_broadcast())
- panic("Max apic_id exceeded!\n");
-
- printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
- "trying %d\n", ioapic, apic_id, i);
-
- apic_id = i;
- }
-
- tmp = apicid_to_cpu_present(apic_id);
- physids_or(apic_id_map, apic_id_map, tmp);
-
- if (reg_00.bits.ID != apic_id) {
- reg_00.bits.ID = apic_id;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic, 0, reg_00.raw);
- reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- /* Sanity check */
- if (reg_00.bits.ID != apic_id) {
- printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
- return -1;
- }
- }
-
- apic_printk(APIC_VERBOSE, KERN_INFO
- "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
-
- return apic_id;
-}
-
-
-int __init io_apic_get_version(int ioapic)
-{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return reg_01.bits.version;
-}
-
-
-int __init io_apic_get_redir_entries(int ioapic)
-{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return reg_01.bits.entries;
-}
-
-
-int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
-{
- struct IO_APIC_route_entry entry;
-
- if (!IO_APIC_IRQ(irq)) {
- printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
- ioapic);
- return -EINVAL;
- }
-
- /*
- * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
- * Note that we mask (disable) IRQs now -- these get enabled when the
- * corresponding device driver registers for this IRQ.
- */
-
- memset(&entry, 0, sizeof(entry));
-
- entry.delivery_mode = INT_DELIVERY_MODE;
- entry.dest_mode = INT_DEST_MODE;
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
- entry.trigger = edge_level;
- entry.polarity = active_high_low;
- entry.mask = 1;
-
- /*
- * IRQs < 16 are already in the irq_2_pin[] map
- */
- if (irq >= 16)
- add_pin_to_irq(irq, ioapic, pin);
-
- entry.vector = assign_irq_vector(irq);
-
- apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
- "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
- mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
- edge_level, active_high_low);
-
- ioapic_register_intr(irq, entry.vector, edge_level);
-
- if (!ioapic && (irq < 16))
- disable_8259A_irq(irq);
-
- ioapic_write_entry(ioapic, pin, entry);
-
- return 0;
-}
-
-int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
-{
- int i;
-
- if (skip_ioapic_setup)
- return -1;
-
- for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_irqtype == mp_INT &&
- mp_irqs[i].mp_srcbusirq == bus_irq)
- break;
- if (i >= mp_irq_entries)
- return -1;
-
- *trigger = irq_trigger(i);
- *polarity = irq_polarity(i);
- return 0;
-}
-
-#endif /* CONFIG_ACPI */
-
-static int __init parse_disable_timer_pin_1(char *arg)
-{
- disable_timer_pin_1 = 1;
- return 0;
-}
-early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
-
-static int __init parse_enable_timer_pin_1(char *arg)
-{
- disable_timer_pin_1 = -1;
- return 0;
-}
-early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
-
-static int __init parse_noapic(char *arg)
-{
- /* disable IO-APIC */
- disable_ioapic_setup();
- return 0;
-}
-early_param("noapic", parse_noapic);
-
-void __init ioapic_init_mappings(void)
-{
- unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
- int i;
-
- for (i = 0; i < nr_ioapics; i++) {
- if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].mp_apicaddr;
- if (!ioapic_phys) {
- printk(KERN_ERR
- "WARNING: bogus zero IO-APIC "
- "address found in MPTABLE, "
- "disabling IO/APIC support!\n");
- smp_found_config = 0;
- skip_ioapic_setup = 1;
- goto fake_ioapic_page;
- }
- } else {
-fake_ioapic_page:
- ioapic_phys = (unsigned long)
- alloc_bootmem_pages(PAGE_SIZE);
- ioapic_phys = __pa(ioapic_phys);
- }
- set_fixmap_nocache(idx, ioapic_phys);
- printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
- __fix_to_virt(idx), ioapic_phys);
- idx++;
- }
-}
-
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
new file mode 100644
index 0000000..ccf6c50
--- /dev/null
+++ b/arch/x86/kernel/irq.c
@@ -0,0 +1,189 @@
+/*
+ * Common interrupt code for 32 and 64 bit
+ */
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/seq_file.h>
+
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/smp.h>
+
+atomic_t irq_err_count;
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+ printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * Currently unexpected vectors happen only on SMP and APIC.
+ * We _must_ ack these because every local APIC has only N
+ * irq slots per priority level, and a 'hanging, unacked' IRQ
+ * holds up an irq slot - in excessive cases (when multiple
+ * unexpected vectors occur) that might lock up the APIC
+ * completely.
+ * But only ack when the APIC is enabled -AK
+ */
+ if (cpu_has_apic)
+ ack_APIC_irq();
+#endif
+}
+
+#ifdef CONFIG_X86_32
+# define irq_stats(x) (&per_cpu(irq_stat,x))
+#else
+# define irq_stats(x) cpu_pda(x)
+#endif
+/*
+ * /proc/interrupts printing:
+ */
+static int show_other_interrupts(struct seq_file *p)
+{
+ int j;
+
+ seq_printf(p, "NMI: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
+ seq_printf(p, " Non-maskable interrupts\n");
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(p, "LOC: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
+ seq_printf(p, " Local timer interrupts\n");
+#endif
+#ifdef CONFIG_SMP
+ seq_printf(p, "RES: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
+ seq_printf(p, " Rescheduling interrupts\n");
+ seq_printf(p, "CAL: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+ seq_printf(p, " Function call interrupts\n");
+ seq_printf(p, "TLB: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
+ seq_printf(p, " TLB shootdowns\n");
+#endif
+#ifdef CONFIG_X86_MCE
+ seq_printf(p, "TRM: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
+ seq_printf(p, " Thermal event interrupts\n");
+# ifdef CONFIG_X86_64
+ seq_printf(p, "THR: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
+ seq_printf(p, " Threshold APIC interrupts\n");
+# endif
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(p, "SPU: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+ seq_printf(p, " Spurious interrupts\n");
+#endif
+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+ return 0;
+}
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+ unsigned long flags, any_count = 0;
+ int i = *(loff_t *) v, j;
+ struct irqaction *action;
+ struct irq_desc *desc;
+
+ if (i > nr_irqs)
+ return 0;
+
+ if (i == nr_irqs)
+ return show_other_interrupts(p);
+
+ /* print header */
+ if (i == 0) {
+ seq_printf(p, " ");
+ for_each_online_cpu(j)
+ seq_printf(p, "CPU%-8d",j);
+ seq_putc(p, '\n');
+ }
+
+ desc = irq_to_desc(i);
+ spin_lock_irqsave(&desc->lock, flags);
+#ifndef CONFIG_SMP
+ any_count = kstat_irqs(i);
+#else
+ for_each_online_cpu(j)
+ any_count |= kstat_irqs_cpu(i, j);
+#endif
+ action = desc->action;
+ if (!action && !any_count)
+ goto out;
+
+ seq_printf(p, "%3d: ", i);
+#ifndef CONFIG_SMP
+ seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+#endif
+ seq_printf(p, " %8s", desc->chip->name);
+ seq_printf(p, "-%-8s", desc->name);
+
+ if (action) {
+ seq_printf(p, " %s", action->name);
+ while ((action = action->next) != NULL)
+ seq_printf(p, ", %s", action->name);
+ }
+
+ seq_putc(p, '\n');
+out:
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return 0;
+}
+
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+ u64 sum = irq_stats(cpu)->__nmi_count;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ sum += irq_stats(cpu)->apic_timer_irqs;
+#endif
+#ifdef CONFIG_SMP
+ sum += irq_stats(cpu)->irq_resched_count;
+ sum += irq_stats(cpu)->irq_call_count;
+ sum += irq_stats(cpu)->irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+ sum += irq_stats(cpu)->irq_thermal_count;
+# ifdef CONFIG_X86_64
+ sum += irq_stats(cpu)->irq_threshold_count;
+#endif
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+ sum += irq_stats(cpu)->irq_spurious_count;
+#endif
+ return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+ u64 sum = atomic_read(&irq_err_count);
+
+#ifdef CONFIG_X86_IO_APIC
+ sum += atomic_read(&irq_mis_count);
+#endif
+ return sum;
+}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index b71e02d..a513826 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -25,29 +25,6 @@
DEFINE_PER_CPU(struct pt_regs *, irq_regs);
EXPORT_PER_CPU_SYMBOL(irq_regs);
-/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
- */
-void ack_bad_irq(unsigned int irq)
-{
- printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
-
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Currently unexpected vectors happen only on SMP and APIC.
- * We _must_ ack these because every local APIC has only N
- * irq slots per priority level, and a 'hanging, unacked' IRQ
- * holds up an irq slot - in excessive cases (when multiple
- * unexpected vectors occur) that might lock up the APIC
- * completely.
- * But only ack when the APIC is enabled -AK
- */
- if (cpu_has_apic)
- ack_APIC_irq();
-#endif
-}
-
#ifdef CONFIG_DEBUG_STACKOVERFLOW
/* Debugging check for stack overflow: is there less than 1KB free? */
static int check_stack_overflow(void)
@@ -223,20 +200,25 @@
{
struct pt_regs *old_regs;
/* high bit used in ret_from_ code */
- int overflow, irq = ~regs->orig_ax;
- struct irq_desc *desc = irq_desc + irq;
+ int overflow;
+ unsigned vector = ~regs->orig_ax;
+ struct irq_desc *desc;
+ unsigned irq;
- if (unlikely((unsigned)irq >= NR_IRQS)) {
- printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
- __func__, irq);
- BUG();
- }
old_regs = set_irq_regs(regs);
irq_enter();
+ irq = __get_cpu_var(vector_irq)[vector];
overflow = check_stack_overflow();
+ desc = irq_to_desc(irq);
+ if (unlikely(!desc)) {
+ printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n",
+ __func__, irq, vector, smp_processor_id());
+ BUG();
+ }
+
if (!execute_on_irq_stack(overflow, desc, irq)) {
if (unlikely(overflow))
print_stack_overflow();
@@ -248,146 +230,6 @@
return 1;
}
-/*
- * Interrupt statistics:
- */
-
-atomic_t irq_err_count;
-
-/*
- * /proc/interrupts printing:
- */
-
-int show_interrupts(struct seq_file *p, void *v)
-{
- int i = *(loff_t *) v, j;
- struct irqaction * action;
- unsigned long flags;
-
- if (i == 0) {
- seq_printf(p, " ");
- for_each_online_cpu(j)
- seq_printf(p, "CPU%-8d",j);
- seq_putc(p, '\n');
- }
-
- if (i < NR_IRQS) {
- unsigned any_count = 0;
-
- spin_lock_irqsave(&irq_desc[i].lock, flags);
-#ifndef CONFIG_SMP
- any_count = kstat_irqs(i);
-#else
- for_each_online_cpu(j)
- any_count |= kstat_cpu(j).irqs[i];
-#endif
- action = irq_desc[i].action;
- if (!action && !any_count)
- goto skip;
- seq_printf(p, "%3d: ",i);
-#ifndef CONFIG_SMP
- seq_printf(p, "%10u ", kstat_irqs(i));
-#else
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
-#endif
- seq_printf(p, " %8s", irq_desc[i].chip->name);
- seq_printf(p, "-%-8s", irq_desc[i].name);
-
- if (action) {
- seq_printf(p, " %s", action->name);
- while ((action = action->next) != NULL)
- seq_printf(p, ", %s", action->name);
- }
-
- seq_putc(p, '\n');
-skip:
- spin_unlock_irqrestore(&irq_desc[i].lock, flags);
- } else if (i == NR_IRQS) {
- seq_printf(p, "NMI: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", nmi_count(j));
- seq_printf(p, " Non-maskable interrupts\n");
-#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "LOC: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).apic_timer_irqs);
- seq_printf(p, " Local timer interrupts\n");
-#endif
-#ifdef CONFIG_SMP
- seq_printf(p, "RES: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_resched_count);
- seq_printf(p, " Rescheduling interrupts\n");
- seq_printf(p, "CAL: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_call_count);
- seq_printf(p, " Function call interrupts\n");
- seq_printf(p, "TLB: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_tlb_count);
- seq_printf(p, " TLB shootdowns\n");
-#endif
-#ifdef CONFIG_X86_MCE
- seq_printf(p, "TRM: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_thermal_count);
- seq_printf(p, " Thermal event interrupts\n");
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "SPU: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_spurious_count);
- seq_printf(p, " Spurious interrupts\n");
-#endif
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
-#if defined(CONFIG_X86_IO_APIC)
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
-#endif
- }
- return 0;
-}
-
-/*
- * /proc/stat helpers
- */
-u64 arch_irq_stat_cpu(unsigned int cpu)
-{
- u64 sum = nmi_count(cpu);
-
-#ifdef CONFIG_X86_LOCAL_APIC
- sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
-#endif
-#ifdef CONFIG_SMP
- sum += per_cpu(irq_stat, cpu).irq_resched_count;
- sum += per_cpu(irq_stat, cpu).irq_call_count;
- sum += per_cpu(irq_stat, cpu).irq_tlb_count;
-#endif
-#ifdef CONFIG_X86_MCE
- sum += per_cpu(irq_stat, cpu).irq_thermal_count;
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- sum += per_cpu(irq_stat, cpu).irq_spurious_count;
-#endif
- return sum;
-}
-
-u64 arch_irq_stat(void)
-{
- u64 sum = atomic_read(&irq_err_count);
-
-#ifdef CONFIG_X86_IO_APIC
- sum += atomic_read(&irq_mis_count);
-#endif
- return sum;
-}
-
#ifdef CONFIG_HOTPLUG_CPU
#include <mach_apic.h>
@@ -395,20 +237,22 @@
{
unsigned int irq;
static int warned;
+ struct irq_desc *desc;
- for (irq = 0; irq < NR_IRQS; irq++) {
+ for_each_irq_desc(irq, desc) {
cpumask_t mask;
+
if (irq == 2)
continue;
- cpus_and(mask, irq_desc[irq].affinity, map);
+ cpus_and(mask, desc->affinity, map);
if (any_online_cpu(mask) == NR_CPUS) {
printk("Breaking affinity for irq %i\n", irq);
mask = map;
}
- if (irq_desc[irq].chip->set_affinity)
- irq_desc[irq].chip->set_affinity(irq, mask);
- else if (irq_desc[irq].action && !(warned++))
+ if (desc->chip->set_affinity)
+ desc->chip->set_affinity(irq, mask);
+ else if (desc->action && !(warned++))
printk("Cannot set affinity for irq %i\n", irq);
}
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index f065fe9..60eb84e 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,28 +18,6 @@
#include <asm/idle.h>
#include <asm/smp.h>
-atomic_t irq_err_count;
-
-/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
- */
-void ack_bad_irq(unsigned int irq)
-{
- printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq);
- /*
- * Currently unexpected vectors happen only on SMP and APIC.
- * We _must_ ack these because every local APIC has only N
- * irq slots per priority level, and a 'hanging, unacked' IRQ
- * holds up an irq slot - in excessive cases (when multiple
- * unexpected vectors occur) that might lock up the APIC
- * completely.
- * But don't ack when the APIC is disabled. -AK
- */
- if (!disable_apic)
- ack_APIC_irq();
-}
-
#ifdef CONFIG_DEBUG_STACKOVERFLOW
/*
* Probabilistic stack overflow check:
@@ -65,122 +43,6 @@
#endif
/*
- * Generic, controller-independent functions:
- */
-
-int show_interrupts(struct seq_file *p, void *v)
-{
- int i = *(loff_t *) v, j;
- struct irqaction * action;
- unsigned long flags;
-
- if (i == 0) {
- seq_printf(p, " ");
- for_each_online_cpu(j)
- seq_printf(p, "CPU%-8d",j);
- seq_putc(p, '\n');
- }
-
- if (i < NR_IRQS) {
- unsigned any_count = 0;
-
- spin_lock_irqsave(&irq_desc[i].lock, flags);
-#ifndef CONFIG_SMP
- any_count = kstat_irqs(i);
-#else
- for_each_online_cpu(j)
- any_count |= kstat_cpu(j).irqs[i];
-#endif
- action = irq_desc[i].action;
- if (!action && !any_count)
- goto skip;
- seq_printf(p, "%3d: ",i);
-#ifndef CONFIG_SMP
- seq_printf(p, "%10u ", kstat_irqs(i));
-#else
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
-#endif
- seq_printf(p, " %8s", irq_desc[i].chip->name);
- seq_printf(p, "-%-8s", irq_desc[i].name);
-
- if (action) {
- seq_printf(p, " %s", action->name);
- while ((action = action->next) != NULL)
- seq_printf(p, ", %s", action->name);
- }
- seq_putc(p, '\n');
-skip:
- spin_unlock_irqrestore(&irq_desc[i].lock, flags);
- } else if (i == NR_IRQS) {
- seq_printf(p, "NMI: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
- seq_printf(p, " Non-maskable interrupts\n");
- seq_printf(p, "LOC: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
- seq_printf(p, " Local timer interrupts\n");
-#ifdef CONFIG_SMP
- seq_printf(p, "RES: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count);
- seq_printf(p, " Rescheduling interrupts\n");
- seq_printf(p, "CAL: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
- seq_printf(p, " Function call interrupts\n");
- seq_printf(p, "TLB: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
- seq_printf(p, " TLB shootdowns\n");
-#endif
-#ifdef CONFIG_X86_MCE
- seq_printf(p, "TRM: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
- seq_printf(p, " Thermal event interrupts\n");
- seq_printf(p, "THR: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
- seq_printf(p, " Threshold APIC interrupts\n");
-#endif
- seq_printf(p, "SPU: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
- seq_printf(p, " Spurious interrupts\n");
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
- }
- return 0;
-}
-
-/*
- * /proc/stat helpers
- */
-u64 arch_irq_stat_cpu(unsigned int cpu)
-{
- u64 sum = cpu_pda(cpu)->__nmi_count;
-
- sum += cpu_pda(cpu)->apic_timer_irqs;
-#ifdef CONFIG_SMP
- sum += cpu_pda(cpu)->irq_resched_count;
- sum += cpu_pda(cpu)->irq_call_count;
- sum += cpu_pda(cpu)->irq_tlb_count;
-#endif
-#ifdef CONFIG_X86_MCE
- sum += cpu_pda(cpu)->irq_thermal_count;
- sum += cpu_pda(cpu)->irq_threshold_count;
-#endif
- sum += cpu_pda(cpu)->irq_spurious_count;
- return sum;
-}
-
-u64 arch_irq_stat(void)
-{
- return atomic_read(&irq_err_count);
-}
-
-/*
* do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
* handlers).
@@ -188,6 +50,7 @@
asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
+ struct irq_desc *desc;
/* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax;
@@ -201,8 +64,9 @@
stack_overflow_check(regs);
#endif
- if (likely(irq < NR_IRQS))
- generic_handle_irq(irq);
+ desc = irq_to_desc(irq);
+ if (likely(desc))
+ generic_handle_irq_desc(irq, desc);
else {
if (!disable_apic)
ack_APIC_irq();
@@ -223,8 +87,9 @@
{
unsigned int irq;
static int warned;
+ struct irq_desc *desc;
- for (irq = 0; irq < NR_IRQS; irq++) {
+ for_each_irq_desc(irq, desc) {
cpumask_t mask;
int break_affinity = 0;
int set_affinity = 1;
@@ -233,32 +98,32 @@
continue;
/* interrupt's are disabled at this point */
- spin_lock(&irq_desc[irq].lock);
+ spin_lock(&desc->lock);
if (!irq_has_action(irq) ||
- cpus_equal(irq_desc[irq].affinity, map)) {
- spin_unlock(&irq_desc[irq].lock);
+ cpus_equal(desc->affinity, map)) {
+ spin_unlock(&desc->lock);
continue;
}
- cpus_and(mask, irq_desc[irq].affinity, map);
+ cpus_and(mask, desc->affinity, map);
if (cpus_empty(mask)) {
break_affinity = 1;
mask = map;
}
- if (irq_desc[irq].chip->mask)
- irq_desc[irq].chip->mask(irq);
+ if (desc->chip->mask)
+ desc->chip->mask(irq);
- if (irq_desc[irq].chip->set_affinity)
- irq_desc[irq].chip->set_affinity(irq, mask);
+ if (desc->chip->set_affinity)
+ desc->chip->set_affinity(irq, mask);
else if (!(warned++))
set_affinity = 0;
- if (irq_desc[irq].chip->unmask)
- irq_desc[irq].chip->unmask(irq);
+ if (desc->chip->unmask)
+ desc->chip->unmask(irq);
- spin_unlock(&irq_desc[irq].lock);
+ spin_unlock(&desc->lock);
if (break_affinity && set_affinity)
printk("Broke affinity for irq %i\n", irq);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 9200a1e..845aa98 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -69,6 +69,13 @@
* 16 old-style INTA-cycle interrupts:
*/
for (i = 0; i < 16; i++) {
+ /* first time call this irq_desc */
+ struct irq_desc *desc = irq_to_desc(i);
+
+ desc->status = IRQ_DISABLED;
+ desc->action = NULL;
+ desc->depth = 1;
+
set_irq_chip_and_handler_name(i, &i8259A_chip,
handle_level_irq, "XT");
}
@@ -83,6 +90,27 @@
.name = "cascade",
};
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+ [0 ... IRQ0_VECTOR - 1] = -1,
+ [IRQ0_VECTOR] = 0,
+ [IRQ1_VECTOR] = 1,
+ [IRQ2_VECTOR] = 2,
+ [IRQ3_VECTOR] = 3,
+ [IRQ4_VECTOR] = 4,
+ [IRQ5_VECTOR] = 5,
+ [IRQ6_VECTOR] = 6,
+ [IRQ7_VECTOR] = 7,
+ [IRQ8_VECTOR] = 8,
+ [IRQ9_VECTOR] = 9,
+ [IRQ10_VECTOR] = 10,
+ [IRQ11_VECTOR] = 11,
+ [IRQ12_VECTOR] = 12,
+ [IRQ13_VECTOR] = 13,
+ [IRQ14_VECTOR] = 14,
+ [IRQ15_VECTOR] = 15,
+ [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+};
+
/* Overridden in paravirt.c */
void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
@@ -98,22 +126,14 @@
* us. (some of these will be overridden and become
* 'special' SMP interrupts)
*/
- for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
- if (i >= NR_IRQS)
- break;
+ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
/* SYSCALL_VECTOR was reserved in trap_init. */
- if (!test_bit(vector, used_vectors))
- set_intr_gate(vector, interrupt[i]);
+ if (i != SYSCALL_VECTOR)
+ set_intr_gate(i, interrupt[i]);
}
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
- /*
- * IRQ0 must be given a fixed assignment and initialized,
- * because it's used before the IO-APIC is set up.
- */
- set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
/*
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
* IPI, driven by wakeup.
@@ -128,6 +148,9 @@
/* IPI for single call function */
set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
+
+ /* Low priority IPI to cleanup after moving an irq */
+ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
#endif
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 5b5be9d..ff02353 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -142,23 +142,19 @@
init_bsp_APIC();
init_8259A(0);
- for (i = 0; i < NR_IRQS; i++) {
- irq_desc[i].status = IRQ_DISABLED;
- irq_desc[i].action = NULL;
- irq_desc[i].depth = 1;
+ for (i = 0; i < 16; i++) {
+ /* first time call this irq_desc */
+ struct irq_desc *desc = irq_to_desc(i);
- if (i < 16) {
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- set_irq_chip_and_handler_name(i, &i8259A_chip,
+ desc->status = IRQ_DISABLED;
+ desc->action = NULL;
+ desc->depth = 1;
+
+ /*
+ * 16 old-style INTA-cycle interrupts:
+ */
+ set_irq_chip_and_handler_name(i, &i8259A_chip,
handle_level_irq, "XT");
- } else {
- /*
- * 'high' PCI IRQs filled in on demand
- */
- irq_desc[i].chip = &no_irq_chip;
- }
}
}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index f6a11b9..67465ed 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -35,9 +35,6 @@
if (!(word & (1 << 13))) {
dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
"disabling irq balancing and affinity\n");
-#ifdef CONFIG_IRQBALANCE
- irqbalance_disable("");
-#endif
noirqdebug_setup("");
#ifdef CONFIG_PROC_FS
no_irq_affinity = 1;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b2c9787..0fa6790 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1073,6 +1073,7 @@
#endif
prefill_possible_map();
+
#ifdef CONFIG_X86_64
init_cpu_to_node();
#endif
@@ -1080,6 +1081,9 @@
init_apic_mappings();
ioapic_init_mappings();
+ /* need to wait for io_apic is mapped */
+ nr_irqs = probe_nr_irqs();
+
kvm_guest_init();
e820_reserve_resources();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 0e67f72..410c88f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -140,25 +140,30 @@
*/
void __init setup_per_cpu_areas(void)
{
- ssize_t size = PERCPU_ENOUGH_ROOM;
+ ssize_t size, old_size;
char *ptr;
int cpu;
+ unsigned long align = 1;
/* Setup cpu_pda map */
setup_cpu_pda_map();
/* Copy section for each CPU (we discard the original) */
- size = PERCPU_ENOUGH_ROOM;
+ old_size = PERCPU_ENOUGH_ROOM;
+ align = max_t(unsigned long, PAGE_SIZE, align);
+ size = roundup(old_size, align);
printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
size);
for_each_possible_cpu(cpu) {
#ifndef CONFIG_NEED_MULTIPLE_NODES
- ptr = alloc_bootmem_pages(size);
+ ptr = __alloc_bootmem(size, align,
+ __pa(MAX_DMA_ADDRESS));
#else
int node = early_cpu_to_node(cpu);
if (!node_online(node) || !NODE_DATA(node)) {
- ptr = alloc_bootmem_pages(size);
+ ptr = __alloc_bootmem(size, align,
+ __pa(MAX_DMA_ADDRESS));
printk(KERN_INFO
"cpu %d has no node %d or node-local memory\n",
cpu, node);
@@ -167,7 +172,8 @@
cpu, __pa(ptr));
}
else {
- ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+ ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
+ __pa(MAX_DMA_ADDRESS));
if (ptr)
printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
cpu, node, __pa(ptr));
@@ -175,7 +181,6 @@
#endif
per_cpu_offset(cpu) = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-
}
printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7ed9e07..7ece815 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -543,10 +543,10 @@
int timeout;
u32 status;
- printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
+ printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
for (i = 0; i < ARRAY_SIZE(regs); i++) {
- printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]);
+ printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
/*
* Wait for idle.
@@ -874,7 +874,7 @@
start_ip = setup_trampoline();
/* So we see what's up */
- printk(KERN_INFO "Booting processor %d/%d ip %lx\n",
+ printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n",
cpu, apicid, start_ip);
/*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
new file mode 100644
index 0000000..aeef529
--- /dev/null
+++ b/arch/x86/kernel/uv_irq.c
@@ -0,0 +1,79 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV IRQ functions
+ *
+ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/irq.h>
+
+#include <asm/apic.h>
+#include <asm/uv/uv_irq.h>
+
+static void uv_noop(unsigned int irq)
+{
+}
+
+static unsigned int uv_noop_ret(unsigned int irq)
+{
+ return 0;
+}
+
+static void uv_ack_apic(unsigned int irq)
+{
+ ack_APIC_irq();
+}
+
+struct irq_chip uv_irq_chip = {
+ .name = "UV-CORE",
+ .startup = uv_noop_ret,
+ .shutdown = uv_noop,
+ .enable = uv_noop,
+ .disable = uv_noop,
+ .ack = uv_noop,
+ .mask = uv_noop,
+ .unmask = uv_noop,
+ .eoi = uv_ack_apic,
+ .end = uv_noop,
+};
+
+/*
+ * Set up a mapping of an available irq and vector, and enable the specified
+ * MMR that defines the MSI that is to be sent to the specified CPU when an
+ * interrupt is raised.
+ */
+int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
+ unsigned long mmr_offset)
+{
+ int irq;
+ int ret;
+
+ irq = create_irq();
+ if (irq <= 0)
+ return -EBUSY;
+
+ ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset);
+ if (ret != irq)
+ destroy_irq(irq);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uv_setup_irq);
+
+/*
+ * Tear down a mapping of an irq and vector, and disable the specified MMR that
+ * defined the MSI that was to be sent to the specified CPU when an interrupt
+ * was raised.
+ *
+ * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
+ */
+void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset)
+{
+ arch_disable_uv_irq(mmr_blade, mmr_offset);
+ destroy_irq(irq);
+}
+EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
new file mode 100644
index 0000000..67f9b9d
--- /dev/null
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -0,0 +1,72 @@
+/*
+ * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson
+ */
+
+#include <linux/sysdev.h>
+#include <asm/uv/bios.h>
+
+struct kobject *sgi_uv_kobj;
+
+static ssize_t partition_id_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
+}
+
+static ssize_t coherence_id_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
+}
+
+static struct kobj_attribute partition_id_attr =
+ __ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
+
+static struct kobj_attribute coherence_id_attr =
+ __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
+
+
+static int __init sgi_uv_sysfs_init(void)
+{
+ unsigned long ret;
+
+ if (!sgi_uv_kobj)
+ sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
+ if (!sgi_uv_kobj) {
+ printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n");
+ return -EINVAL;
+ }
+
+ ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
+ if (ret) {
+ printk(KERN_WARNING "sysfs_create_file partition_id failed \n");
+ return ret;
+ }
+
+ ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
+ if (ret) {
+ printk(KERN_WARNING "sysfs_create_file coherence_id failed \n");
+ return ret;
+ }
+
+ return 0;
+}
+
+device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 61a97e6..0c9667f 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -484,10 +484,11 @@
static unsigned int startup_cobalt_irq(unsigned int irq)
{
unsigned long flags;
+ struct irq_desc *desc = irq_to_desc(irq);
spin_lock_irqsave(&cobalt_lock, flags);
- if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
- irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
+ if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
+ desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
enable_cobalt_irq(irq);
spin_unlock_irqrestore(&cobalt_lock, flags);
return 0;
@@ -506,9 +507,10 @@
static void end_cobalt_irq(unsigned int irq)
{
unsigned long flags;
+ struct irq_desc *desc = irq_to_desc(irq);
spin_lock_irqsave(&cobalt_lock, flags);
- if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
+ if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
enable_cobalt_irq(irq);
spin_unlock_irqrestore(&cobalt_lock, flags);
}
@@ -626,12 +628,12 @@
spin_unlock_irqrestore(&i8259A_lock, flags);
- desc = irq_desc + realirq;
+ desc = irq_to_desc(realirq);
/*
* handle this 'virtual interrupt' as a Cobalt one now.
*/
- kstat_cpu(smp_processor_id()).irqs[realirq]++;
+ kstat_incr_irqs_this_cpu(realirq, desc);
if (likely(desc->action != NULL))
handle_IRQ_event(realirq, desc->action);
@@ -662,27 +664,29 @@
int i;
for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
- irq_desc[i].status = IRQ_DISABLED;
- irq_desc[i].action = 0;
- irq_desc[i].depth = 1;
+ struct irq_desc *desc = irq_to_desc(i);
+
+ desc->status = IRQ_DISABLED;
+ desc->action = 0;
+ desc->depth = 1;
if (i == 0) {
- irq_desc[i].chip = &cobalt_irq_type;
+ desc->chip = &cobalt_irq_type;
}
else if (i == CO_IRQ_IDE0) {
- irq_desc[i].chip = &cobalt_irq_type;
+ desc->chip = &cobalt_irq_type;
}
else if (i == CO_IRQ_IDE1) {
- irq_desc[i].chip = &cobalt_irq_type;
+ desc->chip = &cobalt_irq_type;
}
else if (i == CO_IRQ_8259) {
- irq_desc[i].chip = &piix4_master_irq_type;
+ desc->chip = &piix4_master_irq_type;
}
else if (i < CO_IRQ_APIC0) {
- irq_desc[i].chip = &piix4_virtual_irq_type;
+ desc->chip = &piix4_virtual_irq_type;
}
else if (IS_CO_APIC(i)) {
- irq_desc[i].chip = &cobalt_irq_type;
+ desc->chip = &cobalt_irq_type;
}
}
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 6953859..254ee07 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -235,11 +235,14 @@
void __init vmi_time_init(void)
{
+ unsigned int cpu;
/* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
vmi_time_init_clockevent();
setup_irq(0, &vmi_clock_action);
+ for_each_possible_cpu(cpu)
+ per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
}
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 65f0b8a..48ee4f9 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -582,7 +582,7 @@
for (i = 0; i < LGUEST_IRQS; i++) {
int vector = FIRST_EXTERNAL_VECTOR + i;
if (vector != SYSCALL_VECTOR) {
- set_intr_gate(vector, interrupt[i]);
+ set_intr_gate(vector, interrupt[vector]);
set_irq_chip_and_handler_name(i, &lguest_irq_controller,
handle_level_irq,
"level");
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index df37fc9..3c3b471 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -41,6 +41,10 @@
{ }
};
+static cpumask_t vector_allocation_domain(int cpu)
+{
+ return cpumask_of_cpu(cpu);
+}
static int probe_bigsmp(void)
{
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 6513d41..28459ca 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -75,4 +75,18 @@
}
#endif
+static cpumask_t vector_allocation_domain(int cpu)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+ return domain;
+}
+
struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
index 8cf5839..71a309b 100644
--- a/arch/x86/mach-generic/numaq.c
+++ b/arch/x86/mach-generic/numaq.c
@@ -38,4 +38,18 @@
return 0;
}
+static cpumask_t vector_allocation_domain(int cpu)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+ return domain;
+}
+
struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index 6ad6b67..6272b5e 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -23,4 +23,18 @@
return 0;
}
+static cpumask_t vector_allocation_domain(int cpu)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+ return domain;
+}
+
struct genapic apic_summit = APIC_INIT("summit", probe_summit);
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 199a5f4..0f6e8a6 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -1483,7 +1483,7 @@
* the interrupt off to another CPU */
static void before_handle_vic_irq(unsigned int irq)
{
- irq_desc_t *desc = irq_desc + irq;
+ irq_desc_t *desc = irq_to_desc(irq);
__u8 cpu = smp_processor_id();
_raw_spin_lock(&vic_irq_lock);
@@ -1518,7 +1518,7 @@
/* Finish the VIC interrupt: basically mask */
static void after_handle_vic_irq(unsigned int irq)
{
- irq_desc_t *desc = irq_desc + irq;
+ irq_desc_t *desc = irq_to_desc(irq);
_raw_spin_lock(&vic_irq_lock);
{
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 635b50e..2c4baa8 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -56,13 +56,6 @@
static DEFINE_PER_CPU(struct trap_reason, pf_reason);
static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
-#if 0 /* XXX: no way gather this info anymore */
-/* Access to this is not per-cpu. */
-static DEFINE_PER_CPU(atomic_t, dropped);
-#endif
-
-static struct dentry *marker_file;
-
static DEFINE_MUTEX(mmiotrace_mutex);
static DEFINE_SPINLOCK(trace_lock);
static atomic_t mmiotrace_enabled;
@@ -75,7 +68,7 @@
* and trace_lock.
* - Routines depending on is_enabled() must take trace_lock.
* - trace_list users must hold trace_lock.
- * - is_enabled() guarantees that mmio_trace_record is allowed.
+ * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
* - pre/post callbacks assume the effect of is_enabled() being true.
*/
@@ -97,44 +90,6 @@
return atomic_read(&mmiotrace_enabled);
}
-#if 0 /* XXX: needs rewrite */
-/*
- * Write callback for the debugfs entry:
- * Read a marker and write it to the mmio trace log
- */
-static ssize_t write_marker(struct file *file, const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- char *event = NULL;
- struct mm_io_header *headp;
- ssize_t len = (count > 65535) ? 65535 : count;
-
- event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
- if (!event)
- return -ENOMEM;
-
- headp = (struct mm_io_header *)event;
- headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
- headp->data_len = len;
-
- if (copy_from_user(event + sizeof(*headp), buffer, len)) {
- kfree(event);
- return -EFAULT;
- }
-
- spin_lock_irq(&trace_lock);
-#if 0 /* XXX: convert this to use tracing */
- if (is_enabled())
- relay_write(chan, event, sizeof(*headp) + len);
- else
-#endif
- len = -EINVAL;
- spin_unlock_irq(&trace_lock);
- kfree(event);
- return len;
-}
-#endif
-
static void print_pte(unsigned long address)
{
unsigned int level;
@@ -307,8 +262,10 @@
map.map_id = trace->id;
spin_lock_irq(&trace_lock);
- if (!is_enabled())
+ if (!is_enabled()) {
+ kfree(trace);
goto not_enabled;
+ }
mmio_trace_mapping(&map);
list_add_tail(&trace->list, &trace_list);
@@ -377,6 +334,23 @@
iounmap_trace_core(addr);
}
+int mmiotrace_printk(const char *fmt, ...)
+{
+ int ret = 0;
+ va_list args;
+ unsigned long flags;
+ va_start(args, fmt);
+
+ spin_lock_irqsave(&trace_lock, flags);
+ if (is_enabled())
+ ret = mmio_trace_printk(fmt, args);
+ spin_unlock_irqrestore(&trace_lock, flags);
+
+ va_end(args);
+ return ret;
+}
+EXPORT_SYMBOL(mmiotrace_printk);
+
static void clear_trace_list(void)
{
struct remap_trace *trace;
@@ -462,26 +436,12 @@
}
#endif
-#if 0 /* XXX: out of order */
-static struct file_operations fops_marker = {
- .owner = THIS_MODULE,
- .write = write_marker
-};
-#endif
-
void enable_mmiotrace(void)
{
mutex_lock(&mmiotrace_mutex);
if (is_enabled())
goto out;
-#if 0 /* XXX: tracing does not support text entries */
- marker_file = debugfs_create_file("marker", 0660, dir, NULL,
- &fops_marker);
- if (!marker_file)
- pr_err(NAME "marker file creation failed.\n");
-#endif
-
if (nommiotrace)
pr_info(NAME "MMIO tracing disabled.\n");
enter_uniprocessor();
@@ -506,11 +466,6 @@
clear_trace_list(); /* guarantees: no more kmmio callbacks */
leave_uniprocessor();
- if (marker_file) {
- debugfs_remove(marker_file);
- marker_file = NULL;
- }
-
pr_info(NAME "disabled.\n");
out:
mutex_unlock(&mmiotrace_mutex);
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index efa1911..df3d5c8 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -79,25 +79,34 @@
static unsigned int mw64[] = { 0x89, 0x8B };
#endif /* not __i386__ */
-static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
- int *rexr)
+struct prefix_bits {
+ unsigned shorted:1;
+ unsigned enlarged:1;
+ unsigned rexr:1;
+ unsigned rex:1;
+};
+
+static int skip_prefix(unsigned char *addr, struct prefix_bits *prf)
{
int i;
unsigned char *p = addr;
- *shorted = 0;
- *enlarged = 0;
- *rexr = 0;
+ prf->shorted = 0;
+ prf->enlarged = 0;
+ prf->rexr = 0;
+ prf->rex = 0;
restart:
for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
if (*p == prefix_codes[i]) {
if (*p == 0x66)
- *shorted = 1;
+ prf->shorted = 1;
#ifdef __amd64__
if ((*p & 0xf8) == 0x48)
- *enlarged = 1;
+ prf->enlarged = 1;
if ((*p & 0xf4) == 0x44)
- *rexr = 1;
+ prf->rexr = 1;
+ if ((*p & 0xf0) == 0x40)
+ prf->rex = 1;
#endif
p++;
goto restart;
@@ -135,12 +144,12 @@
{
unsigned int opcode;
unsigned char *p;
- int shorted, enlarged, rexr;
+ struct prefix_bits prf;
int i;
enum reason_type rv = OTHERS;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
@@ -156,10 +165,11 @@
{
unsigned int opcode;
unsigned char *p;
- int i, shorted, enlarged, rexr;
+ struct prefix_bits prf;
+ int i;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(rw8); i++)
@@ -168,7 +178,7 @@
for (i = 0; i < ARRAY_SIZE(rw32); i++)
if (rw32[i] == opcode)
- return (shorted ? 2 : (enlarged ? 8 : 4));
+ return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
return 0;
@@ -178,10 +188,11 @@
{
unsigned int opcode;
unsigned char *p;
- int i, shorted, enlarged, rexr;
+ struct prefix_bits prf;
+ int i;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(mw8); i++)
@@ -194,11 +205,11 @@
for (i = 0; i < ARRAY_SIZE(mw32); i++)
if (mw32[i] == opcode)
- return shorted ? 2 : 4;
+ return prf.shorted ? 2 : 4;
for (i = 0; i < ARRAY_SIZE(mw64); i++)
if (mw64[i] == opcode)
- return shorted ? 2 : (enlarged ? 8 : 4);
+ return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
return 0;
@@ -238,7 +249,7 @@
#endif
};
-static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
+static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs)
{
unsigned char *rv = NULL;
@@ -255,18 +266,6 @@
case arg_DL:
rv = (unsigned char *)®s->dx;
break;
- case arg_AH:
- rv = 1 + (unsigned char *)®s->ax;
- break;
- case arg_BH:
- rv = 1 + (unsigned char *)®s->bx;
- break;
- case arg_CH:
- rv = 1 + (unsigned char *)®s->cx;
- break;
- case arg_DH:
- rv = 1 + (unsigned char *)®s->dx;
- break;
#ifdef __amd64__
case arg_R8:
rv = (unsigned char *)®s->r8;
@@ -294,9 +293,55 @@
break;
#endif
default:
- printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
break;
}
+
+ if (rv)
+ return rv;
+
+ if (rex) {
+ /*
+ * If REX prefix exists, access low bytes of SI etc.
+ * instead of AH etc.
+ */
+ switch (no) {
+ case arg_SI:
+ rv = (unsigned char *)®s->si;
+ break;
+ case arg_DI:
+ rv = (unsigned char *)®s->di;
+ break;
+ case arg_BP:
+ rv = (unsigned char *)®s->bp;
+ break;
+ case arg_SP:
+ rv = (unsigned char *)®s->sp;
+ break;
+ default:
+ break;
+ }
+ } else {
+ switch (no) {
+ case arg_AH:
+ rv = 1 + (unsigned char *)®s->ax;
+ break;
+ case arg_BH:
+ rv = 1 + (unsigned char *)®s->bx;
+ break;
+ case arg_CH:
+ rv = 1 + (unsigned char *)®s->cx;
+ break;
+ case arg_DH:
+ rv = 1 + (unsigned char *)®s->dx;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!rv)
+ printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+
return rv;
}
@@ -368,11 +413,12 @@
unsigned char mod_rm;
int reg;
unsigned char *p;
- int i, shorted, enlarged, rexr;
+ struct prefix_bits prf;
+ int i;
unsigned long rv;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
if (reg_rop[i] == opcode) {
@@ -392,10 +438,10 @@
do_work:
mod_rm = *p;
- reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
+ reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
switch (get_ins_reg_width(ins_addr)) {
case 1:
- return *get_reg_w8(reg, regs);
+ return *get_reg_w8(reg, prf.rex, regs);
case 2:
return *(unsigned short *)get_reg_w32(reg, regs);
@@ -422,11 +468,12 @@
unsigned char mod_rm;
unsigned char mod;
unsigned char *p;
- int i, shorted, enlarged, rexr;
+ struct prefix_bits prf;
+ int i;
unsigned long rv;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
if (imm_wop[i] == opcode) {
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index d877c5b..ab50a8d 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -3,6 +3,7 @@
*/
#include <linux/module.h>
#include <linux/io.h>
+#include <linux/mmiotrace.h>
#define MODULE_NAME "testmmiotrace"
@@ -13,6 +14,7 @@
static void do_write_test(void __iomem *p)
{
unsigned int i;
+ mmiotrace_printk("Write test.\n");
for (i = 0; i < 256; i++)
iowrite8(i, p + i);
for (i = 1024; i < (5 * 1024); i += 2)
@@ -24,6 +26,7 @@
static void do_read_test(void __iomem *p)
{
unsigned int i;
+ mmiotrace_printk("Read test.\n");
for (i = 0; i < 256; i++)
ioread8(p + i);
for (i = 1024; i < (5 * 1024); i += 2)
@@ -39,6 +42,7 @@
pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
return;
}
+ mmiotrace_printk("ioremap returned %p.\n", p);
do_write_test(p);
do_read_test(p);
iounmap(p);
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 28b85ab..bb04260 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -21,7 +21,6 @@
static void __init __xen_init_IRQ(void)
{
-#ifdef CONFIG_X86_64
int i;
/* Create identity vector->irq map */
@@ -31,7 +30,6 @@
for_each_possible_cpu(cpu)
per_cpu(vector_irq, cpu)[i] = i;
}
-#endif /* CONFIG_X86_64 */
xen_init_IRQ();
}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index dd71e3a..5601506 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -241,7 +241,7 @@
ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
- kstat_this_cpu.irqs[irq]++;
+ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
out:
raw_local_irq_restore(flags);
diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index e8362c1..dcbf1be 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -115,34 +115,32 @@
* (start) dependent operations on their target channel
* @tx: transaction with dependencies
*/
-void
-async_tx_run_dependencies(struct dma_async_tx_descriptor *tx)
+void async_tx_run_dependencies(struct dma_async_tx_descriptor *tx)
{
- struct dma_async_tx_descriptor *next = tx->next;
+ struct dma_async_tx_descriptor *dep = tx->next;
+ struct dma_async_tx_descriptor *dep_next;
struct dma_chan *chan;
- if (!next)
+ if (!dep)
return;
- tx->next = NULL;
- chan = next->chan;
+ chan = dep->chan;
/* keep submitting up until a channel switch is detected
* in that case we will be called again as a result of
* processing the interrupt from async_tx_channel_switch
*/
- while (next && next->chan == chan) {
- struct dma_async_tx_descriptor *_next;
+ for (; dep; dep = dep_next) {
+ spin_lock_bh(&dep->lock);
+ dep->parent = NULL;
+ dep_next = dep->next;
+ if (dep_next && dep_next->chan == chan)
+ dep->next = NULL; /* ->next will be submitted */
+ else
+ dep_next = NULL; /* submit current dep and terminate */
+ spin_unlock_bh(&dep->lock);
- spin_lock_bh(&next->lock);
- next->parent = NULL;
- _next = next->next;
- if (_next && _next->chan == chan)
- next->next = NULL;
- spin_unlock_bh(&next->lock);
-
- next->tx_submit(next);
- next = _next;
+ dep->tx_submit(dep);
}
chan->device->device_issue_pending(chan);
diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c
index 31dcd91..dc8d1a9 100644
--- a/drivers/char/agp/ali-agp.c
+++ b/drivers/char/agp/ali-agp.c
@@ -417,6 +417,6 @@
module_init(agp_ali_init);
module_exit(agp_ali_cleanup);
-MODULE_AUTHOR("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
MODULE_LICENSE("GPL and additional rights");
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 2812ee2..52f4361 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -772,6 +772,6 @@
module_exit(agp_amd64_cleanup);
#endif
-MODULE_AUTHOR("Dave Jones <davej@codemonkey.org.uk>, Andi Kleen");
+MODULE_AUTHOR("Dave Jones <davej@redhat.com>, Andi Kleen");
module_param(agp_try_unsupported, bool, 0);
MODULE_LICENSE("GPL");
diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c
index ae2791b..f1537ee 100644
--- a/drivers/char/agp/ati-agp.c
+++ b/drivers/char/agp/ati-agp.c
@@ -561,6 +561,6 @@
module_init(agp_ati_init);
module_exit(agp_ati_cleanup);
-MODULE_AUTHOR("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
MODULE_LICENSE("GPL and additional rights");
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index 3a3cc03..8c617ad 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -349,7 +349,7 @@
__setup("agp=", agp_setup);
#endif
-MODULE_AUTHOR("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
MODULE_DESCRIPTION("AGP GART driver");
MODULE_LICENSE("GPL and additional rights");
MODULE_ALIAS_MISCDEV(AGPGART_MINOR);
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index 1108665..9cf6e9b 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -2390,5 +2390,5 @@
module_init(agp_intel_init);
module_exit(agp_intel_cleanup);
-MODULE_AUTHOR("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
MODULE_LICENSE("GPL and additional rights");
diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c
index 5bbed3d..16acee2 100644
--- a/drivers/char/agp/nvidia-agp.c
+++ b/drivers/char/agp/nvidia-agp.c
@@ -1,7 +1,7 @@
/*
* Nvidia AGPGART routines.
* Based upon a 2.4 agpgart diff by the folks from NVIDIA, and hacked up
- * to work in 2.5 by Dave Jones <davej@codemonkey.org.uk>
+ * to work in 2.5 by Dave Jones <davej@redhat.com>
*/
#include <linux/module.h>
diff --git a/drivers/char/agp/via-agp.c b/drivers/char/agp/via-agp.c
index 9f4d49e..d3bd243 100644
--- a/drivers/char/agp/via-agp.c
+++ b/drivers/char/agp/via-agp.c
@@ -595,4 +595,4 @@
module_exit(agp_via_cleanup);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index f3cfb4c..408f5f9 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -219,7 +219,7 @@
for (irq = find_first_bit(&v, HPET_MAX_IRQ); irq < HPET_MAX_IRQ;
irq = find_next_bit(&v, HPET_MAX_IRQ, 1 + irq)) {
- if (irq >= NR_IRQS) {
+ if (irq >= nr_irqs) {
irq = HPET_MAX_IRQ;
break;
}
diff --git a/drivers/char/random.c b/drivers/char/random.c
index c8752ea..705a839 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -558,9 +558,26 @@
unsigned dont_count_entropy:1;
};
-static struct timer_rand_state input_timer_state;
static struct timer_rand_state *irq_timer_state[NR_IRQS];
+static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+ if (irq >= nr_irqs)
+ return NULL;
+
+ return irq_timer_state[irq];
+}
+
+static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+ if (irq >= nr_irqs)
+ return;
+
+ irq_timer_state[irq] = state;
+}
+
+static struct timer_rand_state input_timer_state;
+
/*
* This function adds entropy to the entropy "pool" by using timing
* delays. It uses the timer_rand_state structure to make an estimate
@@ -648,11 +665,15 @@
void add_interrupt_randomness(int irq)
{
- if (irq >= NR_IRQS || irq_timer_state[irq] == NULL)
+ struct timer_rand_state *state;
+
+ state = get_timer_rand_state(irq);
+
+ if (state == NULL)
return;
DEBUG_ENT("irq event %d\n", irq);
- add_timer_randomness(irq_timer_state[irq], 0x100 + irq);
+ add_timer_randomness(state, 0x100 + irq);
}
#ifdef CONFIG_BLOCK
@@ -912,7 +933,12 @@
{
struct timer_rand_state *state;
- if (irq >= NR_IRQS || irq_timer_state[irq])
+ if (irq >= nr_irqs)
+ return;
+
+ state = get_timer_rand_state(irq);
+
+ if (state)
return;
/*
@@ -921,7 +947,7 @@
*/
state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL);
if (state)
- irq_timer_state[irq] = state;
+ set_timer_rand_state(irq, state);
}
#ifdef CONFIG_BLOCK
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index d0c0d64..ce0d9da 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -168,7 +168,7 @@
static struct sysrq_key_op sysrq_show_timers_op = {
.handler = sysrq_handle_show_timers,
.help_msg = "show-all-timers(Q)",
- .action_msg = "Show pending hrtimers (no others)",
+ .action_msg = "Show clockevent devices & pending hrtimers (no others)",
};
static void sysrq_handle_mountro(int key, struct tty_struct *tty)
diff --git a/drivers/char/vr41xx_giu.c b/drivers/char/vr41xx_giu.c
index ffe9b4e..54c8372 100644
--- a/drivers/char/vr41xx_giu.c
+++ b/drivers/char/vr41xx_giu.c
@@ -641,7 +641,7 @@
}
irq = platform_get_irq(dev, 0);
- if (irq < 0 || irq >= NR_IRQS)
+ if (irq < 0 || irq >= nr_irqs)
return -EBUSY;
return cascade_irq(irq, giu_get_irq);
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index 71d2ac4..c201710 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -237,9 +237,12 @@
if (strict_strtoul(arg, 16, &base))
return -EINVAL;
-
+#ifdef CONFIG_X86_64
+ if (base > UINT_MAX)
+ return -ERANGE;
+#endif
printk(KERN_INFO "PMTMR IOPort override: 0x%04x -> 0x%04lx\n",
- (unsigned int)pmtmr_ioport, base);
+ pmtmr_ioport, base);
pmtmr_ioport = base;
return 1;
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index cd30390..904e575 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -48,13 +48,13 @@
can be integrated in chips such as the Atmel AT32ap7000.
config FSL_DMA
- bool "Freescale MPC85xx/MPC83xx DMA support"
- depends on PPC
+ tristate "Freescale Elo and Elo Plus DMA support"
+ depends on FSL_SOC
select DMA_ENGINE
---help---
- Enable support for the Freescale DMA engine. Now, it support
- MPC8560/40, MPC8555, MPC8548 and MPC8641 processors.
- The MPC8349, MPC8360 is also supported.
+ Enable support for the Freescale Elo and Elo Plus DMA controllers.
+ The Elo is the DMA controller on some 82xx and 83xx parts, and the
+ Elo Plus is the DMA controller on 85xx and 86xx parts.
config MV_XOR
bool "Marvell XOR engine support"
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index a08d197..d1e381e 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -325,7 +325,12 @@
struct dmatest_thread *thread;
unsigned int i;
- dtc = kmalloc(sizeof(struct dmatest_chan), GFP_ATOMIC);
+ /* Have we already been told about this channel? */
+ list_for_each_entry(dtc, &dmatest_channels, node)
+ if (dtc->chan == chan)
+ return DMA_DUP;
+
+ dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL);
if (!dtc) {
pr_warning("dmatest: No memory for %s\n", chan->dev.bus_id);
return DMA_NAK;
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index c0059ca..0b95dcc 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -370,7 +370,10 @@
struct dma_client *client)
{
struct fsl_dma_chan *fsl_chan = to_fsl_chan(chan);
- LIST_HEAD(tmp_list);
+
+ /* Has this channel already been allocated? */
+ if (fsl_chan->desc_pool)
+ return 1;
/* We need the descriptor to be aligned to 32bytes
* for meeting FSL DMA specification requirement.
@@ -410,6 +413,8 @@
}
spin_unlock_irqrestore(&fsl_chan->desc_lock, flags);
dma_pool_destroy(fsl_chan->desc_pool);
+
+ fsl_chan->desc_pool = NULL;
}
static struct dma_async_tx_descriptor *
@@ -786,159 +791,29 @@
fsl_chan_ld_cleanup(fsl_chan);
}
-static void fsl_dma_callback_test(void *param)
+static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev,
+ struct device_node *node, u32 feature, const char *compatible)
{
- struct fsl_dma_chan *fsl_chan = param;
- if (fsl_chan)
- dev_dbg(fsl_chan->dev, "selftest: callback is ok!\n");
-}
-
-static int fsl_dma_self_test(struct fsl_dma_chan *fsl_chan)
-{
- struct dma_chan *chan;
- int err = 0;
- dma_addr_t dma_dest, dma_src;
- dma_cookie_t cookie;
- u8 *src, *dest;
- int i;
- size_t test_size;
- struct dma_async_tx_descriptor *tx1, *tx2, *tx3;
-
- test_size = 4096;
-
- src = kmalloc(test_size * 2, GFP_KERNEL);
- if (!src) {
- dev_err(fsl_chan->dev,
- "selftest: Cannot alloc memory for test!\n");
- return -ENOMEM;
- }
-
- dest = src + test_size;
-
- for (i = 0; i < test_size; i++)
- src[i] = (u8) i;
-
- chan = &fsl_chan->common;
-
- if (fsl_dma_alloc_chan_resources(chan, NULL) < 1) {
- dev_err(fsl_chan->dev,
- "selftest: Cannot alloc resources for DMA\n");
- err = -ENODEV;
- goto out;
- }
-
- /* TX 1 */
- dma_src = dma_map_single(fsl_chan->dev, src, test_size / 2,
- DMA_TO_DEVICE);
- dma_dest = dma_map_single(fsl_chan->dev, dest, test_size / 2,
- DMA_FROM_DEVICE);
- tx1 = fsl_dma_prep_memcpy(chan, dma_dest, dma_src, test_size / 2, 0);
- async_tx_ack(tx1);
-
- cookie = fsl_dma_tx_submit(tx1);
- fsl_dma_memcpy_issue_pending(chan);
- msleep(2);
-
- if (fsl_dma_is_complete(chan, cookie, NULL, NULL) != DMA_SUCCESS) {
- dev_err(fsl_chan->dev, "selftest: Time out!\n");
- err = -ENODEV;
- goto free_resources;
- }
-
- /* Test free and re-alloc channel resources */
- fsl_dma_free_chan_resources(chan);
-
- if (fsl_dma_alloc_chan_resources(chan, NULL) < 1) {
- dev_err(fsl_chan->dev,
- "selftest: Cannot alloc resources for DMA\n");
- err = -ENODEV;
- goto free_resources;
- }
-
- /* Continue to test
- * TX 2
- */
- dma_src = dma_map_single(fsl_chan->dev, src + test_size / 2,
- test_size / 4, DMA_TO_DEVICE);
- dma_dest = dma_map_single(fsl_chan->dev, dest + test_size / 2,
- test_size / 4, DMA_FROM_DEVICE);
- tx2 = fsl_dma_prep_memcpy(chan, dma_dest, dma_src, test_size / 4, 0);
- async_tx_ack(tx2);
-
- /* TX 3 */
- dma_src = dma_map_single(fsl_chan->dev, src + test_size * 3 / 4,
- test_size / 4, DMA_TO_DEVICE);
- dma_dest = dma_map_single(fsl_chan->dev, dest + test_size * 3 / 4,
- test_size / 4, DMA_FROM_DEVICE);
- tx3 = fsl_dma_prep_memcpy(chan, dma_dest, dma_src, test_size / 4, 0);
- async_tx_ack(tx3);
-
- /* Interrupt tx test */
- tx1 = fsl_dma_prep_interrupt(chan, 0);
- async_tx_ack(tx1);
- cookie = fsl_dma_tx_submit(tx1);
-
- /* Test exchanging the prepared tx sort */
- cookie = fsl_dma_tx_submit(tx3);
- cookie = fsl_dma_tx_submit(tx2);
-
- if (dma_has_cap(DMA_INTERRUPT, ((struct fsl_dma_device *)
- dev_get_drvdata(fsl_chan->dev->parent))->common.cap_mask)) {
- tx3->callback = fsl_dma_callback_test;
- tx3->callback_param = fsl_chan;
- }
- fsl_dma_memcpy_issue_pending(chan);
- msleep(2);
-
- if (fsl_dma_is_complete(chan, cookie, NULL, NULL) != DMA_SUCCESS) {
- dev_err(fsl_chan->dev, "selftest: Time out!\n");
- err = -ENODEV;
- goto free_resources;
- }
-
- err = memcmp(src, dest, test_size);
- if (err) {
- for (i = 0; (*(src + i) == *(dest + i)) && (i < test_size);
- i++);
- dev_err(fsl_chan->dev, "selftest: Test failed, data %d/%ld is "
- "error! src 0x%x, dest 0x%x\n",
- i, (long)test_size, *(src + i), *(dest + i));
- }
-
-free_resources:
- fsl_dma_free_chan_resources(chan);
-out:
- kfree(src);
- return err;
-}
-
-static int __devinit of_fsl_dma_chan_probe(struct of_device *dev,
- const struct of_device_id *match)
-{
- struct fsl_dma_device *fdev;
struct fsl_dma_chan *new_fsl_chan;
int err;
- fdev = dev_get_drvdata(dev->dev.parent);
- BUG_ON(!fdev);
-
/* alloc channel */
new_fsl_chan = kzalloc(sizeof(struct fsl_dma_chan), GFP_KERNEL);
if (!new_fsl_chan) {
- dev_err(&dev->dev, "No free memory for allocating "
+ dev_err(fdev->dev, "No free memory for allocating "
"dma channels!\n");
return -ENOMEM;
}
/* get dma channel register base */
- err = of_address_to_resource(dev->node, 0, &new_fsl_chan->reg);
+ err = of_address_to_resource(node, 0, &new_fsl_chan->reg);
if (err) {
- dev_err(&dev->dev, "Can't get %s property 'reg'\n",
- dev->node->full_name);
+ dev_err(fdev->dev, "Can't get %s property 'reg'\n",
+ node->full_name);
goto err_no_reg;
}
- new_fsl_chan->feature = *(u32 *)match->data;
+ new_fsl_chan->feature = feature;
if (!fdev->feature)
fdev->feature = new_fsl_chan->feature;
@@ -948,13 +823,13 @@
*/
WARN_ON(fdev->feature != new_fsl_chan->feature);
- new_fsl_chan->dev = &dev->dev;
+ new_fsl_chan->dev = &new_fsl_chan->common.dev;
new_fsl_chan->reg_base = ioremap(new_fsl_chan->reg.start,
new_fsl_chan->reg.end - new_fsl_chan->reg.start + 1);
new_fsl_chan->id = ((new_fsl_chan->reg.start - 0x100) & 0xfff) >> 7;
if (new_fsl_chan->id > FSL_DMA_MAX_CHANS_PER_DEVICE) {
- dev_err(&dev->dev, "There is no %d channel!\n",
+ dev_err(fdev->dev, "There is no %d channel!\n",
new_fsl_chan->id);
err = -EINVAL;
goto err_no_chan;
@@ -988,29 +863,23 @@
&fdev->common.channels);
fdev->common.chancnt++;
- new_fsl_chan->irq = irq_of_parse_and_map(dev->node, 0);
+ new_fsl_chan->irq = irq_of_parse_and_map(node, 0);
if (new_fsl_chan->irq != NO_IRQ) {
err = request_irq(new_fsl_chan->irq,
&fsl_dma_chan_do_interrupt, IRQF_SHARED,
"fsldma-channel", new_fsl_chan);
if (err) {
- dev_err(&dev->dev, "DMA channel %s request_irq error "
- "with return %d\n", dev->node->full_name, err);
+ dev_err(fdev->dev, "DMA channel %s request_irq error "
+ "with return %d\n", node->full_name, err);
goto err_no_irq;
}
}
- err = fsl_dma_self_test(new_fsl_chan);
- if (err)
- goto err_self_test;
-
- dev_info(&dev->dev, "#%d (%s), irq %d\n", new_fsl_chan->id,
- match->compatible, new_fsl_chan->irq);
+ dev_info(fdev->dev, "#%d (%s), irq %d\n", new_fsl_chan->id,
+ compatible, new_fsl_chan->irq);
return 0;
-err_self_test:
- free_irq(new_fsl_chan->irq, new_fsl_chan);
err_no_irq:
list_del(&new_fsl_chan->common.device_node);
err_no_chan:
@@ -1020,38 +889,20 @@
return err;
}
-const u32 mpc8540_dma_ip_feature = FSL_DMA_IP_85XX | FSL_DMA_BIG_ENDIAN;
-const u32 mpc8349_dma_ip_feature = FSL_DMA_IP_83XX | FSL_DMA_LITTLE_ENDIAN;
-
-static struct of_device_id of_fsl_dma_chan_ids[] = {
- {
- .compatible = "fsl,eloplus-dma-channel",
- .data = (void *)&mpc8540_dma_ip_feature,
- },
- {
- .compatible = "fsl,elo-dma-channel",
- .data = (void *)&mpc8349_dma_ip_feature,
- },
- {}
-};
-
-static struct of_platform_driver of_fsl_dma_chan_driver = {
- .name = "of-fsl-dma-channel",
- .match_table = of_fsl_dma_chan_ids,
- .probe = of_fsl_dma_chan_probe,
-};
-
-static __init int of_fsl_dma_chan_init(void)
+static void fsl_dma_chan_remove(struct fsl_dma_chan *fchan)
{
- return of_register_platform_driver(&of_fsl_dma_chan_driver);
+ free_irq(fchan->irq, fchan);
+ list_del(&fchan->common.device_node);
+ iounmap(fchan->reg_base);
+ kfree(fchan);
}
static int __devinit of_fsl_dma_probe(struct of_device *dev,
const struct of_device_id *match)
{
int err;
- unsigned int irq;
struct fsl_dma_device *fdev;
+ struct device_node *child;
fdev = kzalloc(sizeof(struct fsl_dma_device), GFP_KERNEL);
if (!fdev) {
@@ -1085,9 +936,9 @@
fdev->common.device_issue_pending = fsl_dma_memcpy_issue_pending;
fdev->common.dev = &dev->dev;
- irq = irq_of_parse_and_map(dev->node, 0);
- if (irq != NO_IRQ) {
- err = request_irq(irq, &fsl_dma_do_interrupt, IRQF_SHARED,
+ fdev->irq = irq_of_parse_and_map(dev->node, 0);
+ if (fdev->irq != NO_IRQ) {
+ err = request_irq(fdev->irq, &fsl_dma_do_interrupt, IRQF_SHARED,
"fsldma-device", fdev);
if (err) {
dev_err(&dev->dev, "DMA device request_irq error "
@@ -1097,7 +948,21 @@
}
dev_set_drvdata(&(dev->dev), fdev);
- of_platform_bus_probe(dev->node, of_fsl_dma_chan_ids, &dev->dev);
+
+ /* We cannot use of_platform_bus_probe() because there is no
+ * of_platform_bus_remove. Instead, we manually instantiate every DMA
+ * channel object.
+ */
+ for_each_child_of_node(dev->node, child) {
+ if (of_device_is_compatible(child, "fsl,eloplus-dma-channel"))
+ fsl_dma_chan_probe(fdev, child,
+ FSL_DMA_IP_85XX | FSL_DMA_BIG_ENDIAN,
+ "fsl,eloplus-dma-channel");
+ if (of_device_is_compatible(child, "fsl,elo-dma-channel"))
+ fsl_dma_chan_probe(fdev, child,
+ FSL_DMA_IP_83XX | FSL_DMA_LITTLE_ENDIAN,
+ "fsl,elo-dma-channel");
+ }
dma_async_device_register(&fdev->common);
return 0;
@@ -1109,6 +974,30 @@
return err;
}
+static int of_fsl_dma_remove(struct of_device *of_dev)
+{
+ struct fsl_dma_device *fdev;
+ unsigned int i;
+
+ fdev = dev_get_drvdata(&of_dev->dev);
+
+ dma_async_device_unregister(&fdev->common);
+
+ for (i = 0; i < FSL_DMA_MAX_CHANS_PER_DEVICE; i++)
+ if (fdev->chan[i])
+ fsl_dma_chan_remove(fdev->chan[i]);
+
+ if (fdev->irq != NO_IRQ)
+ free_irq(fdev->irq, fdev);
+
+ iounmap(fdev->reg_base);
+
+ kfree(fdev);
+ dev_set_drvdata(&of_dev->dev, NULL);
+
+ return 0;
+}
+
static struct of_device_id of_fsl_dma_ids[] = {
{ .compatible = "fsl,eloplus-dma", },
{ .compatible = "fsl,elo-dma", },
@@ -1116,15 +1005,32 @@
};
static struct of_platform_driver of_fsl_dma_driver = {
- .name = "of-fsl-dma",
+ .name = "fsl-elo-dma",
.match_table = of_fsl_dma_ids,
.probe = of_fsl_dma_probe,
+ .remove = of_fsl_dma_remove,
};
static __init int of_fsl_dma_init(void)
{
- return of_register_platform_driver(&of_fsl_dma_driver);
+ int ret;
+
+ pr_info("Freescale Elo / Elo Plus DMA driver\n");
+
+ ret = of_register_platform_driver(&of_fsl_dma_driver);
+ if (ret)
+ pr_err("fsldma: failed to register platform driver\n");
+
+ return ret;
}
-subsys_initcall(of_fsl_dma_chan_init);
+static void __exit of_fsl_dma_exit(void)
+{
+ of_unregister_platform_driver(&of_fsl_dma_driver);
+}
+
subsys_initcall(of_fsl_dma_init);
+module_exit(of_fsl_dma_exit);
+
+MODULE_DESCRIPTION("Freescale Elo / Elo Plus DMA driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
index 6faf07b..4f21a51 100644
--- a/drivers/dma/fsldma.h
+++ b/drivers/dma/fsldma.h
@@ -114,6 +114,7 @@
struct dma_device common;
struct fsl_dma_chan *chan[FSL_DMA_MAX_CHANS_PER_DEVICE];
u32 feature; /* The same as DMA channels */
+ int irq; /* Channel IRQ */
};
/* Define macros for fsl_dma_chan->feature property */
diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index bc8c6e3..1ef68b3 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -971,11 +971,9 @@
switch (ioat_chan->device->version) {
case IOAT_VER_1_2:
return ioat1_dma_get_next_descriptor(ioat_chan);
- break;
case IOAT_VER_2_0:
case IOAT_VER_3_0:
return ioat2_dma_get_next_descriptor(ioat_chan);
- break;
}
return NULL;
}
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 22edc42..faa1cc6 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1143,7 +1143,7 @@
if (!is_out) {
int irq = gpio_to_irq(gpio);
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
/* This races with request_irq(), set_irq_type(),
* and set_irq_wake() ... but those are "rare".
diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 74a369a..a820ca6 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -84,21 +84,40 @@
If unsure, say N.
-config BLK_DEV_IDEDISK
- tristate "Include IDE/ATA-2 DISK support"
- ---help---
- This will include enhanced support for MFM/RLL/IDE hard disks. If
- you have a MFM/RLL/IDE disk, and there is no special reason to use
- the old hard disk driver instead, say Y. If you have an SCSI-only
- system, you can say N here.
+config IDE_GD
+ tristate "generic ATA/ATAPI disk support"
+ default y
+ help
+ Support for ATA/ATAPI disks (including ATAPI floppy drives).
- To compile this driver as a module, choose M here: the
- module will be called ide-disk.
- Do not compile this driver as a module if your root file system
- (the one containing the directory /) is located on the IDE disk.
+ To compile this driver as a module, choose M here.
+ The module will be called ide-gd_mod.
If unsure, say Y.
+config IDE_GD_ATA
+ bool "ATA disk support"
+ depends on IDE_GD
+ default y
+ help
+ This will include support for ATA hard disks.
+
+ If unsure, say Y.
+
+config IDE_GD_ATAPI
+ bool "ATAPI floppy support"
+ depends on IDE_GD
+ select IDE_ATAPI
+ help
+ This will include support for ATAPI floppy drives
+ (i.e. Iomega ZIP or MKE LS-120).
+
+ For information about jumper settings and the question
+ of when a ZIP drive uses a partition table, see
+ <http://www.win.tue.nl/~aeb/linux/zip/zip-1.html>.
+
+ If unsure, say N.
+
config BLK_DEV_IDECS
tristate "PCMCIA IDE support"
depends on PCMCIA
@@ -163,29 +182,6 @@
To compile this driver as a module, choose M here: the
module will be called ide-tape.
-config BLK_DEV_IDEFLOPPY
- tristate "Include IDE/ATAPI FLOPPY support"
- select IDE_ATAPI
- ---help---
- If you have an IDE floppy drive which uses the ATAPI protocol,
- answer Y. ATAPI is a newer protocol used by IDE CD-ROM/tape/floppy
- drives, similar to the SCSI protocol.
-
- The LS-120 and the IDE/ATAPI Iomega ZIP drive are also supported by
- this driver. For information about jumper settings and the question
- of when a ZIP drive uses a partition table, see
- <http://www.win.tue.nl/~aeb/linux/zip/zip-1.html>.
- (ATAPI PD-CD/CDR drives are not supported by this driver; support
- for PD-CD/CDR drives is available if you answer Y to
- "SCSI emulation support", below).
-
- If you say Y here, the FLOPPY drive will be identified along with
- other IDE devices, as "hdb" or "hdc", or something similar (check
- the boot messages with dmesg).
-
- To compile this driver as a module, choose M here: the
- module will be called ide-floppy.
-
config BLK_DEV_IDESCSI
tristate "SCSI emulation support (DEPRECATED)"
depends on SCSI
@@ -332,7 +328,7 @@
# TODO: split it on per host driver config options (or module parameters)
config BLK_DEV_OFFBOARD
bool "Boot off-board chipsets first support (DEPRECATED)"
- depends on BLK_DEV_IDEPCI && (BLK_DEV_AEC62XX || BLK_DEV_GENERIC || BLK_DEV_HPT34X || BLK_DEV_HPT366 || BLK_DEV_PDC202XX_NEW || BLK_DEV_PDC202XX_OLD || BLK_DEV_TC86C001)
+ depends on BLK_DEV_IDEPCI && (BLK_DEV_AEC62XX || BLK_DEV_GENERIC || BLK_DEV_HPT366 || BLK_DEV_PDC202XX_NEW || BLK_DEV_PDC202XX_OLD || BLK_DEV_TC86C001)
help
Normally, IDE controllers built into the motherboard (on-board
controllers) are assigned to ide0 and ide1 while those on add-in PCI
@@ -482,28 +478,6 @@
It is safe to say Y to this question.
-config BLK_DEV_HPT34X
- tristate "HPT34X chipset support"
- depends on BROKEN
- select BLK_DEV_IDEDMA_PCI
- help
- This driver adds up to 4 more EIDE devices sharing a single
- interrupt. The HPT343 chipset in its current form is a non-bootable
- controller; the HPT345/HPT363 chipset is a bootable (needs BIOS FIX)
- PCI UDMA controllers. This driver requires dynamic tuning of the
- chipset during the ide-probe at boot time. It is reported to support
- DVD II drives, by the manufacturer.
-
-config HPT34X_AUTODMA
- bool "HPT34X AUTODMA support (EXPERIMENTAL)"
- depends on BLK_DEV_HPT34X && EXPERIMENTAL
- help
- This is a dangerous thing to attempt currently! Please read the
- comments at the top of <file:drivers/ide/pci/hpt34x.c>. If you say Y
- here, then say Y to "Use DMA by default when available" as well.
-
- If unsure, say N.
-
config BLK_DEV_HPT366
tristate "HPT36X/37X chipset support"
select BLK_DEV_IDEDMA_PCI
diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile
index ceaf779..093d324 100644
--- a/drivers/ide/Makefile
+++ b/drivers/ide/Makefile
@@ -37,18 +37,25 @@
obj-$(CONFIG_IDE_GENERIC) += ide-generic.o
obj-$(CONFIG_BLK_DEV_IDEPNP) += ide-pnp.o
-ide-disk_mod-y += ide-disk.o ide-disk_ioctl.o
+ide-gd_mod-y += ide-gd.o
ide-cd_mod-y += ide-cd.o ide-cd_ioctl.o ide-cd_verbose.o
-ide-floppy_mod-y += ide-floppy.o ide-floppy_ioctl.o
+ifeq ($(CONFIG_IDE_GD_ATA), y)
+ ide-gd_mod-y += ide-disk.o ide-disk_ioctl.o
ifeq ($(CONFIG_IDE_PROC_FS), y)
- ide-disk_mod-y += ide-disk_proc.o
- ide-floppy_mod-y += ide-floppy_proc.o
+ ide-gd_mod-y += ide-disk_proc.o
+endif
endif
-obj-$(CONFIG_BLK_DEV_IDEDISK) += ide-disk_mod.o
+ifeq ($(CONFIG_IDE_GD_ATAPI), y)
+ ide-gd_mod-y += ide-floppy.o ide-floppy_ioctl.o
+ifeq ($(CONFIG_IDE_PROC_FS), y)
+ ide-gd_mod-y += ide-floppy_proc.o
+endif
+endif
+
+obj-$(CONFIG_IDE_GD) += ide-gd_mod.o
obj-$(CONFIG_BLK_DEV_IDECD) += ide-cd_mod.o
-obj-$(CONFIG_BLK_DEV_IDEFLOPPY) += ide-floppy_mod.o
obj-$(CONFIG_BLK_DEV_IDETAPE) += ide-tape.o
ifeq ($(CONFIG_BLK_DEV_IDECS), y)
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 2e30571..4e58b9e 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -191,7 +191,7 @@
{
struct ide_atapi_pc pc;
- if (drive->atapi_flags & IDE_AFLAG_NO_DOORLOCK)
+ if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0)
return 0;
ide_init_pc(&pc);
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 3308b1c..13265a8 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -99,7 +99,7 @@
/* Mark that we've seen a media change and invalidate our internal buffers. */
static void cdrom_saw_media_change(ide_drive_t *drive)
{
- drive->atapi_flags |= IDE_AFLAG_MEDIA_CHANGED;
+ drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
drive->atapi_flags &= ~IDE_AFLAG_TOC_VALID;
}
@@ -340,8 +340,8 @@
}
ide_debug_log(IDE_DBG_RQ, "%s: stat: 0x%x, good_stat: 0x%x, "
- "rq->cmd_type: 0x%x, err: 0x%x\n", __func__, stat,
- good_stat, rq->cmd_type, err);
+ "rq->cmd[0]: 0x%x, rq->cmd_type: 0x%x, err: 0x%x\n",
+ __func__, stat, good_stat, rq->cmd[0], rq->cmd_type, err);
if (blk_sense_request(rq)) {
/*
@@ -843,13 +843,10 @@
rq->q->prep_rq_fn(rq->q, rq);
}
-/*
- * All other packet commands.
- */
static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct request *rq)
{
-
- ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+ ide_debug_log(IDE_DBG_FUNC, "Call %s, rq->cmd[0]: 0x%x\n",
+ __func__, rq->cmd[0]);
/*
* Some of the trailing request sense fields are optional,
@@ -876,7 +873,7 @@
if (!sense)
sense = &local_sense;
- ide_debug_log(IDE_DBG_PC, "Call %s, rq->cmd[0]: 0x%x, write: 0x%x, "
+ ide_debug_log(IDE_DBG_PC, "Call %s, cmd[0]: 0x%x, write: 0x%x, "
"timeout: %d, cmd_flags: 0x%x\n", __func__, cmd[0], write,
timeout, cmd_flags);
@@ -1177,8 +1174,9 @@
unsigned short sectors_per_frame =
queue_hardsect_size(drive->queue) >> SECTOR_BITS;
- ide_debug_log(IDE_DBG_RQ, "Call %s, write: 0x%x, secs_per_frame: %u\n",
- __func__, write, sectors_per_frame);
+ ide_debug_log(IDE_DBG_RQ, "Call %s, rq->cmd[0]: 0x%x, write: 0x%x, "
+ "secs_per_frame: %u\n",
+ __func__, rq->cmd[0], write, sectors_per_frame);
if (write) {
/* disk has become write protected */
@@ -1221,7 +1219,8 @@
static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
{
- ide_debug_log(IDE_DBG_PC, "Call %s, rq->cmd_type: 0x%x\n", __func__,
+ ide_debug_log(IDE_DBG_PC, "Call %s, rq->cmd[0]: 0x%x, "
+ "rq->cmd_type: 0x%x\n", __func__, rq->cmd[0],
rq->cmd_type);
if (blk_pc_request(rq))
@@ -1257,9 +1256,6 @@
}
}
-/*
- * cdrom driver request routine.
- */
static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
sector_t block)
{
@@ -1267,8 +1263,10 @@
ide_handler_t *fn;
int xferlen;
- ide_debug_log(IDE_DBG_RQ, "Call %s, rq->cmd_type: 0x%x, block: %llu\n",
- __func__, rq->cmd_type, (unsigned long long)block);
+ ide_debug_log(IDE_DBG_RQ, "Call %s, rq->cmd[0]: 0x%x, "
+ "rq->cmd_type: 0x%x, block: %llu\n",
+ __func__, rq->cmd[0], rq->cmd_type,
+ (unsigned long long)block);
if (blk_fs_request(rq)) {
if (drive->atapi_flags & IDE_AFLAG_SEEKING) {
@@ -1412,6 +1410,10 @@
*capacity = 1 + be32_to_cpu(capbuf.lba);
*sectors_per_frame = blocklen >> SECTOR_BITS;
+
+ ide_debug_log(IDE_DBG_PROBE, "%s: cap: %lu, sectors_per_frame: %lu\n",
+ __func__, *capacity, *sectors_per_frame);
+
return 0;
}
@@ -1643,6 +1645,9 @@
maxspeed = be16_to_cpup((__be16 *)&buf[8 + 8]);
}
+ ide_debug_log(IDE_DBG_PROBE, "%s: curspeed: %u, maxspeed: %u\n",
+ __func__, curspeed, maxspeed);
+
cd->current_speed = (curspeed + (176/2)) / 176;
cd->max_speed = (maxspeed + (176/2)) / 176;
}
@@ -1732,7 +1737,7 @@
return 0;
if ((buf[8 + 6] & 0x01) == 0)
- drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK;
+ drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
if (buf[8 + 6] & 0x08)
drive->atapi_flags &= ~IDE_AFLAG_NO_EJECT;
if (buf[8 + 3] & 0x01)
@@ -1777,7 +1782,7 @@
if ((cdi->mask & CDC_DVD_R) == 0 || (cdi->mask & CDC_DVD_RAM) == 0)
printk(KERN_CONT " DVD%s%s",
(cdi->mask & CDC_DVD_R) ? "" : "-R",
- (cdi->mask & CDC_DVD_RAM) ? "" : "-RAM");
+ (cdi->mask & CDC_DVD_RAM) ? "" : "/RAM");
if ((cdi->mask & CDC_CD_R) == 0 || (cdi->mask & CDC_CD_RW) == 0)
printk(KERN_CONT " CD%s%s",
@@ -1908,6 +1913,16 @@
IDE_PROC_DEVSET(dsc_overlap, 0, 1),
{ 0 },
};
+
+static ide_proc_entry_t *ide_cd_proc_entries(ide_drive_t *drive)
+{
+ return idecd_proc;
+}
+
+static const struct ide_proc_devset *ide_cd_proc_devsets(ide_drive_t *drive)
+{
+ return idecd_settings;
+}
#endif
static const struct cd_list_entry ide_cd_quirks_list[] = {
@@ -1986,8 +2001,8 @@
if (!drive->queue->unplug_delay)
drive->queue->unplug_delay = 1;
- drive->atapi_flags = IDE_AFLAG_MEDIA_CHANGED | IDE_AFLAG_NO_EJECT |
- ide_cd_flags(id);
+ drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
+ drive->atapi_flags = IDE_AFLAG_NO_EJECT | ide_cd_flags(id);
if ((drive->atapi_flags & IDE_AFLAG_VERTOS_300_SSD) &&
fw_rev[4] == '1' && fw_rev[6] <= '2')
@@ -2069,8 +2084,8 @@
.end_request = ide_end_request,
.error = __ide_error,
#ifdef CONFIG_IDE_PROC_FS
- .proc = idecd_proc,
- .settings = idecd_settings,
+ .proc_entries = ide_cd_proc_entries,
+ .proc_devsets = ide_cd_proc_devsets,
#endif
};
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 74231b4..df3df00 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -86,8 +86,8 @@
if (slot_nr == CDSL_CURRENT) {
(void) cdrom_check_status(drive, NULL);
- retval = (drive->atapi_flags & IDE_AFLAG_MEDIA_CHANGED) ? 1 : 0;
- drive->atapi_flags &= ~IDE_AFLAG_MEDIA_CHANGED;
+ retval = (drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED) ? 1 : 0;
+ drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED;
return retval;
} else {
return -EINVAL;
@@ -136,7 +136,7 @@
sense = &my_sense;
/* If the drive cannot lock the door, just pretend. */
- if (drive->atapi_flags & IDE_AFLAG_NO_DOORLOCK) {
+ if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) {
stat = 0;
} else {
unsigned char cmd[BLK_MAX_CDB];
@@ -157,7 +157,7 @@
(sense->asc == 0x24 || sense->asc == 0x20)) {
printk(KERN_ERR "%s: door locking not supported\n",
drive->name);
- drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK;
+ drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
stat = 0;
}
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 3853bde..223750c 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -14,9 +14,6 @@
* This is the IDE/ATA disk driver, as evolved from hd.c and ide.c.
*/
-#define IDEDISK_VERSION "1.18"
-
-#include <linux/module.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/kernel.h>
@@ -39,46 +36,8 @@
#include <asm/io.h>
#include <asm/div64.h>
-#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)
-#define IDE_DISK_MINORS (1 << PARTN_BITS)
-#else
-#define IDE_DISK_MINORS 0
-#endif
-
#include "ide-disk.h"
-static DEFINE_MUTEX(idedisk_ref_mutex);
-
-#define to_ide_disk(obj) container_of(obj, struct ide_disk_obj, kref)
-
-static void ide_disk_release(struct kref *);
-
-static struct ide_disk_obj *ide_disk_get(struct gendisk *disk)
-{
- struct ide_disk_obj *idkp = NULL;
-
- mutex_lock(&idedisk_ref_mutex);
- idkp = ide_disk_g(disk);
- if (idkp) {
- if (ide_device_get(idkp->drive))
- idkp = NULL;
- else
- kref_get(&idkp->kref);
- }
- mutex_unlock(&idedisk_ref_mutex);
- return idkp;
-}
-
-static void ide_disk_put(struct ide_disk_obj *idkp)
-{
- ide_drive_t *drive = idkp->drive;
-
- mutex_lock(&idedisk_ref_mutex);
- kref_put(&idkp->kref, ide_disk_release);
- ide_device_put(drive);
- mutex_unlock(&idedisk_ref_mutex);
-}
-
static const u8 ide_rw_cmds[] = {
ATA_CMD_READ_MULTI,
ATA_CMD_WRITE_MULTI,
@@ -374,7 +333,7 @@
}
}
-static void init_idedisk_capacity(ide_drive_t *drive)
+static int ide_disk_get_capacity(ide_drive_t *drive)
{
u16 *id = drive->id;
int lba;
@@ -403,11 +362,28 @@
if (ata_id_hpa_enabled(id))
idedisk_check_hpa(drive);
}
-}
-sector_t ide_disk_capacity(ide_drive_t *drive)
-{
- return drive->capacity64;
+ /* limit drive capacity to 137GB if LBA48 cannot be used */
+ if ((drive->dev_flags & IDE_DFLAG_LBA48) == 0 &&
+ drive->capacity64 > 1ULL << 28) {
+ printk(KERN_WARNING "%s: cannot use LBA48 - full capacity "
+ "%llu sectors (%llu MB)\n",
+ drive->name, (unsigned long long)drive->capacity64,
+ sectors_to_MB(drive->capacity64));
+ drive->capacity64 = 1ULL << 28;
+ }
+
+ if ((drive->hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) &&
+ (drive->dev_flags & IDE_DFLAG_LBA48)) {
+ if (drive->capacity64 > 1ULL << 28) {
+ printk(KERN_INFO "%s: cannot use LBA48 DMA - PIO mode"
+ " will be used for accessing sectors "
+ "> %u\n", drive->name, 1 << 28);
+ } else
+ drive->dev_flags &= ~IDE_DFLAG_LBA48;
+ }
+
+ return 0;
}
static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
@@ -508,7 +484,7 @@
* time we have trimmed the drive capacity if LBA48 is
* not available so we don't need to recheck that.
*/
- capacity = ide_disk_capacity(drive);
+ capacity = ide_gd_capacity(drive);
barrier = ata_id_flush_enabled(id) &&
(drive->dev_flags & IDE_DFLAG_NOFLUSH) == 0 &&
((drive->dev_flags & IDE_DFLAG_LBA48) == 0 ||
@@ -616,7 +592,12 @@
ide_ext_devset_rw_sync(nowerr, nowerr);
-static void idedisk_setup(ide_drive_t *drive)
+static int ide_disk_check(ide_drive_t *drive, const char *s)
+{
+ return 1;
+}
+
+static void ide_disk_setup(ide_drive_t *drive)
{
struct ide_disk_obj *idkp = drive->driver_data;
ide_hwif_t *hwif = drive->hwif;
@@ -652,33 +633,13 @@
drive->queue->max_sectors / 2);
/* calculate drive capacity, and select LBA if possible */
- init_idedisk_capacity(drive);
-
- /* limit drive capacity to 137GB if LBA48 cannot be used */
- if ((drive->dev_flags & IDE_DFLAG_LBA48) == 0 &&
- drive->capacity64 > 1ULL << 28) {
- printk(KERN_WARNING "%s: cannot use LBA48 - full capacity "
- "%llu sectors (%llu MB)\n",
- drive->name, (unsigned long long)drive->capacity64,
- sectors_to_MB(drive->capacity64));
- drive->capacity64 = 1ULL << 28;
- }
-
- if ((hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) &&
- (drive->dev_flags & IDE_DFLAG_LBA48)) {
- if (drive->capacity64 > 1ULL << 28) {
- printk(KERN_INFO "%s: cannot use LBA48 DMA - PIO mode"
- " will be used for accessing sectors "
- "> %u\n", drive->name, 1 << 28);
- } else
- drive->dev_flags &= ~IDE_DFLAG_LBA48;
- }
+ ide_disk_get_capacity(drive);
/*
* if possible, give fdisk access to more of the drive,
* by correcting bios_cyls:
*/
- capacity = ide_disk_capacity(drive);
+ capacity = ide_gd_capacity(drive);
if ((drive->dev_flags & IDE_DFLAG_FORCED_GEOM) == 0) {
if (ata_id_lba48_enabled(drive->id)) {
@@ -718,9 +679,17 @@
drive->dev_flags |= IDE_DFLAG_WCACHE;
set_wcache(drive, 1);
+
+ if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 &&
+ (drive->head == 0 || drive->head > 16)) {
+ printk(KERN_ERR "%s: invalid geometry: %d physical heads?\n",
+ drive->name, drive->head);
+ drive->dev_flags &= ~IDE_DFLAG_ATTACH;
+ } else
+ drive->dev_flags |= IDE_DFLAG_ATTACH;
}
-static void ide_cacheflush_p(ide_drive_t *drive)
+static void ide_disk_flush(ide_drive_t *drive)
{
if (ata_id_flush_enabled(drive->id) == 0 ||
(drive->dev_flags & IDE_DFLAG_WCACHE) == 0)
@@ -730,267 +699,40 @@
printk(KERN_INFO "%s: wcache flush failed!\n", drive->name);
}
-static void ide_disk_remove(ide_drive_t *drive)
+static int ide_disk_init_media(ide_drive_t *drive, struct gendisk *disk)
{
- struct ide_disk_obj *idkp = drive->driver_data;
- struct gendisk *g = idkp->disk;
-
- ide_proc_unregister_driver(drive, idkp->driver);
-
- del_gendisk(g);
-
- ide_cacheflush_p(drive);
-
- ide_disk_put(idkp);
+ return 0;
}
-static void ide_disk_release(struct kref *kref)
-{
- struct ide_disk_obj *idkp = to_ide_disk(kref);
- ide_drive_t *drive = idkp->drive;
- struct gendisk *g = idkp->disk;
-
- drive->driver_data = NULL;
- g->private_data = NULL;
- put_disk(g);
- kfree(idkp);
-}
-
-static int ide_disk_probe(ide_drive_t *drive);
-
-/*
- * On HPA drives the capacity needs to be
- * reinitilized on resume otherwise the disk
- * can not be used and a hard reset is required
- */
-static void ide_disk_resume(ide_drive_t *drive)
-{
- if (ata_id_hpa_enabled(drive->id))
- init_idedisk_capacity(drive);
-}
-
-static void ide_device_shutdown(ide_drive_t *drive)
-{
-#ifdef CONFIG_ALPHA
- /* On Alpha, halt(8) doesn't actually turn the machine off,
- it puts you into the sort of firmware monitor. Typically,
- it's used to boot another kernel image, so it's not much
- different from reboot(8). Therefore, we don't need to
- spin down the disk in this case, especially since Alpha
- firmware doesn't handle disks in standby mode properly.
- On the other hand, it's reasonably safe to turn the power
- off when the shutdown process reaches the firmware prompt,
- as the firmware initialization takes rather long time -
- at least 10 seconds, which should be sufficient for
- the disk to expire its write cache. */
- if (system_state != SYSTEM_POWER_OFF) {
-#else
- if (system_state == SYSTEM_RESTART) {
-#endif
- ide_cacheflush_p(drive);
- return;
- }
-
- printk(KERN_INFO "Shutdown: %s\n", drive->name);
-
- drive->gendev.bus->suspend(&drive->gendev, PMSG_SUSPEND);
-}
-
-static ide_driver_t idedisk_driver = {
- .gen_driver = {
- .owner = THIS_MODULE,
- .name = "ide-disk",
- .bus = &ide_bus_type,
- },
- .probe = ide_disk_probe,
- .remove = ide_disk_remove,
- .resume = ide_disk_resume,
- .shutdown = ide_device_shutdown,
- .version = IDEDISK_VERSION,
- .do_request = ide_do_rw_disk,
- .end_request = ide_end_request,
- .error = __ide_error,
-#ifdef CONFIG_IDE_PROC_FS
- .proc = ide_disk_proc,
- .settings = ide_disk_settings,
-#endif
-};
-
-static int idedisk_set_doorlock(ide_drive_t *drive, int on)
+static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk,
+ int on)
{
ide_task_t task;
+ int ret;
+
+ if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0)
+ return 0;
memset(&task, 0, sizeof(task));
task.tf.command = on ? ATA_CMD_MEDIA_LOCK : ATA_CMD_MEDIA_UNLOCK;
task.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
- return ide_no_data_taskfile(drive, &task);
+ ret = ide_no_data_taskfile(drive, &task);
+
+ if (ret)
+ drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
+
+ return ret;
}
-static int idedisk_open(struct inode *inode, struct file *filp)
-{
- struct gendisk *disk = inode->i_bdev->bd_disk;
- struct ide_disk_obj *idkp;
- ide_drive_t *drive;
-
- idkp = ide_disk_get(disk);
- if (idkp == NULL)
- return -ENXIO;
-
- drive = idkp->drive;
-
- idkp->openers++;
-
- if ((drive->dev_flags & IDE_DFLAG_REMOVABLE) && idkp->openers == 1) {
- check_disk_change(inode->i_bdev);
- /*
- * Ignore the return code from door_lock,
- * since the open() has already succeeded,
- * and the door_lock is irrelevant at this point.
- */
- if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) &&
- idedisk_set_doorlock(drive, 1))
- drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
- }
- return 0;
-}
-
-static int idedisk_release(struct inode *inode, struct file *filp)
-{
- struct gendisk *disk = inode->i_bdev->bd_disk;
- struct ide_disk_obj *idkp = ide_disk_g(disk);
- ide_drive_t *drive = idkp->drive;
-
- if (idkp->openers == 1)
- ide_cacheflush_p(drive);
-
- if ((drive->dev_flags & IDE_DFLAG_REMOVABLE) && idkp->openers == 1) {
- if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) &&
- idedisk_set_doorlock(drive, 0))
- drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
- }
-
- idkp->openers--;
-
- ide_disk_put(idkp);
-
- return 0;
-}
-
-static int idedisk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
- struct ide_disk_obj *idkp = ide_disk_g(bdev->bd_disk);
- ide_drive_t *drive = idkp->drive;
-
- geo->heads = drive->bios_head;
- geo->sectors = drive->bios_sect;
- geo->cylinders = (u16)drive->bios_cyl; /* truncate */
- return 0;
-}
-
-static int idedisk_media_changed(struct gendisk *disk)
-{
- struct ide_disk_obj *idkp = ide_disk_g(disk);
- ide_drive_t *drive = idkp->drive;
-
- /* do not scan partitions twice if this is a removable device */
- if (drive->dev_flags & IDE_DFLAG_ATTACH) {
- drive->dev_flags &= ~IDE_DFLAG_ATTACH;
- return 0;
- }
-
- /* if removable, always assume it was changed */
- return !!(drive->dev_flags & IDE_DFLAG_REMOVABLE);
-}
-
-static int idedisk_revalidate_disk(struct gendisk *disk)
-{
- struct ide_disk_obj *idkp = ide_disk_g(disk);
- set_capacity(disk, ide_disk_capacity(idkp->drive));
- return 0;
-}
-
-static struct block_device_operations idedisk_ops = {
- .owner = THIS_MODULE,
- .open = idedisk_open,
- .release = idedisk_release,
- .ioctl = ide_disk_ioctl,
- .getgeo = idedisk_getgeo,
- .media_changed = idedisk_media_changed,
- .revalidate_disk = idedisk_revalidate_disk
+const struct ide_disk_ops ide_ata_disk_ops = {
+ .check = ide_disk_check,
+ .get_capacity = ide_disk_get_capacity,
+ .setup = ide_disk_setup,
+ .flush = ide_disk_flush,
+ .init_media = ide_disk_init_media,
+ .set_doorlock = ide_disk_set_doorlock,
+ .do_request = ide_do_rw_disk,
+ .end_request = ide_end_request,
+ .ioctl = ide_disk_ioctl,
};
-
-MODULE_DESCRIPTION("ATA DISK Driver");
-
-static int ide_disk_probe(ide_drive_t *drive)
-{
- struct ide_disk_obj *idkp;
- struct gendisk *g;
-
- /* strstr("foo", "") is non-NULL */
- if (!strstr("ide-disk", drive->driver_req))
- goto failed;
-
- if (drive->media != ide_disk)
- goto failed;
-
- idkp = kzalloc(sizeof(*idkp), GFP_KERNEL);
- if (!idkp)
- goto failed;
-
- g = alloc_disk_node(IDE_DISK_MINORS, hwif_to_node(drive->hwif));
- if (!g)
- goto out_free_idkp;
-
- ide_init_disk(g, drive);
-
- kref_init(&idkp->kref);
-
- idkp->drive = drive;
- idkp->driver = &idedisk_driver;
- idkp->disk = g;
-
- g->private_data = &idkp->driver;
-
- drive->driver_data = idkp;
-
- idedisk_setup(drive);
- if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 &&
- (drive->head == 0 || drive->head > 16)) {
- printk(KERN_ERR "%s: INVALID GEOMETRY: %d PHYSICAL HEADS?\n",
- drive->name, drive->head);
- drive->dev_flags &= ~IDE_DFLAG_ATTACH;
- } else
- drive->dev_flags |= IDE_DFLAG_ATTACH;
-
- g->minors = IDE_DISK_MINORS;
- g->driverfs_dev = &drive->gendev;
- g->flags |= GENHD_FL_EXT_DEVT;
- if (drive->dev_flags & IDE_DFLAG_REMOVABLE)
- g->flags = GENHD_FL_REMOVABLE;
- set_capacity(g, ide_disk_capacity(drive));
- g->fops = &idedisk_ops;
- add_disk(g);
- return 0;
-
-out_free_idkp:
- kfree(idkp);
-failed:
- return -ENODEV;
-}
-
-static void __exit idedisk_exit(void)
-{
- driver_unregister(&idedisk_driver.gen_driver);
-}
-
-static int __init idedisk_init(void)
-{
- return driver_register(&idedisk_driver.gen_driver);
-}
-
-MODULE_ALIAS("ide:*m-disk*");
-MODULE_ALIAS("ide-disk");
-module_init(idedisk_init);
-module_exit(idedisk_exit);
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/ide-disk.h b/drivers/ide/ide-disk.h
index a82fa43..b234b0f 100644
--- a/drivers/ide/ide-disk.h
+++ b/drivers/ide/ide-disk.h
@@ -1,19 +1,11 @@
#ifndef __IDE_DISK_H
#define __IDE_DISK_H
-struct ide_disk_obj {
- ide_drive_t *drive;
- ide_driver_t *driver;
- struct gendisk *disk;
- struct kref kref;
- unsigned int openers; /* protected by BKL for now */
-};
+#include "ide-gd.h"
-#define ide_disk_g(disk) \
- container_of((disk)->private_data, struct ide_disk_obj, driver)
-
+#ifdef CONFIG_IDE_GD_ATA
/* ide-disk.c */
-sector_t ide_disk_capacity(ide_drive_t *);
+extern const struct ide_disk_ops ide_ata_disk_ops;
ide_decl_devset(address);
ide_decl_devset(multcount);
ide_decl_devset(nowerr);
@@ -21,12 +13,17 @@
ide_decl_devset(acoustic);
/* ide-disk_ioctl.c */
-int ide_disk_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
+int ide_disk_ioctl(ide_drive_t *, struct inode *, struct file *, unsigned int,
+ unsigned long);
#ifdef CONFIG_IDE_PROC_FS
/* ide-disk_proc.c */
extern ide_proc_entry_t ide_disk_proc[];
extern const struct ide_proc_devset ide_disk_settings[];
#endif
+#else
+#define ide_disk_proc NULL
+#define ide_disk_settings NULL
+#endif
#endif /* __IDE_DISK_H */
diff --git a/drivers/ide/ide-disk_ioctl.c b/drivers/ide/ide-disk_ioctl.c
index a6cf1a0..a49698b 100644
--- a/drivers/ide/ide-disk_ioctl.c
+++ b/drivers/ide/ide-disk_ioctl.c
@@ -13,12 +13,10 @@
{ 0 }
};
-int ide_disk_ioctl(struct inode *inode, struct file *file,
+int ide_disk_ioctl(ide_drive_t *drive, struct inode *inode, struct file *file,
unsigned int cmd, unsigned long arg)
{
struct block_device *bdev = inode->i_bdev;
- struct ide_disk_obj *idkp = ide_disk_g(bdev->bd_disk);
- ide_drive_t *drive = idkp->drive;
int err;
err = ide_setting_ioctl(drive, bdev, cmd, arg, ide_disk_ioctl_settings);
diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c
index 4724976..1146f42 100644
--- a/drivers/ide/ide-disk_proc.c
+++ b/drivers/ide/ide-disk_proc.c
@@ -56,7 +56,7 @@
ide_drive_t*drive = (ide_drive_t *)data;
int len;
- len = sprintf(page, "%llu\n", (long long)ide_disk_capacity(drive));
+ len = sprintf(page, "%llu\n", (long long)ide_gd_capacity(drive));
PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
}
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index 0903782..cac431f 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -130,7 +130,7 @@
xcount = bcount & 0xffff;
if (is_trm290)
xcount = ((xcount >> 2) - 1) << 16;
- if (xcount == 0x0000) {
+ else if (xcount == 0x0000) {
if (count++ >= PRD_ENTRIES)
goto use_pio_instead;
*table++ = cpu_to_le32(0x8000);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index cf0aa25..aeb1ad7 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -15,12 +15,6 @@
* Documentation/ide/ChangeLog.ide-floppy.1996-2002
*/
-#define DRV_NAME "ide-floppy"
-#define PFX DRV_NAME ": "
-
-#define IDEFLOPPY_VERSION "1.00"
-
-#include <linux/module.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/kernel.h>
@@ -49,19 +43,6 @@
#include "ide-floppy.h"
-/* module parameters */
-static unsigned long debug_mask;
-module_param(debug_mask, ulong, 0644);
-
-/* define to see debug info */
-#define IDEFLOPPY_DEBUG_LOG 0
-
-#if IDEFLOPPY_DEBUG_LOG
-#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, args)
-#else
-#define ide_debug_log(lvl, fmt, args...) do {} while (0)
-#endif
-
/*
* After each failed packet command we issue a request sense command and retry
* the packet command IDEFLOPPY_MAX_PC_RETRIES times.
@@ -83,43 +64,13 @@
/* Error code returned in rq->errors to the higher part of the driver. */
#define IDEFLOPPY_ERROR_GENERAL 101
-static DEFINE_MUTEX(idefloppy_ref_mutex);
-
-static void idefloppy_cleanup_obj(struct kref *);
-
-static struct ide_floppy_obj *ide_floppy_get(struct gendisk *disk)
-{
- struct ide_floppy_obj *floppy = NULL;
-
- mutex_lock(&idefloppy_ref_mutex);
- floppy = ide_drv_g(disk, ide_floppy_obj);
- if (floppy) {
- if (ide_device_get(floppy->drive))
- floppy = NULL;
- else
- kref_get(&floppy->kref);
- }
- mutex_unlock(&idefloppy_ref_mutex);
- return floppy;
-}
-
-static void ide_floppy_put(struct ide_floppy_obj *floppy)
-{
- ide_drive_t *drive = floppy->drive;
-
- mutex_lock(&idefloppy_ref_mutex);
- kref_put(&floppy->kref, idefloppy_cleanup_obj);
- ide_device_put(drive);
- mutex_unlock(&idefloppy_ref_mutex);
-}
-
/*
* Used to finish servicing a request. For read/write requests, we will call
* ide_end_request to pass to the next buffer.
*/
-static int idefloppy_end_request(ide_drive_t *drive, int uptodate, int nsecs)
+static int ide_floppy_end_request(ide_drive_t *drive, int uptodate, int nsecs)
{
- idefloppy_floppy_t *floppy = drive->driver_data;
+ struct ide_disk_obj *floppy = drive->driver_data;
struct request *rq = HWGROUP(drive)->rq;
int error;
@@ -161,12 +112,12 @@
struct bio *bio = rq->bio;
while ((bio = rq->bio) != NULL)
- idefloppy_end_request(drive, 1, 0);
+ ide_floppy_end_request(drive, 1, 0);
}
static void ide_floppy_callback(ide_drive_t *drive, int dsc)
{
- idefloppy_floppy_t *floppy = drive->driver_data;
+ struct ide_disk_obj *floppy = drive->driver_data;
struct ide_atapi_pc *pc = drive->pc;
int uptodate = pc->error ? 0 : 1;
@@ -200,10 +151,10 @@
"Aborting request!\n");
}
- idefloppy_end_request(drive, uptodate, 0);
+ ide_floppy_end_request(drive, uptodate, 0);
}
-static void ide_floppy_report_error(idefloppy_floppy_t *floppy,
+static void ide_floppy_report_error(struct ide_disk_obj *floppy,
struct ide_atapi_pc *pc)
{
/* supress error messages resulting from Medium not present */
@@ -222,7 +173,7 @@
static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
struct ide_atapi_pc *pc)
{
- idefloppy_floppy_t *floppy = drive->driver_data;
+ struct ide_disk_obj *floppy = drive->driver_data;
if (floppy->failed_pc == NULL &&
pc->c[0] != GPCMD_REQUEST_SENSE)
@@ -286,7 +237,7 @@
struct ide_atapi_pc *pc, struct request *rq,
unsigned long sector)
{
- idefloppy_floppy_t *floppy = drive->driver_data;
+ struct ide_disk_obj *floppy = drive->driver_data;
int block = sector / floppy->bs_factor;
int blocks = rq->nr_sectors / floppy->bs_factor;
int cmd = rq_data_dir(rq);
@@ -310,7 +261,7 @@
pc->flags |= PC_FLAG_DMA_OK;
}
-static void idefloppy_blockpc_cmd(idefloppy_floppy_t *floppy,
+static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
struct ide_atapi_pc *pc, struct request *rq)
{
ide_init_pc(pc);
@@ -329,13 +280,12 @@
pc->req_xfer = pc->buf_size = rq->data_len;
}
-static ide_startstop_t idefloppy_do_request(ide_drive_t *drive,
- struct request *rq, sector_t block_s)
+static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
+ struct request *rq, sector_t block)
{
- idefloppy_floppy_t *floppy = drive->driver_data;
+ struct ide_disk_obj *floppy = drive->driver_data;
ide_hwif_t *hwif = drive->hwif;
struct ide_atapi_pc *pc;
- unsigned long block = (unsigned long)block_s;
ide_debug_log(IDE_DBG_FUNC, "%s: dev: %s, cmd: 0x%x, cmd_type: %x, "
"errors: %d\n",
@@ -353,7 +303,7 @@
else
printk(KERN_ERR PFX "%s: I/O error\n", drive->name);
- idefloppy_end_request(drive, 0, 0);
+ ide_floppy_end_request(drive, 0, 0);
return ide_stopped;
}
if (blk_fs_request(rq)) {
@@ -361,11 +311,11 @@
(rq->nr_sectors % floppy->bs_factor)) {
printk(KERN_ERR PFX "%s: unsupported r/w rq size\n",
drive->name);
- idefloppy_end_request(drive, 0, 0);
+ ide_floppy_end_request(drive, 0, 0);
return ide_stopped;
}
pc = &floppy->queued_pc;
- idefloppy_create_rw_cmd(drive, pc, rq, block);
+ idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
} else if (blk_special_request(rq)) {
pc = (struct ide_atapi_pc *) rq->buffer;
} else if (blk_pc_request(rq)) {
@@ -373,7 +323,7 @@
idefloppy_blockpc_cmd(floppy, pc, rq);
} else {
blk_dump_rq_flags(rq, PFX "unsupported command in queue");
- idefloppy_end_request(drive, 0, 0);
+ ide_floppy_end_request(drive, 0, 0);
return ide_stopped;
}
@@ -394,7 +344,7 @@
*/
static int ide_floppy_get_flexible_disk_page(ide_drive_t *drive)
{
- idefloppy_floppy_t *floppy = drive->driver_data;
+ struct ide_disk_obj *floppy = drive->driver_data;
struct gendisk *disk = floppy->disk;
struct ide_atapi_pc pc;
u8 *page;
@@ -410,11 +360,11 @@
}
if (pc.buf[3] & 0x80)
- drive->atapi_flags |= IDE_AFLAG_WP;
+ drive->dev_flags |= IDE_DFLAG_WP;
else
- drive->atapi_flags &= ~IDE_AFLAG_WP;
+ drive->dev_flags &= ~IDE_DFLAG_WP;
- set_disk_ro(disk, !!(drive->atapi_flags & IDE_AFLAG_WP));
+ set_disk_ro(disk, !!(drive->dev_flags & IDE_DFLAG_WP));
page = &pc.buf[8];
@@ -445,7 +395,9 @@
drive->name, lba_capacity, capacity);
floppy->blocks = floppy->block_size ?
capacity / floppy->block_size : 0;
+ drive->capacity64 = floppy->blocks * floppy->bs_factor;
}
+
return 0;
}
@@ -455,7 +407,7 @@
*/
static int ide_floppy_get_capacity(ide_drive_t *drive)
{
- idefloppy_floppy_t *floppy = drive->driver_data;
+ struct ide_disk_obj *floppy = drive->driver_data;
struct gendisk *disk = floppy->disk;
struct ide_atapi_pc pc;
u8 *cap_desc;
@@ -466,7 +418,7 @@
drive->bios_head = drive->bios_sect = 0;
floppy->blocks = 0;
floppy->bs_factor = 1;
- set_capacity(floppy->disk, 0);
+ drive->capacity64 = 0;
ide_floppy_create_read_capacity_cmd(&pc);
if (ide_queue_pc_tail(drive, disk, &pc)) {
@@ -523,6 +475,8 @@
"non 512 bytes block size not "
"fully supported\n",
drive->name);
+ drive->capacity64 =
+ floppy->blocks * floppy->bs_factor;
rc = 0;
}
break;
@@ -547,21 +501,12 @@
if (!(drive->atapi_flags & IDE_AFLAG_CLIK_DRIVE))
(void) ide_floppy_get_flexible_disk_page(drive);
- set_capacity(disk, floppy->blocks * floppy->bs_factor);
-
return rc;
}
-sector_t ide_floppy_capacity(ide_drive_t *drive)
+static void ide_floppy_setup(ide_drive_t *drive)
{
- idefloppy_floppy_t *floppy = drive->driver_data;
- unsigned long capacity = floppy->blocks * floppy->bs_factor;
-
- return capacity;
-}
-
-static void idefloppy_setup(ide_drive_t *drive, idefloppy_floppy_t *floppy)
-{
+ struct ide_disk_obj *floppy = drive->driver_data;
u16 *id = drive->id;
drive->pc_callback = ide_floppy_callback;
@@ -592,252 +537,42 @@
blk_queue_max_sectors(drive->queue, 64);
drive->atapi_flags |= IDE_AFLAG_CLIK_DRIVE;
/* IOMEGA Clik! drives do not support lock/unlock commands */
- drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK;
+ drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
}
(void) ide_floppy_get_capacity(drive);
ide_proc_register_driver(drive, floppy->driver);
+
+ drive->dev_flags |= IDE_DFLAG_ATTACH;
}
-static void ide_floppy_remove(ide_drive_t *drive)
+static void ide_floppy_flush(ide_drive_t *drive)
{
- idefloppy_floppy_t *floppy = drive->driver_data;
- struct gendisk *g = floppy->disk;
-
- ide_proc_unregister_driver(drive, floppy->driver);
-
- del_gendisk(g);
-
- ide_floppy_put(floppy);
}
-static void idefloppy_cleanup_obj(struct kref *kref)
+static int ide_floppy_init_media(ide_drive_t *drive, struct gendisk *disk)
{
- struct ide_floppy_obj *floppy = to_ide_drv(kref, ide_floppy_obj);
- ide_drive_t *drive = floppy->drive;
- struct gendisk *g = floppy->disk;
-
- drive->driver_data = NULL;
- g->private_data = NULL;
- put_disk(g);
- kfree(floppy);
-}
-
-static int ide_floppy_probe(ide_drive_t *);
-
-static ide_driver_t idefloppy_driver = {
- .gen_driver = {
- .owner = THIS_MODULE,
- .name = "ide-floppy",
- .bus = &ide_bus_type,
- },
- .probe = ide_floppy_probe,
- .remove = ide_floppy_remove,
- .version = IDEFLOPPY_VERSION,
- .do_request = idefloppy_do_request,
- .end_request = idefloppy_end_request,
- .error = __ide_error,
-#ifdef CONFIG_IDE_PROC_FS
- .proc = ide_floppy_proc,
- .settings = ide_floppy_settings,
-#endif
-};
-
-static int idefloppy_open(struct inode *inode, struct file *filp)
-{
- struct gendisk *disk = inode->i_bdev->bd_disk;
- struct ide_floppy_obj *floppy;
- ide_drive_t *drive;
int ret = 0;
- floppy = ide_floppy_get(disk);
- if (!floppy)
- return -ENXIO;
+ if (ide_do_test_unit_ready(drive, disk))
+ ide_do_start_stop(drive, disk, 1);
- drive = floppy->drive;
+ ret = ide_floppy_get_capacity(drive);
- ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+ set_capacity(disk, ide_gd_capacity(drive));
- floppy->openers++;
-
- if (floppy->openers == 1) {
- drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
- /* Just in case */
-
- if (ide_do_test_unit_ready(drive, disk))
- ide_do_start_stop(drive, disk, 1);
-
- if (ide_floppy_get_capacity(drive)
- && (filp->f_flags & O_NDELAY) == 0
- /*
- * Allow O_NDELAY to open a drive without a disk, or with an
- * unreadable disk, so that we can get the format capacity
- * of the drive or begin the format - Sam
- */
- ) {
- ret = -EIO;
- goto out_put_floppy;
- }
-
- if ((drive->atapi_flags & IDE_AFLAG_WP) && (filp->f_mode & 2)) {
- ret = -EROFS;
- goto out_put_floppy;
- }
-
- drive->atapi_flags |= IDE_AFLAG_MEDIA_CHANGED;
- ide_set_media_lock(drive, disk, 1);
- check_disk_change(inode->i_bdev);
- } else if (drive->atapi_flags & IDE_AFLAG_FORMAT_IN_PROGRESS) {
- ret = -EBUSY;
- goto out_put_floppy;
- }
- return 0;
-
-out_put_floppy:
- floppy->openers--;
- ide_floppy_put(floppy);
return ret;
}
-static int idefloppy_release(struct inode *inode, struct file *filp)
-{
- struct gendisk *disk = inode->i_bdev->bd_disk;
- struct ide_floppy_obj *floppy = ide_drv_g(disk, ide_floppy_obj);
- ide_drive_t *drive = floppy->drive;
-
- ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
-
- if (floppy->openers == 1) {
- ide_set_media_lock(drive, disk, 0);
- drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
- }
-
- floppy->openers--;
-
- ide_floppy_put(floppy);
-
- return 0;
-}
-
-static int idefloppy_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
- struct ide_floppy_obj *floppy = ide_drv_g(bdev->bd_disk,
- ide_floppy_obj);
- ide_drive_t *drive = floppy->drive;
-
- geo->heads = drive->bios_head;
- geo->sectors = drive->bios_sect;
- geo->cylinders = (u16)drive->bios_cyl; /* truncate */
- return 0;
-}
-
-static int idefloppy_media_changed(struct gendisk *disk)
-{
- struct ide_floppy_obj *floppy = ide_drv_g(disk, ide_floppy_obj);
- ide_drive_t *drive = floppy->drive;
- int ret;
-
- /* do not scan partitions twice if this is a removable device */
- if (drive->dev_flags & IDE_DFLAG_ATTACH) {
- drive->dev_flags &= ~IDE_DFLAG_ATTACH;
- return 0;
- }
- ret = !!(drive->atapi_flags & IDE_AFLAG_MEDIA_CHANGED);
- drive->atapi_flags &= ~IDE_AFLAG_MEDIA_CHANGED;
- return ret;
-}
-
-static int idefloppy_revalidate_disk(struct gendisk *disk)
-{
- struct ide_floppy_obj *floppy = ide_drv_g(disk, ide_floppy_obj);
- set_capacity(disk, ide_floppy_capacity(floppy->drive));
- return 0;
-}
-
-static struct block_device_operations idefloppy_ops = {
- .owner = THIS_MODULE,
- .open = idefloppy_open,
- .release = idefloppy_release,
- .ioctl = ide_floppy_ioctl,
- .getgeo = idefloppy_getgeo,
- .media_changed = idefloppy_media_changed,
- .revalidate_disk = idefloppy_revalidate_disk
+const struct ide_disk_ops ide_atapi_disk_ops = {
+ .check = ide_check_atapi_device,
+ .get_capacity = ide_floppy_get_capacity,
+ .setup = ide_floppy_setup,
+ .flush = ide_floppy_flush,
+ .init_media = ide_floppy_init_media,
+ .set_doorlock = ide_set_media_lock,
+ .do_request = ide_floppy_do_request,
+ .end_request = ide_floppy_end_request,
+ .ioctl = ide_floppy_ioctl,
};
-
-static int ide_floppy_probe(ide_drive_t *drive)
-{
- idefloppy_floppy_t *floppy;
- struct gendisk *g;
-
- if (!strstr("ide-floppy", drive->driver_req))
- goto failed;
-
- if (drive->media != ide_floppy)
- goto failed;
-
- if (!ide_check_atapi_device(drive, DRV_NAME)) {
- printk(KERN_ERR PFX "%s: not supported by this version of "
- DRV_NAME "\n", drive->name);
- goto failed;
- }
- floppy = kzalloc(sizeof(idefloppy_floppy_t), GFP_KERNEL);
- if (!floppy) {
- printk(KERN_ERR PFX "%s: Can't allocate a floppy structure\n",
- drive->name);
- goto failed;
- }
-
- g = alloc_disk(1 << PARTN_BITS);
- if (!g)
- goto out_free_floppy;
-
- ide_init_disk(g, drive);
-
- kref_init(&floppy->kref);
-
- floppy->drive = drive;
- floppy->driver = &idefloppy_driver;
- floppy->disk = g;
-
- g->private_data = &floppy->driver;
-
- drive->driver_data = floppy;
-
- drive->debug_mask = debug_mask;
-
- idefloppy_setup(drive, floppy);
- drive->dev_flags |= IDE_DFLAG_ATTACH;
-
- g->minors = 1 << PARTN_BITS;
- g->driverfs_dev = &drive->gendev;
- if (drive->dev_flags & IDE_DFLAG_REMOVABLE)
- g->flags = GENHD_FL_REMOVABLE;
- g->fops = &idefloppy_ops;
- add_disk(g);
- return 0;
-
-out_free_floppy:
- kfree(floppy);
-failed:
- return -ENODEV;
-}
-
-static void __exit idefloppy_exit(void)
-{
- driver_unregister(&idefloppy_driver.gen_driver);
-}
-
-static int __init idefloppy_init(void)
-{
- printk(KERN_INFO DRV_NAME " driver " IDEFLOPPY_VERSION "\n");
- return driver_register(&idefloppy_driver.gen_driver);
-}
-
-MODULE_ALIAS("ide:*m-floppy*");
-MODULE_ALIAS("ide-floppy");
-module_init(idefloppy_init);
-module_exit(idefloppy_exit);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("ATAPI FLOPPY Driver");
-
diff --git a/drivers/ide/ide-floppy.h b/drivers/ide/ide-floppy.h
index 17cf865..c17124d 100644
--- a/drivers/ide/ide-floppy.h
+++ b/drivers/ide/ide-floppy.h
@@ -1,37 +1,9 @@
#ifndef __IDE_FLOPPY_H
#define __IDE_FLOPPY_H
-/*
- * Most of our global data which we need to save even as we leave the driver
- * due to an interrupt or a timer event is stored in a variable of type
- * idefloppy_floppy_t, defined below.
- */
-typedef struct ide_floppy_obj {
- ide_drive_t *drive;
- ide_driver_t *driver;
- struct gendisk *disk;
- struct kref kref;
- unsigned int openers; /* protected by BKL for now */
+#include "ide-gd.h"
- /* Last failed packet command */
- struct ide_atapi_pc *failed_pc;
- /* used for blk_{fs,pc}_request() requests */
- struct ide_atapi_pc queued_pc;
-
- /* Last error information */
- u8 sense_key, asc, ascq;
-
- int progress_indication;
-
- /* Device information */
- /* Current format */
- int blocks, block_size, bs_factor;
- /* Last format capacity descriptor */
- u8 cap_desc[8];
- /* Copy of the flexible disk page */
- u8 flexible_disk_page[32];
-} idefloppy_floppy_t;
-
+#ifdef CONFIG_IDE_GD_ATAPI
/*
* Pages of the SELECT SENSE / MODE SENSE packet commands.
* See SFF-8070i spec.
@@ -46,17 +18,22 @@
#define IDEFLOPPY_IOCTL_FORMAT_GET_PROGRESS 0x4603
/* ide-floppy.c */
+extern const struct ide_disk_ops ide_atapi_disk_ops;
void ide_floppy_create_mode_sense_cmd(struct ide_atapi_pc *, u8);
void ide_floppy_create_read_capacity_cmd(struct ide_atapi_pc *);
-sector_t ide_floppy_capacity(ide_drive_t *);
/* ide-floppy_ioctl.c */
-int ide_floppy_ioctl(struct inode *, struct file *, unsigned, unsigned long);
+int ide_floppy_ioctl(ide_drive_t *, struct inode *, struct file *, unsigned int,
+ unsigned long);
#ifdef CONFIG_IDE_PROC_FS
/* ide-floppy_proc.c */
extern ide_proc_entry_t ide_floppy_proc[];
extern const struct ide_proc_devset ide_floppy_settings[];
#endif
+#else
+#define ide_floppy_proc NULL
+#define ide_floppy_settings NULL
+#endif
#endif /*__IDE_FLOPPY_H */
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index a3a7a08..409e4c1 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -33,7 +33,7 @@
static int ide_floppy_get_format_capacities(ide_drive_t *drive, int __user *arg)
{
- struct ide_floppy_obj *floppy = drive->driver_data;
+ struct ide_disk_obj *floppy = drive->driver_data;
struct ide_atapi_pc pc;
u8 header_len, desc_cnt;
int i, blocks, length, u_array_size, u_index;
@@ -113,7 +113,7 @@
static int ide_floppy_get_sfrp_bit(ide_drive_t *drive)
{
- idefloppy_floppy_t *floppy = drive->driver_data;
+ struct ide_disk_obj *floppy = drive->driver_data;
struct ide_atapi_pc pc;
drive->atapi_flags &= ~IDE_AFLAG_SRFP;
@@ -132,17 +132,17 @@
static int ide_floppy_format_unit(ide_drive_t *drive, int __user *arg)
{
- idefloppy_floppy_t *floppy = drive->driver_data;
+ struct ide_disk_obj *floppy = drive->driver_data;
struct ide_atapi_pc pc;
int blocks, length, flags, err = 0;
if (floppy->openers > 1) {
/* Don't format if someone is using the disk */
- drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
+ drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
return -EBUSY;
}
- drive->atapi_flags |= IDE_AFLAG_FORMAT_IN_PROGRESS;
+ drive->dev_flags |= IDE_DFLAG_FORMAT_IN_PROGRESS;
/*
* Send ATAPI_FORMAT_UNIT to the drive.
@@ -174,7 +174,7 @@
out:
if (err)
- drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
+ drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
return err;
}
@@ -190,7 +190,7 @@
static int ide_floppy_get_format_progress(ide_drive_t *drive, int __user *arg)
{
- idefloppy_floppy_t *floppy = drive->driver_data;
+ struct ide_disk_obj *floppy = drive->driver_data;
struct ide_atapi_pc pc;
int progress_indication = 0x10000;
@@ -226,7 +226,7 @@
static int ide_floppy_lockdoor(ide_drive_t *drive, struct ide_atapi_pc *pc,
unsigned long arg, unsigned int cmd)
{
- idefloppy_floppy_t *floppy = drive->driver_data;
+ struct ide_disk_obj *floppy = drive->driver_data;
struct gendisk *disk = floppy->disk;
int prevent = (arg && cmd != CDROMEJECT) ? 1 : 0;
@@ -260,13 +260,10 @@
}
}
-int ide_floppy_ioctl(struct inode *inode, struct file *file,
- unsigned int cmd, unsigned long arg)
+int ide_floppy_ioctl(ide_drive_t *drive, struct inode *inode,
+ struct file *file, unsigned int cmd, unsigned long arg)
{
struct block_device *bdev = inode->i_bdev;
- struct ide_floppy_obj *floppy = ide_drv_g(bdev->bd_disk,
- ide_floppy_obj);
- ide_drive_t *drive = floppy->drive;
struct ide_atapi_pc pc;
void __user *argp = (void __user *)arg;
int err;
diff --git a/drivers/ide/ide-floppy_proc.c b/drivers/ide/ide-floppy_proc.c
index 76f0c6c..3ec762c 100644
--- a/drivers/ide/ide-floppy_proc.c
+++ b/drivers/ide/ide-floppy_proc.c
@@ -9,7 +9,7 @@
ide_drive_t*drive = (ide_drive_t *)data;
int len;
- len = sprintf(page, "%llu\n", (long long)ide_floppy_capacity(drive));
+ len = sprintf(page, "%llu\n", (long long)ide_gd_capacity(drive));
PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
}
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
new file mode 100644
index 0000000..d44898f
--- /dev/null
+++ b/drivers/ide/ide-gd.c
@@ -0,0 +1,398 @@
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/genhd.h>
+#include <linux/mutex.h>
+#include <linux/ide.h>
+#include <linux/hdreg.h>
+
+#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)
+#define IDE_DISK_MINORS (1 << PARTN_BITS)
+#else
+#define IDE_DISK_MINORS 0
+#endif
+
+#include "ide-disk.h"
+#include "ide-floppy.h"
+
+#define IDE_GD_VERSION "1.18"
+
+/* module parameters */
+static unsigned long debug_mask;
+module_param(debug_mask, ulong, 0644);
+
+static DEFINE_MUTEX(ide_disk_ref_mutex);
+
+static void ide_disk_release(struct kref *);
+
+static struct ide_disk_obj *ide_disk_get(struct gendisk *disk)
+{
+ struct ide_disk_obj *idkp = NULL;
+
+ mutex_lock(&ide_disk_ref_mutex);
+ idkp = ide_drv_g(disk, ide_disk_obj);
+ if (idkp) {
+ if (ide_device_get(idkp->drive))
+ idkp = NULL;
+ else
+ kref_get(&idkp->kref);
+ }
+ mutex_unlock(&ide_disk_ref_mutex);
+ return idkp;
+}
+
+static void ide_disk_put(struct ide_disk_obj *idkp)
+{
+ ide_drive_t *drive = idkp->drive;
+
+ mutex_lock(&ide_disk_ref_mutex);
+ kref_put(&idkp->kref, ide_disk_release);
+ ide_device_put(drive);
+ mutex_unlock(&ide_disk_ref_mutex);
+}
+
+sector_t ide_gd_capacity(ide_drive_t *drive)
+{
+ return drive->capacity64;
+}
+
+static int ide_gd_probe(ide_drive_t *);
+
+static void ide_gd_remove(ide_drive_t *drive)
+{
+ struct ide_disk_obj *idkp = drive->driver_data;
+ struct gendisk *g = idkp->disk;
+
+ ide_proc_unregister_driver(drive, idkp->driver);
+
+ del_gendisk(g);
+
+ drive->disk_ops->flush(drive);
+
+ ide_disk_put(idkp);
+}
+
+static void ide_disk_release(struct kref *kref)
+{
+ struct ide_disk_obj *idkp = to_ide_drv(kref, ide_disk_obj);
+ ide_drive_t *drive = idkp->drive;
+ struct gendisk *g = idkp->disk;
+
+ drive->disk_ops = NULL;
+ drive->driver_data = NULL;
+ g->private_data = NULL;
+ put_disk(g);
+ kfree(idkp);
+}
+
+/*
+ * On HPA drives the capacity needs to be
+ * reinitilized on resume otherwise the disk
+ * can not be used and a hard reset is required
+ */
+static void ide_gd_resume(ide_drive_t *drive)
+{
+ if (ata_id_hpa_enabled(drive->id))
+ (void)drive->disk_ops->get_capacity(drive);
+}
+
+static void ide_gd_shutdown(ide_drive_t *drive)
+{
+#ifdef CONFIG_ALPHA
+ /* On Alpha, halt(8) doesn't actually turn the machine off,
+ it puts you into the sort of firmware monitor. Typically,
+ it's used to boot another kernel image, so it's not much
+ different from reboot(8). Therefore, we don't need to
+ spin down the disk in this case, especially since Alpha
+ firmware doesn't handle disks in standby mode properly.
+ On the other hand, it's reasonably safe to turn the power
+ off when the shutdown process reaches the firmware prompt,
+ as the firmware initialization takes rather long time -
+ at least 10 seconds, which should be sufficient for
+ the disk to expire its write cache. */
+ if (system_state != SYSTEM_POWER_OFF) {
+#else
+ if (system_state == SYSTEM_RESTART) {
+#endif
+ drive->disk_ops->flush(drive);
+ return;
+ }
+
+ printk(KERN_INFO "Shutdown: %s\n", drive->name);
+
+ drive->gendev.bus->suspend(&drive->gendev, PMSG_SUSPEND);
+}
+
+#ifdef CONFIG_IDE_PROC_FS
+static ide_proc_entry_t *ide_disk_proc_entries(ide_drive_t *drive)
+{
+ return (drive->media == ide_disk) ? ide_disk_proc : ide_floppy_proc;
+}
+
+static const struct ide_proc_devset *ide_disk_proc_devsets(ide_drive_t *drive)
+{
+ return (drive->media == ide_disk) ? ide_disk_settings
+ : ide_floppy_settings;
+}
+#endif
+
+static ide_startstop_t ide_gd_do_request(ide_drive_t *drive,
+ struct request *rq, sector_t sector)
+{
+ return drive->disk_ops->do_request(drive, rq, sector);
+}
+
+static int ide_gd_end_request(ide_drive_t *drive, int uptodate, int nrsecs)
+{
+ return drive->disk_ops->end_request(drive, uptodate, nrsecs);
+}
+
+static ide_driver_t ide_gd_driver = {
+ .gen_driver = {
+ .owner = THIS_MODULE,
+ .name = "ide-gd",
+ .bus = &ide_bus_type,
+ },
+ .probe = ide_gd_probe,
+ .remove = ide_gd_remove,
+ .resume = ide_gd_resume,
+ .shutdown = ide_gd_shutdown,
+ .version = IDE_GD_VERSION,
+ .do_request = ide_gd_do_request,
+ .end_request = ide_gd_end_request,
+ .error = __ide_error,
+#ifdef CONFIG_IDE_PROC_FS
+ .proc_entries = ide_disk_proc_entries,
+ .proc_devsets = ide_disk_proc_devsets,
+#endif
+};
+
+static int ide_gd_open(struct inode *inode, struct file *filp)
+{
+ struct gendisk *disk = inode->i_bdev->bd_disk;
+ struct ide_disk_obj *idkp;
+ ide_drive_t *drive;
+ int ret = 0;
+
+ idkp = ide_disk_get(disk);
+ if (idkp == NULL)
+ return -ENXIO;
+
+ drive = idkp->drive;
+
+ ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+
+ idkp->openers++;
+
+ if ((drive->dev_flags & IDE_DFLAG_REMOVABLE) && idkp->openers == 1) {
+ drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
+ /* Just in case */
+
+ ret = drive->disk_ops->init_media(drive, disk);
+
+ /*
+ * Allow O_NDELAY to open a drive without a disk, or with an
+ * unreadable disk, so that we can get the format capacity
+ * of the drive or begin the format - Sam
+ */
+ if (ret && (filp->f_flags & O_NDELAY) == 0) {
+ ret = -EIO;
+ goto out_put_idkp;
+ }
+
+ if ((drive->dev_flags & IDE_DFLAG_WP) && (filp->f_mode & 2)) {
+ ret = -EROFS;
+ goto out_put_idkp;
+ }
+
+ /*
+ * Ignore the return code from door_lock,
+ * since the open() has already succeeded,
+ * and the door_lock is irrelevant at this point.
+ */
+ drive->disk_ops->set_doorlock(drive, disk, 1);
+ drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
+ check_disk_change(inode->i_bdev);
+ } else if (drive->dev_flags & IDE_DFLAG_FORMAT_IN_PROGRESS) {
+ ret = -EBUSY;
+ goto out_put_idkp;
+ }
+ return 0;
+
+out_put_idkp:
+ idkp->openers--;
+ ide_disk_put(idkp);
+ return ret;
+}
+
+static int ide_gd_release(struct inode *inode, struct file *filp)
+{
+ struct gendisk *disk = inode->i_bdev->bd_disk;
+ struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
+ ide_drive_t *drive = idkp->drive;
+
+ ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+
+ if (idkp->openers == 1)
+ drive->disk_ops->flush(drive);
+
+ if ((drive->dev_flags & IDE_DFLAG_REMOVABLE) && idkp->openers == 1) {
+ drive->disk_ops->set_doorlock(drive, disk, 0);
+ drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
+ }
+
+ idkp->openers--;
+
+ ide_disk_put(idkp);
+
+ return 0;
+}
+
+static int ide_gd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct ide_disk_obj *idkp = ide_drv_g(bdev->bd_disk, ide_disk_obj);
+ ide_drive_t *drive = idkp->drive;
+
+ geo->heads = drive->bios_head;
+ geo->sectors = drive->bios_sect;
+ geo->cylinders = (u16)drive->bios_cyl; /* truncate */
+ return 0;
+}
+
+static int ide_gd_media_changed(struct gendisk *disk)
+{
+ struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
+ ide_drive_t *drive = idkp->drive;
+ int ret;
+
+ /* do not scan partitions twice if this is a removable device */
+ if (drive->dev_flags & IDE_DFLAG_ATTACH) {
+ drive->dev_flags &= ~IDE_DFLAG_ATTACH;
+ return 0;
+ }
+
+ ret = !!(drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED);
+ drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED;
+
+ return ret;
+}
+
+static int ide_gd_revalidate_disk(struct gendisk *disk)
+{
+ struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
+ set_capacity(disk, ide_gd_capacity(idkp->drive));
+ return 0;
+}
+
+static int ide_gd_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ struct block_device *bdev = inode->i_bdev;
+ struct ide_disk_obj *idkp = ide_drv_g(bdev->bd_disk, ide_disk_obj);
+ ide_drive_t *drive = idkp->drive;
+
+ return drive->disk_ops->ioctl(drive, inode, file, cmd, arg);
+}
+
+static struct block_device_operations ide_gd_ops = {
+ .owner = THIS_MODULE,
+ .open = ide_gd_open,
+ .release = ide_gd_release,
+ .ioctl = ide_gd_ioctl,
+ .getgeo = ide_gd_getgeo,
+ .media_changed = ide_gd_media_changed,
+ .revalidate_disk = ide_gd_revalidate_disk
+};
+
+static int ide_gd_probe(ide_drive_t *drive)
+{
+ const struct ide_disk_ops *disk_ops = NULL;
+ struct ide_disk_obj *idkp;
+ struct gendisk *g;
+
+ /* strstr("foo", "") is non-NULL */
+ if (!strstr("ide-gd", drive->driver_req))
+ goto failed;
+
+#ifdef CONFIG_IDE_GD_ATA
+ if (drive->media == ide_disk)
+ disk_ops = &ide_ata_disk_ops;
+#endif
+#ifdef CONFIG_IDE_GD_ATAPI
+ if (drive->media == ide_floppy)
+ disk_ops = &ide_atapi_disk_ops;
+#endif
+ if (disk_ops == NULL)
+ goto failed;
+
+ if (disk_ops->check(drive, DRV_NAME) == 0) {
+ printk(KERN_ERR PFX "%s: not supported by this driver\n",
+ drive->name);
+ goto failed;
+ }
+
+ idkp = kzalloc(sizeof(*idkp), GFP_KERNEL);
+ if (!idkp) {
+ printk(KERN_ERR PFX "%s: can't allocate a disk structure\n",
+ drive->name);
+ goto failed;
+ }
+
+ g = alloc_disk_node(IDE_DISK_MINORS, hwif_to_node(drive->hwif));
+ if (!g)
+ goto out_free_idkp;
+
+ ide_init_disk(g, drive);
+
+ kref_init(&idkp->kref);
+
+ idkp->drive = drive;
+ idkp->driver = &ide_gd_driver;
+ idkp->disk = g;
+
+ g->private_data = &idkp->driver;
+
+ drive->driver_data = idkp;
+ drive->debug_mask = debug_mask;
+ drive->disk_ops = disk_ops;
+
+ disk_ops->setup(drive);
+
+ set_capacity(g, ide_gd_capacity(drive));
+
+ g->minors = IDE_DISK_MINORS;
+ g->driverfs_dev = &drive->gendev;
+ g->flags |= GENHD_FL_EXT_DEVT;
+ if (drive->dev_flags & IDE_DFLAG_REMOVABLE)
+ g->flags = GENHD_FL_REMOVABLE;
+ g->fops = &ide_gd_ops;
+ add_disk(g);
+ return 0;
+
+out_free_idkp:
+ kfree(idkp);
+failed:
+ return -ENODEV;
+}
+
+static int __init ide_gd_init(void)
+{
+ printk(KERN_INFO DRV_NAME " driver " IDE_GD_VERSION "\n");
+ return driver_register(&ide_gd_driver.gen_driver);
+}
+
+static void __exit ide_gd_exit(void)
+{
+ driver_unregister(&ide_gd_driver.gen_driver);
+}
+
+MODULE_ALIAS("ide:*m-disk*");
+MODULE_ALIAS("ide-disk");
+MODULE_ALIAS("ide:*m-floppy*");
+MODULE_ALIAS("ide-floppy");
+module_init(ide_gd_init);
+module_exit(ide_gd_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("generic ATA/ATAPI disk driver");
diff --git a/drivers/ide/ide-gd.h b/drivers/ide/ide-gd.h
new file mode 100644
index 0000000..7d3d101
--- /dev/null
+++ b/drivers/ide/ide-gd.h
@@ -0,0 +1,44 @@
+#ifndef __IDE_GD_H
+#define __IDE_GD_H
+
+#define DRV_NAME "ide-gd"
+#define PFX DRV_NAME ": "
+
+/* define to see debug info */
+#define IDE_GD_DEBUG_LOG 0
+
+#if IDE_GD_DEBUG_LOG
+#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, args)
+#else
+#define ide_debug_log(lvl, fmt, args...) do {} while (0)
+#endif
+
+struct ide_disk_obj {
+ ide_drive_t *drive;
+ ide_driver_t *driver;
+ struct gendisk *disk;
+ struct kref kref;
+ unsigned int openers; /* protected by BKL for now */
+
+ /* Last failed packet command */
+ struct ide_atapi_pc *failed_pc;
+ /* used for blk_{fs,pc}_request() requests */
+ struct ide_atapi_pc queued_pc;
+
+ /* Last error information */
+ u8 sense_key, asc, ascq;
+
+ int progress_indication;
+
+ /* Device information */
+ /* Current format */
+ int blocks, block_size, bs_factor;
+ /* Last format capacity descriptor */
+ u8 cap_desc[8];
+ /* Copy of the flexible disk page */
+ u8 flexible_disk_page[32];
+};
+
+sector_t ide_gd_capacity(ide_drive_t *);
+
+#endif /* __IDE_GD_H */
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index b762deb..bb7a1ed 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -755,7 +755,7 @@
udelay(1);
SELECT_DRIVE(drive);
- SELECT_MASK(drive, 0);
+ SELECT_MASK(drive, 1);
udelay(1);
tp_ops->set_irq(hwif, 0);
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 19f8c77..1649ea5 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -208,6 +208,7 @@
drive->ready_stat = 0;
if (ata_id_cdb_intr(id))
drive->atapi_flags |= IDE_AFLAG_DRQ_INTERRUPT;
+ drive->dev_flags |= IDE_DFLAG_DOORLOCKING;
/* we don't do head unloading on ATAPI devices */
drive->dev_flags |= IDE_DFLAG_NO_UNLOAD;
return;
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index b269264..c31d0dd 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -567,10 +567,10 @@
void ide_proc_register_driver(ide_drive_t *drive, ide_driver_t *driver)
{
mutex_lock(&ide_setting_mtx);
- drive->settings = driver->settings;
+ drive->settings = driver->proc_devsets(drive);
mutex_unlock(&ide_setting_mtx);
- ide_add_proc_entries(drive->proc, driver->proc, drive);
+ ide_add_proc_entries(drive->proc, driver->proc_entries(drive), drive);
}
EXPORT_SYMBOL(ide_proc_register_driver);
@@ -591,7 +591,7 @@
{
unsigned long flags;
- ide_remove_proc_entries(drive->proc, driver->proc);
+ ide_remove_proc_entries(drive->proc, driver->proc_entries(drive));
mutex_lock(&ide_setting_mtx);
spin_lock_irqsave(&ide_lock, flags);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d879c77..b2b2e5e 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -2108,7 +2108,7 @@
/* device lacks locking support according to capabilities page */
if ((caps[6] & 1) == 0)
- drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK;
+ drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
if (caps[7] & 0x02)
tape->blk_size = 512;
@@ -2298,6 +2298,16 @@
{ "name", S_IFREG|S_IRUGO, proc_idetape_read_name, NULL },
{ NULL, 0, NULL, NULL }
};
+
+static ide_proc_entry_t *ide_tape_proc_entries(ide_drive_t *drive)
+{
+ return idetape_proc;
+}
+
+static const struct ide_proc_devset *ide_tape_proc_devsets(ide_drive_t *drive)
+{
+ return idetape_settings;
+}
#endif
static int ide_tape_probe(ide_drive_t *);
@@ -2315,8 +2325,8 @@
.end_request = idetape_end_request,
.error = __ide_error,
#ifdef CONFIG_IDE_PROC_FS
- .proc = idetape_proc,
- .settings = idetape_settings,
+ .proc_entries = ide_tape_proc_entries,
+ .proc_devsets = ide_tape_proc_devsets,
#endif
};
diff --git a/drivers/ide/pci/Makefile b/drivers/ide/pci/Makefile
index 02e6ee7..ab44a1f 100644
--- a/drivers/ide/pci/Makefile
+++ b/drivers/ide/pci/Makefile
@@ -11,7 +11,6 @@
obj-$(CONFIG_BLK_DEV_SC1200) += sc1200.o
obj-$(CONFIG_BLK_DEV_CY82C693) += cy82c693.o
obj-$(CONFIG_BLK_DEV_DELKIN) += delkin_cb.o
-obj-$(CONFIG_BLK_DEV_HPT34X) += hpt34x.o
obj-$(CONFIG_BLK_DEV_HPT366) += hpt366.o
obj-$(CONFIG_BLK_DEV_IT8213) += it8213.o
obj-$(CONFIG_BLK_DEV_IT821X) += it821x.o
diff --git a/drivers/ide/pci/delkin_cb.c b/drivers/ide/pci/delkin_cb.c
index 8689a70..8f1b2d9 100644
--- a/drivers/ide/pci/delkin_cb.c
+++ b/drivers/ide/pci/delkin_cb.c
@@ -46,10 +46,27 @@
.quirkproc = ide_undecoded_slave,
};
+static unsigned int delkin_cb_init_chipset(struct pci_dev *dev)
+{
+ unsigned long base = pci_resource_start(dev, 0);
+ int i;
+
+ outb(0x02, base + 0x1e); /* set nIEN to block interrupts */
+ inb(base + 0x17); /* read status to clear interrupts */
+
+ for (i = 0; i < sizeof(setup); ++i) {
+ if (setup[i])
+ outb(setup[i], base + i);
+ }
+
+ return 0;
+}
+
static const struct ide_port_info delkin_cb_port_info = {
.port_ops = &delkin_cb_port_ops,
.host_flags = IDE_HFLAG_IO_32BIT | IDE_HFLAG_UNMASK_IRQS |
IDE_HFLAG_NO_DMA,
+ .init_chipset = delkin_cb_init_chipset,
};
static int __devinit
@@ -57,7 +74,7 @@
{
struct ide_host *host;
unsigned long base;
- int i, rc;
+ int rc;
hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
rc = pci_enable_device(dev);
@@ -72,12 +89,8 @@
return rc;
}
base = pci_resource_start(dev, 0);
- outb(0x02, base + 0x1e); /* set nIEN to block interrupts */
- inb(base + 0x17); /* read status to clear interrupts */
- for (i = 0; i < sizeof(setup); ++i) {
- if (setup[i])
- outb(setup[i], base + i);
- }
+
+ delkin_cb_init_chipset(dev);
memset(&hw, 0, sizeof(hw));
ide_std_init_ports(&hw, base + 0x10, base + 0x1e);
@@ -110,6 +123,40 @@
pci_disable_device(dev);
}
+#ifdef CONFIG_PM
+static int delkin_cb_suspend(struct pci_dev *dev, pm_message_t state)
+{
+ pci_save_state(dev);
+ pci_disable_device(dev);
+ pci_set_power_state(dev, pci_choose_state(dev, state));
+
+ return 0;
+}
+
+static int delkin_cb_resume(struct pci_dev *dev)
+{
+ struct ide_host *host = pci_get_drvdata(dev);
+ int rc;
+
+ pci_set_power_state(dev, PCI_D0);
+
+ rc = pci_enable_device(dev);
+ if (rc)
+ return rc;
+
+ pci_restore_state(dev);
+ pci_set_master(dev);
+
+ if (host->init_chipset)
+ host->init_chipset(dev);
+
+ return 0;
+}
+#else
+#define delkin_cb_suspend NULL
+#define delkin_cb_resume NULL
+#endif
+
static struct pci_device_id delkin_cb_pci_tbl[] __devinitdata = {
{ 0x1145, 0xf021, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
{ 0x1145, 0xf024, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
@@ -122,6 +169,8 @@
.id_table = delkin_cb_pci_tbl,
.probe = delkin_cb_probe,
.remove = delkin_cb_remove,
+ .suspend = delkin_cb_suspend,
+ .resume = delkin_cb_resume,
};
static int __init delkin_cb_init(void)
diff --git a/drivers/ide/pci/hpt34x.c b/drivers/ide/pci/hpt34x.c
deleted file mode 100644
index fb1a3aa..0000000
--- a/drivers/ide/pci/hpt34x.c
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (C) 1998-2000 Andre Hedrick <andre@linux-ide.org>
- *
- * May be copied or modified under the terms of the GNU General Public License
- *
- *
- * 00:12.0 Unknown mass storage controller:
- * Triones Technologies, Inc.
- * Unknown device 0003 (rev 01)
- *
- * hde: UDMA 2 (0x0000 0x0002) (0x0000 0x0010)
- * hdf: UDMA 2 (0x0002 0x0012) (0x0010 0x0030)
- * hde: DMA 2 (0x0000 0x0002) (0x0000 0x0010)
- * hdf: DMA 2 (0x0002 0x0012) (0x0010 0x0030)
- * hdg: DMA 1 (0x0012 0x0052) (0x0030 0x0070)
- * hdh: DMA 1 (0x0052 0x0252) (0x0070 0x00f0)
- *
- * ide-pci.c reference
- *
- * Since there are two cards that report almost identically,
- * the only discernable difference is the values reported in pcicmd.
- * Booting-BIOS card or HPT363 :: pcicmd == 0x07
- * Non-bootable card or HPT343 :: pcicmd == 0x05
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/ide.h>
-
-#define DRV_NAME "hpt34x"
-
-#define HPT343_DEBUG_DRIVE_INFO 0
-
-static void hpt34x_set_mode(ide_drive_t *drive, const u8 speed)
-{
- struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
- u32 reg1= 0, tmp1 = 0, reg2 = 0, tmp2 = 0;
- u8 hi_speed, lo_speed;
-
- hi_speed = speed >> 4;
- lo_speed = speed & 0x0f;
-
- if (hi_speed & 7) {
- hi_speed = (hi_speed & 4) ? 0x01 : 0x10;
- } else {
- lo_speed <<= 5;
- lo_speed >>= 5;
- }
-
- pci_read_config_dword(dev, 0x44, ®1);
- pci_read_config_dword(dev, 0x48, ®2);
- tmp1 = ((lo_speed << (3*drive->dn)) | (reg1 & ~(7 << (3*drive->dn))));
- tmp2 = ((hi_speed << drive->dn) | (reg2 & ~(0x11 << drive->dn)));
- pci_write_config_dword(dev, 0x44, tmp1);
- pci_write_config_dword(dev, 0x48, tmp2);
-
-#if HPT343_DEBUG_DRIVE_INFO
- printk("%s: %s drive%d (0x%04x 0x%04x) (0x%04x 0x%04x)" \
- " (0x%02x 0x%02x)\n",
- drive->name, ide_xfer_verbose(speed),
- drive->dn, reg1, tmp1, reg2, tmp2,
- hi_speed, lo_speed);
-#endif /* HPT343_DEBUG_DRIVE_INFO */
-}
-
-static void hpt34x_set_pio_mode(ide_drive_t *drive, const u8 pio)
-{
- hpt34x_set_mode(drive, XFER_PIO_0 + pio);
-}
-
-/*
- * If the BIOS does not set the IO base addaress to XX00, 343 will fail.
- */
-#define HPT34X_PCI_INIT_REG 0x80
-
-static unsigned int init_chipset_hpt34x(struct pci_dev *dev)
-{
- int i = 0;
- unsigned long hpt34xIoBase = pci_resource_start(dev, 4);
- unsigned long hpt_addr[4] = { 0x20, 0x34, 0x28, 0x3c };
- unsigned long hpt_addr_len[4] = { 7, 3, 7, 3 };
- u16 cmd;
- unsigned long flags;
-
- local_irq_save(flags);
-
- pci_write_config_byte(dev, HPT34X_PCI_INIT_REG, 0x00);
- pci_read_config_word(dev, PCI_COMMAND, &cmd);
-
- if (cmd & PCI_COMMAND_MEMORY)
- pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0xF0);
- else
- pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0x20);
-
- /*
- * Since 20-23 can be assigned and are R/W, we correct them.
- */
- pci_write_config_word(dev, PCI_COMMAND, cmd & ~PCI_COMMAND_IO);
- for(i=0; i<4; i++) {
- dev->resource[i].start = (hpt34xIoBase + hpt_addr[i]);
- dev->resource[i].end = dev->resource[i].start + hpt_addr_len[i];
- dev->resource[i].flags = IORESOURCE_IO;
- pci_write_config_dword(dev,
- (PCI_BASE_ADDRESS_0 + (i * 4)),
- dev->resource[i].start);
- }
- pci_write_config_word(dev, PCI_COMMAND, cmd);
-
- local_irq_restore(flags);
-
- return dev->irq;
-}
-
-static const struct ide_port_ops hpt34x_port_ops = {
- .set_pio_mode = hpt34x_set_pio_mode,
- .set_dma_mode = hpt34x_set_mode,
-};
-
-#define IDE_HFLAGS_HPT34X \
- (IDE_HFLAG_NO_ATAPI_DMA | \
- IDE_HFLAG_NO_DSC | \
- IDE_HFLAG_NO_AUTODMA)
-
-static const struct ide_port_info hpt34x_chipsets[] __devinitdata = {
- { /* 0: HPT343 */
- .name = DRV_NAME,
- .init_chipset = init_chipset_hpt34x,
- .port_ops = &hpt34x_port_ops,
- .host_flags = IDE_HFLAGS_HPT34X | IDE_HFLAG_NON_BOOTABLE,
- .pio_mask = ATA_PIO5,
- },
- { /* 1: HPT345 */
- .name = DRV_NAME,
- .init_chipset = init_chipset_hpt34x,
- .port_ops = &hpt34x_port_ops,
- .host_flags = IDE_HFLAGS_HPT34X | IDE_HFLAG_OFF_BOARD,
- .pio_mask = ATA_PIO5,
-#ifdef CONFIG_HPT34X_AUTODMA
- .swdma_mask = ATA_SWDMA2,
- .mwdma_mask = ATA_MWDMA2,
- .udma_mask = ATA_UDMA2,
-#endif
- }
-};
-
-static int __devinit hpt34x_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
- const struct ide_port_info *d;
- u16 pcicmd = 0;
-
- pci_read_config_word(dev, PCI_COMMAND, &pcicmd);
-
- d = &hpt34x_chipsets[(pcicmd & PCI_COMMAND_MEMORY) ? 1 : 0];
-
- return ide_pci_init_one(dev, d, NULL);
-}
-
-static const struct pci_device_id hpt34x_pci_tbl[] = {
- { PCI_VDEVICE(TTI, PCI_DEVICE_ID_TTI_HPT343), 0 },
- { 0, },
-};
-MODULE_DEVICE_TABLE(pci, hpt34x_pci_tbl);
-
-static struct pci_driver hpt34x_pci_driver = {
- .name = "HPT34x_IDE",
- .id_table = hpt34x_pci_tbl,
- .probe = hpt34x_init_one,
- .remove = ide_pci_remove,
- .suspend = ide_pci_suspend,
- .resume = ide_pci_resume,
-};
-
-static int __init hpt34x_ide_init(void)
-{
- return ide_pci_register_driver(&hpt34x_pci_driver);
-}
-
-static void __exit hpt34x_ide_exit(void)
-{
- pci_unregister_driver(&hpt34x_pci_driver);
-}
-
-module_init(hpt34x_ide_init);
-module_exit(hpt34x_ide_exit);
-
-MODULE_AUTHOR("Andre Hedrick");
-MODULE_DESCRIPTION("PCI driver module for Highpoint 34x IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c
index 9cf171c..a7909e9 100644
--- a/drivers/ide/pci/hpt366.c
+++ b/drivers/ide/pci/hpt366.c
@@ -3,7 +3,7 @@
* Portions Copyright (C) 2001 Sun Microsystems, Inc.
* Portions Copyright (C) 2003 Red Hat Inc
* Portions Copyright (C) 2007 Bartlomiej Zolnierkiewicz
- * Portions Copyright (C) 2005-2007 MontaVista Software, Inc.
+ * Portions Copyright (C) 2005-2008 MontaVista Software, Inc.
*
* Thanks to HighPoint Technologies for their assistance, and hardware.
* Special Thanks to Jon Burchmore in SanDiego for the deep pockets, his
@@ -748,26 +748,24 @@
struct pci_dev *dev = to_pci_dev(hwif->dev);
struct hpt_info *info = hpt3xx_get_info(hwif->dev);
- if (drive->quirk_list) {
- if (info->chip_type >= HPT370) {
- u8 scr1 = 0;
+ if (drive->quirk_list == 0)
+ return;
- pci_read_config_byte(dev, 0x5a, &scr1);
- if (((scr1 & 0x10) >> 4) != mask) {
- if (mask)
- scr1 |= 0x10;
- else
- scr1 &= ~0x10;
- pci_write_config_byte(dev, 0x5a, scr1);
- }
- } else {
+ if (info->chip_type >= HPT370) {
+ u8 scr1 = 0;
+
+ pci_read_config_byte(dev, 0x5a, &scr1);
+ if (((scr1 & 0x10) >> 4) != mask) {
if (mask)
- disable_irq(hwif->irq);
+ scr1 |= 0x10;
else
- enable_irq (hwif->irq);
+ scr1 &= ~0x10;
+ pci_write_config_byte(dev, 0x5a, scr1);
}
- } else
- outb(ATA_DEVCTL_OBS | (mask ? 2 : 0), hwif->io_ports.ctl_addr);
+ } else if (mask)
+ disable_irq(hwif->irq);
+ else
+ enable_irq(hwif->irq);
}
/*
@@ -1289,7 +1287,6 @@
static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
{
- struct pci_dev *dev = to_pci_dev(hwif->dev);
struct hpt_info *info = hpt3xx_get_info(hwif->dev);
int serialize = HPT_SERIALIZE_IO;
u8 chip_type = info->chip_type;
diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c
index 9ce1d80..49f163a 100644
--- a/drivers/ide/pci/scc_pata.c
+++ b/drivers/ide/pci/scc_pata.c
@@ -617,7 +617,6 @@
unsigned long intmask_port;
unsigned long mode_port;
unsigned long ecmode_port;
- unsigned long dma_status_port;
u32 reg = 0;
struct scc_ports *ports;
int rc;
@@ -637,7 +636,6 @@
intmask_port = dma_base + 0x010;
mode_port = ctl_base + 0x024;
ecmode_port = ctl_base + 0xf00;
- dma_status_port = dma_base + 0x004;
/* controller initialization */
reg = 0;
@@ -843,8 +841,6 @@
static void __devinit init_hwif_scc(ide_hwif_t *hwif)
{
- struct scc_ports *ports = ide_get_hwifdata(hwif);
-
/* PTERADD */
out_be32((void __iomem *)(hwif->dma_base + 0x018), hwif->dmatable_dma);
diff --git a/drivers/ide/pci/sgiioc4.c b/drivers/ide/pci/sgiioc4.c
index dd63454..8af9b23 100644
--- a/drivers/ide/pci/sgiioc4.c
+++ b/drivers/ide/pci/sgiioc4.c
@@ -101,18 +101,8 @@
for (i = 0; i <= 7; i++)
hw->io_ports_array[i] = reg + i * 4;
- if (ctrl_port)
- hw->io_ports.ctl_addr = ctrl_port;
-
- if (irq_port)
- hw->io_ports.irq_addr = irq_port;
-}
-
-static void
-sgiioc4_maskproc(ide_drive_t * drive, int mask)
-{
- writeb(ATA_DEVCTL_OBS | (mask ? 2 : 0),
- (void __iomem *)drive->hwif->io_ports.ctl_addr);
+ hw->io_ports.ctl_addr = ctrl_port;
+ hw->io_ports.irq_addr = irq_port;
}
static int
@@ -310,16 +300,14 @@
unsigned long port = hwif->io_ports.status_addr;
u8 reg = (u8) readb((void __iomem *) port);
- if ((port & 0xFFF) == 0x11C) { /* Status register of IOC4 */
- if (!(reg & ATA_BUSY)) { /* Not busy... check for interrupt */
- unsigned long other_ir = port - 0x110;
- unsigned int intr_reg = (u32) readl((void __iomem *) other_ir);
+ if (!(reg & ATA_BUSY)) { /* Not busy... check for interrupt */
+ unsigned long other_ir = port - 0x110;
+ unsigned int intr_reg = (u32) readl((void __iomem *) other_ir);
- /* Clear the Interrupt, Error bits on the IOC4 */
- if (intr_reg & 0x03) {
- writel(0x03, (void __iomem *) other_ir);
- intr_reg = (u32) readl((void __iomem *) other_ir);
- }
+ /* Clear the Interrupt, Error bits on the IOC4 */
+ if (intr_reg & 0x03) {
+ writel(0x03, (void __iomem *) other_ir);
+ intr_reg = (u32) readl((void __iomem *) other_ir);
}
}
@@ -332,13 +320,9 @@
{
struct pci_dev *dev = to_pci_dev(hwif->dev);
unsigned long dma_base = pci_resource_start(dev, 0) + IOC4_DMA_OFFSET;
- void __iomem *virt_dma_base;
int num_ports = sizeof (ioc4_dma_regs_t);
void *pad;
- if (dma_base == 0)
- return -1;
-
printk(KERN_INFO " %s: MMIO-DMA\n", hwif->name);
if (request_mem_region(dma_base, num_ports, hwif->name) == NULL) {
@@ -348,14 +332,8 @@
return -1;
}
- virt_dma_base = ioremap(dma_base, num_ports);
- if (virt_dma_base == NULL) {
- printk(KERN_ERR "%s(%s) -- ERROR: unable to map addresses "
- "0x%lx to 0x%lx\n", __func__, hwif->name,
- dma_base, dma_base + num_ports - 1);
- goto dma_remap_failure;
- }
- hwif->dma_base = (unsigned long) virt_dma_base;
+ hwif->dma_base = (unsigned long)hwif->io_ports.irq_addr +
+ IOC4_DMA_OFFSET;
hwif->sg_max_nents = IOC4_PRD_ENTRIES;
@@ -379,9 +357,6 @@
printk(KERN_INFO "%s: changing from DMA to PIO mode", hwif->name);
dma_pci_alloc_failure:
- iounmap(virt_dma_base);
-
-dma_remap_failure:
release_mem_region(dma_base, num_ports);
return -1;
@@ -563,8 +538,6 @@
.set_dma_mode = sgiioc4_set_dma_mode,
/* reset DMA engine, clear IRQs */
.resetproc = sgiioc4_resetproc,
- /* mask on/off NIEN register */
- .maskproc = sgiioc4_maskproc,
};
static const struct ide_dma_ops sgiioc4_dma_ops = {
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index e3e4042..c7ff1e1 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -179,7 +179,7 @@
config LEDS_TRIGGER_IDE_DISK
bool "LED IDE Disk Trigger"
- depends on LEDS_TRIGGERS && BLK_DEV_IDEDISK
+ depends on LEDS_TRIGGERS && IDE_GD_ATA
help
This allows LEDs to be controlled by IDE disk activity.
If unsure, say Y.
diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c
index ba5aa20..e4c0db4 100644
--- a/drivers/mfd/asic3.c
+++ b/drivers/mfd/asic3.c
@@ -123,7 +123,7 @@
irqnr = asic->irq_base +
(ASIC3_GPIOS_PER_BANK * bank)
+ i;
- desc = irq_desc + irqnr;
+ desc = irq_to_desc(irqnr);
desc->handle_irq(irqnr, desc);
if (asic->irq_bothedge[bank] & bit)
asic3_irq_flip_edge(asic, base,
@@ -136,7 +136,7 @@
for (i = ASIC3_NUM_GPIOS; i < ASIC3_NR_IRQS; i++) {
/* They start at bit 4 and go up */
if (status & (1 << (i - ASIC3_NUM_GPIOS + 4))) {
- desc = irq_desc + asic->irq_base + i;
+ desc = irq_to_desc(asic->irq_base + i);
desc->handle_irq(asic->irq_base + i,
desc);
}
diff --git a/drivers/mfd/htc-egpio.c b/drivers/mfd/htc-egpio.c
index 50dff6e..1a4d046 100644
--- a/drivers/mfd/htc-egpio.c
+++ b/drivers/mfd/htc-egpio.c
@@ -112,7 +112,7 @@
/* Run irq handler */
pr_debug("got IRQ %d\n", irqpin);
irq = ei->irq_start + irqpin;
- desc = &irq_desc[irq];
+ desc = irq_to_desc(irq);
desc->handle_irq(irq, desc);
}
}
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 406989e..7a72e75 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -132,6 +132,7 @@
blk_queue_prep_rq(mq->queue, mmc_prep_request);
blk_queue_ordered(mq->queue, QUEUE_ORDERED_DRAIN, NULL);
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);
#ifdef CONFIG_MMC_BLOCK_BOUNCE
if (host->max_hw_segs == 1) {
diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c
index ae16d84..3b2085b 100644
--- a/drivers/mmc/host/s3cmci.c
+++ b/drivers/mmc/host/s3cmci.c
@@ -3,6 +3,9 @@
*
* Copyright (C) 2004-2006 maintech GmbH, Thomas Kleffel <tk@maintech.de>
*
+ * Current driver maintained by Ben Dooks and Simtec Electronics
+ * Copyright (C) 2008 Simtec Electronics <ben-linux@fluff.org>
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
@@ -13,6 +16,7 @@
#include <linux/clk.h>
#include <linux/mmc/host.h>
#include <linux/platform_device.h>
+#include <linux/cpufreq.h>
#include <linux/irq.h>
#include <linux/io.h>
@@ -39,9 +43,9 @@
dbg_conf = (1 << 8),
};
-static const int dbgmap_err = dbg_err | dbg_fail;
+static const int dbgmap_err = dbg_fail;
static const int dbgmap_info = dbg_info | dbg_conf;
-static const int dbgmap_debug = dbg_debug;
+static const int dbgmap_debug = dbg_err | dbg_debug;
#define dbg(host, channels, args...) \
do { \
@@ -189,7 +193,7 @@
}
static inline int get_data_buffer(struct s3cmci_host *host,
- u32 *words, u32 **pointer)
+ u32 *bytes, u32 **pointer)
{
struct scatterlist *sg;
@@ -206,7 +210,7 @@
}
sg = &host->mrq->data->sg[host->pio_sgptr];
- *words = sg->length >> 2;
+ *bytes = sg->length;
*pointer = sg_virt(sg);
host->pio_sgptr++;
@@ -222,7 +226,7 @@
u32 fifostat = readl(host->base + S3C2410_SDIFSTA);
fifostat &= S3C2410_SDIFSTA_COUNTMASK;
- return fifostat >> 2;
+ return fifostat;
}
static inline u32 fifo_free(struct s3cmci_host *host)
@@ -230,13 +234,15 @@
u32 fifostat = readl(host->base + S3C2410_SDIFSTA);
fifostat &= S3C2410_SDIFSTA_COUNTMASK;
- return (63 - fifostat) >> 2;
+ return 63 - fifostat;
}
static void do_pio_read(struct s3cmci_host *host)
{
int res;
u32 fifo;
+ u32 *ptr;
+ u32 fifo_words;
void __iomem *from_ptr;
/* write real prescaler to host, it might be set slow to fix */
@@ -245,8 +251,8 @@
from_ptr = host->base + host->sdidata;
while ((fifo = fifo_count(host))) {
- if (!host->pio_words) {
- res = get_data_buffer(host, &host->pio_words,
+ if (!host->pio_bytes) {
+ res = get_data_buffer(host, &host->pio_bytes,
&host->pio_ptr);
if (res) {
host->pio_active = XFER_NONE;
@@ -259,26 +265,47 @@
dbg(host, dbg_pio,
"pio_read(): new target: [%i]@[%p]\n",
- host->pio_words, host->pio_ptr);
+ host->pio_bytes, host->pio_ptr);
}
dbg(host, dbg_pio,
"pio_read(): fifo:[%02i] buffer:[%03i] dcnt:[%08X]\n",
- fifo, host->pio_words,
+ fifo, host->pio_bytes,
readl(host->base + S3C2410_SDIDCNT));
- if (fifo > host->pio_words)
- fifo = host->pio_words;
+ /* If we have reached the end of the block, we can
+ * read a word and get 1 to 3 bytes. If we in the
+ * middle of the block, we have to read full words,
+ * otherwise we will write garbage, so round down to
+ * an even multiple of 4. */
+ if (fifo >= host->pio_bytes)
+ fifo = host->pio_bytes;
+ else
+ fifo -= fifo & 3;
- host->pio_words -= fifo;
+ host->pio_bytes -= fifo;
host->pio_count += fifo;
- while (fifo--)
- *(host->pio_ptr++) = readl(from_ptr);
+ fifo_words = fifo >> 2;
+ ptr = host->pio_ptr;
+ while (fifo_words--)
+ *ptr++ = readl(from_ptr);
+ host->pio_ptr = ptr;
+
+ if (fifo & 3) {
+ u32 n = fifo & 3;
+ u32 data = readl(from_ptr);
+ u8 *p = (u8 *)host->pio_ptr;
+
+ while (n--) {
+ *p++ = data;
+ data >>= 8;
+ }
+ }
}
- if (!host->pio_words) {
- res = get_data_buffer(host, &host->pio_words, &host->pio_ptr);
+ if (!host->pio_bytes) {
+ res = get_data_buffer(host, &host->pio_bytes, &host->pio_ptr);
if (res) {
dbg(host, dbg_pio,
"pio_read(): complete (no more buffers).\n");
@@ -298,12 +325,13 @@
void __iomem *to_ptr;
int res;
u32 fifo;
+ u32 *ptr;
to_ptr = host->base + host->sdidata;
while ((fifo = fifo_free(host))) {
- if (!host->pio_words) {
- res = get_data_buffer(host, &host->pio_words,
+ if (!host->pio_bytes) {
+ res = get_data_buffer(host, &host->pio_bytes,
&host->pio_ptr);
if (res) {
dbg(host, dbg_pio,
@@ -315,18 +343,27 @@
dbg(host, dbg_pio,
"pio_write(): new source: [%i]@[%p]\n",
- host->pio_words, host->pio_ptr);
+ host->pio_bytes, host->pio_ptr);
}
- if (fifo > host->pio_words)
- fifo = host->pio_words;
+ /* If we have reached the end of the block, we have to
+ * write exactly the remaining number of bytes. If we
+ * in the middle of the block, we have to write full
+ * words, so round down to an even multiple of 4. */
+ if (fifo >= host->pio_bytes)
+ fifo = host->pio_bytes;
+ else
+ fifo -= fifo & 3;
- host->pio_words -= fifo;
+ host->pio_bytes -= fifo;
host->pio_count += fifo;
+ fifo = (fifo + 3) >> 2;
+ ptr = host->pio_ptr;
while (fifo--)
- writel(*(host->pio_ptr++), to_ptr);
+ writel(*ptr++, to_ptr);
+ host->pio_ptr = ptr;
}
enable_imask(host, S3C2410_SDIIMSK_TXFIFOHALF);
@@ -349,9 +386,9 @@
clear_imask(host);
if (host->pio_active != XFER_NONE) {
dbg(host, dbg_err, "unfinished %s "
- "- pio_count:[%u] pio_words:[%u]\n",
+ "- pio_count:[%u] pio_bytes:[%u]\n",
(host->pio_active == XFER_READ) ? "read" : "write",
- host->pio_count, host->pio_words);
+ host->pio_count, host->pio_bytes);
if (host->mrq->data)
host->mrq->data->error = -EINVAL;
@@ -812,11 +849,10 @@
/* We cannot deal with unaligned blocks with more than
* one block being transfered. */
- if (data->blocks > 1)
+ if (data->blocks > 1) {
+ pr_warning("%s: can't do non-word sized block transfers (blksz %d)\n", __func__, data->blksz);
return -EINVAL;
-
- /* No support yet for non-word block transfers. */
- return -EINVAL;
+ }
}
while (readl(host->base + S3C2410_SDIDSTA) &
@@ -896,7 +932,7 @@
BUG_ON((data->flags & BOTH_DIR) == BOTH_DIR);
host->pio_sgptr = 0;
- host->pio_words = 0;
+ host->pio_bytes = 0;
host->pio_count = 0;
host->pio_active = rw ? XFER_WRITE : XFER_READ;
@@ -1033,10 +1069,33 @@
s3cmci_send_request(mmc);
}
+static void s3cmci_set_clk(struct s3cmci_host *host, struct mmc_ios *ios)
+{
+ u32 mci_psc;
+
+ /* Set clock */
+ for (mci_psc = 0; mci_psc < 255; mci_psc++) {
+ host->real_rate = host->clk_rate / (host->clk_div*(mci_psc+1));
+
+ if (host->real_rate <= ios->clock)
+ break;
+ }
+
+ if (mci_psc > 255)
+ mci_psc = 255;
+
+ host->prescaler = mci_psc;
+ writel(host->prescaler, host->base + S3C2410_SDIPRE);
+
+ /* If requested clock is 0, real_rate will be 0, too */
+ if (ios->clock == 0)
+ host->real_rate = 0;
+}
+
static void s3cmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
{
struct s3cmci_host *host = mmc_priv(mmc);
- u32 mci_psc, mci_con;
+ u32 mci_con;
/* Set the power state */
@@ -1074,23 +1133,7 @@
break;
}
- /* Set clock */
- for (mci_psc = 0; mci_psc < 255; mci_psc++) {
- host->real_rate = host->clk_rate / (host->clk_div*(mci_psc+1));
-
- if (host->real_rate <= ios->clock)
- break;
- }
-
- if (mci_psc > 255)
- mci_psc = 255;
-
- host->prescaler = mci_psc;
- writel(host->prescaler, host->base + S3C2410_SDIPRE);
-
- /* If requested clock is 0, real_rate will be 0, too */
- if (ios->clock == 0)
- host->real_rate = 0;
+ s3cmci_set_clk(host, ios);
/* Set CLOCK_ENABLE */
if (ios->clock)
@@ -1148,6 +1191,61 @@
* checks. Any zero fields to ensure reaonable defaults are picked. */
};
+#ifdef CONFIG_CPU_FREQ
+
+static int s3cmci_cpufreq_transition(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct s3cmci_host *host;
+ struct mmc_host *mmc;
+ unsigned long newclk;
+ unsigned long flags;
+
+ host = container_of(nb, struct s3cmci_host, freq_transition);
+ newclk = clk_get_rate(host->clk);
+ mmc = host->mmc;
+
+ if ((val == CPUFREQ_PRECHANGE && newclk > host->clk_rate) ||
+ (val == CPUFREQ_POSTCHANGE && newclk < host->clk_rate)) {
+ spin_lock_irqsave(&mmc->lock, flags);
+
+ host->clk_rate = newclk;
+
+ if (mmc->ios.power_mode != MMC_POWER_OFF &&
+ mmc->ios.clock != 0)
+ s3cmci_set_clk(host, &mmc->ios);
+
+ spin_unlock_irqrestore(&mmc->lock, flags);
+ }
+
+ return 0;
+}
+
+static inline int s3cmci_cpufreq_register(struct s3cmci_host *host)
+{
+ host->freq_transition.notifier_call = s3cmci_cpufreq_transition;
+
+ return cpufreq_register_notifier(&host->freq_transition,
+ CPUFREQ_TRANSITION_NOTIFIER);
+}
+
+static inline void s3cmci_cpufreq_deregister(struct s3cmci_host *host)
+{
+ cpufreq_unregister_notifier(&host->freq_transition,
+ CPUFREQ_TRANSITION_NOTIFIER);
+}
+
+#else
+static inline int s3cmci_cpufreq_register(struct s3cmci_host *host)
+{
+ return 0;
+}
+
+static inline void s3cmci_cpufreq_deregister(struct s3cmci_host *host)
+{
+}
+#endif
+
static int __devinit s3cmci_probe(struct platform_device *pdev, int is2440)
{
struct s3cmci_host *host;
@@ -1298,10 +1396,16 @@
(host->is2440?"2440":""),
host->base, host->irq, host->irq_cd, host->dma);
+ ret = s3cmci_cpufreq_register(host);
+ if (ret) {
+ dev_err(&pdev->dev, "failed to register cpufreq\n");
+ goto free_dmabuf;
+ }
+
ret = mmc_add_host(mmc);
if (ret) {
dev_err(&pdev->dev, "failed to add mmc host.\n");
- goto free_dmabuf;
+ goto free_cpufreq;
}
platform_set_drvdata(pdev, mmc);
@@ -1309,6 +1413,9 @@
return 0;
+ free_cpufreq:
+ s3cmci_cpufreq_deregister(host);
+
free_dmabuf:
clk_disable(host->clk);
@@ -1342,6 +1449,7 @@
if (host->irq_cd >= 0)
free_irq(host->irq_cd, host);
+ s3cmci_cpufreq_deregister(host);
mmc_remove_host(mmc);
clk_disable(host->clk);
}
@@ -1455,7 +1563,7 @@
MODULE_DESCRIPTION("Samsung S3C MMC/SD Card Interface driver");
MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Thomas Kleffel <tk@maintech.de>");
+MODULE_AUTHOR("Thomas Kleffel <tk@maintech.de>, Ben Dooks <ben-linux@fluff.org>");
MODULE_ALIAS("platform:s3c2410-sdi");
MODULE_ALIAS("platform:s3c2412-sdi");
MODULE_ALIAS("platform:s3c2440-sdi");
diff --git a/drivers/mmc/host/s3cmci.h b/drivers/mmc/host/s3cmci.h
index 37d9c60..ca1ba3d 100644
--- a/drivers/mmc/host/s3cmci.h
+++ b/drivers/mmc/host/s3cmci.h
@@ -51,7 +51,7 @@
int dma_complete;
u32 pio_sgptr;
- u32 pio_words;
+ u32 pio_bytes;
u32 pio_count;
u32 *pio_ptr;
#define XFER_NONE 0
@@ -67,4 +67,8 @@
unsigned int ccnt, dcnt;
struct tasklet_struct pio_tasklet;
+
+#ifdef CONFIG_CPU_FREQ
+ struct notifier_block freq_transition;
+#endif
};
diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
index 491ee16..9ba295d 100644
--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -90,7 +90,7 @@
#include <linux/eisa.h>
#include <linux/bitops.h>
#include <linux/jiffies.h>
-#include <asm/irq.h> /* For NR_IRQS only. */
+#include <asm/irq.h> /* For nr_irqs only. */
#include <asm/io.h>
#include <asm/uaccess.h>
@@ -1221,7 +1221,7 @@
if (print_info)
printk(", IRQ %d\n", dev->irq);
/* Tell them about an invalid IRQ. */
- if (dev->irq <= 0 || dev->irq >= NR_IRQS)
+ if (dev->irq <= 0 || dev->irq >= nr_irqs)
printk(KERN_WARNING " *** Warning: IRQ %d is unlikely to work! ***\n",
dev->irq);
diff --git a/drivers/net/hamradio/baycom_ser_fdx.c b/drivers/net/hamradio/baycom_ser_fdx.c
index 17ac697..b6a816e 100644
--- a/drivers/net/hamradio/baycom_ser_fdx.c
+++ b/drivers/net/hamradio/baycom_ser_fdx.c
@@ -416,10 +416,10 @@
if (!dev || !bc)
return -ENXIO;
if (!dev->base_addr || dev->base_addr > 0xffff-SER12_EXTENT ||
- dev->irq < 2 || dev->irq > NR_IRQS) {
+ dev->irq < 2 || dev->irq > nr_irqs) {
printk(KERN_INFO "baycom_ser_fdx: invalid portnumber (max %u) "
"or irq (2 <= irq <= %d)\n",
- 0xffff-SER12_EXTENT, NR_IRQS);
+ 0xffff-SER12_EXTENT, nr_irqs);
return -ENXIO;
}
if (bc->baud < 300 || bc->baud > 4800) {
diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c
index 45ae9d1..c17e39b 100644
--- a/drivers/net/hamradio/scc.c
+++ b/drivers/net/hamradio/scc.c
@@ -1465,7 +1465,7 @@
printk(KERN_INFO "Init Z8530 driver: %u channels, IRQ", Nchips*2);
flag=" ";
- for (k = 0; k < NR_IRQS; k++)
+ for (k = 0; k < nr_irqs; k++)
if (Ivec[k].used)
{
printk("%s%d", flag, k);
@@ -1728,7 +1728,7 @@
if (hwcfg.irq == 2) hwcfg.irq = 9;
- if (hwcfg.irq < 0 || hwcfg.irq >= NR_IRQS)
+ if (hwcfg.irq < 0 || hwcfg.irq >= nr_irqs)
return -EINVAL;
if (!Ivec[hwcfg.irq].used && hwcfg.irq)
@@ -2148,7 +2148,7 @@
}
/* To unload the port must be closed so no real IRQ pending */
- for (k=0; k < NR_IRQS ; k++)
+ for (k = 0; k < nr_irqs ; k++)
if (Ivec[k].used) free_irq(k, NULL);
local_irq_enable();
diff --git a/drivers/net/wan/sbni.c b/drivers/net/wan/sbni.c
index f972fef..ee51b6a 100644
--- a/drivers/net/wan/sbni.c
+++ b/drivers/net/wan/sbni.c
@@ -318,7 +318,7 @@
continue;
}
- if( pci_irq_line <= 0 || pci_irq_line >= NR_IRQS )
+ if (pci_irq_line <= 0 || pci_irq_line >= nr_irqs)
printk( KERN_WARNING " WARNING: The PCI BIOS assigned "
"this PCI card to IRQ %d, which is unlikely "
"to work!.\n"
diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c
index fd56128..3bc54b3 100644
--- a/drivers/parisc/dino.c
+++ b/drivers/parisc/dino.c
@@ -298,7 +298,8 @@
static void dino_disable_irq(unsigned int irq)
{
- struct dino_device *dino_dev = irq_desc[irq].chip_data;
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct dino_device *dino_dev = desc->chip_data;
int local_irq = gsc_find_local_irq(irq, dino_dev->global_irq, DINO_LOCAL_IRQS);
DBG(KERN_WARNING "%s(0x%p, %d)\n", __func__, dino_dev, irq);
@@ -310,7 +311,8 @@
static void dino_enable_irq(unsigned int irq)
{
- struct dino_device *dino_dev = irq_desc[irq].chip_data;
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct dino_device *dino_dev = desc->chip_data;
int local_irq = gsc_find_local_irq(irq, dino_dev->global_irq, DINO_LOCAL_IRQS);
u32 tmp;
diff --git a/drivers/parisc/eisa.c b/drivers/parisc/eisa.c
index 771cef5..7891db5 100644
--- a/drivers/parisc/eisa.c
+++ b/drivers/parisc/eisa.c
@@ -346,10 +346,10 @@
}
/* Reserve IRQ2 */
- irq_desc[2].action = &irq2_action;
+ irq_to_desc(2)->action = &irq2_action;
for (i = 0; i < 16; i++) {
- irq_desc[i].chip = &eisa_interrupt_type;
+ irq_to_desc(i)->chip = &eisa_interrupt_type;
}
EISA_bus = 1;
diff --git a/drivers/parisc/gsc.c b/drivers/parisc/gsc.c
index f7d088b..e76db9e 100644
--- a/drivers/parisc/gsc.c
+++ b/drivers/parisc/gsc.c
@@ -108,7 +108,8 @@
static void gsc_asic_disable_irq(unsigned int irq)
{
- struct gsc_asic *irq_dev = irq_desc[irq].chip_data;
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct gsc_asic *irq_dev = desc->chip_data;
int local_irq = gsc_find_local_irq(irq, irq_dev->global_irq, 32);
u32 imr;
@@ -123,7 +124,8 @@
static void gsc_asic_enable_irq(unsigned int irq)
{
- struct gsc_asic *irq_dev = irq_desc[irq].chip_data;
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct gsc_asic *irq_dev = desc->chip_data;
int local_irq = gsc_find_local_irq(irq, irq_dev->global_irq, 32);
u32 imr;
@@ -159,12 +161,14 @@
int gsc_assign_irq(struct hw_interrupt_type *type, void *data)
{
static int irq = GSC_IRQ_BASE;
+ struct irq_desc *desc;
if (irq > GSC_IRQ_MAX)
return NO_IRQ;
- irq_desc[irq].chip = type;
- irq_desc[irq].chip_data = data;
+ desc = irq_to_desc(irq);
+ desc->chip = type;
+ desc->chip_data = data;
return irq++;
}
diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c
index 6fb3f79..7beffca 100644
--- a/drivers/parisc/iosapic.c
+++ b/drivers/parisc/iosapic.c
@@ -619,7 +619,9 @@
static struct vector_info *iosapic_get_vector(unsigned int irq)
{
- return irq_desc[irq].chip_data;
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return desc->chip_data;
}
static void iosapic_disable_irq(unsigned int irq)
diff --git a/drivers/parisc/superio.c b/drivers/parisc/superio.c
index 1e8d2d1..1e93c83 100644
--- a/drivers/parisc/superio.c
+++ b/drivers/parisc/superio.c
@@ -363,7 +363,9 @@
#endif
for (i = 0; i < 16; i++) {
- irq_desc[i].chip = &superio_interrupt_type;
+ struct irq_desc *desc = irq_to_desc(i);
+
+ desc->chip = &superio_interrupt_type;
}
/*
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index e842e75..8b29c30 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -193,7 +193,7 @@
{
struct acpi_dmar_hardware_unit *drhd;
static int include_all;
- int ret;
+ int ret = 0;
drhd = (struct acpi_dmar_hardware_unit *) dmaru->hdr;
@@ -212,7 +212,7 @@
include_all = 1;
}
- if (ret || (dmaru->devices_cnt == 0 && !dmaru->include_all)) {
+ if (ret) {
list_del(&dmaru->list);
kfree(dmaru);
}
@@ -289,6 +289,24 @@
}
}
+/**
+ * dmar_table_detect - checks to see if the platform supports DMAR devices
+ */
+static int __init dmar_table_detect(void)
+{
+ acpi_status status = AE_OK;
+
+ /* if we could find DMAR table, then there are DMAR devices */
+ status = acpi_get_table(ACPI_SIG_DMAR, 0,
+ (struct acpi_table_header **)&dmar_tbl);
+
+ if (ACPI_SUCCESS(status) && !dmar_tbl) {
+ printk (KERN_WARNING PREFIX "Unable to map DMAR\n");
+ status = AE_NOT_FOUND;
+ }
+
+ return (ACPI_SUCCESS(status) ? 1 : 0);
+}
/**
* parse_dmar_table - parses the DMA reporting table
@@ -300,6 +318,12 @@
struct acpi_dmar_header *entry_header;
int ret = 0;
+ /*
+ * Do it again, earlier dmar_tbl mapping could be mapped with
+ * fixed map.
+ */
+ dmar_table_detect();
+
dmar = (struct acpi_table_dmar *)dmar_tbl;
if (!dmar)
return -ENODEV;
@@ -373,10 +397,10 @@
int __init dmar_dev_scope_init(void)
{
- struct dmar_drhd_unit *drhd;
+ struct dmar_drhd_unit *drhd, *drhd_n;
int ret = -ENODEV;
- for_each_drhd_unit(drhd) {
+ list_for_each_entry_safe(drhd, drhd_n, &dmar_drhd_units, list) {
ret = dmar_parse_dev(drhd);
if (ret)
return ret;
@@ -384,8 +408,8 @@
#ifdef CONFIG_DMAR
{
- struct dmar_rmrr_unit *rmrr;
- for_each_rmrr_units(rmrr) {
+ struct dmar_rmrr_unit *rmrr, *rmrr_n;
+ list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
ret = rmrr_parse_dev(rmrr);
if (ret)
return ret;
@@ -430,30 +454,11 @@
return 0;
}
-/**
- * early_dmar_detect - checks to see if the platform supports DMAR devices
- */
-int __init early_dmar_detect(void)
-{
- acpi_status status = AE_OK;
-
- /* if we could find DMAR table, then there are DMAR devices */
- status = acpi_get_table(ACPI_SIG_DMAR, 0,
- (struct acpi_table_header **)&dmar_tbl);
-
- if (ACPI_SUCCESS(status) && !dmar_tbl) {
- printk (KERN_WARNING PREFIX "Unable to map DMAR\n");
- status = AE_NOT_FOUND;
- }
-
- return (ACPI_SUCCESS(status) ? 1 : 0);
-}
-
void __init detect_intel_iommu(void)
{
int ret;
- ret = early_dmar_detect();
+ ret = dmar_table_detect();
#ifdef CONFIG_DMAR
{
@@ -479,14 +484,16 @@
" x2apic support\n");
dmar_disabled = 1;
- return;
+ goto end;
}
if (ret && !no_iommu && !iommu_detected && !swiotlb &&
!dmar_disabled)
iommu_detected = 1;
}
+end:
#endif
+ dmar_tbl = NULL;
}
diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
index 279c940..bf7d6ce 100644
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -126,7 +126,8 @@
cfg->msg.address_hi = 0xffffffff;
irq = create_irq();
- if (irq < 0) {
+
+ if (irq <= 0) {
kfree(cfg);
return -EBUSY;
}
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 738d4c8..2de5a32 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -1,3 +1,4 @@
+#include <linux/interrupt.h>
#include <linux/dmar.h>
#include <linux/spinlock.h>
#include <linux/jiffies.h>
@@ -11,41 +12,64 @@
static int ir_ioapic_num;
int intr_remapping_enabled;
-static struct {
+struct irq_2_iommu {
struct intel_iommu *iommu;
u16 irte_index;
u16 sub_handle;
u8 irte_mask;
-} irq_2_iommu[NR_IRQS];
+};
+
+static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+ return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL;
+}
+
+static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
+{
+ return irq_2_iommu(irq);
+}
static DEFINE_SPINLOCK(irq_2_ir_lock);
+static struct irq_2_iommu *valid_irq_2_iommu(unsigned int irq)
+{
+ struct irq_2_iommu *irq_iommu;
+
+ irq_iommu = irq_2_iommu(irq);
+
+ if (!irq_iommu)
+ return NULL;
+
+ if (!irq_iommu->iommu)
+ return NULL;
+
+ return irq_iommu;
+}
+
int irq_remapped(int irq)
{
- if (irq > NR_IRQS)
- return 0;
-
- if (!irq_2_iommu[irq].iommu)
- return 0;
-
- return 1;
+ return valid_irq_2_iommu(irq) != NULL;
}
int get_irte(int irq, struct irte *entry)
{
int index;
+ struct irq_2_iommu *irq_iommu;
- if (!entry || irq > NR_IRQS)
+ if (!entry)
return -1;
spin_lock(&irq_2_ir_lock);
- if (!irq_2_iommu[irq].iommu) {
+ irq_iommu = valid_irq_2_iommu(irq);
+ if (!irq_iommu) {
spin_unlock(&irq_2_ir_lock);
return -1;
}
- index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle;
- *entry = *(irq_2_iommu[irq].iommu->ir_table->base + index);
+ index = irq_iommu->irte_index + irq_iommu->sub_handle;
+ *entry = *(irq_iommu->iommu->ir_table->base + index);
spin_unlock(&irq_2_ir_lock);
return 0;
@@ -54,6 +78,7 @@
int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
{
struct ir_table *table = iommu->ir_table;
+ struct irq_2_iommu *irq_iommu;
u16 index, start_index;
unsigned int mask = 0;
int i;
@@ -61,6 +86,10 @@
if (!count)
return -1;
+ /* protect irq_2_iommu_alloc later */
+ if (irq >= nr_irqs)
+ return -1;
+
/*
* start the IRTE search from index 0.
*/
@@ -100,10 +129,11 @@
for (i = index; i < index + count; i++)
table->base[i].present = 1;
- irq_2_iommu[irq].iommu = iommu;
- irq_2_iommu[irq].irte_index = index;
- irq_2_iommu[irq].sub_handle = 0;
- irq_2_iommu[irq].irte_mask = mask;
+ irq_iommu = irq_2_iommu_alloc(irq);
+ irq_iommu->iommu = iommu;
+ irq_iommu->irte_index = index;
+ irq_iommu->sub_handle = 0;
+ irq_iommu->irte_mask = mask;
spin_unlock(&irq_2_ir_lock);
@@ -124,31 +154,33 @@
int map_irq_to_irte_handle(int irq, u16 *sub_handle)
{
int index;
+ struct irq_2_iommu *irq_iommu;
spin_lock(&irq_2_ir_lock);
- if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) {
+ irq_iommu = valid_irq_2_iommu(irq);
+ if (!irq_iommu) {
spin_unlock(&irq_2_ir_lock);
return -1;
}
- *sub_handle = irq_2_iommu[irq].sub_handle;
- index = irq_2_iommu[irq].irte_index;
+ *sub_handle = irq_iommu->sub_handle;
+ index = irq_iommu->irte_index;
spin_unlock(&irq_2_ir_lock);
return index;
}
int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
{
- spin_lock(&irq_2_ir_lock);
- if (irq >= NR_IRQS || irq_2_iommu[irq].iommu) {
- spin_unlock(&irq_2_ir_lock);
- return -1;
- }
+ struct irq_2_iommu *irq_iommu;
- irq_2_iommu[irq].iommu = iommu;
- irq_2_iommu[irq].irte_index = index;
- irq_2_iommu[irq].sub_handle = subhandle;
- irq_2_iommu[irq].irte_mask = 0;
+ spin_lock(&irq_2_ir_lock);
+
+ irq_iommu = irq_2_iommu_alloc(irq);
+
+ irq_iommu->iommu = iommu;
+ irq_iommu->irte_index = index;
+ irq_iommu->sub_handle = subhandle;
+ irq_iommu->irte_mask = 0;
spin_unlock(&irq_2_ir_lock);
@@ -157,16 +189,19 @@
int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index)
{
+ struct irq_2_iommu *irq_iommu;
+
spin_lock(&irq_2_ir_lock);
- if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) {
+ irq_iommu = valid_irq_2_iommu(irq);
+ if (!irq_iommu) {
spin_unlock(&irq_2_ir_lock);
return -1;
}
- irq_2_iommu[irq].iommu = NULL;
- irq_2_iommu[irq].irte_index = 0;
- irq_2_iommu[irq].sub_handle = 0;
- irq_2_iommu[irq].irte_mask = 0;
+ irq_iommu->iommu = NULL;
+ irq_iommu->irte_index = 0;
+ irq_iommu->sub_handle = 0;
+ irq_2_iommu(irq)->irte_mask = 0;
spin_unlock(&irq_2_ir_lock);
@@ -178,16 +213,18 @@
int index;
struct irte *irte;
struct intel_iommu *iommu;
+ struct irq_2_iommu *irq_iommu;
spin_lock(&irq_2_ir_lock);
- if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) {
+ irq_iommu = valid_irq_2_iommu(irq);
+ if (!irq_iommu) {
spin_unlock(&irq_2_ir_lock);
return -1;
}
- iommu = irq_2_iommu[irq].iommu;
+ iommu = irq_iommu->iommu;
- index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle;
+ index = irq_iommu->irte_index + irq_iommu->sub_handle;
irte = &iommu->ir_table->base[index];
set_64bit((unsigned long *)irte, irte_modified->low | (1 << 1));
@@ -203,18 +240,20 @@
{
int index;
struct intel_iommu *iommu;
+ struct irq_2_iommu *irq_iommu;
spin_lock(&irq_2_ir_lock);
- if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) {
+ irq_iommu = valid_irq_2_iommu(irq);
+ if (!irq_iommu) {
spin_unlock(&irq_2_ir_lock);
return -1;
}
- iommu = irq_2_iommu[irq].iommu;
+ iommu = irq_iommu->iommu;
- index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle;
+ index = irq_iommu->irte_index + irq_iommu->sub_handle;
- qi_flush_iec(iommu, index, irq_2_iommu[irq].irte_mask);
+ qi_flush_iec(iommu, index, irq_iommu->irte_mask);
spin_unlock(&irq_2_ir_lock);
return 0;
@@ -246,28 +285,30 @@
int index, i;
struct irte *irte;
struct intel_iommu *iommu;
+ struct irq_2_iommu *irq_iommu;
spin_lock(&irq_2_ir_lock);
- if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) {
+ irq_iommu = valid_irq_2_iommu(irq);
+ if (!irq_iommu) {
spin_unlock(&irq_2_ir_lock);
return -1;
}
- iommu = irq_2_iommu[irq].iommu;
+ iommu = irq_iommu->iommu;
- index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle;
+ index = irq_iommu->irte_index + irq_iommu->sub_handle;
irte = &iommu->ir_table->base[index];
- if (!irq_2_iommu[irq].sub_handle) {
- for (i = 0; i < (1 << irq_2_iommu[irq].irte_mask); i++)
+ if (!irq_iommu->sub_handle) {
+ for (i = 0; i < (1 << irq_iommu->irte_mask); i++)
set_64bit((unsigned long *)irte, 0);
- qi_flush_iec(iommu, index, irq_2_iommu[irq].irte_mask);
+ qi_flush_iec(iommu, index, irq_iommu->irte_mask);
}
- irq_2_iommu[irq].iommu = NULL;
- irq_2_iommu[irq].irte_index = 0;
- irq_2_iommu[irq].sub_handle = 0;
- irq_2_iommu[irq].irte_mask = 0;
+ irq_iommu->iommu = NULL;
+ irq_iommu->irte_index = 0;
+ irq_iommu->sub_handle = 0;
+ irq_iommu->irte_mask = 0;
spin_unlock(&irq_2_ir_lock);
diff --git a/drivers/pcmcia/at91_cf.c b/drivers/pcmcia/at91_cf.c
index a0ffb8e..9e1140f 100644
--- a/drivers/pcmcia/at91_cf.c
+++ b/drivers/pcmcia/at91_cf.c
@@ -273,7 +273,7 @@
goto fail0d;
cf->socket.pci_irq = board->irq_pin;
} else
- cf->socket.pci_irq = NR_IRQS + 1;
+ cf->socket.pci_irq = nr_irqs + 1;
/* pcmcia layer only remaps "real" memory not iospace */
cf->socket.io_offset = (unsigned long)
diff --git a/drivers/pcmcia/hd64465_ss.c b/drivers/pcmcia/hd64465_ss.c
index 117dc12..9ef69cd 100644
--- a/drivers/pcmcia/hd64465_ss.c
+++ b/drivers/pcmcia/hd64465_ss.c
@@ -233,15 +233,18 @@
*/
static void hs_map_irq(hs_socket_t *sp, unsigned int irq)
{
+ struct irq_desc *desc;
+
DPRINTK("hs_map_irq(sock=%d irq=%d)\n", sp->number, irq);
if (irq >= HS_NUM_MAPPED_IRQS)
return;
+ desc = irq_to_desc(irq);
hs_mapped_irq[irq].sock = sp;
/* insert ourselves as the irq controller */
- hs_mapped_irq[irq].old_handler = irq_desc[irq].chip;
- irq_desc[irq].chip = &hd64465_ss_irq_type;
+ hs_mapped_irq[irq].old_handler = desc->chip;
+ desc->chip = &hd64465_ss_irq_type;
}
@@ -250,13 +253,16 @@
*/
static void hs_unmap_irq(hs_socket_t *sp, unsigned int irq)
{
+ struct irq_desc *desc;
+
DPRINTK("hs_unmap_irq(sock=%d irq=%d)\n", sp->number, irq);
if (irq >= HS_NUM_MAPPED_IRQS)
return;
+ desc = irq_to_desc(irq);
/* restore the original irq controller */
- irq_desc[irq].chip = hs_mapped_irq[irq].old_handler;
+ desc->chip = hs_mapped_irq[irq].old_handler;
}
/*============================================================*/
diff --git a/drivers/pcmcia/vrc4171_card.c b/drivers/pcmcia/vrc4171_card.c
index eee2f1c..b2c4124 100644
--- a/drivers/pcmcia/vrc4171_card.c
+++ b/drivers/pcmcia/vrc4171_card.c
@@ -639,7 +639,7 @@
int irq;
options += 4;
irq = simple_strtoul(options, &options, 0);
- if (irq >= 0 && irq < NR_IRQS)
+ if (irq >= 0 && irq < nr_irqs)
vrc4171_irq = irq;
if (*options != ',')
diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c
index 884b635..834dcc6 100644
--- a/drivers/rtc/rtc-vr41xx.c
+++ b/drivers/rtc/rtc-vr41xx.c
@@ -360,7 +360,7 @@
spin_unlock_irq(&rtc_lock);
aie_irq = platform_get_irq(pdev, 0);
- if (aie_irq < 0 || aie_irq >= NR_IRQS) {
+ if (aie_irq < 0 || aie_irq >= nr_irqs) {
retval = -EBUSY;
goto err_device_unregister;
}
@@ -371,7 +371,7 @@
goto err_device_unregister;
pie_irq = platform_get_irq(pdev, 1);
- if (pie_irq < 0 || pie_irq >= NR_IRQS)
+ if (pie_irq < 0 || pie_irq >= nr_irqs)
goto err_free_irq;
retval = request_irq(pie_irq, rtclong1_interrupt, IRQF_DISABLED,
diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c
index b5a868d..1e5478a 100644
--- a/drivers/scsi/aha152x.c
+++ b/drivers/scsi/aha152x.c
@@ -337,7 +337,7 @@
#else
#define IRQ_MIN 9
#if defined(__PPC)
-#define IRQ_MAX (NR_IRQS-1)
+#define IRQ_MAX (nr_irqs-1)
#else
#define IRQ_MAX 12
#endif
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 740bad4..afc96e8 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -343,6 +343,11 @@
}
#ifdef CONFIG_IDE_PROC_FS
+static ide_proc_entry_t idescsi_proc[] = {
+ { "capacity", S_IFREG|S_IRUGO, proc_ide_read_capacity, NULL },
+ { NULL, 0, NULL, NULL }
+};
+
#define ide_scsi_devset_get(name, field) \
static int get_##name(ide_drive_t *drive) \
{ \
@@ -378,6 +383,16 @@
IDE_PROC_DEVSET(transform, 0, 3),
{ 0 },
};
+
+static ide_proc_entry_t *ide_scsi_proc_entries(ide_drive_t *drive)
+{
+ return idescsi_proc;
+}
+
+static const struct ide_proc_devset *ide_scsi_proc_devsets(ide_drive_t *drive)
+{
+ return idescsi_settings;
+}
#endif
/*
@@ -419,13 +434,6 @@
static int ide_scsi_probe(ide_drive_t *);
-#ifdef CONFIG_IDE_PROC_FS
-static ide_proc_entry_t idescsi_proc[] = {
- { "capacity", S_IFREG|S_IRUGO, proc_ide_read_capacity, NULL },
- { NULL, 0, NULL, NULL }
-};
-#endif
-
static ide_driver_t idescsi_driver = {
.gen_driver = {
.owner = THIS_MODULE,
@@ -439,8 +447,8 @@
.end_request = idescsi_end_request,
.error = idescsi_atapi_error,
#ifdef CONFIG_IDE_PROC_FS
- .proc = idescsi_proc,
- .settings = idescsi_settings,
+ .proc_entries = ide_scsi_proc_entries,
+ .proc_devsets = ide_scsi_proc_devsets,
#endif
};
diff --git a/drivers/serial/68328serial.c b/drivers/serial/68328serial.c
index 381b12a..d935b2d 100644
--- a/drivers/serial/68328serial.c
+++ b/drivers/serial/68328serial.c
@@ -66,7 +66,6 @@
#endif
static struct m68k_serial m68k_soft[NR_PORTS];
-struct m68k_serial *IRQ_ports[NR_IRQS];
static unsigned int uart_irqs[NR_PORTS] = UART_IRQ_DEFNS;
@@ -375,15 +374,11 @@
*/
irqreturn_t rs_interrupt(int irq, void *dev_id)
{
- struct m68k_serial * info;
+ struct m68k_serial *info = dev_id;
m68328_uart *uart;
unsigned short rx;
unsigned short tx;
- info = IRQ_ports[irq];
- if(!info)
- return IRQ_NONE;
-
uart = &uart_addr[info->line];
rx = uart->urx.w;
@@ -1383,8 +1378,6 @@
info->port, info->irq);
printk(" is a builtin MC68328 UART\n");
- IRQ_ports[info->irq] = info; /* waste of space */
-
#ifdef CONFIG_M68VZ328
if (i > 0 )
PJSEL &= 0xCF; /* PSW enable second port output */
@@ -1393,7 +1386,7 @@
if (request_irq(uart_irqs[i],
rs_interrupt,
IRQF_DISABLED,
- "M68328_UART", NULL))
+ "M68328_UART", info))
panic("Unable to attach 68328 serial interrupt\n");
}
local_irq_restore(flags);
diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 1528de2..303272a 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -156,11 +156,15 @@
};
struct irq_info {
- spinlock_t lock;
+ struct hlist_node node;
+ int irq;
+ spinlock_t lock; /* Protects list not the hash */
struct list_head *head;
};
-static struct irq_info irq_lists[NR_IRQS];
+#define NR_IRQ_HASH 32 /* Can be adjusted later */
+static struct hlist_head irq_lists[NR_IRQ_HASH];
+static DEFINE_MUTEX(hash_mutex); /* Used to walk the hash */
/*
* Here we define the default xmit fifo size used for each type of UART.
@@ -1545,15 +1549,43 @@
BUG_ON(i->head != &up->list);
i->head = NULL;
}
-
spin_unlock_irq(&i->lock);
+ /* List empty so throw away the hash node */
+ if (i->head == NULL) {
+ hlist_del(&i->node);
+ kfree(i);
+ }
}
static int serial_link_irq_chain(struct uart_8250_port *up)
{
- struct irq_info *i = irq_lists + up->port.irq;
+ struct hlist_head *h;
+ struct hlist_node *n;
+ struct irq_info *i;
int ret, irq_flags = up->port.flags & UPF_SHARE_IRQ ? IRQF_SHARED : 0;
+ mutex_lock(&hash_mutex);
+
+ h = &irq_lists[up->port.irq % NR_IRQ_HASH];
+
+ hlist_for_each(n, h) {
+ i = hlist_entry(n, struct irq_info, node);
+ if (i->irq == up->port.irq)
+ break;
+ }
+
+ if (n == NULL) {
+ i = kzalloc(sizeof(struct irq_info), GFP_KERNEL);
+ if (i == NULL) {
+ mutex_unlock(&hash_mutex);
+ return -ENOMEM;
+ }
+ spin_lock_init(&i->lock);
+ i->irq = up->port.irq;
+ hlist_add_head(&i->node, h);
+ }
+ mutex_unlock(&hash_mutex);
+
spin_lock_irq(&i->lock);
if (i->head) {
@@ -1577,14 +1609,28 @@
static void serial_unlink_irq_chain(struct uart_8250_port *up)
{
- struct irq_info *i = irq_lists + up->port.irq;
+ struct irq_info *i;
+ struct hlist_node *n;
+ struct hlist_head *h;
+ mutex_lock(&hash_mutex);
+
+ h = &irq_lists[up->port.irq % NR_IRQ_HASH];
+
+ hlist_for_each(n, h) {
+ i = hlist_entry(n, struct irq_info, node);
+ if (i->irq == up->port.irq)
+ break;
+ }
+
+ BUG_ON(n == NULL);
BUG_ON(i->head == NULL);
if (list_empty(i->head))
free_irq(up->port.irq, i);
serial_do_unlink(i, up);
+ mutex_unlock(&hash_mutex);
}
/* Base timer interval for polling */
@@ -2447,7 +2493,7 @@
static int
serial8250_verify_port(struct uart_port *port, struct serial_struct *ser)
{
- if (ser->irq >= NR_IRQS || ser->irq < 0 ||
+ if (ser->irq >= nr_irqs || ser->irq < 0 ||
ser->baud_base < 9600 || ser->type < PORT_UNKNOWN ||
ser->type >= ARRAY_SIZE(uart_config) || ser->type == PORT_CIRRUS ||
ser->type == PORT_STARTECH)
@@ -2967,7 +3013,7 @@
static int __init serial8250_init(void)
{
- int ret, i;
+ int ret;
if (nr_uarts > UART_NR)
nr_uarts = UART_NR;
@@ -2976,9 +3022,6 @@
"%d ports, IRQ sharing %sabled\n", nr_uarts,
share_irqs ? "en" : "dis");
- for (i = 0; i < NR_IRQS; i++)
- spin_lock_init(&irq_lists[i].lock);
-
#ifdef CONFIG_SPARC
ret = sunserial_register_minors(&serial8250_reg, UART_NR);
#else
@@ -3006,15 +3049,15 @@
goto out;
platform_device_del(serial8250_isa_devs);
- put_dev:
+put_dev:
platform_device_put(serial8250_isa_devs);
- unreg_uart_drv:
+unreg_uart_drv:
#ifdef CONFIG_SPARC
sunserial_unregister_minors(&serial8250_reg, UART_NR);
#else
uart_unregister_driver(&serial8250_reg);
#endif
- out:
+out:
return ret;
}
diff --git a/drivers/serial/amba-pl010.c b/drivers/serial/amba-pl010.c
index 90b56c2..7156268 100644
--- a/drivers/serial/amba-pl010.c
+++ b/drivers/serial/amba-pl010.c
@@ -512,7 +512,7 @@
int ret = 0;
if (ser->type != PORT_UNKNOWN && ser->type != PORT_AMBA)
ret = -EINVAL;
- if (ser->irq < 0 || ser->irq >= NR_IRQS)
+ if (ser->irq < 0 || ser->irq >= nr_irqs)
ret = -EINVAL;
if (ser->baud_base < 9600)
ret = -EINVAL;
diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c
index 9d08f27..b718004 100644
--- a/drivers/serial/amba-pl011.c
+++ b/drivers/serial/amba-pl011.c
@@ -572,7 +572,7 @@
int ret = 0;
if (ser->type != PORT_UNKNOWN && ser->type != PORT_AMBA)
ret = -EINVAL;
- if (ser->irq < 0 || ser->irq >= NR_IRQS)
+ if (ser->irq < 0 || ser->irq >= nr_irqs)
ret = -EINVAL;
if (ser->baud_base < 9600)
ret = -EINVAL;
diff --git a/drivers/serial/cpm_uart/cpm_uart_core.c b/drivers/serial/cpm_uart/cpm_uart_core.c
index a6c4d74..bde4b4b 100644
--- a/drivers/serial/cpm_uart/cpm_uart_core.c
+++ b/drivers/serial/cpm_uart/cpm_uart_core.c
@@ -623,7 +623,7 @@
if (ser->type != PORT_UNKNOWN && ser->type != PORT_CPM)
ret = -EINVAL;
- if (ser->irq < 0 || ser->irq >= NR_IRQS)
+ if (ser->irq < 0 || ser->irq >= nr_irqs)
ret = -EINVAL;
if (ser->baud_base < 9600)
ret = -EINVAL;
diff --git a/drivers/serial/m32r_sio.c b/drivers/serial/m32r_sio.c
index 23d0305..611c97a 100644
--- a/drivers/serial/m32r_sio.c
+++ b/drivers/serial/m32r_sio.c
@@ -922,7 +922,7 @@
static int
m32r_sio_verify_port(struct uart_port *port, struct serial_struct *ser)
{
- if (ser->irq >= NR_IRQS || ser->irq < 0 ||
+ if (ser->irq >= nr_irqs || ser->irq < 0 ||
ser->baud_base < 9600 || ser->type < PORT_UNKNOWN ||
ser->type >= ARRAY_SIZE(uart_config))
return -EINVAL;
@@ -1162,7 +1162,7 @@
printk(KERN_INFO "Serial: M32R SIO driver\n");
- for (i = 0; i < NR_IRQS; i++)
+ for (i = 0; i < nr_irqs; i++)
spin_lock_init(&irq_lists[i].lock);
ret = uart_register_driver(&m32r_sio_reg);
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 6bdf336..874786a 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -741,7 +741,7 @@
if (port->ops->verify_port)
retval = port->ops->verify_port(port, &new_serial);
- if ((new_serial.irq >= NR_IRQS) || (new_serial.irq < 0) ||
+ if ((new_serial.irq >= nr_irqs) || (new_serial.irq < 0) ||
(new_serial.baud_base < 9600))
retval = -EINVAL;
diff --git a/drivers/serial/serial_lh7a40x.c b/drivers/serial/serial_lh7a40x.c
index cb49a5a..61dc8b3 100644
--- a/drivers/serial/serial_lh7a40x.c
+++ b/drivers/serial/serial_lh7a40x.c
@@ -460,7 +460,7 @@
if (ser->type != PORT_UNKNOWN && ser->type != PORT_LH7A40X)
ret = -EINVAL;
- if (ser->irq < 0 || ser->irq >= NR_IRQS)
+ if (ser->irq < 0 || ser->irq >= nr_irqs)
ret = -EINVAL;
if (ser->baud_base < 9600) /* *** FIXME: is this true? */
ret = -EINVAL;
diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 3b9d2d8..f0658d2 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -1149,7 +1149,7 @@
{
struct sci_port *s = &sci_ports[port->line];
- if (ser->irq != s->irqs[SCIx_TXI_IRQ] || ser->irq > NR_IRQS)
+ if (ser->irq != s->irqs[SCIx_TXI_IRQ] || ser->irq > nr_irqs)
return -EINVAL;
if (ser->baud_base < 2400)
/* No paper tape reader for Mitch.. */
diff --git a/drivers/serial/ucc_uart.c b/drivers/serial/ucc_uart.c
index 539c933..315a933 100644
--- a/drivers/serial/ucc_uart.c
+++ b/drivers/serial/ucc_uart.c
@@ -1066,7 +1066,7 @@
if (ser->type != PORT_UNKNOWN && ser->type != PORT_CPM)
return -EINVAL;
- if (ser->irq < 0 || ser->irq >= NR_IRQS)
+ if (ser->irq < 0 || ser->irq >= nr_irqs)
return -EINVAL;
if (ser->baud_base < 9600)
diff --git a/drivers/video/imacfb.c b/drivers/video/imacfb.c
deleted file mode 100644
index e69de29..0000000
--- a/drivers/video/imacfb.c
+++ /dev/null
diff --git a/drivers/watchdog/ib700wdt.c b/drivers/watchdog/ib700wdt.c
index 05a2810..8782ec1 100644
--- a/drivers/watchdog/ib700wdt.c
+++ b/drivers/watchdog/ib700wdt.c
@@ -154,7 +154,7 @@
return -EINVAL;
for (i = 0x0F; i > -1; i--)
- if (wd_times[i] > t)
+ if (wd_times[i] >= t)
break;
wd_margin = i;
return 0;
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index c3290bc..9ce1ab6 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -125,7 +125,7 @@
BUG_ON(irq == -1);
#ifdef CONFIG_SMP
- irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+ irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu);
#endif
__clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
@@ -137,10 +137,12 @@
static void init_evtchn_cpu_bindings(void)
{
#ifdef CONFIG_SMP
+ struct irq_desc *desc;
int i;
+
/* By default all event channels notify CPU#0. */
- for (i = 0; i < NR_IRQS; i++)
- irq_desc[i].affinity = cpumask_of_cpu(0);
+ for_each_irq_desc(i, desc)
+ desc->affinity = cpumask_of_cpu(0);
#endif
memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
@@ -229,12 +231,12 @@
int irq;
/* Only allocate from dynirq range */
- for (irq = 0; irq < NR_IRQS; irq++)
+ for_each_irq_nr(irq)
if (irq_bindcount[irq] == 0)
break;
- if (irq == NR_IRQS)
- panic("No available IRQ to bind to: increase NR_IRQS!\n");
+ if (irq == nr_irqs)
+ panic("No available IRQ to bind to: increase nr_irqs!\n");
return irq;
}
@@ -790,7 +792,7 @@
mask_evtchn(evtchn);
/* No IRQ <-> event-channel mappings. */
- for (irq = 0; irq < NR_IRQS; irq++)
+ for_each_irq_nr(irq)
irq_info[irq].evtchn = 0; /* zap event-channel binding */
for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
@@ -822,7 +824,7 @@
mask_evtchn(i);
/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
- for (i = 0; i < NR_IRQS; i++)
+ for_each_irq_nr(i)
irq_bindcount[i] = 0;
irq_ctx_init(smp_processor_id());
diff --git a/fs/Kconfig b/fs/Kconfig
index 4eca61c..e46297f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -6,61 +6,9 @@
if BLOCK
-config EXT2_FS
- tristate "Second extended fs support"
- help
- Ext2 is a standard Linux file system for hard disks.
-
- To compile this file system support as a module, choose M here: the
- module will be called ext2.
-
- If unsure, say Y.
-
-config EXT2_FS_XATTR
- bool "Ext2 extended attributes"
- depends on EXT2_FS
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page, or visit
- <http://acl.bestbits.at/> for details).
-
- If unsure, say N.
-
-config EXT2_FS_POSIX_ACL
- bool "Ext2 POSIX Access Control Lists"
- depends on EXT2_FS_XATTR
- select FS_POSIX_ACL
- help
- Posix Access Control Lists (ACLs) support permissions for users and
- groups beyond the owner/group/world scheme.
-
- To learn more about Access Control Lists, visit the Posix ACLs for
- Linux website <http://acl.bestbits.at/>.
-
- If you don't know what Access Control Lists are, say N
-
-config EXT2_FS_SECURITY
- bool "Ext2 Security Labels"
- depends on EXT2_FS_XATTR
- help
- Security labels support alternative access control models
- implemented by security modules like SELinux. This option
- enables an extended attribute handler for file security
- labels in the ext2 filesystem.
-
- If you are not using a security module that requires using
- extended attributes for file security labels, say N.
-
-config EXT2_FS_XIP
- bool "Ext2 execute in place support"
- depends on EXT2_FS && MMU
- help
- Execute in place can be used on memory-backed block devices. If you
- enable this option, you can select to mount block devices which are
- capable of this feature without using the page cache.
-
- If you do not use a block device that is capable of using this,
- or if unsure, say N.
+source "fs/ext2/Kconfig"
+source "fs/ext3/Kconfig"
+source "fs/ext4/Kconfig"
config FS_XIP
# execute in place
@@ -68,218 +16,8 @@
depends on EXT2_FS_XIP
default y
-config EXT3_FS
- tristate "Ext3 journalling file system support"
- select JBD
- help
- This is the journalling version of the Second extended file system
- (often called ext3), the de facto standard Linux file system
- (method to organize files on a storage device) for hard disks.
-
- The journalling code included in this driver means you do not have
- to run e2fsck (file system checker) on your file systems after a
- crash. The journal keeps track of any changes that were being made
- at the time the system crashed, and can ensure that your file system
- is consistent without the need for a lengthy check.
-
- Other than adding the journal to the file system, the on-disk format
- of ext3 is identical to ext2. It is possible to freely switch
- between using the ext3 driver and the ext2 driver, as long as the
- file system has been cleanly unmounted, or e2fsck is run on the file
- system.
-
- To add a journal on an existing ext2 file system or change the
- behavior of ext3 file systems, you can use the tune2fs utility ("man
- tune2fs"). To modify attributes of files and directories on ext3
- file systems, use chattr ("man chattr"). You need to be using
- e2fsprogs version 1.20 or later in order to create ext3 journals
- (available at <http://sourceforge.net/projects/e2fsprogs/>).
-
- To compile this file system support as a module, choose M here: the
- module will be called ext3.
-
-config EXT3_FS_XATTR
- bool "Ext3 extended attributes"
- depends on EXT3_FS
- default y
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page, or visit
- <http://acl.bestbits.at/> for details).
-
- If unsure, say N.
-
- You need this for POSIX ACL support on ext3.
-
-config EXT3_FS_POSIX_ACL
- bool "Ext3 POSIX Access Control Lists"
- depends on EXT3_FS_XATTR
- select FS_POSIX_ACL
- help
- Posix Access Control Lists (ACLs) support permissions for users and
- groups beyond the owner/group/world scheme.
-
- To learn more about Access Control Lists, visit the Posix ACLs for
- Linux website <http://acl.bestbits.at/>.
-
- If you don't know what Access Control Lists are, say N
-
-config EXT3_FS_SECURITY
- bool "Ext3 Security Labels"
- depends on EXT3_FS_XATTR
- help
- Security labels support alternative access control models
- implemented by security modules like SELinux. This option
- enables an extended attribute handler for file security
- labels in the ext3 filesystem.
-
- If you are not using a security module that requires using
- extended attributes for file security labels, say N.
-
-config EXT4_FS
- tristate "The Extended 4 (ext4) filesystem"
- select JBD2
- select CRC16
- help
- This is the next generation of the ext3 filesystem.
-
- Unlike the change from ext2 filesystem to ext3 filesystem,
- the on-disk format of ext4 is not forwards compatible with
- ext3; it is based on extent maps and it supports 48-bit
- physical block numbers. The ext4 filesystem also supports delayed
- allocation, persistent preallocation, high resolution time stamps,
- and a number of other features to improve performance and speed
- up fsck time. For more information, please see the web pages at
- http://ext4.wiki.kernel.org.
-
- The ext4 filesystem will support mounting an ext3
- filesystem; while there will be some performance gains from
- the delayed allocation and inode table readahead, the best
- performance gains will require enabling ext4 features in the
- filesystem, or formating a new filesystem as an ext4
- filesystem initially.
-
- To compile this file system support as a module, choose M here. The
- module will be called ext4.
-
- If unsure, say N.
-
-config EXT4DEV_COMPAT
- bool "Enable ext4dev compatibility"
- depends on EXT4_FS
- help
- Starting with 2.6.28, the name of the ext4 filesystem was
- renamed from ext4dev to ext4. Unfortunately there are some
- legacy userspace programs (such as klibc's fstype) have
- "ext4dev" hardcoded.
-
- To enable backwards compatibility so that systems that are
- still expecting to mount ext4 filesystems using ext4dev,
- chose Y here. This feature will go away by 2.6.31, so
- please arrange to get your userspace programs fixed!
-
-config EXT4_FS_XATTR
- bool "Ext4 extended attributes"
- depends on EXT4_FS
- default y
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page, or visit
- <http://acl.bestbits.at/> for details).
-
- If unsure, say N.
-
- You need this for POSIX ACL support on ext4.
-
-config EXT4_FS_POSIX_ACL
- bool "Ext4 POSIX Access Control Lists"
- depends on EXT4_FS_XATTR
- select FS_POSIX_ACL
- help
- POSIX Access Control Lists (ACLs) support permissions for users and
- groups beyond the owner/group/world scheme.
-
- To learn more about Access Control Lists, visit the POSIX ACLs for
- Linux website <http://acl.bestbits.at/>.
-
- If you don't know what Access Control Lists are, say N
-
-config EXT4_FS_SECURITY
- bool "Ext4 Security Labels"
- depends on EXT4_FS_XATTR
- help
- Security labels support alternative access control models
- implemented by security modules like SELinux. This option
- enables an extended attribute handler for file security
- labels in the ext4 filesystem.
-
- If you are not using a security module that requires using
- extended attributes for file security labels, say N.
-
-config JBD
- tristate
- help
- This is a generic journalling layer for block devices. It is
- currently used by the ext3 file system, but it could also be
- used to add journal support to other file systems or block
- devices such as RAID or LVM.
-
- If you are using the ext3 file system, you need to say Y here.
- If you are not using ext3 then you will probably want to say N.
-
- To compile this device as a module, choose M here: the module will be
- called jbd. If you are compiling ext3 into the kernel, you
- cannot compile this code as a module.
-
-config JBD_DEBUG
- bool "JBD (ext3) debugging support"
- depends on JBD && DEBUG_FS
- help
- If you are using the ext3 journaled file system (or potentially any
- other file system/device using JBD), this option allows you to
- enable debugging output while the system is running, in order to
- help track down any problems you are having. By default the
- debugging output will be turned off.
-
- If you select Y here, then you will be able to turn on debugging
- with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
- number between 1 and 5, the higher the number, the more debugging
- output is generated. To turn debugging off again, do
- "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
-
-config JBD2
- tristate
- select CRC32
- help
- This is a generic journaling layer for block devices that support
- both 32-bit and 64-bit block numbers. It is currently used by
- the ext4 and OCFS2 filesystems, but it could also be used to add
- journal support to other file systems or block devices such
- as RAID or LVM.
-
- If you are using ext4 or OCFS2, you need to say Y here.
- If you are not using ext4 or OCFS2 then you will
- probably want to say N.
-
- To compile this device as a module, choose M here. The module will be
- called jbd2. If you are compiling ext4 or OCFS2 into the kernel,
- you cannot compile this code as a module.
-
-config JBD2_DEBUG
- bool "JBD2 (ext4) debugging support"
- depends on JBD2 && DEBUG_FS
- help
- If you are using the ext4 journaled file system (or
- potentially any other filesystem/device using JBD2), this option
- allows you to enable debugging output while the system is running,
- in order to help track down any problems you are having.
- By default, the debugging output will be turned off.
-
- If you select Y here, then you will be able to turn on debugging
- with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
- number between 1 and 5. The higher the number, the more debugging
- output is generated. To turn debugging off again, do
- "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
+source "fs/jbd/Kconfig"
+source "fs/jbd2/Kconfig"
config FS_MBCACHE
# Meta block cache for Extended Attributes (ext2/ext3/ext4)
@@ -665,7 +403,7 @@
N here.
config FUSE_FS
- tristate "Filesystem in Userspace support"
+ tristate "FUSE (Filesystem in Userspace) support"
help
With FUSE it is possible to implement a fully functional filesystem
in a userspace program.
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e215906..8fcfa39 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1341,20 +1341,15 @@
prstatus->pr_pgrp = task_pgrp_vnr(p);
prstatus->pr_sid = task_session_vnr(p);
if (thread_group_leader(p)) {
+ struct task_cputime cputime;
+
/*
- * This is the record for the group leader. Add in the
- * cumulative times of previous dead threads. This total
- * won't include the time of each live thread whose state
- * is included in the core dump. The final total reported
- * to our parent process when it calls wait4 will include
- * those sums as well as the little bit more time it takes
- * this and each other thread to finish dying after the
- * core dump synchronization phase.
+ * This is the record for the group leader. It shows the
+ * group-wide total, not its individual thread total.
*/
- cputime_to_timeval(cputime_add(p->utime, p->signal->utime),
- &prstatus->pr_utime);
- cputime_to_timeval(cputime_add(p->stime, p->signal->stime),
- &prstatus->pr_stime);
+ thread_group_cputime(p, &cputime);
+ cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
+ cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
} else {
cputime_to_timeval(p->utime, &prstatus->pr_utime);
cputime_to_timeval(p->stime, &prstatus->pr_stime);
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
new file mode 100644
index 0000000..14a6780
--- /dev/null
+++ b/fs/ext2/Kconfig
@@ -0,0 +1,55 @@
+config EXT2_FS
+ tristate "Second extended fs support"
+ help
+ Ext2 is a standard Linux file system for hard disks.
+
+ To compile this file system support as a module, choose M here: the
+ module will be called ext2.
+
+ If unsure, say Y.
+
+config EXT2_FS_XATTR
+ bool "Ext2 extended attributes"
+ depends on EXT2_FS
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+config EXT2_FS_POSIX_ACL
+ bool "Ext2 POSIX Access Control Lists"
+ depends on EXT2_FS_XATTR
+ select FS_POSIX_ACL
+ help
+ Posix Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the Posix ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
+
+config EXT2_FS_SECURITY
+ bool "Ext2 Security Labels"
+ depends on EXT2_FS_XATTR
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the ext2 filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
+
+config EXT2_FS_XIP
+ bool "Ext2 execute in place support"
+ depends on EXT2_FS && MMU
+ help
+ Execute in place can be used on memory-backed block devices. If you
+ enable this option, you can select to mount block devices which are
+ capable of this feature without using the page cache.
+
+ If you do not use a block device that is capable of using this,
+ or if unsure, say N.
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
new file mode 100644
index 0000000..8e0cfe4
--- /dev/null
+++ b/fs/ext3/Kconfig
@@ -0,0 +1,67 @@
+config EXT3_FS
+ tristate "Ext3 journalling file system support"
+ select JBD
+ help
+ This is the journalling version of the Second extended file system
+ (often called ext3), the de facto standard Linux file system
+ (method to organize files on a storage device) for hard disks.
+
+ The journalling code included in this driver means you do not have
+ to run e2fsck (file system checker) on your file systems after a
+ crash. The journal keeps track of any changes that were being made
+ at the time the system crashed, and can ensure that your file system
+ is consistent without the need for a lengthy check.
+
+ Other than adding the journal to the file system, the on-disk format
+ of ext3 is identical to ext2. It is possible to freely switch
+ between using the ext3 driver and the ext2 driver, as long as the
+ file system has been cleanly unmounted, or e2fsck is run on the file
+ system.
+
+ To add a journal on an existing ext2 file system or change the
+ behavior of ext3 file systems, you can use the tune2fs utility ("man
+ tune2fs"). To modify attributes of files and directories on ext3
+ file systems, use chattr ("man chattr"). You need to be using
+ e2fsprogs version 1.20 or later in order to create ext3 journals
+ (available at <http://sourceforge.net/projects/e2fsprogs/>).
+
+ To compile this file system support as a module, choose M here: the
+ module will be called ext3.
+
+config EXT3_FS_XATTR
+ bool "Ext3 extended attributes"
+ depends on EXT3_FS
+ default y
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+ You need this for POSIX ACL support on ext3.
+
+config EXT3_FS_POSIX_ACL
+ bool "Ext3 POSIX Access Control Lists"
+ depends on EXT3_FS_XATTR
+ select FS_POSIX_ACL
+ help
+ Posix Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the Posix ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
+
+config EXT3_FS_SECURITY
+ bool "Ext3 Security Labels"
+ depends on EXT3_FS_XATTR
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the ext3 filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
new file mode 100644
index 0000000..7505482
--- /dev/null
+++ b/fs/ext4/Kconfig
@@ -0,0 +1,79 @@
+config EXT4_FS
+ tristate "The Extended 4 (ext4) filesystem"
+ select JBD2
+ select CRC16
+ help
+ This is the next generation of the ext3 filesystem.
+
+ Unlike the change from ext2 filesystem to ext3 filesystem,
+ the on-disk format of ext4 is not forwards compatible with
+ ext3; it is based on extent maps and it supports 48-bit
+ physical block numbers. The ext4 filesystem also supports delayed
+ allocation, persistent preallocation, high resolution time stamps,
+ and a number of other features to improve performance and speed
+ up fsck time. For more information, please see the web pages at
+ http://ext4.wiki.kernel.org.
+
+ The ext4 filesystem will support mounting an ext3
+ filesystem; while there will be some performance gains from
+ the delayed allocation and inode table readahead, the best
+ performance gains will require enabling ext4 features in the
+ filesystem, or formating a new filesystem as an ext4
+ filesystem initially.
+
+ To compile this file system support as a module, choose M here. The
+ module will be called ext4.
+
+ If unsure, say N.
+
+config EXT4DEV_COMPAT
+ bool "Enable ext4dev compatibility"
+ depends on EXT4_FS
+ help
+ Starting with 2.6.28, the name of the ext4 filesystem was
+ renamed from ext4dev to ext4. Unfortunately there are some
+ legacy userspace programs (such as klibc's fstype) have
+ "ext4dev" hardcoded.
+
+ To enable backwards compatibility so that systems that are
+ still expecting to mount ext4 filesystems using ext4dev,
+ chose Y here. This feature will go away by 2.6.31, so
+ please arrange to get your userspace programs fixed!
+
+config EXT4_FS_XATTR
+ bool "Ext4 extended attributes"
+ depends on EXT4_FS
+ default y
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+ You need this for POSIX ACL support on ext4.
+
+config EXT4_FS_POSIX_ACL
+ bool "Ext4 POSIX Access Control Lists"
+ depends on EXT4_FS_XATTR
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
+
+config EXT4_FS_SECURITY
+ bool "Ext4 Security Labels"
+ depends on EXT4_FS_XATTR
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the ext4 filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2bada6b..34930a9 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -101,6 +101,8 @@
file->f_op = &fuse_direct_io_file_operations;
if (!(outarg->open_flags & FOPEN_KEEP_CACHE))
invalidate_inode_pages2(inode->i_mapping);
+ if (outarg->open_flags & FOPEN_NONSEEKABLE)
+ nonseekable_open(inode, file);
ff->fh = outarg->fh;
file->private_data = fuse_file_get(ff);
}
@@ -1448,6 +1450,9 @@
mutex_lock(&inode->i_mutex);
switch (origin) {
case SEEK_END:
+ retval = fuse_update_attributes(inode, NULL, file, NULL);
+ if (retval)
+ return retval;
offset += i_size_read(inode);
break;
case SEEK_CUR:
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 3a87607..35accfd 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -6,6 +6,9 @@
See the file COPYING.
*/
+#ifndef _FS_FUSE_I_H
+#define _FS_FUSE_I_H
+
#include <linux/fuse.h>
#include <linux/fs.h>
#include <linux/mount.h>
@@ -655,3 +658,5 @@
void fuse_release_nowrite(struct inode *inode);
u64 fuse_get_attr_version(struct fuse_conn *fc);
+
+#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6a84388..54b1f0e1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -865,7 +865,7 @@
if (is_bdev) {
fc->destroy_req = fuse_request_alloc();
if (!fc->destroy_req)
- goto err_put_root;
+ goto err_free_init_req;
}
mutex_lock(&fuse_mutex);
@@ -895,6 +895,7 @@
err_unlock:
mutex_unlock(&fuse_mutex);
+ err_free_init_req:
fuse_request_free(init_req);
err_put_root:
dput(root_dentry);
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig
new file mode 100644
index 0000000..4e28bee
--- /dev/null
+++ b/fs/jbd/Kconfig
@@ -0,0 +1,30 @@
+config JBD
+ tristate
+ help
+ This is a generic journalling layer for block devices. It is
+ currently used by the ext3 file system, but it could also be
+ used to add journal support to other file systems or block
+ devices such as RAID or LVM.
+
+ If you are using the ext3 file system, you need to say Y here.
+ If you are not using ext3 then you will probably want to say N.
+
+ To compile this device as a module, choose M here: the module will be
+ called jbd. If you are compiling ext3 into the kernel, you
+ cannot compile this code as a module.
+
+config JBD_DEBUG
+ bool "JBD (ext3) debugging support"
+ depends on JBD && DEBUG_FS
+ help
+ If you are using the ext3 journaled file system (or potentially any
+ other file system/device using JBD), this option allows you to
+ enable debugging output while the system is running, in order to
+ help track down any problems you are having. By default the
+ debugging output will be turned off.
+
+ If you select Y here, then you will be able to turn on debugging
+ with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
+ number between 1 and 5, the higher the number, the more debugging
+ output is generated. To turn debugging off again, do
+ "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
new file mode 100644
index 0000000..f32f346
--- /dev/null
+++ b/fs/jbd2/Kconfig
@@ -0,0 +1,33 @@
+config JBD2
+ tristate
+ select CRC32
+ help
+ This is a generic journaling layer for block devices that support
+ both 32-bit and 64-bit block numbers. It is currently used by
+ the ext4 and OCFS2 filesystems, but it could also be used to add
+ journal support to other file systems or block devices such
+ as RAID or LVM.
+
+ If you are using ext4 or OCFS2, you need to say Y here.
+ If you are not using ext4 or OCFS2 then you will
+ probably want to say N.
+
+ To compile this device as a module, choose M here. The module will be
+ called jbd2. If you are compiling ext4 or OCFS2 into the kernel,
+ you cannot compile this code as a module.
+
+config JBD2_DEBUG
+ bool "JBD2 (ext4) debugging support"
+ depends on JBD2 && DEBUG_FS
+ help
+ If you are using the ext4 journaled file system (or
+ potentially any other filesystem/device using JBD2), this option
+ allows you to enable debugging output while the system is running,
+ in order to help track down any problems you are having.
+ By default, the debugging output will be turned off.
+
+ If you select Y here, then you will be able to turn on debugging
+ with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+ number between 1 and 5. The higher the number, the more debugging
+ output is generated. To turn debugging off again, do
+ "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
diff --git a/fs/proc/array.c b/fs/proc/array.c
index f4bc0e7..bb9f4b0 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -388,20 +388,20 @@
/* add up live thread stats at the group level */
if (whole) {
+ struct task_cputime cputime;
struct task_struct *t = task;
do {
min_flt += t->min_flt;
maj_flt += t->maj_flt;
- utime = cputime_add(utime, task_utime(t));
- stime = cputime_add(stime, task_stime(t));
gtime = cputime_add(gtime, task_gtime(t));
t = next_thread(t);
} while (t != task);
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
- utime = cputime_add(utime, sig->utime);
- stime = cputime_add(stime, sig->stime);
+ thread_group_cputime(task, &cputime);
+ utime = cputime.utime;
+ stime = cputime.stime;
gtime = cputime_add(gtime, sig->gtime);
}
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 61b25f4..7ea52c7 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -30,6 +30,7 @@
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/pagemap.h>
+#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/swap.h>
#include <linux/slab.h>
@@ -521,17 +522,13 @@
static int show_stat(struct seq_file *p, void *v)
{
- int i;
+ int i, j;
unsigned long jif;
cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
cputime64_t guest;
u64 sum = 0;
struct timespec boottime;
- unsigned int *per_irq_sum;
-
- per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL);
- if (!per_irq_sum)
- return -ENOMEM;
+ unsigned int per_irq_sum;
user = nice = system = idle = iowait =
irq = softirq = steal = cputime64_zero;
@@ -540,8 +537,6 @@
jif = boottime.tv_sec;
for_each_possible_cpu(i) {
- int j;
-
user = cputime64_add(user, kstat_cpu(i).cpustat.user);
nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
system = cputime64_add(system, kstat_cpu(i).cpustat.system);
@@ -551,11 +546,10 @@
softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
- for (j = 0; j < NR_IRQS; j++) {
- unsigned int temp = kstat_cpu(i).irqs[j];
- sum += temp;
- per_irq_sum[j] += temp;
- }
+
+ for_each_irq_nr(j)
+ sum += kstat_irqs_cpu(j, i);
+
sum += arch_irq_stat_cpu(i);
}
sum += arch_irq_stat();
@@ -597,8 +591,15 @@
}
seq_printf(p, "intr %llu", (unsigned long long)sum);
- for (i = 0; i < NR_IRQS; i++)
- seq_printf(p, " %u", per_irq_sum[i]);
+ /* sum again ? it could be updated? */
+ for_each_irq_nr(j) {
+ per_irq_sum = 0;
+
+ for_each_possible_cpu(i)
+ per_irq_sum += kstat_irqs_cpu(j, i);
+
+ seq_printf(p, " %u", per_irq_sum);
+ }
seq_printf(p,
"\nctxt %llu\n"
@@ -612,7 +613,6 @@
nr_running(),
nr_iowait());
- kfree(per_irq_sum);
return 0;
}
@@ -651,15 +651,14 @@
*/
static void *int_seq_start(struct seq_file *f, loff_t *pos)
{
- return (*pos <= NR_IRQS) ? pos : NULL;
+ return (*pos <= nr_irqs) ? pos : NULL;
}
+
static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
(*pos)++;
- if (*pos > NR_IRQS)
- return NULL;
- return pos;
+ return (*pos <= nr_irqs) ? pos : NULL;
}
static void int_seq_stop(struct seq_file *f, void *v)
@@ -667,7 +666,6 @@
/* Nothing to do */
}
-
static const struct seq_operations int_seq_ops = {
.start = int_seq_start,
.next = int_seq_next,
diff --git a/include/asm-frv/ide.h b/include/asm-frv/ide.h
index 7ebcc56..3610766 100644
--- a/include/asm-frv/ide.h
+++ b/include/asm-frv/ide.h
@@ -18,15 +18,7 @@
#include <asm/io.h>
#include <asm/irq.h>
-/****************************************************************************/
-/*
- * some bits needed for parts of the IDE subsystem to compile
- */
-#define __ide_mm_insw(port, addr, n) insw((unsigned long) (port), addr, n)
-#define __ide_mm_insl(port, addr, n) insl((unsigned long) (port), addr, n)
-#define __ide_mm_outsw(port, addr, n) outsw((unsigned long) (port), addr, n)
-#define __ide_mm_outsl(port, addr, n) outsl((unsigned long) (port), addr, n)
-
+#include <asm-generic/ide_iops.h>
#endif /* __KERNEL__ */
#endif /* _ASM_IDE_H */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 74c5faf..8074460 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -37,6 +37,13 @@
#define MEM_DISCARD(sec) *(.mem##sec)
#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+#define MCOUNT_REC() VMLINUX_SYMBOL(__start_mcount_loc) = .; \
+ *(__mcount_loc) \
+ VMLINUX_SYMBOL(__stop_mcount_loc) = .;
+#else
+#define MCOUNT_REC()
+#endif
/* .data section */
#define DATA_DATA \
@@ -52,7 +59,10 @@
. = ALIGN(8); \
VMLINUX_SYMBOL(__start___markers) = .; \
*(__markers) \
- VMLINUX_SYMBOL(__stop___markers) = .;
+ VMLINUX_SYMBOL(__stop___markers) = .; \
+ VMLINUX_SYMBOL(__start___tracepoints) = .; \
+ *(__tracepoints) \
+ VMLINUX_SYMBOL(__stop___tracepoints) = .;
#define RO_DATA(align) \
. = ALIGN((align)); \
@@ -61,6 +71,7 @@
*(.rodata) *(.rodata.*) \
*(__vermagic) /* Kernel version magic */ \
*(__markers_strings) /* Markers: strings */ \
+ *(__tracepoints_strings)/* Tracepoints: strings */ \
} \
\
.rodata1 : AT(ADDR(.rodata1) - LOAD_OFFSET) { \
@@ -188,6 +199,7 @@
/* __*init sections */ \
__init_rodata : AT(ADDR(__init_rodata) - LOAD_OFFSET) { \
*(.ref.rodata) \
+ MCOUNT_REC() \
DEV_KEEP(init.rodata) \
DEV_KEEP(exit.rodata) \
CPU_KEEP(init.rodata) \
diff --git a/include/asm-m68k/ide.h b/include/asm-m68k/ide.h
index 1daf6cb..b996a3c 100644
--- a/include/asm-m68k/ide.h
+++ b/include/asm-m68k/ide.h
@@ -92,15 +92,6 @@
#define outsw_swapw(port, addr, n) raw_outsw_swapw((u16 *)port, addr, n)
#endif
-
-/* Q40 and Atari have byteswapped IDE busses and since many interesting
- * values in the identification string are text, chars and words they
- * happened to be almost correct without swapping.. However *_capacity
- * is needed for drives over 8 GB. RZ */
-#if defined(CONFIG_Q40) || defined(CONFIG_ATARI)
-#define M68K_IDE_SWAPW (MACH_IS_Q40 || MACH_IS_ATARI)
-#endif
-
#ifdef CONFIG_BLK_DEV_FALCON_IDE
#define IDE_ARCH_LOCK
diff --git a/include/asm-parisc/ide.h b/include/asm-parisc/ide.h
index c246ef7..81700a2 100644
--- a/include/asm-parisc/ide.h
+++ b/include/asm-parisc/ide.h
@@ -13,10 +13,6 @@
#ifdef __KERNEL__
-#define ide_request_irq(irq,hand,flg,dev,id) request_irq((irq),(hand),(flg),(dev),(id))
-#define ide_free_irq(irq,dev_id) free_irq((irq), (dev_id))
-#define ide_request_region(from,extent,name) request_region((from), (extent), (name))
-#define ide_release_region(from,extent) release_region((from), (extent))
/* Generic I/O and MEMIO string operations. */
#define __ide_insw insw
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index d76a083..ef1d72d 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -40,8 +40,6 @@
extern unsigned int apic_verbosity;
extern int local_apic_timer_c2_ok;
-extern int ioapic_force;
-
extern int disable_apic;
/*
* Basic functions accessing APICs.
@@ -100,6 +98,20 @@
extern void enable_x2apic(void);
extern void enable_IR_x2apic(void);
extern void x2apic_icr_write(u32 low, u32 id);
+static inline int x2apic_enabled(void)
+{
+ int msr, msr2;
+
+ if (!cpu_has_x2apic)
+ return 0;
+
+ rdmsr(MSR_IA32_APICBASE, msr, msr2);
+ if (msr & X2APIC_ENABLE)
+ return 1;
+ return 0;
+}
+#else
+#define x2apic_enabled() 0
#endif
struct apic_ops {
diff --git a/include/asm-x86/bigsmp/apic.h b/include/asm-x86/bigsmp/apic.h
index 0a9cd7c..1d9543b 100644
--- a/include/asm-x86/bigsmp/apic.h
+++ b/include/asm-x86/bigsmp/apic.h
@@ -9,22 +9,17 @@
return (1);
}
-/* Round robin the irqs amoung the online cpus */
static inline cpumask_t target_cpus(void)
{
- static unsigned long cpu = NR_CPUS;
- do {
- if (cpu >= NR_CPUS)
- cpu = first_cpu(cpu_online_map);
- else
- cpu = next_cpu(cpu, cpu_online_map);
- } while (cpu >= NR_CPUS);
- return cpumask_of_cpu(cpu);
+#ifdef CONFIG_SMP
+ return cpu_online_map;
+#else
+ return cpumask_of_cpu(0);
+#endif
}
#undef APIC_DEST_LOGICAL
#define APIC_DEST_LOGICAL 0
-#define TARGET_CPUS (target_cpus())
#define APIC_DFR_VALUE (APIC_DFR_FLAT)
#define INT_DELIVERY_MODE (dest_Fixed)
#define INT_DEST_MODE (0) /* phys delivery to target proc */
diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h
index ed2de22..313438e 100644
--- a/include/asm-x86/efi.h
+++ b/include/asm-x86/efi.h
@@ -94,4 +94,17 @@
extern void efi_call_phys_prelog(void);
extern void efi_call_phys_epilog(void);
+#ifndef CONFIG_EFI
+/*
+ * IF EFI is not configured, have the EFI calls return -ENOSYS.
+ */
+#define efi_call0(_f) (-ENOSYS)
+#define efi_call1(_f, _a1) (-ENOSYS)
+#define efi_call2(_f, _a1, _a2) (-ENOSYS)
+#define efi_call3(_f, _a1, _a2, _a3) (-ENOSYS)
+#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS)
+#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS)
+#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS)
+#endif /* CONFIG_EFI */
+
#endif /* ASM_X86__EFI_H */
diff --git a/include/asm-x86/es7000/apic.h b/include/asm-x86/es7000/apic.h
index aae50c2..380f0b4 100644
--- a/include/asm-x86/es7000/apic.h
+++ b/include/asm-x86/es7000/apic.h
@@ -17,7 +17,6 @@
return cpumask_of_cpu(smp_processor_id());
#endif
}
-#define TARGET_CPUS (target_cpus())
#if defined CONFIG_ES7000_CLUSTERED_APIC
#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
@@ -81,7 +80,7 @@
int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
(apic_version[apic] == 0x14) ?
- "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(TARGET_CPUS)[0]);
+ "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]);
}
static inline int multi_timer_check(int apic, int irq)
diff --git a/include/asm-x86/ftrace.h b/include/asm-x86/ftrace.h
index be0e004..1bb6f9bb 100644
--- a/include/asm-x86/ftrace.h
+++ b/include/asm-x86/ftrace.h
@@ -7,6 +7,16 @@
#ifndef __ASSEMBLY__
extern void mcount(void);
+
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+ /*
+ * call mcount is "e8 <4 byte offset>"
+ * The addr points to the 4 byte offset and the caller of this
+ * function wants the pointer to e8. Simply subtract one.
+ */
+ return addr - 1;
+}
#endif
#endif /* CONFIG_FTRACE */
diff --git a/include/asm-x86/genapic_32.h b/include/asm-x86/genapic_32.h
index 34280f0..6fe4f81 100644
--- a/include/asm-x86/genapic_32.h
+++ b/include/asm-x86/genapic_32.h
@@ -57,6 +57,7 @@
unsigned (*get_apic_id)(unsigned long x);
unsigned long apic_id_mask;
unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
+ cpumask_t (*vector_allocation_domain)(int cpu);
#ifdef CONFIG_SMP
/* ipi */
@@ -104,6 +105,7 @@
APICFUNC(get_apic_id) \
.apic_id_mask = APIC_ID_MASK, \
APICFUNC(cpu_mask_to_apicid) \
+ APICFUNC(vector_allocation_domain) \
APICFUNC(acpi_madt_oem_check) \
IPIFUNC(send_IPI_mask) \
IPIFUNC(send_IPI_allbutself) \
diff --git a/include/asm-x86/hpet.h b/include/asm-x86/hpet.h
index cbbbb6d..58b273f 100644
--- a/include/asm-x86/hpet.h
+++ b/include/asm-x86/hpet.h
@@ -1,6 +1,8 @@
#ifndef ASM_X86__HPET_H
#define ASM_X86__HPET_H
+#include <linux/msi.h>
+
#ifdef CONFIG_HPET_TIMER
#define HPET_MMAP_SIZE 1024
@@ -10,6 +12,11 @@
#define HPET_CFG 0x010
#define HPET_STATUS 0x020
#define HPET_COUNTER 0x0f0
+
+#define HPET_Tn_CFG(n) (0x100 + 0x20 * n)
+#define HPET_Tn_CMP(n) (0x108 + 0x20 * n)
+#define HPET_Tn_ROUTE(n) (0x110 + 0x20 * n)
+
#define HPET_T0_CFG 0x100
#define HPET_T0_CMP 0x108
#define HPET_T0_ROUTE 0x110
@@ -65,6 +72,20 @@
extern unsigned long hpet_readl(unsigned long a);
extern void force_hpet_resume(void);
+extern void hpet_msi_unmask(unsigned int irq);
+extern void hpet_msi_mask(unsigned int irq);
+extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
+extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
+
+#ifdef CONFIG_PCI_MSI
+extern int arch_setup_hpet_msi(unsigned int irq);
+#else
+static inline int arch_setup_hpet_msi(unsigned int irq)
+{
+ return -EINVAL;
+}
+#endif
+
#ifdef CONFIG_HPET_EMULATE_RTC
#include <linux/interrupt.h>
diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h
index 50f6e03..749d042 100644
--- a/include/asm-x86/hw_irq.h
+++ b/include/asm-x86/hw_irq.h
@@ -96,13 +96,8 @@
/* SMP */
extern void smp_apic_timer_interrupt(struct pt_regs *);
-#ifdef CONFIG_X86_32
extern void smp_spurious_interrupt(struct pt_regs *);
extern void smp_error_interrupt(struct pt_regs *);
-#else
-extern asmlinkage void smp_spurious_interrupt(void);
-extern asmlinkage void smp_error_interrupt(void);
-#endif
#ifdef CONFIG_X86_SMP
extern void smp_reschedule_interrupt(struct pt_regs *);
extern void smp_call_function_interrupt(struct pt_regs *);
@@ -115,13 +110,13 @@
#endif
#ifdef CONFIG_X86_32
-extern void (*const interrupt[NR_IRQS])(void);
-#else
-typedef int vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+extern void (*const interrupt[NR_VECTORS])(void);
#endif
-#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_X86_64)
+typedef int vector_irq_t[NR_VECTORS];
+DECLARE_PER_CPU(vector_irq_t, vector_irq);
+
+#ifdef CONFIG_X86_IO_APIC
extern void lock_vector_lock(void);
extern void unlock_vector_lock(void);
extern void __setup_vector_irq(int cpu);
diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h
index 8ec68a5..d35cbd7 100644
--- a/include/asm-x86/io_apic.h
+++ b/include/asm-x86/io_apic.h
@@ -4,6 +4,7 @@
#include <linux/types.h>
#include <asm/mpspec.h>
#include <asm/apicdef.h>
+#include <asm/irq_vectors.h>
/*
* Intel IO-APIC support for SMP and UP systems.
@@ -87,24 +88,8 @@
mask : 1, /* 0: enabled, 1: disabled */
__reserved_2 : 15;
-#ifdef CONFIG_X86_32
- union {
- struct {
- __u32 __reserved_1 : 24,
- physical_dest : 4,
- __reserved_2 : 4;
- } physical;
-
- struct {
- __u32 __reserved_1 : 24,
- logical_dest : 8;
- } logical;
- } dest;
-#else
__u32 __reserved_3 : 24,
dest : 8;
-#endif
-
} __attribute__ ((packed));
struct IR_IO_APIC_route_entry {
@@ -203,10 +188,17 @@
extern void reinit_intr_remapped_IO_APIC(int);
#endif
+extern int probe_nr_irqs(void);
+
#else /* !CONFIG_X86_IO_APIC */
#define io_apic_assign_pci_irqs 0
static const int timer_through_8259 = 0;
static inline void ioapic_init_mappings(void) { }
+
+static inline int probe_nr_irqs(void)
+{
+ return NR_IRQS;
+}
#endif
#endif /* ASM_X86__IO_APIC_H */
diff --git a/include/asm-x86/irq_vectors.h b/include/asm-x86/irq_vectors.h
index c5d2d76..a8d065d 100644
--- a/include/asm-x86/irq_vectors.h
+++ b/include/asm-x86/irq_vectors.h
@@ -19,19 +19,14 @@
/*
* Reserve the lowest usable priority level 0x20 - 0x2f for triggering
- * cleanup after irq migration on 64 bit.
+ * cleanup after irq migration.
*/
#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
/*
- * Vectors 0x20-0x2f are used for ISA interrupts on 32 bit.
- * Vectors 0x30-0x3f are used for ISA interrupts on 64 bit.
+ * Vectors 0x30-0x3f are used for ISA interrupts.
*/
-#ifdef CONFIG_X86_32
-#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR)
-#else
#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10)
-#endif
#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
#define IRQ3_VECTOR (IRQ0_VECTOR + 3)
@@ -96,11 +91,7 @@
* start at 0x31(0x41) to spread out vectors evenly between priority
* levels. (0x80 is the syscall vector)
*/
-#ifdef CONFIG_X86_32
-# define FIRST_DEVICE_VECTOR 0x31
-#else
-# define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
-#endif
+#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
#define NR_VECTORS 256
@@ -116,7 +107,6 @@
# else
# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
# endif
-# define NR_IRQ_VECTORS NR_IRQS
#elif !defined(CONFIG_X86_VOYAGER)
@@ -124,23 +114,15 @@
# define NR_IRQS 224
-# if (224 >= 32 * NR_CPUS)
-# define NR_IRQ_VECTORS NR_IRQS
-# else
-# define NR_IRQ_VECTORS (32 * NR_CPUS)
-# endif
-
# else /* IO_APIC || PARAVIRT */
# define NR_IRQS 16
-# define NR_IRQ_VECTORS NR_IRQS
# endif
#else /* !VISWS && !VOYAGER */
# define NR_IRQS 224
-# define NR_IRQ_VECTORS NR_IRQS
#endif /* VISWS */
diff --git a/include/asm-x86/mach-default/entry_arch.h b/include/asm-x86/mach-default/entry_arch.h
index 9283b60..6b1add8 100644
--- a/include/asm-x86/mach-default/entry_arch.h
+++ b/include/asm-x86/mach-default/entry_arch.h
@@ -14,6 +14,7 @@
BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
+BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
#endif
/*
diff --git a/include/asm-x86/mach-default/mach_apic.h b/include/asm-x86/mach-default/mach_apic.h
index 2a330a4..3c66f2c 100644
--- a/include/asm-x86/mach-default/mach_apic.h
+++ b/include/asm-x86/mach-default/mach_apic.h
@@ -85,6 +85,20 @@
return 0;
#endif
}
+
+static inline cpumask_t vector_allocation_domain(int cpu)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+ return domain;
+}
#endif
static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
@@ -138,6 +152,5 @@
static inline void enable_apic_mode(void)
{
}
-
#endif /* CONFIG_X86_LOCAL_APIC */
#endif /* ASM_X86__MACH_DEFAULT__MACH_APIC_H */
diff --git a/include/asm-x86/mach-generic/irq_vectors_limits.h b/include/asm-x86/mach-generic/irq_vectors_limits.h
deleted file mode 100644
index f7870e1..0000000
--- a/include/asm-x86/mach-generic/irq_vectors_limits.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef ASM_X86__MACH_GENERIC__IRQ_VECTORS_LIMITS_H
-#define ASM_X86__MACH_GENERIC__IRQ_VECTORS_LIMITS_H
-
-/*
- * For Summit or generic (i.e. installer) kernels, we have lots of I/O APICs,
- * even with uni-proc kernels, so use a big array.
- *
- * This value should be the same in both the generic and summit subarches.
- * Change one, change 'em both.
- */
-#define NR_IRQS 224
-#define NR_IRQ_VECTORS 1024
-
-#endif /* ASM_X86__MACH_GENERIC__IRQ_VECTORS_LIMITS_H */
diff --git a/include/asm-x86/mach-generic/mach_apic.h b/include/asm-x86/mach-generic/mach_apic.h
index 5d010c6..5085b52 100644
--- a/include/asm-x86/mach-generic/mach_apic.h
+++ b/include/asm-x86/mach-generic/mach_apic.h
@@ -24,6 +24,7 @@
#define check_phys_apicid_present (genapic->check_phys_apicid_present)
#define check_apicid_used (genapic->check_apicid_used)
#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
+#define vector_allocation_domain (genapic->vector_allocation_domain)
#define enable_apic_mode (genapic->enable_apic_mode)
#define phys_pkg_id (genapic->phys_pkg_id)
diff --git a/include/asm-x86/numaq/apic.h b/include/asm-x86/numaq/apic.h
index a8344ba..0bf2a06 100644
--- a/include/asm-x86/numaq/apic.h
+++ b/include/asm-x86/numaq/apic.h
@@ -12,8 +12,6 @@
return CPU_MASK_ALL;
}
-#define TARGET_CPUS (target_cpus())
-
#define NO_BALANCE_IRQ (1)
#define esr_disable (1)
diff --git a/include/asm-x86/summit/apic.h b/include/asm-x86/summit/apic.h
index 394b00b..9b3070f 100644
--- a/include/asm-x86/summit/apic.h
+++ b/include/asm-x86/summit/apic.h
@@ -22,7 +22,6 @@
*/
return cpumask_of_cpu(0);
}
-#define TARGET_CPUS (target_cpus())
#define INT_DELIVERY_MODE (dest_LowestPrio)
#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
diff --git a/include/asm-x86/summit/irq_vectors_limits.h b/include/asm-x86/summit/irq_vectors_limits.h
deleted file mode 100644
index 890ce3f..0000000
--- a/include/asm-x86/summit/irq_vectors_limits.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _ASM_IRQ_VECTORS_LIMITS_H
-#define _ASM_IRQ_VECTORS_LIMITS_H
-
-/*
- * For Summit or generic (i.e. installer) kernels, we have lots of I/O APICs,
- * even with uni-proc kernels, so use a big array.
- *
- * This value should be the same in both the generic and summit subarches.
- * Change one, change 'em both.
- */
-#define NR_IRQS 224
-#define NR_IRQ_VECTORS 1024
-
-#endif /* _ASM_IRQ_VECTORS_LIMITS_H */
diff --git a/include/asm-x86/uv/bios.h b/include/asm-x86/uv/bios.h
index 7cd6d7e..215f196 100644
--- a/include/asm-x86/uv/bios.h
+++ b/include/asm-x86/uv/bios.h
@@ -2,9 +2,7 @@
#define ASM_X86__UV__BIOS_H
/*
- * BIOS layer definitions.
- *
- * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ * UV BIOS layer definitions.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -19,11 +17,43 @@
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson
*/
#include <linux/rtc.h>
-#define BIOS_FREQ_BASE 0x01000001
+/*
+ * Values for the BIOS calls. It is passed as the first * argument in the
+ * BIOS call. Passing any other value in the first argument will result
+ * in a BIOS_STATUS_UNIMPLEMENTED return status.
+ */
+enum uv_bios_cmd {
+ UV_BIOS_COMMON,
+ UV_BIOS_GET_SN_INFO,
+ UV_BIOS_FREQ_BASE
+};
+
+/*
+ * Status values returned from a BIOS call.
+ */
+enum {
+ BIOS_STATUS_SUCCESS = 0,
+ BIOS_STATUS_UNIMPLEMENTED = -ENOSYS,
+ BIOS_STATUS_EINVAL = -EINVAL,
+ BIOS_STATUS_UNAVAIL = -EBUSY
+};
+
+/*
+ * The UV system table describes specific firmware
+ * capabilities available to the Linux kernel at runtime.
+ */
+struct uv_systab {
+ char signature[4]; /* must be "UVST" */
+ u32 revision; /* distinguish different firmware revs */
+ u64 function; /* BIOS runtime callback function ptr */
+};
enum {
BIOS_FREQ_BASE_PLATFORM = 0,
@@ -31,38 +61,34 @@
BIOS_FREQ_BASE_REALTIME_CLOCK = 2
};
-# define BIOS_CALL(result, a0, a1, a2, a3, a4, a5, a6, a7) \
- do { \
- /* XXX - the real call goes here */ \
- result.status = BIOS_STATUS_UNIMPLEMENTED; \
- isrv.v0 = 0; \
- isrv.v1 = 0; \
- } while (0)
-
-enum {
- BIOS_STATUS_SUCCESS = 0,
- BIOS_STATUS_UNIMPLEMENTED = -1,
- BIOS_STATUS_EINVAL = -2,
- BIOS_STATUS_ERROR = -3
+union partition_info_u {
+ u64 val;
+ struct {
+ u64 hub_version : 8,
+ partition_id : 16,
+ coherence_id : 16,
+ region_size : 24;
+ };
};
-struct uv_bios_retval {
- /*
- * A zero status value indicates call completed without error.
- * A negative status value indicates reason of call failure.
- * A positive status value indicates success but an
- * informational value should be printed (e.g., "reboot for
- * change to take effect").
- */
- s64 status;
- u64 v0;
- u64 v1;
- u64 v2;
-};
+/*
+ * bios calls have 6 parameters
+ */
+extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
+extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
+extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
-extern long
-x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
- unsigned long *drift_info);
-extern const char *x86_bios_strerror(long status);
+extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *);
+extern s64 uv_bios_freq_base(u64, u64 *);
+
+extern void uv_bios_init(void);
+
+extern int uv_type;
+extern long sn_partition_id;
+extern long uv_coherency_id;
+extern long uv_region_size;
+#define partition_coherence_id() (uv_coherency_id)
+
+extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
#endif /* ASM_X86__UV__BIOS_H */
diff --git a/include/asm-x86/uv/uv_irq.h b/include/asm-x86/uv/uv_irq.h
new file mode 100644
index 0000000..8bf5f32
--- /dev/null
+++ b/include/asm-x86/uv/uv_irq.h
@@ -0,0 +1,36 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV IRQ definitions
+ *
+ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef ASM_X86__UV__UV_IRQ_H
+#define ASM_X86__UV__UV_IRQ_H
+
+/* If a generic version of this structure gets defined, eliminate this one. */
+struct uv_IO_APIC_route_entry {
+ __u64 vector : 8,
+ delivery_mode : 3,
+ dest_mode : 1,
+ delivery_status : 1,
+ polarity : 1,
+ __reserved_1 : 1,
+ trigger : 1,
+ mask : 1,
+ __reserved_2 : 15,
+ dest : 32;
+};
+
+extern struct irq_chip uv_irq_chip;
+
+extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long);
+extern void arch_disable_uv_irq(int, unsigned long);
+
+extern int uv_setup_irq(char *, int, int, unsigned long);
+extern void uv_teardown_irq(unsigned int, int, unsigned long);
+
+#endif /* ASM_X86__UV__UV_IRQ_H */
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 55e434f..f88d32f 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -45,7 +45,8 @@
* @read: returns a cycle value
* @mask: bitmask for two's complement
* subtraction of non 64 bit counters
- * @mult: cycle to nanosecond multiplier
+ * @mult: cycle to nanosecond multiplier (adjusted by NTP)
+ * @mult_orig: cycle to nanosecond multiplier (unadjusted by NTP)
* @shift: cycle to nanosecond divisor (power of two)
* @flags: flags describing special properties
* @vread: vsyscall based read
@@ -63,6 +64,7 @@
cycle_t (*read)(void);
cycle_t mask;
u32 mult;
+ u32 mult_orig;
u32 shift;
unsigned long flags;
cycle_t (*vread)(void);
@@ -77,6 +79,7 @@
/* timekeeping specific data, ignore */
cycle_t cycle_interval;
u64 xtime_interval;
+ u32 raw_interval;
/*
* Second part is written at each timer interrupt
* Keep it in a different cache line to dirty no
@@ -85,6 +88,7 @@
cycle_t cycle_last ____cacheline_aligned_in_smp;
u64 xtime_nsec;
s64 error;
+ struct timespec raw_time;
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
/* Watchdog related data, used by the framework */
@@ -201,17 +205,19 @@
{
u64 tmp;
- /* XXX - All of this could use a whole lot of optimization */
+ /* Do the ns -> cycle conversion first, using original mult */
tmp = length_nsec;
tmp <<= c->shift;
- tmp += c->mult/2;
- do_div(tmp, c->mult);
+ tmp += c->mult_orig/2;
+ do_div(tmp, c->mult_orig);
c->cycle_interval = (cycle_t)tmp;
if (c->cycle_interval == 0)
c->cycle_interval = 1;
+ /* Go back from cycles -> shifted ns, this time use ntp adjused mult */
c->xtime_interval = (u64)c->cycle_interval * c->mult;
+ c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift;
}
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 8322141..98115d9 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -44,6 +44,8 @@
# error Sorry, your compiler is too old/not recognized.
#endif
+#define notrace __attribute__((no_instrument_function))
+
/* Intel compiler defines __GNUC__. So we will overwrite implementations
* coming from above header files here
*/
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index c360c55..f1984fc 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -45,7 +45,6 @@
list_for_each_entry(drhd, &dmar_drhd_units, list)
extern int dmar_table_init(void);
-extern int early_dmar_detect(void);
extern int dmar_dev_scope_init(void);
/* Intel IOMMU detection */
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 807373d..bb66feb 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -208,6 +208,9 @@
#define EFI_GLOBAL_VARIABLE_GUID \
EFI_GUID( 0x8be4df61, 0x93ca, 0x11d2, 0xaa, 0x0d, 0x00, 0xe0, 0x98, 0x03, 0x2b, 0x8c )
+#define UV_SYSTEM_TABLE_GUID \
+ EFI_GUID( 0x3b13a7d4, 0x633e, 0x11dd, 0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93 )
+
typedef struct {
efi_guid_t guid;
unsigned long table;
@@ -255,6 +258,7 @@
unsigned long boot_info; /* boot info table */
unsigned long hcdp; /* HCDP table */
unsigned long uga; /* UGA table */
+ unsigned long uv_systab; /* UV system table */
efi_get_time_t *get_time;
efi_set_time_t *set_time;
efi_get_wakeup_time_t *get_wakeup_time;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index bb38406..a3d4615 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1,10 +1,14 @@
#ifndef _LINUX_FTRACE_H
#define _LINUX_FTRACE_H
-#ifdef CONFIG_FTRACE
-
#include <linux/linkage.h>
#include <linux/fs.h>
+#include <linux/ktime.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kallsyms.h>
+
+#ifdef CONFIG_FTRACE
extern int ftrace_enabled;
extern int
@@ -36,6 +40,7 @@
# define register_ftrace_function(ops) do { } while (0)
# define unregister_ftrace_function(ops) do { } while (0)
# define clear_ftrace_function(ops) do { } while (0)
+static inline void ftrace_kill_atomic(void) { }
#endif /* CONFIG_FTRACE */
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -76,8 +81,10 @@
extern int skip_trace(unsigned long ip);
-void ftrace_disable_daemon(void);
-void ftrace_enable_daemon(void);
+extern void ftrace_release(void *start, unsigned long size);
+
+extern void ftrace_disable_daemon(void);
+extern void ftrace_enable_daemon(void);
#else
# define skip_trace(ip) ({ 0; })
@@ -85,6 +92,7 @@
# define ftrace_set_filter(buf, len, reset) do { } while (0)
# define ftrace_disable_daemon() do { } while (0)
# define ftrace_enable_daemon() do { } while (0)
+static inline void ftrace_release(void *start, unsigned long size) { }
#endif /* CONFIG_DYNAMIC_FTRACE */
/* totally disable ftrace - can not re-enable after this */
@@ -98,9 +106,11 @@
#endif
}
-/* Ftrace disable/restore without lock. Some synchronization mechanism
+/*
+ * Ftrace disable/restore without lock. Some synchronization mechanism
* must be used to prevent ftrace_enabled to be changed between
- * disable/restore. */
+ * disable/restore.
+ */
static inline int __ftrace_enabled_save(void)
{
#ifdef CONFIG_FTRACE
@@ -157,9 +167,71 @@
#ifdef CONFIG_TRACING
extern void
ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+/**
+ * ftrace_printk - printf formatting in the ftrace buffer
+ * @fmt: the printf format for printing
+ *
+ * Note: __ftrace_printk is an internal function for ftrace_printk and
+ * the @ip is passed in via the ftrace_printk macro.
+ *
+ * This function allows a kernel developer to debug fast path sections
+ * that printk is not appropriate for. By scattering in various
+ * printk like tracing in the code, a developer can quickly see
+ * where problems are occurring.
+ *
+ * This is intended as a debugging tool for the developer only.
+ * Please refrain from leaving ftrace_printks scattered around in
+ * your code.
+ */
+# define ftrace_printk(fmt...) __ftrace_printk(_THIS_IP_, fmt)
+extern int
+__ftrace_printk(unsigned long ip, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+extern void ftrace_dump(void);
#else
static inline void
ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
+static inline int
+ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));
+
+static inline int
+ftrace_printk(const char *fmt, ...)
+{
+ return 0;
+}
+static inline void ftrace_dump(void) { }
#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+extern void ftrace_init(void);
+extern void ftrace_init_module(unsigned long *start, unsigned long *end);
+#else
+static inline void ftrace_init(void) { }
+static inline void
+ftrace_init_module(unsigned long *start, unsigned long *end) { }
+#endif
+
+
+struct boot_trace {
+ pid_t caller;
+ char func[KSYM_NAME_LEN];
+ int result;
+ unsigned long long duration; /* usecs */
+ ktime_t calltime;
+ ktime_t rettime;
+};
+
+#ifdef CONFIG_BOOT_TRACER
+extern void trace_boot(struct boot_trace *it, initcall_t fn);
+extern void start_boot_trace(void);
+extern void stop_boot_trace(void);
+#else
+static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
+static inline void start_boot_trace(void) { }
+static inline void stop_boot_trace(void) { }
+#endif
+
+
+
#endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 265635d..350fe97 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -17,8 +17,14 @@
* - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in
* - add blksize field to fuse_attr
* - add file flags field to fuse_read_in and fuse_write_in
+ *
+ * 7.10
+ * - add nonseekable open flag
*/
+#ifndef _LINUX_FUSE_H
+#define _LINUX_FUSE_H
+
#include <asm/types.h>
#include <linux/major.h>
@@ -26,7 +32,7 @@
#define FUSE_KERNEL_VERSION 7
/** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 9
+#define FUSE_KERNEL_MINOR_VERSION 10
/** The node ID of the root inode */
#define FUSE_ROOT_ID 1
@@ -98,9 +104,11 @@
*
* FOPEN_DIRECT_IO: bypass page cache for this open file
* FOPEN_KEEP_CACHE: don't invalidate the data cache on open
+ * FOPEN_NONSEEKABLE: the file is not seekable
*/
#define FOPEN_DIRECT_IO (1 << 0)
#define FOPEN_KEEP_CACHE (1 << 1)
+#define FOPEN_NONSEEKABLE (1 << 2)
/**
* INIT request/reply flags
@@ -409,3 +417,5 @@
#define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1))
#define FUSE_DIRENT_SIZE(d) \
FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
+
+#endif /* _LINUX_FUSE_H */
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 2f245fe..9a4e35c 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -125,12 +125,12 @@
enum hrtimer_restart (*function)(struct hrtimer *);
struct hrtimer_clock_base *base;
unsigned long state;
- enum hrtimer_cb_mode cb_mode;
struct list_head cb_entry;
+ enum hrtimer_cb_mode cb_mode;
#ifdef CONFIG_TIMER_STATS
+ int start_pid;
void *start_site;
char start_comm[16];
- int start_pid;
#endif
};
@@ -155,10 +155,8 @@
* @first: pointer to the timer node which expires first
* @resolution: the resolution of the clock, in nanoseconds
* @get_time: function to retrieve the current time of the clock
- * @get_softirq_time: function to retrieve the current time from the softirq
* @softirq_time: the time when running the hrtimer queue in the softirq
* @offset: offset of this clock to the monotonic base
- * @reprogram: function to reprogram the timer event
*/
struct hrtimer_clock_base {
struct hrtimer_cpu_base *cpu_base;
@@ -167,13 +165,9 @@
struct rb_node *first;
ktime_t resolution;
ktime_t (*get_time)(void);
- ktime_t (*get_softirq_time)(void);
ktime_t softirq_time;
#ifdef CONFIG_HIGH_RES_TIMERS
ktime_t offset;
- int (*reprogram)(struct hrtimer *t,
- struct hrtimer_clock_base *b,
- ktime_t n);
#endif
};
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c47e371..89e53cf 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -461,12 +461,26 @@
struct ide_acpi_hwif_link;
#endif
+struct ide_drive_s;
+
+struct ide_disk_ops {
+ int (*check)(struct ide_drive_s *, const char *);
+ int (*get_capacity)(struct ide_drive_s *);
+ void (*setup)(struct ide_drive_s *);
+ void (*flush)(struct ide_drive_s *);
+ int (*init_media)(struct ide_drive_s *, struct gendisk *);
+ int (*set_doorlock)(struct ide_drive_s *, struct gendisk *,
+ int);
+ ide_startstop_t (*do_request)(struct ide_drive_s *, struct request *,
+ sector_t);
+ int (*end_request)(struct ide_drive_s *, int, int);
+ int (*ioctl)(struct ide_drive_s *, struct inode *,
+ struct file *, unsigned int, unsigned long);
+};
+
/* ATAPI device flags */
enum {
IDE_AFLAG_DRQ_INTERRUPT = (1 << 0),
- IDE_AFLAG_MEDIA_CHANGED = (1 << 1),
- /* Drive cannot lock the door. */
- IDE_AFLAG_NO_DOORLOCK = (1 << 2),
/* ide-cd */
/* Drive cannot eject the disc. */
@@ -498,14 +512,10 @@
IDE_AFLAG_LE_SPEED_FIELDS = (1 << 17),
/* ide-floppy */
- /* Format in progress */
- IDE_AFLAG_FORMAT_IN_PROGRESS = (1 << 18),
/* Avoid commands not supported in Clik drive */
IDE_AFLAG_CLIK_DRIVE = (1 << 19),
/* Requires BH algorithm for packets */
IDE_AFLAG_ZIP_DRIVE = (1 << 20),
- /* Write protect */
- IDE_AFLAG_WP = (1 << 21),
/* Supports format progress report */
IDE_AFLAG_SRFP = (1 << 22),
@@ -578,7 +588,11 @@
/* don't unload heads */
IDE_DFLAG_NO_UNLOAD = (1 << 27),
/* heads unloaded, please don't reset port */
- IDE_DFLAG_PARKED = (1 << 28)
+ IDE_DFLAG_PARKED = (1 << 28),
+ IDE_DFLAG_MEDIA_CHANGED = (1 << 29),
+ /* write protect */
+ IDE_DFLAG_WP = (1 << 30),
+ IDE_DFLAG_FORMAT_IN_PROGRESS = (1 << 31),
};
struct ide_drive_s {
@@ -597,6 +611,8 @@
#endif
struct hwif_s *hwif; /* actually (ide_hwif_t *) */
+ const struct ide_disk_ops *disk_ops;
+
unsigned long dev_flags;
unsigned long sleep; /* sleep until this time */
@@ -1123,8 +1139,8 @@
void (*resume)(ide_drive_t *);
void (*shutdown)(ide_drive_t *);
#ifdef CONFIG_IDE_PROC_FS
- ide_proc_entry_t *proc;
- const struct ide_proc_devset *settings;
+ ide_proc_entry_t * (*proc_entries)(ide_drive_t *);
+ const struct ide_proc_devset * (*proc_devsets)(ide_drive_t *);
#endif
};
diff --git a/include/linux/init.h b/include/linux/init.h
index ad63824..0c12646 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -40,7 +40,7 @@
/* These are for everybody (although not all archs will actually
discard it in modules) */
-#define __init __section(.init.text) __cold
+#define __init __section(.init.text) __cold notrace
#define __initdata __section(.init.data)
#define __initconst __section(.init.rodata)
#define __exitdata __section(.exit.data)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 35a61dc..f58a0cf 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -8,6 +8,7 @@
#include <linux/preempt.h>
#include <linux/cpumask.h>
#include <linux/irqreturn.h>
+#include <linux/irqnr.h>
#include <linux/hardirq.h>
#include <linux/sched.h>
#include <linux/irqflags.h>
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8d9411b..d058c57 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -18,6 +18,7 @@
#include <linux/spinlock.h>
#include <linux/cpumask.h>
#include <linux/irqreturn.h>
+#include <linux/irqnr.h>
#include <linux/errno.h>
#include <asm/irq.h>
@@ -152,6 +153,7 @@
* @name: flow handler name for /proc/interrupts output
*/
struct irq_desc {
+ unsigned int irq;
irq_flow_handler_t handle_irq;
struct irq_chip *chip;
struct msi_desc *msi_desc;
@@ -170,7 +172,7 @@
cpumask_t affinity;
unsigned int cpu;
#endif
-#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_t pending_mask;
#endif
#ifdef CONFIG_PROC_FS
@@ -179,8 +181,14 @@
const char *name;
} ____cacheline_internodealigned_in_smp;
+
extern struct irq_desc irq_desc[NR_IRQS];
+static inline struct irq_desc *irq_to_desc(unsigned int irq)
+{
+ return (irq < nr_irqs) ? irq_desc + irq : NULL;
+}
+
/*
* Migration helpers for obsolete names, they will go away:
*/
@@ -198,19 +206,15 @@
#ifdef CONFIG_GENERIC_HARDIRQS
-#ifndef handle_dynamic_tick
-# define handle_dynamic_tick(a) do { } while (0)
-#endif
-
#ifdef CONFIG_SMP
-#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
void set_pending_irq(unsigned int irq, cpumask_t mask);
void move_native_irq(int irq);
void move_masked_irq(int irq);
-#else /* CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE */
+#else /* CONFIG_GENERIC_PENDING_IRQ */
static inline void move_irq(int irq)
{
@@ -237,19 +241,14 @@
#endif /* CONFIG_SMP */
-#ifdef CONFIG_IRQBALANCE
-extern void set_balance_irq_affinity(unsigned int irq, cpumask_t mask);
-#else
-static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
-{
-}
-#endif
-
extern int no_irq_affinity;
static inline int irq_balancing_disabled(unsigned int irq)
{
- return irq_desc[irq].status & IRQ_NO_BALANCING_MASK;
+ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+ return desc->status & IRQ_NO_BALANCING_MASK;
}
/* Handle irq action chains: */
@@ -279,10 +278,8 @@
* irqchip-style controller then we call the ->handle_irq() handler,
* and it calls __do_IRQ() if it's attached to an irqtype-style controller.
*/
-static inline void generic_handle_irq(unsigned int irq)
+static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc)
{
- struct irq_desc *desc = irq_desc + irq;
-
#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
desc->handle_irq(irq, desc);
#else
@@ -293,6 +290,11 @@
#endif
}
+static inline void generic_handle_irq(unsigned int irq)
+{
+ generic_handle_irq_desc(irq, irq_to_desc(irq));
+}
+
/* Handling of unhandled and spurious interrupts: */
extern void note_interrupt(unsigned int irq, struct irq_desc *desc,
int action_ret);
@@ -325,7 +327,10 @@
static inline void __set_irq_handler_unlocked(int irq,
irq_flow_handler_t handler)
{
- irq_desc[irq].handle_irq = handler;
+ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+ desc->handle_irq = handler;
}
/*
@@ -353,13 +358,14 @@
extern void set_irq_probe(unsigned int irq);
/* Handle dynamic irq creation and destruction */
+extern unsigned int create_irq_nr(unsigned int irq_want);
extern int create_irq(void);
extern void destroy_irq(unsigned int irq);
/* Test to see if a driver has successfully requested an irq */
static inline int irq_has_action(unsigned int irq)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
return desc->action != NULL;
}
@@ -374,10 +380,10 @@
extern int set_irq_type(unsigned int irq, unsigned int type);
extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
-#define get_irq_chip(irq) (irq_desc[irq].chip)
-#define get_irq_chip_data(irq) (irq_desc[irq].chip_data)
-#define get_irq_data(irq) (irq_desc[irq].handler_data)
-#define get_irq_msi(irq) (irq_desc[irq].msi_desc)
+#define get_irq_chip(irq) (irq_to_desc(irq)->chip)
+#define get_irq_chip_data(irq) (irq_to_desc(irq)->chip_data)
+#define get_irq_data(irq) (irq_to_desc(irq)->handler_data)
+#define get_irq_msi(irq) (irq_to_desc(irq)->msi_desc)
#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
new file mode 100644
index 0000000..3171ddc
--- /dev/null
+++ b/include/linux/irqnr.h
@@ -0,0 +1,24 @@
+#ifndef _LINUX_IRQNR_H
+#define _LINUX_IRQNR_H
+
+#ifndef CONFIG_GENERIC_HARDIRQS
+#include <asm/irq.h>
+# define nr_irqs NR_IRQS
+
+# define for_each_irq_desc(irq, desc) \
+ for (irq = 0; irq < nr_irqs; irq++)
+#else
+extern int nr_irqs;
+
+# define for_each_irq_desc(irq, desc) \
+ for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+
+# define for_each_irq_desc_reverse(irq, desc) \
+ for (irq = nr_irqs -1, desc = irq_desc + (nr_irqs -1 ); \
+ irq > 0; irq--, desc--)
+#endif
+
+#define for_each_irq_nr(irq) \
+ for (irq = 0; irq < nr_irqs; irq++)
+
+#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5a566b7..94d17ff 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -496,4 +496,9 @@
#define NUMA_BUILD 0
#endif
+/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
+#endif
+
#endif
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index cf9f40a..4a145ca 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -39,19 +39,34 @@
extern unsigned long long nr_context_switches(void);
+struct irq_desc;
+
+static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
+ struct irq_desc *desc)
+{
+ kstat_this_cpu.irqs[irq]++;
+}
+
+static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+ return kstat_cpu(cpu).irqs[irq];
+}
+
/*
* Number of interrupts per specific IRQ source, since bootup
*/
-static inline int kstat_irqs(int irq)
+static inline unsigned int kstat_irqs(unsigned int irq)
{
- int cpu, sum = 0;
+ unsigned int sum = 0;
+ int cpu;
for_each_possible_cpu(cpu)
- sum += kstat_cpu(cpu).irqs[irq];
+ sum += kstat_irqs_cpu(irq, cpu);
return sum;
}
+extern unsigned long long task_delta_exec(struct task_struct *);
extern void account_user_time(struct task_struct *, cputime_t);
extern void account_user_time_scaled(struct task_struct *, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t);
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 0be7795..497b1d1 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -29,6 +29,7 @@
* <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
* <prasanna@in.ibm.com> added function-return probes.
*/
+#include <linux/linkage.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/smp.h>
@@ -47,7 +48,7 @@
#define KPROBE_HIT_SSDONE 0x00000008
/* Attach to insert probes on any functions which should be ignored*/
-#define __kprobes __attribute__((__section__(".kprobes.text")))
+#define __kprobes __attribute__((__section__(".kprobes.text"))) notrace
struct kprobe;
struct pt_regs;
@@ -256,7 +257,7 @@
#else /* CONFIG_KPROBES */
-#define __kprobes /**/
+#define __kprobes notrace
struct jprobe;
struct kretprobe;
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 56ba373..9fd1f85 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -4,8 +4,6 @@
#include <linux/compiler.h>
#include <asm/linkage.h>
-#define notrace __attribute__((no_instrument_function))
-
#ifdef __cplusplus
#define CPP_ASMLINKAGE extern "C"
#else
diff --git a/include/linux/marker.h b/include/linux/marker.h
index 1290653..889196c 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -160,4 +160,11 @@
extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
int num);
+/*
+ * marker_synchronize_unregister must be called between the last marker probe
+ * unregistration and the end of module exit to make sure there is no caller
+ * executing a probe when it is freed.
+ */
+#define marker_synchronize_unregister() synchronize_sched()
+
#endif
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 61d19e1..139d7c8 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -34,11 +34,15 @@
/* Called from page fault handler. */
extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
-/* Called from ioremap.c */
#ifdef CONFIG_MMIOTRACE
+/* Called from ioremap.c */
extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
void __iomem *addr);
extern void mmiotrace_iounmap(volatile void __iomem *addr);
+
+/* For anyone to insert markers. Remember trailing newline. */
+extern int mmiotrace_printk(const char *fmt, ...)
+ __attribute__ ((format (printf, 1, 2)));
#else
static inline void mmiotrace_ioremap(resource_size_t offset,
unsigned long size, void __iomem *addr)
@@ -48,15 +52,22 @@
static inline void mmiotrace_iounmap(volatile void __iomem *addr)
{
}
-#endif /* CONFIG_MMIOTRACE_HOOKS */
+
+static inline int mmiotrace_printk(const char *fmt, ...)
+ __attribute__ ((format (printf, 1, 0)));
+
+static inline int mmiotrace_printk(const char *fmt, ...)
+{
+ return 0;
+}
+#endif /* CONFIG_MMIOTRACE */
enum mm_io_opcode {
MMIO_READ = 0x1, /* struct mmiotrace_rw */
MMIO_WRITE = 0x2, /* struct mmiotrace_rw */
MMIO_PROBE = 0x3, /* struct mmiotrace_map */
MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */
- MMIO_MARKER = 0x5, /* raw char data */
- MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */
+ MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */
};
struct mmiotrace_rw {
@@ -81,5 +92,6 @@
extern void disable_mmiotrace(void);
extern void mmio_trace_rw(struct mmiotrace_rw *rw);
extern void mmio_trace_mapping(struct mmiotrace_map *map);
+extern int mmio_trace_printk(const char *fmt, va_list args);
#endif /* MMIOTRACE_H */
diff --git a/include/linux/module.h b/include/linux/module.h
index a41555c..5d2970c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -16,6 +16,7 @@
#include <linux/kobject.h>
#include <linux/moduleparam.h>
#include <linux/marker.h>
+#include <linux/tracepoint.h>
#include <asm/local.h>
#include <asm/module.h>
@@ -331,6 +332,10 @@
struct marker *markers;
unsigned int num_markers;
#endif
+#ifdef CONFIG_TRACEPOINTS
+ struct tracepoint *tracepoints;
+ unsigned int num_tracepoints;
+#endif
#ifdef CONFIG_MODULE_UNLOAD
/* What modules depend on me? */
@@ -453,6 +458,9 @@
extern void module_update_markers(void);
+extern void module_update_tracepoints(void);
+extern int module_get_iter_tracepoints(struct tracepoint_iter *iter);
+
#else /* !CONFIG_MODULES... */
#define EXPORT_SYMBOL(sym)
#define EXPORT_SYMBOL_GPL(sym)
@@ -557,6 +565,15 @@
{
}
+static inline void module_update_tracepoints(void)
+{
+}
+
+static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter)
+{
+ return 0;
+}
+
#endif /* CONFIG_MODULES */
struct device_driver;
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a7dd38f..a7c7213 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -45,8 +45,6 @@
int it_requeue_pending; /* waiting to requeue this timer */
#define REQUEUE_PENDING 1
int it_sigev_notify; /* notify word of sigevent struct */
- int it_sigev_signo; /* signo word of sigevent struct */
- sigval_t it_sigev_value; /* value word of sigevent struct */
struct task_struct *it_process; /* process to send signal to */
struct sigqueue *sigq; /* signal queue entry. */
union {
@@ -115,4 +113,6 @@
long clock_nanosleep_restart(struct restart_block *restart_block);
+void update_rlimit_cpu(unsigned long rlim_new);
+
#endif
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
new file mode 100644
index 0000000..536b0ca
--- /dev/null
+++ b/include/linux/ring_buffer.h
@@ -0,0 +1,127 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use functions below.
+ */
+struct ring_buffer_event {
+ u32 type:2, len:3, time_delta:27;
+ u32 array[];
+};
+
+/**
+ * enum ring_buffer_type - internal ring buffer types
+ *
+ * @RINGBUF_TYPE_PADDING: Left over page padding
+ * array is ignored
+ * size is variable depending on how much
+ * padding is needed
+ *
+ * @RINGBUF_TYPE_TIME_EXTEND: Extend the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ *
+ * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock
+ * array[0] = tv_nsec
+ * array[1] = tv_sec
+ * size = 16 bytes
+ *
+ * @RINGBUF_TYPE_DATA: Data record
+ * If len is zero:
+ * array[0] holds the actual length
+ * array[1..(length+3)/4-1] holds data
+ * else
+ * length = len << 2
+ * array[0..(length+3)/4] holds data
+ */
+enum ring_buffer_type {
+ RINGBUF_TYPE_PADDING,
+ RINGBUF_TYPE_TIME_EXTEND,
+ /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
+ RINGBUF_TYPE_TIME_STAMP,
+ RINGBUF_TYPE_DATA,
+};
+
+unsigned ring_buffer_event_length(struct ring_buffer_event *event);
+void *ring_buffer_event_data(struct ring_buffer_event *event);
+
+/**
+ * ring_buffer_event_time_delta - return the delta timestamp of the event
+ * @event: the event to get the delta timestamp of
+ *
+ * The delta timestamp is the 27 bit timestamp since the last event.
+ */
+static inline unsigned
+ring_buffer_event_time_delta(struct ring_buffer_event *event)
+{
+ return event->time_delta;
+}
+
+/*
+ * size is in bytes for each per CPU buffer.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags);
+void ring_buffer_free(struct ring_buffer *buffer);
+
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
+
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags);
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags);
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length, void *data);
+
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_finish(struct ring_buffer_iter *iter);
+
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
+
+unsigned long ring_buffer_size(struct ring_buffer *buffer);
+
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_reset(struct ring_buffer *buffer);
+
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu);
+
+int ring_buffer_empty(struct ring_buffer *buffer);
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
+
+void ring_buffer_record_disable(struct ring_buffer *buffer);
+void ring_buffer_record_enable(struct ring_buffer *buffer);
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu);
+
+unsigned long ring_buffer_entries(struct ring_buffer *buffer);
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+
+u64 ring_buffer_time_stamp(int cpu);
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+
+enum ring_buffer_flags {
+ RB_FL_OVERWRITE = 1 << 0,
+};
+
+#endif /* _LINUX_RING_BUFFER_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f52dbd3..5c38db5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -434,6 +434,39 @@
unsigned long ac_minflt, ac_majflt;
};
+/**
+ * struct task_cputime - collected CPU time counts
+ * @utime: time spent in user mode, in &cputime_t units
+ * @stime: time spent in kernel mode, in &cputime_t units
+ * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
+ *
+ * This structure groups together three kinds of CPU time that are
+ * tracked for threads and thread groups. Most things considering
+ * CPU time want to group these counts together and treat all three
+ * of them in parallel.
+ */
+struct task_cputime {
+ cputime_t utime;
+ cputime_t stime;
+ unsigned long long sum_exec_runtime;
+};
+/* Alternate field names when used to cache expirations. */
+#define prof_exp stime
+#define virt_exp utime
+#define sched_exp sum_exec_runtime
+
+/**
+ * struct thread_group_cputime - thread group interval timer counts
+ * @totals: thread group interval timers; substructure for
+ * uniprocessor kernel, per-cpu for SMP kernel.
+ *
+ * This structure contains the version of task_cputime, above, that is
+ * used for thread group CPU clock calculations.
+ */
+struct thread_group_cputime {
+ struct task_cputime *totals;
+};
+
/*
* NOTE! "signal_struct" does not have it's own
* locking, because a shared signal_struct always
@@ -479,6 +512,17 @@
cputime_t it_prof_expires, it_virt_expires;
cputime_t it_prof_incr, it_virt_incr;
+ /*
+ * Thread group totals for process CPU clocks.
+ * See thread_group_cputime(), et al, for details.
+ */
+ struct thread_group_cputime cputime;
+
+ /* Earliest-expiration cache. */
+ struct task_cputime cputime_expires;
+
+ struct list_head cpu_timers[3];
+
/* job control IDs */
/*
@@ -509,7 +553,7 @@
* Live threads maintain their own counters and add to these
* in __exit_signal, except for the group leader.
*/
- cputime_t utime, stime, cutime, cstime;
+ cputime_t cutime, cstime;
cputime_t gtime;
cputime_t cgtime;
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@ -518,14 +562,6 @@
struct task_io_accounting ioac;
/*
- * Cumulative ns of scheduled CPU time for dead threads in the
- * group, not including a zombie group leader. (This only differs
- * from jiffies_to_ns(utime + stime) if sched_clock uses something
- * other than jiffies.)
- */
- unsigned long long sum_sched_runtime;
-
- /*
* We don't bother to synchronize most readers of this at all,
* because there is no reader checking a limit that actually needs
* to get both rlim_cur and rlim_max atomically, and either one
@@ -536,8 +572,6 @@
*/
struct rlimit rlim[RLIM_NLIMITS];
- struct list_head cpu_timers[3];
-
/* keep the process-shared keyrings here so that they do the right
* thing in threads created with CLONE_THREAD */
#ifdef CONFIG_KEYS
@@ -1146,8 +1180,7 @@
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;
- cputime_t it_prof_expires, it_virt_expires;
- unsigned long long it_sched_expires;
+ struct task_cputime cputime_expires;
struct list_head cpu_timers[3];
/* process credentials */
@@ -1597,6 +1630,7 @@
extern unsigned long long
task_sched_runtime(struct task_struct *task);
+extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
@@ -2094,6 +2128,30 @@
}
/*
+ * Thread group CPU time accounting.
+ */
+
+extern int thread_group_cputime_alloc(struct task_struct *);
+extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
+
+static inline void thread_group_cputime_init(struct signal_struct *sig)
+{
+ sig->cputime.totals = NULL;
+}
+
+static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
+{
+ if (curr->signal->cputime.totals)
+ return 0;
+ return thread_group_cputime_alloc(curr);
+}
+
+static inline void thread_group_cputime_free(struct signal_struct *sig)
+{
+ free_percpu(sig->cputime.totals);
+}
+
+/*
* Reevaluate whether the task has signals pending delivery.
* Wake the task if so.
* This is required every time the blocked sigset_t changes.
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 98921a3..b6ec818 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -96,9 +96,11 @@
extern void tick_clock_notify(void);
extern int tick_check_oneshot_change(int allow_nohz);
extern struct tick_sched *tick_get_tick_sched(int cpu);
+extern void tick_check_idle(int cpu);
# else
static inline void tick_clock_notify(void) { }
static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline void tick_check_idle(int cpu) { }
# endif
#else /* CONFIG_GENERIC_CLOCKEVENTS */
@@ -106,26 +108,23 @@
static inline void tick_cancel_sched_timer(int cpu) { }
static inline void tick_clock_notify(void) { }
static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline void tick_check_idle(int cpu) { }
#endif /* !CONFIG_GENERIC_CLOCKEVENTS */
# ifdef CONFIG_NO_HZ
extern void tick_nohz_stop_sched_tick(int inidle);
extern void tick_nohz_restart_sched_tick(void);
-extern void tick_nohz_update_jiffies(void);
extern ktime_t tick_nohz_get_sleep_length(void);
-extern void tick_nohz_stop_idle(int cpu);
extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
# else
static inline void tick_nohz_stop_sched_tick(int inidle) { }
static inline void tick_nohz_restart_sched_tick(void) { }
-static inline void tick_nohz_update_jiffies(void) { }
static inline ktime_t tick_nohz_get_sleep_length(void)
{
ktime_t len = { .tv64 = NSEC_PER_SEC/HZ };
return len;
}
-static inline void tick_nohz_stop_idle(int cpu) { }
static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
# endif /* !NO_HZ */
diff --git a/include/linux/time.h b/include/linux/time.h
index 51e883d..4f1c9db 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -119,6 +119,7 @@
extern unsigned int alarm_setitimer(unsigned int seconds);
extern int do_getitimer(int which, struct itimerval *value);
extern void getnstimeofday(struct timespec *tv);
+extern void getrawmonotonic(struct timespec *ts);
extern void getboottime(struct timespec *ts);
extern void monotonic_to_bootbased(struct timespec *ts);
@@ -127,6 +128,9 @@
extern void update_wall_time(void);
extern void update_xtime_cache(u64 nsec);
+struct tms;
+extern void do_sys_times(struct tms *);
+
/**
* timespec_to_ns - Convert timespec to nanoseconds
* @ts: pointer to the timespec variable to be converted
@@ -216,6 +220,7 @@
#define CLOCK_MONOTONIC 1
#define CLOCK_PROCESS_CPUTIME_ID 2
#define CLOCK_THREAD_CPUTIME_ID 3
+#define CLOCK_MONOTONIC_RAW 4
/*
* The IDs of various hardware clocks:
diff --git a/include/linux/timex.h b/include/linux/timex.h
index fc6035d..9007313 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -82,7 +82,7 @@
*/
#define SHIFT_USEC 16 /* frequency offset scale (shift) */
#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC))
-#define PPM_SCALE_INV_SHIFT 20
+#define PPM_SCALE_INV_SHIFT 19
#define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \
PPM_SCALE + 1)
@@ -141,8 +141,15 @@
#define ADJ_MICRO 0x1000 /* select microsecond resolution */
#define ADJ_NANO 0x2000 /* select nanosecond resolution */
#define ADJ_TICK 0x4000 /* tick value */
+
+#ifdef __KERNEL__
+#define ADJ_ADJTIME 0x8000 /* switch between adjtime/adjtimex modes */
+#define ADJ_OFFSET_SINGLESHOT 0x0001 /* old-fashioned adjtime */
+#define ADJ_OFFSET_READONLY 0x2000 /* read-only adjtime */
+#else
#define ADJ_OFFSET_SINGLESHOT 0x8001 /* old-fashioned adjtime */
-#define ADJ_OFFSET_SS_READ 0xa001 /* read-only adjtime */
+#define ADJ_OFFSET_SS_READ 0xa001 /* read-only adjtime */
+#endif
/* xntp 3.4 compatibility names */
#define MOD_OFFSET ADJ_OFFSET
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
new file mode 100644
index 0000000..c5bb39c
--- /dev/null
+++ b/include/linux/tracepoint.h
@@ -0,0 +1,137 @@
+#ifndef _LINUX_TRACEPOINT_H
+#define _LINUX_TRACEPOINT_H
+
+/*
+ * Kernel Tracepoint API.
+ *
+ * See Documentation/tracepoint.txt.
+ *
+ * (C) Copyright 2008 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ *
+ * Heavily inspired from the Linux Kernel Markers.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+
+struct module;
+struct tracepoint;
+
+struct tracepoint {
+ const char *name; /* Tracepoint name */
+ int state; /* State. */
+ void **funcs;
+} __attribute__((aligned(8)));
+
+
+#define TPPROTO(args...) args
+#define TPARGS(args...) args
+
+#ifdef CONFIG_TRACEPOINTS
+
+/*
+ * it_func[0] is never NULL because there is at least one element in the array
+ * when the array itself is non NULL.
+ */
+#define __DO_TRACE(tp, proto, args) \
+ do { \
+ void **it_func; \
+ \
+ rcu_read_lock_sched(); \
+ it_func = rcu_dereference((tp)->funcs); \
+ if (it_func) { \
+ do { \
+ ((void(*)(proto))(*it_func))(args); \
+ } while (*(++it_func)); \
+ } \
+ rcu_read_unlock_sched(); \
+ } while (0)
+
+/*
+ * Make sure the alignment of the structure in the __tracepoints section will
+ * not add unwanted padding between the beginning of the section and the
+ * structure. Force alignment to the same alignment as the section start.
+ */
+#define DEFINE_TRACE(name, proto, args) \
+ static inline void trace_##name(proto) \
+ { \
+ static const char __tpstrtab_##name[] \
+ __attribute__((section("__tracepoints_strings"))) \
+ = #name ":" #proto; \
+ static struct tracepoint __tracepoint_##name \
+ __attribute__((section("__tracepoints"), aligned(8))) = \
+ { __tpstrtab_##name, 0, NULL }; \
+ if (unlikely(__tracepoint_##name.state)) \
+ __DO_TRACE(&__tracepoint_##name, \
+ TPPROTO(proto), TPARGS(args)); \
+ } \
+ static inline int register_trace_##name(void (*probe)(proto)) \
+ { \
+ return tracepoint_probe_register(#name ":" #proto, \
+ (void *)probe); \
+ } \
+ static inline void unregister_trace_##name(void (*probe)(proto))\
+ { \
+ tracepoint_probe_unregister(#name ":" #proto, \
+ (void *)probe); \
+ }
+
+extern void tracepoint_update_probe_range(struct tracepoint *begin,
+ struct tracepoint *end);
+
+#else /* !CONFIG_TRACEPOINTS */
+#define DEFINE_TRACE(name, proto, args) \
+ static inline void _do_trace_##name(struct tracepoint *tp, proto) \
+ { } \
+ static inline void trace_##name(proto) \
+ { } \
+ static inline int register_trace_##name(void (*probe)(proto)) \
+ { \
+ return -ENOSYS; \
+ } \
+ static inline void unregister_trace_##name(void (*probe)(proto))\
+ { }
+
+static inline void tracepoint_update_probe_range(struct tracepoint *begin,
+ struct tracepoint *end)
+{ }
+#endif /* CONFIG_TRACEPOINTS */
+
+/*
+ * Connect a probe to a tracepoint.
+ * Internal API, should not be used directly.
+ */
+extern int tracepoint_probe_register(const char *name, void *probe);
+
+/*
+ * Disconnect a probe from a tracepoint.
+ * Internal API, should not be used directly.
+ */
+extern int tracepoint_probe_unregister(const char *name, void *probe);
+
+struct tracepoint_iter {
+ struct module *module;
+ struct tracepoint *tracepoint;
+};
+
+extern void tracepoint_iter_start(struct tracepoint_iter *iter);
+extern void tracepoint_iter_next(struct tracepoint_iter *iter);
+extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
+extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
+extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
+ struct tracepoint *begin, struct tracepoint *end);
+
+/*
+ * tracepoint_synchronize_unregister must be called between the last tracepoint
+ * probe unregistration and the end of module exit to make sure there is no
+ * caller executing a probe when it is freed.
+ */
+static inline void tracepoint_synchronize_unregister(void)
+{
+ synchronize_sched();
+}
+
+#endif
diff --git a/include/trace/sched.h b/include/trace/sched.h
new file mode 100644
index 0000000..ad47369
--- /dev/null
+++ b/include/trace/sched.h
@@ -0,0 +1,56 @@
+#ifndef _TRACE_SCHED_H
+#define _TRACE_SCHED_H
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+DEFINE_TRACE(sched_kthread_stop,
+ TPPROTO(struct task_struct *t),
+ TPARGS(t));
+
+DEFINE_TRACE(sched_kthread_stop_ret,
+ TPPROTO(int ret),
+ TPARGS(ret));
+
+DEFINE_TRACE(sched_wait_task,
+ TPPROTO(struct rq *rq, struct task_struct *p),
+ TPARGS(rq, p));
+
+DEFINE_TRACE(sched_wakeup,
+ TPPROTO(struct rq *rq, struct task_struct *p),
+ TPARGS(rq, p));
+
+DEFINE_TRACE(sched_wakeup_new,
+ TPPROTO(struct rq *rq, struct task_struct *p),
+ TPARGS(rq, p));
+
+DEFINE_TRACE(sched_switch,
+ TPPROTO(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next),
+ TPARGS(rq, prev, next));
+
+DEFINE_TRACE(sched_migrate_task,
+ TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu),
+ TPARGS(rq, p, dest_cpu));
+
+DEFINE_TRACE(sched_process_free,
+ TPPROTO(struct task_struct *p),
+ TPARGS(p));
+
+DEFINE_TRACE(sched_process_exit,
+ TPPROTO(struct task_struct *p),
+ TPARGS(p));
+
+DEFINE_TRACE(sched_process_wait,
+ TPPROTO(struct pid *pid),
+ TPARGS(pid));
+
+DEFINE_TRACE(sched_process_fork,
+ TPPROTO(struct task_struct *parent, struct task_struct *child),
+ TPARGS(parent, child));
+
+DEFINE_TRACE(sched_signal_send,
+ TPPROTO(int sig, struct task_struct *p),
+ TPARGS(sig, p));
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 06330a3..113c74c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -794,6 +794,13 @@
Say Y here to enable the extended profiling support mechanisms used
by profilers such as OProfile.
+#
+# Place an empty function call at each tracepoint site. Can be
+# dynamically changed for a probe function.
+#
+config TRACEPOINTS
+ bool
+
config MARKERS
bool "Activate markers"
help
diff --git a/init/main.c b/init/main.c
index 4371d11..3e17a3b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -61,6 +61,7 @@
#include <linux/sched.h>
#include <linux/signal.h>
#include <linux/idr.h>
+#include <linux/ftrace.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -689,6 +690,8 @@
acpi_early_init(); /* before LAPIC and SMP init */
+ ftrace_init();
+
/* Do the rest non-__init'ed, we're now alive */
rest_init();
}
@@ -705,30 +708,31 @@
int do_one_initcall(initcall_t fn)
{
int count = preempt_count();
- ktime_t t0, t1, delta;
+ ktime_t delta;
char msgbuf[64];
- int result;
+ struct boot_trace it;
if (initcall_debug) {
- printk("calling %pF @ %i\n", fn, task_pid_nr(current));
- t0 = ktime_get();
+ it.caller = task_pid_nr(current);
+ printk("calling %pF @ %i\n", fn, it.caller);
+ it.calltime = ktime_get();
}
- result = fn();
+ it.result = fn();
if (initcall_debug) {
- t1 = ktime_get();
- delta = ktime_sub(t1, t0);
-
- printk("initcall %pF returned %d after %Ld msecs\n",
- fn, result,
- (unsigned long long) delta.tv64 >> 20);
+ it.rettime = ktime_get();
+ delta = ktime_sub(it.rettime, it.calltime);
+ it.duration = (unsigned long long) delta.tv64 >> 10;
+ printk("initcall %pF returned %d after %Ld usecs\n", fn,
+ it.result, it.duration);
+ trace_boot(&it, fn);
}
msgbuf[0] = 0;
- if (result && result != -ENODEV && initcall_debug)
- sprintf(msgbuf, "error code %d ", result);
+ if (it.result && it.result != -ENODEV && initcall_debug)
+ sprintf(msgbuf, "error code %d ", it.result);
if (preempt_count() != count) {
strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf));
@@ -742,7 +746,7 @@
printk("initcall %pF returned with %s\n", fn, msgbuf);
}
- return result;
+ return it.result;
}
@@ -857,6 +861,7 @@
smp_prepare_cpus(setup_max_cpus);
do_pre_smp_initcalls();
+ start_boot_trace();
smp_init();
sched_init_smp();
@@ -883,6 +888,7 @@
* we're essentially up and running. Get rid of the
* initmem segments and start the user-mode stuff..
*/
+ stop_boot_trace();
init_post();
return 0;
}
diff --git a/kernel/Makefile b/kernel/Makefile
index 066550a..305f11d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -85,6 +85,7 @@
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_MARKERS) += marker.o
+obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
obj-$(CONFIG_FTRACE) += trace/
diff --git a/kernel/compat.c b/kernel/compat.c
index 143990e..8eafe3e 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -23,6 +23,7 @@
#include <linux/timex.h>
#include <linux/migrate.h>
#include <linux/posix-timers.h>
+#include <linux/times.h>
#include <asm/uaccess.h>
@@ -208,49 +209,23 @@
return 0;
}
+static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
+{
+ return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
+}
+
asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
{
- /*
- * In the SMP world we might just be unlucky and have one of
- * the times increment as we use it. Since the value is an
- * atomically safe type this is just fine. Conceptually its
- * as if the syscall took an instant longer to occur.
- */
if (tbuf) {
+ struct tms tms;
struct compat_tms tmp;
- struct task_struct *tsk = current;
- struct task_struct *t;
- cputime_t utime, stime, cutime, cstime;
- read_lock(&tasklist_lock);
- utime = tsk->signal->utime;
- stime = tsk->signal->stime;
- t = tsk;
- do {
- utime = cputime_add(utime, t->utime);
- stime = cputime_add(stime, t->stime);
- t = next_thread(t);
- } while (t != tsk);
-
- /*
- * While we have tasklist_lock read-locked, no dying thread
- * can be updating current->signal->[us]time. Instead,
- * we got their counts included in the live thread loop.
- * However, another thread can come in right now and
- * do a wait call that updates current->signal->c[us]time.
- * To make sure we always see that pair updated atomically,
- * we take the siglock around fetching them.
- */
- spin_lock_irq(&tsk->sighand->siglock);
- cutime = tsk->signal->cutime;
- cstime = tsk->signal->cstime;
- spin_unlock_irq(&tsk->sighand->siglock);
- read_unlock(&tasklist_lock);
-
- tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
- tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
- tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
- tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
+ do_sys_times(&tms);
+ /* Convert our struct tms to the compat version. */
+ tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
+ tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
+ tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
+ tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
return -EFAULT;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 0ef4673..80137a5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,6 +47,7 @@
#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
+#include <trace/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -112,8 +113,6 @@
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
- sig->utime = cputime_add(sig->utime, task_utime(tsk));
- sig->stime = cputime_add(sig->stime, task_stime(tsk));
sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
@@ -122,7 +121,6 @@
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
- sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig = NULL; /* Marker for below. */
}
@@ -149,7 +147,10 @@
static void delayed_put_task_struct(struct rcu_head *rhp)
{
- put_task_struct(container_of(rhp, struct task_struct, rcu));
+ struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+ trace_sched_process_free(tsk);
+ put_task_struct(tsk);
}
@@ -1073,6 +1074,8 @@
if (group_dead)
acct_process();
+ trace_sched_process_exit(tsk);
+
exit_sem(tsk);
exit_files(tsk);
exit_fs(tsk);
@@ -1301,6 +1304,7 @@
if (likely(!traced)) {
struct signal_struct *psig;
struct signal_struct *sig;
+ struct task_cputime cputime;
/*
* The resource counters for the group leader are in its
@@ -1316,20 +1320,23 @@
* need to protect the access to p->parent->signal fields,
* as other threads in the parent group can be right
* here reaping other children at the same time.
+ *
+ * We use thread_group_cputime() to get times for the thread
+ * group, which consolidates times for all threads in the
+ * group including the group leader.
*/
spin_lock_irq(&p->parent->sighand->siglock);
psig = p->parent->signal;
sig = p->signal;
+ thread_group_cputime(p, &cputime);
psig->cutime =
cputime_add(psig->cutime,
- cputime_add(p->utime,
- cputime_add(sig->utime,
- sig->cutime)));
+ cputime_add(cputime.utime,
+ sig->cutime));
psig->cstime =
cputime_add(psig->cstime,
- cputime_add(p->stime,
- cputime_add(sig->stime,
- sig->cstime)));
+ cputime_add(cputime.stime,
+ sig->cstime));
psig->cgtime =
cputime_add(psig->cgtime,
cputime_add(p->gtime,
@@ -1674,6 +1681,8 @@
struct task_struct *tsk;
int retval;
+ trace_sched_process_wait(pid);
+
add_wait_queue(¤t->signal->wait_chldexit,&wait);
repeat:
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 30de644..4d09355 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -58,6 +58,7 @@
#include <linux/tty.h>
#include <linux/proc_fs.h>
#include <linux/blkdev.h>
+#include <trace/sched.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -759,15 +760,44 @@
kmem_cache_free(sighand_cachep, sighand);
}
+
+/*
+ * Initialize POSIX timer handling for a thread group.
+ */
+static void posix_cpu_timers_init_group(struct signal_struct *sig)
+{
+ /* Thread group counters. */
+ thread_group_cputime_init(sig);
+
+ /* Expiration times and increments. */
+ sig->it_virt_expires = cputime_zero;
+ sig->it_virt_incr = cputime_zero;
+ sig->it_prof_expires = cputime_zero;
+ sig->it_prof_incr = cputime_zero;
+
+ /* Cached expiration times. */
+ sig->cputime_expires.prof_exp = cputime_zero;
+ sig->cputime_expires.virt_exp = cputime_zero;
+ sig->cputime_expires.sched_exp = 0;
+
+ /* The timer lists. */
+ INIT_LIST_HEAD(&sig->cpu_timers[0]);
+ INIT_LIST_HEAD(&sig->cpu_timers[1]);
+ INIT_LIST_HEAD(&sig->cpu_timers[2]);
+}
+
static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
struct signal_struct *sig;
int ret;
if (clone_flags & CLONE_THREAD) {
- atomic_inc(¤t->signal->count);
- atomic_inc(¤t->signal->live);
- return 0;
+ ret = thread_group_cputime_clone_thread(current);
+ if (likely(!ret)) {
+ atomic_inc(¤t->signal->count);
+ atomic_inc(¤t->signal->live);
+ }
+ return ret;
}
sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
tsk->signal = sig;
@@ -795,40 +825,25 @@
sig->it_real_incr.tv64 = 0;
sig->real_timer.function = it_real_fn;
- sig->it_virt_expires = cputime_zero;
- sig->it_virt_incr = cputime_zero;
- sig->it_prof_expires = cputime_zero;
- sig->it_prof_incr = cputime_zero;
-
sig->leader = 0; /* session leadership doesn't inherit */
sig->tty_old_pgrp = NULL;
sig->tty = NULL;
- sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+ sig->cutime = sig->cstime = cputime_zero;
sig->gtime = cputime_zero;
sig->cgtime = cputime_zero;
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
task_io_accounting_init(&sig->ioac);
- sig->sum_sched_runtime = 0;
- INIT_LIST_HEAD(&sig->cpu_timers[0]);
- INIT_LIST_HEAD(&sig->cpu_timers[1]);
- INIT_LIST_HEAD(&sig->cpu_timers[2]);
taskstats_tgid_init(sig);
task_lock(current->group_leader);
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
task_unlock(current->group_leader);
- if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
- /*
- * New sole thread in the process gets an expiry time
- * of the whole CPU time limit.
- */
- tsk->it_prof_expires =
- secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
- }
+ posix_cpu_timers_init_group(sig);
+
acct_init_pacct(&sig->pacct);
tty_audit_fork(sig);
@@ -838,6 +853,7 @@
void __cleanup_signal(struct signal_struct *sig)
{
+ thread_group_cputime_free(sig);
exit_thread_group_keys(sig);
tty_kref_put(sig->tty);
kmem_cache_free(signal_cachep, sig);
@@ -888,6 +904,19 @@
#endif /* CONFIG_MM_OWNER */
/*
+ * Initialize POSIX timer handling for a single task.
+ */
+static void posix_cpu_timers_init(struct task_struct *tsk)
+{
+ tsk->cputime_expires.prof_exp = cputime_zero;
+ tsk->cputime_expires.virt_exp = cputime_zero;
+ tsk->cputime_expires.sched_exp = 0;
+ INIT_LIST_HEAD(&tsk->cpu_timers[0]);
+ INIT_LIST_HEAD(&tsk->cpu_timers[1]);
+ INIT_LIST_HEAD(&tsk->cpu_timers[2]);
+}
+
+/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
*
@@ -997,12 +1026,7 @@
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
- p->it_virt_expires = cputime_zero;
- p->it_prof_expires = cputime_zero;
- p->it_sched_expires = 0;
- INIT_LIST_HEAD(&p->cpu_timers[0]);
- INIT_LIST_HEAD(&p->cpu_timers[1]);
- INIT_LIST_HEAD(&p->cpu_timers[2]);
+ posix_cpu_timers_init(p);
p->lock_depth = -1; /* -1 = no lock */
do_posix_clock_monotonic_gettime(&p->start_time);
@@ -1203,21 +1227,6 @@
if (clone_flags & CLONE_THREAD) {
p->group_leader = current->group_leader;
list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
-
- if (!cputime_eq(current->signal->it_virt_expires,
- cputime_zero) ||
- !cputime_eq(current->signal->it_prof_expires,
- cputime_zero) ||
- current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
- !list_empty(¤t->signal->cpu_timers[0]) ||
- !list_empty(¤t->signal->cpu_timers[1]) ||
- !list_empty(¤t->signal->cpu_timers[2])) {
- /*
- * Have child wake up on its first tick to check
- * for process CPU timers.
- */
- p->it_prof_expires = jiffies_to_cputime(1);
- }
}
if (likely(p->pid)) {
@@ -1364,6 +1373,8 @@
if (!IS_ERR(p)) {
struct completion vfork;
+ trace_sched_process_fork(current, p);
+
nr = task_pid_vnr(p);
if (clone_flags & CLONE_PARENT_SETTID)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cdec83e..95978f4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1403,9 +1403,7 @@
if (!base->first)
continue;
- if (base->get_softirq_time)
- base->softirq_time = base->get_softirq_time();
- else if (gettime) {
+ if (gettime) {
hrtimer_get_softirq_time(cpu_base);
gettime = 0;
}
@@ -1688,9 +1686,11 @@
new_base = &get_cpu_var(hrtimer_bases);
tick_cancel_sched_timer(cpu);
-
- local_irq_disable();
- spin_lock(&new_base->lock);
+ /*
+ * The caller is globally serialized and nobody else
+ * takes two locks at once, deadlock is not possible.
+ */
+ spin_lock_irq(&new_base->lock);
spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
@@ -1703,8 +1703,7 @@
raise = 1;
spin_unlock(&old_base->lock);
- spin_unlock(&new_base->lock);
- local_irq_enable();
+ spin_unlock_irq(&new_base->lock);
put_cpu_var(hrtimer_bases);
if (raise)
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 533068c..cc0f732 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -30,17 +30,16 @@
unsigned long probe_irq_on(void)
{
struct irq_desc *desc;
- unsigned long mask;
- unsigned int i;
+ unsigned long mask = 0;
+ unsigned int status;
+ int i;
mutex_lock(&probing_active);
/*
* something may have generated an irq long ago and we want to
* flush such a longstanding irq before considering it as spurious.
*/
- for (i = NR_IRQS-1; i > 0; i--) {
- desc = irq_desc + i;
-
+ for_each_irq_desc_reverse(i, desc) {
spin_lock_irq(&desc->lock);
if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
/*
@@ -68,9 +67,7 @@
* (we must startup again here because if a longstanding irq
* happened in the previous stage, it may have masked itself)
*/
- for (i = NR_IRQS-1; i > 0; i--) {
- desc = irq_desc + i;
-
+ for_each_irq_desc_reverse(i, desc) {
spin_lock_irq(&desc->lock);
if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
@@ -88,11 +85,7 @@
/*
* Now filter out any obviously spurious interrupts
*/
- mask = 0;
- for (i = 0; i < NR_IRQS; i++) {
- unsigned int status;
-
- desc = irq_desc + i;
+ for_each_irq_desc(i, desc) {
spin_lock_irq(&desc->lock);
status = desc->status;
@@ -126,14 +119,11 @@
*/
unsigned int probe_irq_mask(unsigned long val)
{
- unsigned int mask;
+ unsigned int status, mask = 0;
+ struct irq_desc *desc;
int i;
- mask = 0;
- for (i = 0; i < NR_IRQS; i++) {
- struct irq_desc *desc = irq_desc + i;
- unsigned int status;
-
+ for_each_irq_desc(i, desc) {
spin_lock_irq(&desc->lock);
status = desc->status;
@@ -171,20 +161,19 @@
*/
int probe_irq_off(unsigned long val)
{
- int i, irq_found = 0, nr_irqs = 0;
+ int i, irq_found = 0, nr_of_irqs = 0;
+ struct irq_desc *desc;
+ unsigned int status;
- for (i = 0; i < NR_IRQS; i++) {
- struct irq_desc *desc = irq_desc + i;
- unsigned int status;
-
+ for_each_irq_desc(i, desc) {
spin_lock_irq(&desc->lock);
status = desc->status;
if (status & IRQ_AUTODETECT) {
if (!(status & IRQ_WAITING)) {
- if (!nr_irqs)
+ if (!nr_of_irqs)
irq_found = i;
- nr_irqs++;
+ nr_of_irqs++;
}
desc->status = status & ~IRQ_AUTODETECT;
desc->chip->shutdown(i);
@@ -193,7 +182,7 @@
}
mutex_unlock(&probing_active);
- if (nr_irqs > 1)
+ if (nr_of_irqs > 1)
irq_found = -irq_found;
return irq_found;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3cd441e..4895fde 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -24,16 +24,15 @@
*/
void dynamic_irq_init(unsigned int irq)
{
- struct irq_desc *desc;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- if (irq >= NR_IRQS) {
+ if (!desc) {
WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
return;
}
/* Ensure we don't have left over values from a previous use of this irq */
- desc = irq_desc + irq;
spin_lock_irqsave(&desc->lock, flags);
desc->status = IRQ_DISABLED;
desc->chip = &no_irq_chip;
@@ -57,15 +56,14 @@
*/
void dynamic_irq_cleanup(unsigned int irq)
{
- struct irq_desc *desc;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- if (irq >= NR_IRQS) {
+ if (!desc) {
WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
return;
}
- desc = irq_desc + irq;
spin_lock_irqsave(&desc->lock, flags);
if (desc->action) {
spin_unlock_irqrestore(&desc->lock, flags);
@@ -89,10 +87,10 @@
*/
int set_irq_chip(unsigned int irq, struct irq_chip *chip)
{
- struct irq_desc *desc;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- if (irq >= NR_IRQS) {
+ if (!desc) {
WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
return -EINVAL;
}
@@ -100,7 +98,6 @@
if (!chip)
chip = &no_irq_chip;
- desc = irq_desc + irq;
spin_lock_irqsave(&desc->lock, flags);
irq_chip_set_defaults(chip);
desc->chip = chip;
@@ -111,27 +108,27 @@
EXPORT_SYMBOL(set_irq_chip);
/**
- * set_irq_type - set the irq type for an irq
+ * set_irq_type - set the irq trigger type for an irq
* @irq: irq number
- * @type: interrupt type - see include/linux/interrupt.h
+ * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
*/
int set_irq_type(unsigned int irq, unsigned int type)
{
- struct irq_desc *desc;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
int ret = -ENXIO;
- if (irq >= NR_IRQS) {
+ if (!desc) {
printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
return -ENODEV;
}
- desc = irq_desc + irq;
- if (desc->chip->set_type) {
- spin_lock_irqsave(&desc->lock, flags);
- ret = desc->chip->set_type(irq, type);
- spin_unlock_irqrestore(&desc->lock, flags);
- }
+ if (type == IRQ_TYPE_NONE)
+ return 0;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ ret = __irq_set_trigger(desc, irq, flags);
+ spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
EXPORT_SYMBOL(set_irq_type);
@@ -145,16 +142,15 @@
*/
int set_irq_data(unsigned int irq, void *data)
{
- struct irq_desc *desc;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- if (irq >= NR_IRQS) {
+ if (!desc) {
printk(KERN_ERR
"Trying to install controller data for IRQ%d\n", irq);
return -EINVAL;
}
- desc = irq_desc + irq;
spin_lock_irqsave(&desc->lock, flags);
desc->handler_data = data;
spin_unlock_irqrestore(&desc->lock, flags);
@@ -171,15 +167,15 @@
*/
int set_irq_msi(unsigned int irq, struct msi_desc *entry)
{
- struct irq_desc *desc;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- if (irq >= NR_IRQS) {
+ if (!desc) {
printk(KERN_ERR
"Trying to install msi data for IRQ%d\n", irq);
return -EINVAL;
}
- desc = irq_desc + irq;
+
spin_lock_irqsave(&desc->lock, flags);
desc->msi_desc = entry;
if (entry)
@@ -197,10 +193,16 @@
*/
int set_irq_chip_data(unsigned int irq, void *data)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- if (irq >= NR_IRQS || !desc->chip) {
+ if (!desc) {
+ printk(KERN_ERR
+ "Trying to install chip data for IRQ%d\n", irq);
+ return -EINVAL;
+ }
+
+ if (!desc->chip) {
printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
return -EINVAL;
}
@@ -218,7 +220,7 @@
*/
static void default_enable(unsigned int irq)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
desc->chip->unmask(irq);
desc->status &= ~IRQ_MASKED;
@@ -236,8 +238,9 @@
*/
static unsigned int default_startup(unsigned int irq)
{
- irq_desc[irq].chip->enable(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+ desc->chip->enable(irq);
return 0;
}
@@ -246,7 +249,7 @@
*/
static void default_shutdown(unsigned int irq)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
desc->chip->mask(irq);
desc->status |= IRQ_MASKED;
@@ -305,14 +308,13 @@
{
struct irqaction *action;
irqreturn_t action_ret;
- const unsigned int cpu = smp_processor_id();
spin_lock(&desc->lock);
if (unlikely(desc->status & IRQ_INPROGRESS))
goto out_unlock;
desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
- kstat_cpu(cpu).irqs[irq]++;
+ kstat_incr_irqs_this_cpu(irq, desc);
action = desc->action;
if (unlikely(!action || (desc->status & IRQ_DISABLED)))
@@ -344,7 +346,6 @@
void
handle_level_irq(unsigned int irq, struct irq_desc *desc)
{
- unsigned int cpu = smp_processor_id();
struct irqaction *action;
irqreturn_t action_ret;
@@ -354,7 +355,7 @@
if (unlikely(desc->status & IRQ_INPROGRESS))
goto out_unlock;
desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
- kstat_cpu(cpu).irqs[irq]++;
+ kstat_incr_irqs_this_cpu(irq, desc);
/*
* If its disabled or no action available
@@ -392,7 +393,6 @@
void
handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
{
- unsigned int cpu = smp_processor_id();
struct irqaction *action;
irqreturn_t action_ret;
@@ -402,7 +402,7 @@
goto out;
desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
- kstat_cpu(cpu).irqs[irq]++;
+ kstat_incr_irqs_this_cpu(irq, desc);
/*
* If its disabled or no action available
@@ -451,8 +451,6 @@
void
handle_edge_irq(unsigned int irq, struct irq_desc *desc)
{
- const unsigned int cpu = smp_processor_id();
-
spin_lock(&desc->lock);
desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
@@ -468,8 +466,7 @@
mask_ack_irq(desc, irq);
goto out_unlock;
}
-
- kstat_cpu(cpu).irqs[irq]++;
+ kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
desc->chip->ack(irq);
@@ -524,7 +521,7 @@
{
irqreturn_t action_ret;
- kstat_this_cpu.irqs[irq]++;
+ kstat_incr_irqs_this_cpu(irq, desc);
if (desc->chip->ack)
desc->chip->ack(irq);
@@ -541,17 +538,15 @@
__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
const char *name)
{
- struct irq_desc *desc;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- if (irq >= NR_IRQS) {
+ if (!desc) {
printk(KERN_ERR
"Trying to install type control for IRQ%d\n", irq);
return;
}
- desc = irq_desc + irq;
-
if (!handle)
handle = handle_bad_irq;
else if (desc->chip == &no_irq_chip) {
@@ -583,7 +578,7 @@
desc->status &= ~IRQ_DISABLED;
desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
desc->depth = 0;
- desc->chip->unmask(irq);
+ desc->chip->startup(irq);
}
spin_unlock_irqrestore(&desc->lock, flags);
}
@@ -606,17 +601,14 @@
void __init set_irq_noprobe(unsigned int irq)
{
- struct irq_desc *desc;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- if (irq >= NR_IRQS) {
+ if (!desc) {
printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
-
return;
}
- desc = irq_desc + irq;
-
spin_lock_irqsave(&desc->lock, flags);
desc->status |= IRQ_NOPROBE;
spin_unlock_irqrestore(&desc->lock, flags);
@@ -624,17 +616,14 @@
void __init set_irq_probe(unsigned int irq)
{
- struct irq_desc *desc;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- if (irq >= NR_IRQS) {
+ if (!desc) {
printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
-
return;
}
- desc = irq_desc + irq;
-
spin_lock_irqsave(&desc->lock, flags);
desc->status &= ~IRQ_NOPROBE;
spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 5fa6198..c815b42 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -25,11 +25,10 @@
*
* Handles spurious and unhandled IRQ's. It also prints a debugmessage.
*/
-void
-handle_bad_irq(unsigned int irq, struct irq_desc *desc)
+void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
{
print_irq_desc(irq, desc);
- kstat_this_cpu.irqs[irq]++;
+ kstat_incr_irqs_this_cpu(irq, desc);
ack_bad_irq(irq);
}
@@ -47,6 +46,9 @@
*
* Controller mappings for all interrupt sources:
*/
+int nr_irqs = NR_IRQS;
+EXPORT_SYMBOL_GPL(nr_irqs);
+
struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
[0 ... NR_IRQS-1] = {
.status = IRQ_DISABLED,
@@ -66,7 +68,9 @@
*/
static void ack_bad(unsigned int irq)
{
- print_irq_desc(irq, irq_desc + irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ print_irq_desc(irq, desc);
ack_bad_irq(irq);
}
@@ -131,8 +135,6 @@
irqreturn_t ret, retval = IRQ_NONE;
unsigned int status = 0;
- handle_dynamic_tick(action);
-
if (!(action->flags & IRQF_DISABLED))
local_irq_enable_in_hardirq();
@@ -165,11 +167,12 @@
*/
unsigned int __do_IRQ(unsigned int irq)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
unsigned int status;
- kstat_this_cpu.irqs[irq]++;
+ kstat_incr_irqs_this_cpu(irq, desc);
+
if (CHECK_IRQ_PER_CPU(desc->status)) {
irqreturn_t action_ret;
@@ -256,8 +259,8 @@
}
#endif
-#ifdef CONFIG_TRACE_IRQFLAGS
+#ifdef CONFIG_TRACE_IRQFLAGS
/*
* lockdep: we want to handle all irq_desc locks as a single lock-class:
*/
@@ -265,10 +268,10 @@
void early_init_irq_lock_class(void)
{
+ struct irq_desc *desc;
int i;
- for (i = 0; i < NR_IRQS; i++)
- lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class);
+ for_each_irq_desc(i, desc)
+ lockdep_set_class(&desc->lock, &irq_desc_lock_class);
}
-
#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 08a849a..c9767e6 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -10,12 +10,15 @@
/* Set default handler: */
extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
+extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
+ unsigned long flags);
+
#ifdef CONFIG_PROC_FS
-extern void register_irq_proc(unsigned int irq);
+extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void register_handler_proc(unsigned int irq, struct irqaction *action);
extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
#else
-static inline void register_irq_proc(unsigned int irq) { }
+static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
static inline void register_handler_proc(unsigned int irq,
struct irqaction *action) { }
static inline void unregister_handler_proc(unsigned int irq,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 60c49e3..c498a1b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -31,10 +31,10 @@
*/
void synchronize_irq(unsigned int irq)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned int status;
- if (irq >= NR_IRQS)
+ if (!desc)
return;
do {
@@ -64,7 +64,7 @@
*/
int irq_can_set_affinity(unsigned int irq)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
!desc->chip->set_affinity)
@@ -81,18 +81,17 @@
*/
int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
if (!desc->chip->set_affinity)
return -EINVAL;
- set_balance_irq_affinity(irq, cpumask);
-
#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (desc->status & IRQ_MOVE_PCNTXT) {
+ if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
unsigned long flags;
spin_lock_irqsave(&desc->lock, flags);
+ desc->affinity = cpumask;
desc->chip->set_affinity(irq, cpumask);
spin_unlock_irqrestore(&desc->lock, flags);
} else
@@ -111,16 +110,17 @@
int irq_select_affinity(unsigned int irq)
{
cpumask_t mask;
+ struct irq_desc *desc;
if (!irq_can_set_affinity(irq))
return 0;
cpus_and(mask, cpu_online_map, irq_default_affinity);
- irq_desc[irq].affinity = mask;
- irq_desc[irq].chip->set_affinity(irq, mask);
+ desc = irq_to_desc(irq);
+ desc->affinity = mask;
+ desc->chip->set_affinity(irq, mask);
- set_balance_irq_affinity(irq, mask);
return 0;
}
#endif
@@ -140,10 +140,10 @@
*/
void disable_irq_nosync(unsigned int irq)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- if (irq >= NR_IRQS)
+ if (!desc)
return;
spin_lock_irqsave(&desc->lock, flags);
@@ -169,9 +169,9 @@
*/
void disable_irq(unsigned int irq)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
- if (irq >= NR_IRQS)
+ if (!desc)
return;
disable_irq_nosync(irq);
@@ -211,10 +211,10 @@
*/
void enable_irq(unsigned int irq)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- if (irq >= NR_IRQS)
+ if (!desc)
return;
spin_lock_irqsave(&desc->lock, flags);
@@ -223,9 +223,9 @@
}
EXPORT_SYMBOL(enable_irq);
-int set_irq_wake_real(unsigned int irq, unsigned int on)
+static int set_irq_wake_real(unsigned int irq, unsigned int on)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
int ret = -ENXIO;
if (desc->chip->set_wake)
@@ -248,7 +248,7 @@
*/
int set_irq_wake(unsigned int irq, unsigned int on)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
int ret = 0;
@@ -288,12 +288,16 @@
*/
int can_request_irq(unsigned int irq, unsigned long irqflags)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
- if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST)
+ if (!desc)
return 0;
- action = irq_desc[irq].action;
+ if (desc->status & IRQ_NOREQUEST)
+ return 0;
+
+ action = desc->action;
if (action)
if (irqflags & action->flags & IRQF_SHARED)
action = NULL;
@@ -312,10 +316,11 @@
desc->handle_irq = NULL;
}
-static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
+int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags)
{
int ret;
+ struct irq_chip *chip = desc->chip;
if (!chip || !chip->set_type) {
/*
@@ -333,6 +338,11 @@
pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
(int)(flags & IRQF_TRIGGER_MASK),
irq, chip->set_type);
+ else {
+ /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
+ desc->status &= ~IRQ_TYPE_SENSE_MASK;
+ desc->status |= flags & IRQ_TYPE_SENSE_MASK;
+ }
return ret;
}
@@ -341,16 +351,16 @@
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
*/
-int setup_irq(unsigned int irq, struct irqaction *new)
+static int
+__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
{
- struct irq_desc *desc = irq_desc + irq;
struct irqaction *old, **p;
const char *old_name = NULL;
unsigned long flags;
int shared = 0;
int ret;
- if (irq >= NR_IRQS)
+ if (!desc)
return -EINVAL;
if (desc->chip == &no_irq_chip)
@@ -411,7 +421,7 @@
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
- ret = __irq_set_trigger(desc->chip, irq, new->flags);
+ ret = __irq_set_trigger(desc, irq, new->flags);
if (ret) {
spin_unlock_irqrestore(&desc->lock, flags);
@@ -430,16 +440,21 @@
if (!(desc->status & IRQ_NOAUTOEN)) {
desc->depth = 0;
desc->status &= ~IRQ_DISABLED;
- if (desc->chip->startup)
- desc->chip->startup(irq);
- else
- desc->chip->enable(irq);
+ desc->chip->startup(irq);
} else
/* Undo nested disables: */
desc->depth = 1;
/* Set default affinity mask once everything is setup */
irq_select_affinity(irq);
+
+ } else if ((new->flags & IRQF_TRIGGER_MASK)
+ && (new->flags & IRQF_TRIGGER_MASK)
+ != (desc->status & IRQ_TYPE_SENSE_MASK)) {
+ /* hope the handler works with the actual trigger mode... */
+ pr_warning("IRQ %d uses trigger mode %d; requested %d\n",
+ irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK),
+ (int)(new->flags & IRQF_TRIGGER_MASK));
}
*p = new;
@@ -464,7 +479,7 @@
spin_unlock_irqrestore(&desc->lock, flags);
new->irq = irq;
- register_irq_proc(irq);
+ register_irq_proc(irq, desc);
new->dir = NULL;
register_handler_proc(irq, new);
@@ -484,6 +499,20 @@
}
/**
+ * setup_irq - setup an interrupt
+ * @irq: Interrupt line to setup
+ * @act: irqaction for the interrupt
+ *
+ * Used to statically setup interrupts in the early boot process.
+ */
+int setup_irq(unsigned int irq, struct irqaction *act)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return __setup_irq(irq, desc, act);
+}
+
+/**
* free_irq - free an interrupt
* @irq: Interrupt line to free
* @dev_id: Device identity to free
@@ -499,15 +528,15 @@
*/
void free_irq(unsigned int irq, void *dev_id)
{
- struct irq_desc *desc;
+ struct irq_desc *desc = irq_to_desc(irq);
struct irqaction **p;
unsigned long flags;
WARN_ON(in_interrupt());
- if (irq >= NR_IRQS)
+
+ if (!desc)
return;
- desc = irq_desc + irq;
spin_lock_irqsave(&desc->lock, flags);
p = &desc->action;
for (;;) {
@@ -596,12 +625,14 @@
* IRQF_SHARED Interrupt is shared
* IRQF_DISABLED Disable local interrupts while processing
* IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
+ * IRQF_TRIGGER_* Specify active edge(s) or level
*
*/
int request_irq(unsigned int irq, irq_handler_t handler,
unsigned long irqflags, const char *devname, void *dev_id)
{
struct irqaction *action;
+ struct irq_desc *desc;
int retval;
#ifdef CONFIG_LOCKDEP
@@ -618,9 +649,12 @@
*/
if ((irqflags & IRQF_SHARED) && !dev_id)
return -EINVAL;
- if (irq >= NR_IRQS)
+
+ desc = irq_to_desc(irq);
+ if (!desc)
return -EINVAL;
- if (irq_desc[irq].status & IRQ_NOREQUEST)
+
+ if (desc->status & IRQ_NOREQUEST)
return -EINVAL;
if (!handler)
return -EINVAL;
@@ -636,26 +670,29 @@
action->next = NULL;
action->dev_id = dev_id;
+ retval = __setup_irq(irq, desc, action);
+ if (retval)
+ kfree(action);
+
#ifdef CONFIG_DEBUG_SHIRQ
if (irqflags & IRQF_SHARED) {
/*
* It's a shared IRQ -- the driver ought to be prepared for it
* to happen immediately, so let's make sure....
- * We do this before actually registering it, to make sure that
- * a 'real' IRQ doesn't run in parallel with our fake
+ * We disable the irq to make sure that a 'real' IRQ doesn't
+ * run in parallel with our fake.
*/
unsigned long flags;
+ disable_irq(irq);
local_irq_save(flags);
+
handler(irq, dev_id);
+
local_irq_restore(flags);
+ enable_irq(irq);
}
#endif
-
- retval = setup_irq(irq, action);
- if (retval)
- kfree(action);
-
return retval;
}
EXPORT_SYMBOL(request_irq);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 77b7acc..90b920d 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -3,18 +3,18 @@
void set_pending_irq(unsigned int irq, cpumask_t mask)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
spin_lock_irqsave(&desc->lock, flags);
desc->status |= IRQ_MOVE_PENDING;
- irq_desc[irq].pending_mask = mask;
+ desc->pending_mask = mask;
spin_unlock_irqrestore(&desc->lock, flags);
}
void move_masked_irq(int irq)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
cpumask_t tmp;
if (likely(!(desc->status & IRQ_MOVE_PENDING)))
@@ -30,7 +30,7 @@
desc->status &= ~IRQ_MOVE_PENDING;
- if (unlikely(cpus_empty(irq_desc[irq].pending_mask)))
+ if (unlikely(cpus_empty(desc->pending_mask)))
return;
if (!desc->chip->set_affinity)
@@ -38,7 +38,7 @@
assert_spin_locked(&desc->lock);
- cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map);
+ cpus_and(tmp, desc->pending_mask, cpu_online_map);
/*
* If there was a valid mask to work with, please
@@ -55,12 +55,12 @@
if (likely(!cpus_empty(tmp))) {
desc->chip->set_affinity(irq,tmp);
}
- cpus_clear(irq_desc[irq].pending_mask);
+ cpus_clear(desc->pending_mask);
}
void move_native_irq(int irq)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
if (likely(!(desc->status & IRQ_MOVE_PENDING)))
return;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a09dd29..fac014a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,7 @@
static int irq_affinity_proc_show(struct seq_file *m, void *v)
{
- struct irq_desc *desc = irq_desc + (long)m->private;
+ struct irq_desc *desc = irq_to_desc((long)m->private);
cpumask_t *mask = &desc->affinity;
#ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -43,7 +43,7 @@
cpumask_t new_value;
int err;
- if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
+ if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
irq_balancing_disabled(irq))
return -EIO;
@@ -132,20 +132,20 @@
static int irq_spurious_read(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
- struct irq_desc *d = &irq_desc[(long) data];
+ struct irq_desc *desc = irq_to_desc((long) data);
return sprintf(page, "count %u\n"
"unhandled %u\n"
"last_unhandled %u ms\n",
- d->irq_count,
- d->irqs_unhandled,
- jiffies_to_msecs(d->last_unhandled));
+ desc->irq_count,
+ desc->irqs_unhandled,
+ jiffies_to_msecs(desc->last_unhandled));
}
#define MAX_NAMELEN 128
static int name_unique(unsigned int irq, struct irqaction *new_action)
{
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
unsigned long flags;
int ret = 1;
@@ -165,8 +165,9 @@
void register_handler_proc(unsigned int irq, struct irqaction *action)
{
char name [MAX_NAMELEN];
+ struct irq_desc *desc = irq_to_desc(irq);
- if (!irq_desc[irq].dir || action->dir || !action->name ||
+ if (!desc->dir || action->dir || !action->name ||
!name_unique(irq, action))
return;
@@ -174,36 +175,34 @@
snprintf(name, MAX_NAMELEN, "%s", action->name);
/* create /proc/irq/1234/handler/ */
- action->dir = proc_mkdir(name, irq_desc[irq].dir);
+ action->dir = proc_mkdir(name, desc->dir);
}
#undef MAX_NAMELEN
#define MAX_NAMELEN 10
-void register_irq_proc(unsigned int irq)
+void register_irq_proc(unsigned int irq, struct irq_desc *desc)
{
char name [MAX_NAMELEN];
struct proc_dir_entry *entry;
- if (!root_irq_dir ||
- (irq_desc[irq].chip == &no_irq_chip) ||
- irq_desc[irq].dir)
+ if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
return;
memset(name, 0, MAX_NAMELEN);
sprintf(name, "%d", irq);
/* create /proc/irq/1234 */
- irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
+ desc->dir = proc_mkdir(name, root_irq_dir);
#ifdef CONFIG_SMP
/* create /proc/irq/<irq>/smp_affinity */
- proc_create_data("smp_affinity", 0600, irq_desc[irq].dir,
+ proc_create_data("smp_affinity", 0600, desc->dir,
&irq_affinity_proc_fops, (void *)(long)irq);
#endif
- entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
+ entry = create_proc_entry("spurious", 0444, desc->dir);
if (entry) {
entry->data = (void *)(long)irq;
entry->read_proc = irq_spurious_read;
@@ -214,8 +213,11 @@
void unregister_handler_proc(unsigned int irq, struct irqaction *action)
{
- if (action->dir)
- remove_proc_entry(action->dir->name, irq_desc[irq].dir);
+ if (action->dir) {
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ remove_proc_entry(action->dir->name, desc->dir);
+ }
}
void register_default_affinity_proc(void)
@@ -228,7 +230,8 @@
void init_irq_proc(void)
{
- int i;
+ unsigned int irq;
+ struct irq_desc *desc;
/* create /proc/irq */
root_irq_dir = proc_mkdir("irq", NULL);
@@ -240,7 +243,7 @@
/*
* Create entries for all existing IRQs.
*/
- for (i = 0; i < NR_IRQS; i++)
- register_irq_proc(i);
+ for_each_irq_desc(irq, desc)
+ register_irq_proc(irq, desc);
}
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index a804679..89c7117 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -33,10 +33,10 @@
struct irq_desc *desc;
int irq;
- while (!bitmap_empty(irqs_resend, NR_IRQS)) {
- irq = find_first_bit(irqs_resend, NR_IRQS);
+ while (!bitmap_empty(irqs_resend, nr_irqs)) {
+ irq = find_first_bit(irqs_resend, nr_irqs);
clear_bit(irq, irqs_resend);
- desc = irq_desc + irq;
+ desc = irq_to_desc(irq);
local_irq_disable();
desc->handle_irq(irq, desc);
local_irq_enable();
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index c66d3f1..dd364c1 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -12,83 +12,122 @@
#include <linux/kallsyms.h>
#include <linux/interrupt.h>
#include <linux/moduleparam.h>
+#include <linux/timer.h>
static int irqfixup __read_mostly;
+#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
+static void poll_spurious_irqs(unsigned long dummy);
+static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+
/*
* Recovery handler for misrouted interrupts.
*/
+static int try_one_irq(int irq, struct irq_desc *desc)
+{
+ struct irqaction *action;
+ int ok = 0, work = 0;
+
+ spin_lock(&desc->lock);
+ /* Already running on another processor */
+ if (desc->status & IRQ_INPROGRESS) {
+ /*
+ * Already running: If it is shared get the other
+ * CPU to go looking for our mystery interrupt too
+ */
+ if (desc->action && (desc->action->flags & IRQF_SHARED))
+ desc->status |= IRQ_PENDING;
+ spin_unlock(&desc->lock);
+ return ok;
+ }
+ /* Honour the normal IRQ locking */
+ desc->status |= IRQ_INPROGRESS;
+ action = desc->action;
+ spin_unlock(&desc->lock);
+
+ while (action) {
+ /* Only shared IRQ handlers are safe to call */
+ if (action->flags & IRQF_SHARED) {
+ if (action->handler(irq, action->dev_id) ==
+ IRQ_HANDLED)
+ ok = 1;
+ }
+ action = action->next;
+ }
+ local_irq_disable();
+ /* Now clean up the flags */
+ spin_lock(&desc->lock);
+ action = desc->action;
+
+ /*
+ * While we were looking for a fixup someone queued a real
+ * IRQ clashing with our walk:
+ */
+ while ((desc->status & IRQ_PENDING) && action) {
+ /*
+ * Perform real IRQ processing for the IRQ we deferred
+ */
+ work = 1;
+ spin_unlock(&desc->lock);
+ handle_IRQ_event(irq, action);
+ spin_lock(&desc->lock);
+ desc->status &= ~IRQ_PENDING;
+ }
+ desc->status &= ~IRQ_INPROGRESS;
+ /*
+ * If we did actual work for the real IRQ line we must let the
+ * IRQ controller clean up too
+ */
+ if (work && desc->chip && desc->chip->end)
+ desc->chip->end(irq);
+ spin_unlock(&desc->lock);
+
+ return ok;
+}
+
static int misrouted_irq(int irq)
{
- int i;
- int ok = 0;
- int work = 0; /* Did we do work for a real IRQ */
+ struct irq_desc *desc;
+ int i, ok = 0;
- for (i = 1; i < NR_IRQS; i++) {
- struct irq_desc *desc = irq_desc + i;
- struct irqaction *action;
+ for_each_irq_desc(i, desc) {
+ if (!i)
+ continue;
if (i == irq) /* Already tried */
continue;
- spin_lock(&desc->lock);
- /* Already running on another processor */
- if (desc->status & IRQ_INPROGRESS) {
- /*
- * Already running: If it is shared get the other
- * CPU to go looking for our mystery interrupt too
- */
- if (desc->action && (desc->action->flags & IRQF_SHARED))
- desc->status |= IRQ_PENDING;
- spin_unlock(&desc->lock);
- continue;
- }
- /* Honour the normal IRQ locking */
- desc->status |= IRQ_INPROGRESS;
- action = desc->action;
- spin_unlock(&desc->lock);
-
- while (action) {
- /* Only shared IRQ handlers are safe to call */
- if (action->flags & IRQF_SHARED) {
- if (action->handler(i, action->dev_id) ==
- IRQ_HANDLED)
- ok = 1;
- }
- action = action->next;
- }
- local_irq_disable();
- /* Now clean up the flags */
- spin_lock(&desc->lock);
- action = desc->action;
-
- /*
- * While we were looking for a fixup someone queued a real
- * IRQ clashing with our walk:
- */
- while ((desc->status & IRQ_PENDING) && action) {
- /*
- * Perform real IRQ processing for the IRQ we deferred
- */
- work = 1;
- spin_unlock(&desc->lock);
- handle_IRQ_event(i, action);
- spin_lock(&desc->lock);
- desc->status &= ~IRQ_PENDING;
- }
- desc->status &= ~IRQ_INPROGRESS;
- /*
- * If we did actual work for the real IRQ line we must let the
- * IRQ controller clean up too
- */
- if (work && desc->chip && desc->chip->end)
- desc->chip->end(i);
- spin_unlock(&desc->lock);
+ if (try_one_irq(i, desc))
+ ok = 1;
}
/* So the caller can adjust the irq error counts */
return ok;
}
+static void poll_spurious_irqs(unsigned long dummy)
+{
+ struct irq_desc *desc;
+ int i;
+
+ for_each_irq_desc(i, desc) {
+ unsigned int status;
+
+ if (!i)
+ continue;
+
+ /* Racy but it doesn't matter */
+ status = desc->status;
+ barrier();
+ if (!(status & IRQ_SPURIOUS_DISABLED))
+ continue;
+
+ try_one_irq(i, desc);
+ }
+
+ mod_timer(&poll_spurious_irq_timer,
+ jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
+}
+
/*
* If 99,900 of the previous 100,000 interrupts have not been handled
* then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -137,7 +176,9 @@
}
}
-static inline int try_misrouted_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
+static inline int
+try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
+ irqreturn_t action_ret)
{
struct irqaction *action;
@@ -212,6 +253,9 @@
desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
desc->depth++;
desc->chip->disable(irq);
+
+ mod_timer(&poll_spurious_irq_timer,
+ jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
desc->irqs_unhandled = 0;
}
@@ -241,7 +285,7 @@
__setup("irqfixup", irqfixup_setup);
module_param(irqfixup, int, 0644);
-MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode");
+MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
static int __init irqpoll_setup(char *str)
{
diff --git a/kernel/itimer.c b/kernel/itimer.c
index ab98274..db7c358 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -55,17 +55,15 @@
spin_unlock_irq(&tsk->sighand->siglock);
break;
case ITIMER_VIRTUAL:
- read_lock(&tasklist_lock);
spin_lock_irq(&tsk->sighand->siglock);
cval = tsk->signal->it_virt_expires;
cinterval = tsk->signal->it_virt_incr;
if (!cputime_eq(cval, cputime_zero)) {
- struct task_struct *t = tsk;
- cputime_t utime = tsk->signal->utime;
- do {
- utime = cputime_add(utime, t->utime);
- t = next_thread(t);
- } while (t != tsk);
+ struct task_cputime cputime;
+ cputime_t utime;
+
+ thread_group_cputime(tsk, &cputime);
+ utime = cputime.utime;
if (cputime_le(cval, utime)) { /* about to fire */
cval = jiffies_to_cputime(1);
} else {
@@ -73,25 +71,19 @@
}
}
spin_unlock_irq(&tsk->sighand->siglock);
- read_unlock(&tasklist_lock);
cputime_to_timeval(cval, &value->it_value);
cputime_to_timeval(cinterval, &value->it_interval);
break;
case ITIMER_PROF:
- read_lock(&tasklist_lock);
spin_lock_irq(&tsk->sighand->siglock);
cval = tsk->signal->it_prof_expires;
cinterval = tsk->signal->it_prof_incr;
if (!cputime_eq(cval, cputime_zero)) {
- struct task_struct *t = tsk;
- cputime_t ptime = cputime_add(tsk->signal->utime,
- tsk->signal->stime);
- do {
- ptime = cputime_add(ptime,
- cputime_add(t->utime,
- t->stime));
- t = next_thread(t);
- } while (t != tsk);
+ struct task_cputime times;
+ cputime_t ptime;
+
+ thread_group_cputime(tsk, ×);
+ ptime = cputime_add(times.utime, times.stime);
if (cputime_le(cval, ptime)) { /* about to fire */
cval = jiffies_to_cputime(1);
} else {
@@ -99,7 +91,6 @@
}
}
spin_unlock_irq(&tsk->sighand->siglock);
- read_unlock(&tasklist_lock);
cputime_to_timeval(cval, &value->it_value);
cputime_to_timeval(cinterval, &value->it_interval);
break;
@@ -185,7 +176,6 @@
case ITIMER_VIRTUAL:
nval = timeval_to_cputime(&value->it_value);
ninterval = timeval_to_cputime(&value->it_interval);
- read_lock(&tasklist_lock);
spin_lock_irq(&tsk->sighand->siglock);
cval = tsk->signal->it_virt_expires;
cinterval = tsk->signal->it_virt_incr;
@@ -200,7 +190,6 @@
tsk->signal->it_virt_expires = nval;
tsk->signal->it_virt_incr = ninterval;
spin_unlock_irq(&tsk->sighand->siglock);
- read_unlock(&tasklist_lock);
if (ovalue) {
cputime_to_timeval(cval, &ovalue->it_value);
cputime_to_timeval(cinterval, &ovalue->it_interval);
@@ -209,7 +198,6 @@
case ITIMER_PROF:
nval = timeval_to_cputime(&value->it_value);
ninterval = timeval_to_cputime(&value->it_interval);
- read_lock(&tasklist_lock);
spin_lock_irq(&tsk->sighand->siglock);
cval = tsk->signal->it_prof_expires;
cinterval = tsk->signal->it_prof_incr;
@@ -224,7 +212,6 @@
tsk->signal->it_prof_expires = nval;
tsk->signal->it_prof_incr = ninterval;
spin_unlock_irq(&tsk->sighand->siglock);
- read_unlock(&tasklist_lock);
if (ovalue) {
cputime_to_timeval(cval, &ovalue->it_value);
cputime_to_timeval(cinterval, &ovalue->it_interval);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 14ec64f..8e7a7ce 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -13,6 +13,7 @@
#include <linux/file.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <trace/sched.h>
#define KTHREAD_NICE_LEVEL (-5)
@@ -205,6 +206,8 @@
/* It could exit after stop_info.k set, but before wake_up_process. */
get_task_struct(k);
+ trace_sched_kthread_stop(k);
+
/* Must init completion *before* thread sees kthread_stop_info.k */
init_completion(&kthread_stop_info.done);
smp_wmb();
@@ -220,6 +223,8 @@
ret = kthread_stop_info.err;
mutex_unlock(&kthread_stop_lock);
+ trace_sched_kthread_stop_ret(ret);
+
return ret;
}
EXPORT_SYMBOL(kthread_stop);
diff --git a/kernel/marker.c b/kernel/marker.c
index 7d1faec..e9c6b2b 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -62,7 +62,7 @@
int refcount; /* Number of times armed. 0 if disarmed. */
struct rcu_head rcu;
void *oldptr;
- unsigned char rcu_pending:1;
+ int rcu_pending;
unsigned char ptype:1;
char name[0]; /* Contains name'\0'format'\0' */
};
@@ -103,11 +103,11 @@
char ptype;
/*
- * preempt_disable does two things : disabling preemption to make sure
- * the teardown of the callbacks can be done correctly when they are in
- * modules and they insure RCU read coherency.
+ * rcu_read_lock_sched does two things : disabling preemption to make
+ * sure the teardown of the callbacks can be done correctly when they
+ * are in modules and they insure RCU read coherency.
*/
- preempt_disable();
+ rcu_read_lock_sched();
ptype = mdata->ptype;
if (likely(!ptype)) {
marker_probe_func *func;
@@ -145,7 +145,7 @@
va_end(args);
}
}
- preempt_enable();
+ rcu_read_unlock_sched();
}
EXPORT_SYMBOL_GPL(marker_probe_cb);
@@ -162,7 +162,7 @@
va_list args; /* not initialized */
char ptype;
- preempt_disable();
+ rcu_read_lock_sched();
ptype = mdata->ptype;
if (likely(!ptype)) {
marker_probe_func *func;
@@ -195,7 +195,7 @@
multi[i].func(multi[i].probe_private, call_private,
mdata->format, &args);
}
- preempt_enable();
+ rcu_read_unlock_sched();
}
EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
@@ -560,7 +560,7 @@
* Disable a marker and its probe callback.
* Note: only waiting an RCU period after setting elem->call to the empty
* function insures that the original callback is not used anymore. This insured
- * by preempt_disable around the call site.
+ * by rcu_read_lock_sched around the call site.
*/
static void disable_marker(struct marker *elem)
{
@@ -653,11 +653,17 @@
entry = get_marker(name);
if (!entry) {
entry = add_marker(name, format);
- if (IS_ERR(entry)) {
+ if (IS_ERR(entry))
ret = PTR_ERR(entry);
- goto end;
- }
+ } else if (format) {
+ if (!entry->format)
+ ret = marker_set_format(&entry, format);
+ else if (strcmp(entry->format, format))
+ ret = -EPERM;
}
+ if (ret)
+ goto end;
+
/*
* If we detect that a call_rcu is pending for this marker,
* make sure it's executed now.
@@ -674,6 +680,8 @@
mutex_lock(&markers_mutex);
entry = get_marker(name);
WARN_ON(!entry);
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
entry->oldptr = old;
entry->rcu_pending = 1;
/* write rcu_pending before calling the RCU callback */
@@ -717,6 +725,8 @@
entry = get_marker(name);
if (!entry)
goto end;
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
entry->oldptr = old;
entry->rcu_pending = 1;
/* write rcu_pending before calling the RCU callback */
@@ -795,6 +805,8 @@
mutex_lock(&markers_mutex);
entry = get_marker_from_private_data(probe, probe_private);
WARN_ON(!entry);
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
entry->oldptr = old;
entry->rcu_pending = 1;
/* write rcu_pending before calling the RCU callback */
diff --git a/kernel/module.c b/kernel/module.c
index 25bc9ac..0d8d21e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -46,6 +46,8 @@
#include <asm/cacheflush.h>
#include <linux/license.h>
#include <asm/sections.h>
+#include <linux/tracepoint.h>
+#include <linux/ftrace.h>
#if 0
#define DEBUGP printk
@@ -1430,6 +1432,9 @@
/* Module unload stuff */
module_unload_free(mod);
+ /* release any pointers to mcount in this module */
+ ftrace_release(mod->module_core, mod->core_size);
+
/* This may be NULL, but that's OK */
module_free(mod, mod->module_init);
kfree(mod->args);
@@ -1861,9 +1866,13 @@
unsigned int markersindex;
unsigned int markersstringsindex;
unsigned int verboseindex;
+ unsigned int tracepointsindex;
+ unsigned int tracepointsstringsindex;
+ unsigned int mcountindex;
struct module *mod;
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
+ void *mseg;
struct exception_table_entry *extable;
mm_segment_t old_fs;
@@ -2156,6 +2165,12 @@
markersstringsindex = find_sec(hdr, sechdrs, secstrings,
"__markers_strings");
verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose");
+ tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints");
+ tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings,
+ "__tracepoints_strings");
+
+ mcountindex = find_sec(hdr, sechdrs, secstrings,
+ "__mcount_loc");
/* Now do relocations. */
for (i = 1; i < hdr->e_shnum; i++) {
@@ -2183,6 +2198,12 @@
mod->num_markers =
sechdrs[markersindex].sh_size / sizeof(*mod->markers);
#endif
+#ifdef CONFIG_TRACEPOINTS
+ mod->tracepoints = (void *)sechdrs[tracepointsindex].sh_addr;
+ mod->num_tracepoints =
+ sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints);
+#endif
+
/* Find duplicate symbols */
err = verify_export_symbols(mod);
@@ -2201,12 +2222,22 @@
add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
+ if (!mod->taints) {
#ifdef CONFIG_MARKERS
- if (!mod->taints)
marker_update_probe_range(mod->markers,
mod->markers + mod->num_markers);
#endif
dynamic_printk_setup(sechdrs, verboseindex);
+#ifdef CONFIG_TRACEPOINTS
+ tracepoint_update_probe_range(mod->tracepoints,
+ mod->tracepoints + mod->num_tracepoints);
+#endif
+ }
+
+ /* sechdrs[0].sh_size is always zero */
+ mseg = (void *)sechdrs[mcountindex].sh_addr;
+ ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size);
+
err = module_finalize(hdr, sechdrs, mod);
if (err < 0)
goto cleanup;
@@ -2276,6 +2307,7 @@
cleanup:
kobject_del(&mod->mkobj.kobj);
kobject_put(&mod->mkobj.kobj);
+ ftrace_release(mod->module_core, mod->core_size);
free_unload:
module_unload_free(mod);
module_free(mod, mod->module_init);
@@ -2759,3 +2791,50 @@
mutex_unlock(&module_mutex);
}
#endif
+
+#ifdef CONFIG_TRACEPOINTS
+void module_update_tracepoints(void)
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list)
+ if (!mod->taints)
+ tracepoint_update_probe_range(mod->tracepoints,
+ mod->tracepoints + mod->num_tracepoints);
+ mutex_unlock(&module_mutex);
+}
+
+/*
+ * Returns 0 if current not found.
+ * Returns 1 if current found.
+ */
+int module_get_iter_tracepoints(struct tracepoint_iter *iter)
+{
+ struct module *iter_mod;
+ int found = 0;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(iter_mod, &modules, list) {
+ if (!iter_mod->taints) {
+ /*
+ * Sorted module list
+ */
+ if (iter_mod < iter->module)
+ continue;
+ else if (iter_mod > iter->module)
+ iter->tracepoint = NULL;
+ found = tracepoint_get_iter_range(&iter->tracepoint,
+ iter_mod->tracepoints,
+ iter_mod->tracepoints
+ + iter_mod->num_tracepoints);
+ if (found) {
+ iter->module = iter_mod;
+ break;
+ }
+ }
+ }
+ mutex_unlock(&module_mutex);
+ return found;
+}
+#endif
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 823be11..4282c0a 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -550,7 +550,7 @@
static ATOMIC_NOTIFIER_HEAD(die_chain);
-int notify_die(enum die_val val, const char *str,
+int notrace notify_die(enum die_val val, const char *str,
struct pt_regs *regs, long err, int trap, int sig)
{
struct die_args args = {
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c42a03a..153dcb2 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -7,6 +7,93 @@
#include <linux/errno.h>
#include <linux/math64.h>
#include <asm/uaccess.h>
+#include <linux/kernel_stat.h>
+
+/*
+ * Allocate the thread_group_cputime structure appropriately and fill in the
+ * current values of the fields. Called from copy_signal() via
+ * thread_group_cputime_clone_thread() when adding a second or subsequent
+ * thread to a thread group. Assumes interrupts are enabled when called.
+ */
+int thread_group_cputime_alloc(struct task_struct *tsk)
+{
+ struct signal_struct *sig = tsk->signal;
+ struct task_cputime *cputime;
+
+ /*
+ * If we have multiple threads and we don't already have a
+ * per-CPU task_cputime struct (checked in the caller), allocate
+ * one and fill it in with the times accumulated so far. We may
+ * race with another thread so recheck after we pick up the sighand
+ * lock.
+ */
+ cputime = alloc_percpu(struct task_cputime);
+ if (cputime == NULL)
+ return -ENOMEM;
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (sig->cputime.totals) {
+ spin_unlock_irq(&tsk->sighand->siglock);
+ free_percpu(cputime);
+ return 0;
+ }
+ sig->cputime.totals = cputime;
+ cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
+ cputime->utime = tsk->utime;
+ cputime->stime = tsk->stime;
+ cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ return 0;
+}
+
+/**
+ * thread_group_cputime - Sum the thread group time fields across all CPUs.
+ *
+ * @tsk: The task we use to identify the thread group.
+ * @times: task_cputime structure in which we return the summed fields.
+ *
+ * Walk the list of CPUs to sum the per-CPU time fields in the thread group
+ * time structure.
+ */
+void thread_group_cputime(
+ struct task_struct *tsk,
+ struct task_cputime *times)
+{
+ struct signal_struct *sig;
+ int i;
+ struct task_cputime *tot;
+
+ sig = tsk->signal;
+ if (unlikely(!sig) || !sig->cputime.totals) {
+ times->utime = tsk->utime;
+ times->stime = tsk->stime;
+ times->sum_exec_runtime = tsk->se.sum_exec_runtime;
+ return;
+ }
+ times->stime = times->utime = cputime_zero;
+ times->sum_exec_runtime = 0;
+ for_each_possible_cpu(i) {
+ tot = per_cpu_ptr(tsk->signal->cputime.totals, i);
+ times->utime = cputime_add(times->utime, tot->utime);
+ times->stime = cputime_add(times->stime, tot->stime);
+ times->sum_exec_runtime += tot->sum_exec_runtime;
+ }
+}
+
+/*
+ * Called after updating RLIMIT_CPU to set timer expiration if necessary.
+ */
+void update_rlimit_cpu(unsigned long rlim_new)
+{
+ cputime_t cputime;
+
+ cputime = secs_to_cputime(rlim_new);
+ if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
+ cputime_lt(current->signal->it_prof_expires, cputime)) {
+ spin_lock_irq(¤t->sighand->siglock);
+ set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
+ spin_unlock_irq(¤t->sighand->siglock);
+ }
+}
static int check_clock(const clockid_t which_clock)
{
@@ -158,10 +245,6 @@
{
return p->utime;
}
-static inline unsigned long long sched_ns(struct task_struct *p)
-{
- return task_sched_runtime(p);
-}
int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
{
@@ -211,46 +294,7 @@
cpu->cpu = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = sched_ns(p);
- break;
- }
- return 0;
-}
-
-/*
- * Sample a process (thread group) clock for the given group_leader task.
- * Must be called with tasklist_lock held for reading.
- * Must be called with tasklist_lock held for reading, and p->sighand->siglock.
- */
-static int cpu_clock_sample_group_locked(unsigned int clock_idx,
- struct task_struct *p,
- union cpu_time_count *cpu)
-{
- struct task_struct *t = p;
- switch (clock_idx) {
- default:
- return -EINVAL;
- case CPUCLOCK_PROF:
- cpu->cpu = cputime_add(p->signal->utime, p->signal->stime);
- do {
- cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
- t = next_thread(t);
- } while (t != p);
- break;
- case CPUCLOCK_VIRT:
- cpu->cpu = p->signal->utime;
- do {
- cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
- t = next_thread(t);
- } while (t != p);
- break;
- case CPUCLOCK_SCHED:
- cpu->sched = p->signal->sum_sched_runtime;
- /* Add in each other live thread. */
- while ((t = next_thread(t)) != p) {
- cpu->sched += t->se.sum_exec_runtime;
- }
- cpu->sched += sched_ns(p);
+ cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
break;
}
return 0;
@@ -264,13 +308,23 @@
struct task_struct *p,
union cpu_time_count *cpu)
{
- int ret;
- unsigned long flags;
- spin_lock_irqsave(&p->sighand->siglock, flags);
- ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p,
- cpu);
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
- return ret;
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+ switch (which_clock) {
+ default:
+ return -EINVAL;
+ case CPUCLOCK_PROF:
+ cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+ break;
+ case CPUCLOCK_VIRT:
+ cpu->cpu = cputime.utime;
+ break;
+ case CPUCLOCK_SCHED:
+ cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ break;
+ }
+ return 0;
}
@@ -471,80 +525,11 @@
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
{
+ struct task_cputime cputime;
+
+ thread_group_cputime(tsk, &cputime);
cleanup_timers(tsk->signal->cpu_timers,
- cputime_add(tsk->utime, tsk->signal->utime),
- cputime_add(tsk->stime, tsk->signal->stime),
- tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
-}
-
-
-/*
- * Set the expiry times of all the threads in the process so one of them
- * will go off before the process cumulative expiry total is reached.
- */
-static void process_timer_rebalance(struct task_struct *p,
- unsigned int clock_idx,
- union cpu_time_count expires,
- union cpu_time_count val)
-{
- cputime_t ticks, left;
- unsigned long long ns, nsleft;
- struct task_struct *t = p;
- unsigned int nthreads = atomic_read(&p->signal->live);
-
- if (!nthreads)
- return;
-
- switch (clock_idx) {
- default:
- BUG();
- break;
- case CPUCLOCK_PROF:
- left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
- nthreads);
- do {
- if (likely(!(t->flags & PF_EXITING))) {
- ticks = cputime_add(prof_ticks(t), left);
- if (cputime_eq(t->it_prof_expires,
- cputime_zero) ||
- cputime_gt(t->it_prof_expires, ticks)) {
- t->it_prof_expires = ticks;
- }
- }
- t = next_thread(t);
- } while (t != p);
- break;
- case CPUCLOCK_VIRT:
- left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
- nthreads);
- do {
- if (likely(!(t->flags & PF_EXITING))) {
- ticks = cputime_add(virt_ticks(t), left);
- if (cputime_eq(t->it_virt_expires,
- cputime_zero) ||
- cputime_gt(t->it_virt_expires, ticks)) {
- t->it_virt_expires = ticks;
- }
- }
- t = next_thread(t);
- } while (t != p);
- break;
- case CPUCLOCK_SCHED:
- nsleft = expires.sched - val.sched;
- do_div(nsleft, nthreads);
- nsleft = max_t(unsigned long long, nsleft, 1);
- do {
- if (likely(!(t->flags & PF_EXITING))) {
- ns = t->se.sum_exec_runtime + nsleft;
- if (t->it_sched_expires == 0 ||
- t->it_sched_expires > ns) {
- t->it_sched_expires = ns;
- }
- }
- t = next_thread(t);
- } while (t != p);
- break;
- }
+ cputime.utime, cputime.stime, cputime.sum_exec_runtime);
}
static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -608,29 +593,32 @@
default:
BUG();
case CPUCLOCK_PROF:
- if (cputime_eq(p->it_prof_expires,
+ if (cputime_eq(p->cputime_expires.prof_exp,
cputime_zero) ||
- cputime_gt(p->it_prof_expires,
+ cputime_gt(p->cputime_expires.prof_exp,
nt->expires.cpu))
- p->it_prof_expires = nt->expires.cpu;
+ p->cputime_expires.prof_exp =
+ nt->expires.cpu;
break;
case CPUCLOCK_VIRT:
- if (cputime_eq(p->it_virt_expires,
+ if (cputime_eq(p->cputime_expires.virt_exp,
cputime_zero) ||
- cputime_gt(p->it_virt_expires,
+ cputime_gt(p->cputime_expires.virt_exp,
nt->expires.cpu))
- p->it_virt_expires = nt->expires.cpu;
+ p->cputime_expires.virt_exp =
+ nt->expires.cpu;
break;
case CPUCLOCK_SCHED:
- if (p->it_sched_expires == 0 ||
- p->it_sched_expires > nt->expires.sched)
- p->it_sched_expires = nt->expires.sched;
+ if (p->cputime_expires.sched_exp == 0 ||
+ p->cputime_expires.sched_exp >
+ nt->expires.sched)
+ p->cputime_expires.sched_exp =
+ nt->expires.sched;
break;
}
} else {
/*
- * For a process timer, we must balance
- * all the live threads' expirations.
+ * For a process timer, set the cached expiration time.
*/
switch (CPUCLOCK_WHICH(timer->it_clock)) {
default:
@@ -641,7 +629,9 @@
cputime_lt(p->signal->it_virt_expires,
timer->it.cpu.expires.cpu))
break;
- goto rebalance;
+ p->signal->cputime_expires.virt_exp =
+ timer->it.cpu.expires.cpu;
+ break;
case CPUCLOCK_PROF:
if (!cputime_eq(p->signal->it_prof_expires,
cputime_zero) &&
@@ -652,13 +642,12 @@
if (i != RLIM_INFINITY &&
i <= cputime_to_secs(timer->it.cpu.expires.cpu))
break;
- goto rebalance;
+ p->signal->cputime_expires.prof_exp =
+ timer->it.cpu.expires.cpu;
+ break;
case CPUCLOCK_SCHED:
- rebalance:
- process_timer_rebalance(
- timer->it.cpu.task,
- CPUCLOCK_WHICH(timer->it_clock),
- timer->it.cpu.expires, now);
+ p->signal->cputime_expires.sched_exp =
+ timer->it.cpu.expires.sched;
break;
}
}
@@ -969,13 +958,13 @@
struct signal_struct *const sig = tsk->signal;
maxfire = 20;
- tsk->it_prof_expires = cputime_zero;
+ tsk->cputime_expires.prof_exp = cputime_zero;
while (!list_empty(timers)) {
struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
- tsk->it_prof_expires = t->expires.cpu;
+ tsk->cputime_expires.prof_exp = t->expires.cpu;
break;
}
t->firing = 1;
@@ -984,13 +973,13 @@
++timers;
maxfire = 20;
- tsk->it_virt_expires = cputime_zero;
+ tsk->cputime_expires.virt_exp = cputime_zero;
while (!list_empty(timers)) {
struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
- tsk->it_virt_expires = t->expires.cpu;
+ tsk->cputime_expires.virt_exp = t->expires.cpu;
break;
}
t->firing = 1;
@@ -999,13 +988,13 @@
++timers;
maxfire = 20;
- tsk->it_sched_expires = 0;
+ tsk->cputime_expires.sched_exp = 0;
while (!list_empty(timers)) {
struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
- tsk->it_sched_expires = t->expires.sched;
+ tsk->cputime_expires.sched_exp = t->expires.sched;
break;
}
t->firing = 1;
@@ -1055,10 +1044,10 @@
{
int maxfire;
struct signal_struct *const sig = tsk->signal;
- cputime_t utime, stime, ptime, virt_expires, prof_expires;
+ cputime_t utime, ptime, virt_expires, prof_expires;
unsigned long long sum_sched_runtime, sched_expires;
- struct task_struct *t;
struct list_head *timers = sig->cpu_timers;
+ struct task_cputime cputime;
/*
* Don't sample the current process CPU clocks if there are no timers.
@@ -1074,18 +1063,10 @@
/*
* Collect the current process totals.
*/
- utime = sig->utime;
- stime = sig->stime;
- sum_sched_runtime = sig->sum_sched_runtime;
- t = tsk;
- do {
- utime = cputime_add(utime, t->utime);
- stime = cputime_add(stime, t->stime);
- sum_sched_runtime += t->se.sum_exec_runtime;
- t = next_thread(t);
- } while (t != tsk);
- ptime = cputime_add(utime, stime);
-
+ thread_group_cputime(tsk, &cputime);
+ utime = cputime.utime;
+ ptime = cputime_add(utime, cputime.stime);
+ sum_sched_runtime = cputime.sum_exec_runtime;
maxfire = 20;
prof_expires = cputime_zero;
while (!list_empty(timers)) {
@@ -1193,60 +1174,18 @@
}
}
- if (!cputime_eq(prof_expires, cputime_zero) ||
- !cputime_eq(virt_expires, cputime_zero) ||
- sched_expires != 0) {
- /*
- * Rebalance the threads' expiry times for the remaining
- * process CPU timers.
- */
-
- cputime_t prof_left, virt_left, ticks;
- unsigned long long sched_left, sched;
- const unsigned int nthreads = atomic_read(&sig->live);
-
- if (!nthreads)
- return;
-
- prof_left = cputime_sub(prof_expires, utime);
- prof_left = cputime_sub(prof_left, stime);
- prof_left = cputime_div_non_zero(prof_left, nthreads);
- virt_left = cputime_sub(virt_expires, utime);
- virt_left = cputime_div_non_zero(virt_left, nthreads);
- if (sched_expires) {
- sched_left = sched_expires - sum_sched_runtime;
- do_div(sched_left, nthreads);
- sched_left = max_t(unsigned long long, sched_left, 1);
- } else {
- sched_left = 0;
- }
- t = tsk;
- do {
- if (unlikely(t->flags & PF_EXITING))
- continue;
-
- ticks = cputime_add(cputime_add(t->utime, t->stime),
- prof_left);
- if (!cputime_eq(prof_expires, cputime_zero) &&
- (cputime_eq(t->it_prof_expires, cputime_zero) ||
- cputime_gt(t->it_prof_expires, ticks))) {
- t->it_prof_expires = ticks;
- }
-
- ticks = cputime_add(t->utime, virt_left);
- if (!cputime_eq(virt_expires, cputime_zero) &&
- (cputime_eq(t->it_virt_expires, cputime_zero) ||
- cputime_gt(t->it_virt_expires, ticks))) {
- t->it_virt_expires = ticks;
- }
-
- sched = t->se.sum_exec_runtime + sched_left;
- if (sched_expires && (t->it_sched_expires == 0 ||
- t->it_sched_expires > sched)) {
- t->it_sched_expires = sched;
- }
- } while ((t = next_thread(t)) != tsk);
- }
+ if (!cputime_eq(prof_expires, cputime_zero) &&
+ (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) ||
+ cputime_gt(sig->cputime_expires.prof_exp, prof_expires)))
+ sig->cputime_expires.prof_exp = prof_expires;
+ if (!cputime_eq(virt_expires, cputime_zero) &&
+ (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
+ cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
+ sig->cputime_expires.virt_exp = virt_expires;
+ if (sched_expires != 0 &&
+ (sig->cputime_expires.sched_exp == 0 ||
+ sig->cputime_expires.sched_exp > sched_expires))
+ sig->cputime_expires.sched_exp = sched_expires;
}
/*
@@ -1314,6 +1253,86 @@
++timer->it_requeue_pending;
}
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime: The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero. Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+ if (cputime_eq(cputime->utime, cputime_zero) &&
+ cputime_eq(cputime->stime, cputime_zero) &&
+ cputime->sum_exec_runtime == 0)
+ return 1;
+ return 0;
+}
+
+/**
+ * task_cputime_expired - Compare two task_cputime entities.
+ *
+ * @sample: The task_cputime structure to be checked for expiration.
+ * @expires: Expiration times, against which @sample will be checked.
+ *
+ * Checks @sample against @expires to see if any field of @sample has expired.
+ * Returns true if any field of the former is greater than the corresponding
+ * field of the latter if the latter field is set. Otherwise returns false.
+ */
+static inline int task_cputime_expired(const struct task_cputime *sample,
+ const struct task_cputime *expires)
+{
+ if (!cputime_eq(expires->utime, cputime_zero) &&
+ cputime_ge(sample->utime, expires->utime))
+ return 1;
+ if (!cputime_eq(expires->stime, cputime_zero) &&
+ cputime_ge(cputime_add(sample->utime, sample->stime),
+ expires->stime))
+ return 1;
+ if (expires->sum_exec_runtime != 0 &&
+ sample->sum_exec_runtime >= expires->sum_exec_runtime)
+ return 1;
+ return 0;
+}
+
+/**
+ * fastpath_timer_check - POSIX CPU timers fast path.
+ *
+ * @tsk: The task (thread) being checked.
+ *
+ * Check the task and thread group timers. If both are zero (there are no
+ * timers set) return false. Otherwise snapshot the task and thread group
+ * timers and compare them with the corresponding expiration times. Return
+ * true if a timer has expired, else return false.
+ */
+static inline int fastpath_timer_check(struct task_struct *tsk)
+{
+ struct signal_struct *sig = tsk->signal;
+
+ if (unlikely(!sig))
+ return 0;
+
+ if (!task_cputime_zero(&tsk->cputime_expires)) {
+ struct task_cputime task_sample = {
+ .utime = tsk->utime,
+ .stime = tsk->stime,
+ .sum_exec_runtime = tsk->se.sum_exec_runtime
+ };
+
+ if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
+ return 1;
+ }
+ if (!task_cputime_zero(&sig->cputime_expires)) {
+ struct task_cputime group_sample;
+
+ thread_group_cputime(tsk, &group_sample);
+ if (task_cputime_expired(&group_sample, &sig->cputime_expires))
+ return 1;
+ }
+ return 0;
+}
+
/*
* This is called from the timer interrupt handler. The irq handler has
* already updated our counts. We need to check if any timers fire now.
@@ -1326,42 +1345,31 @@
BUG_ON(!irqs_disabled());
-#define UNEXPIRED(clock) \
- (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \
- cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires))
-
- if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
- (tsk->it_sched_expires == 0 ||
- tsk->se.sum_exec_runtime < tsk->it_sched_expires))
+ /*
+ * The fast path checks that there are no expired thread or thread
+ * group timers. If that's so, just return.
+ */
+ if (!fastpath_timer_check(tsk))
return;
-#undef UNEXPIRED
+ spin_lock(&tsk->sighand->siglock);
+ /*
+ * Here we take off tsk->signal->cpu_timers[N] and
+ * tsk->cpu_timers[N] all the timers that are firing, and
+ * put them on the firing list.
+ */
+ check_thread_timers(tsk, &firing);
+ check_process_timers(tsk, &firing);
/*
- * Double-check with locks held.
+ * We must release these locks before taking any timer's lock.
+ * There is a potential race with timer deletion here, as the
+ * siglock now protects our private firing list. We have set
+ * the firing flag in each timer, so that a deletion attempt
+ * that gets the timer lock before we do will give it up and
+ * spin until we've taken care of that timer below.
*/
- read_lock(&tasklist_lock);
- if (likely(tsk->signal != NULL)) {
- spin_lock(&tsk->sighand->siglock);
-
- /*
- * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
- * all the timers that are firing, and put them on the firing list.
- */
- check_thread_timers(tsk, &firing);
- check_process_timers(tsk, &firing);
-
- /*
- * We must release these locks before taking any timer's lock.
- * There is a potential race with timer deletion here, as the
- * siglock now protects our private firing list. We have set
- * the firing flag in each timer, so that a deletion attempt
- * that gets the timer lock before we do will give it up and
- * spin until we've taken care of that timer below.
- */
- spin_unlock(&tsk->sighand->siglock);
- }
- read_unlock(&tasklist_lock);
+ spin_unlock(&tsk->sighand->siglock);
/*
* Now that all the timers on our list have the firing flag,
@@ -1389,10 +1397,9 @@
/*
* Set one of the process-wide special case CPU timers.
- * The tasklist_lock and tsk->sighand->siglock must be held by the caller.
- * The oldval argument is null for the RLIMIT_CPU timer, where *newval is
- * absolute; non-null for ITIMER_*, where *newval is relative and we update
- * it to be absolute, *oldval is absolute and we update it to be relative.
+ * The tsk->sighand->siglock must be held by the caller.
+ * The *newval argument is relative and we update it to be absolute, *oldval
+ * is absolute and we update it to be relative.
*/
void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
cputime_t *newval, cputime_t *oldval)
@@ -1401,7 +1408,7 @@
struct list_head *head;
BUG_ON(clock_idx == CPUCLOCK_SCHED);
- cpu_clock_sample_group_locked(clock_idx, tsk, &now);
+ cpu_clock_sample_group(clock_idx, tsk, &now);
if (oldval) {
if (!cputime_eq(*oldval, cputime_zero)) {
@@ -1435,13 +1442,14 @@
cputime_ge(list_first_entry(head,
struct cpu_timer_list, entry)->expires.cpu,
*newval)) {
- /*
- * Rejigger each thread's expiry time so that one will
- * notice before we hit the process-cumulative expiry time.
- */
- union cpu_time_count expires = { .sched = 0 };
- expires.cpu = *newval;
- process_timer_rebalance(tsk, clock_idx, expires, now);
+ switch (clock_idx) {
+ case CPUCLOCK_PROF:
+ tsk->signal->cputime_expires.prof_exp = *newval;
+ break;
+ case CPUCLOCK_VIRT:
+ tsk->signal->cputime_expires.virt_exp = *newval;
+ break;
+ }
}
}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5131e54..b931d7c 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -223,6 +223,15 @@
}
/*
+ * Get monotonic time for posix timers
+ */
+static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+{
+ getrawmonotonic(tp);
+ return 0;
+}
+
+/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
static __init int init_posix_timers(void)
@@ -235,9 +244,15 @@
.clock_get = posix_ktime_get_ts,
.clock_set = do_posix_clock_nosettime,
};
+ struct k_clock clock_monotonic_raw = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_monotonic_raw,
+ .clock_set = do_posix_clock_nosettime,
+ };
register_posix_clock(CLOCK_REALTIME, &clock_realtime);
register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+ register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -298,6 +313,7 @@
int posix_timer_event(struct k_itimer *timr, int si_private)
{
+ int shared, ret;
/*
* FIXME: if ->sigq is queued we can race with
* dequeue_signal()->do_schedule_next_timer().
@@ -311,25 +327,10 @@
*/
timr->sigq->info.si_sys_private = si_private;
- timr->sigq->info.si_signo = timr->it_sigev_signo;
- timr->sigq->info.si_code = SI_TIMER;
- timr->sigq->info.si_tid = timr->it_id;
- timr->sigq->info.si_value = timr->it_sigev_value;
-
- if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
- struct task_struct *leader;
- int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
-
- if (likely(ret >= 0))
- return ret;
-
- timr->it_sigev_notify = SIGEV_SIGNAL;
- leader = timr->it_process->group_leader;
- put_task_struct(timr->it_process);
- timr->it_process = leader;
- }
-
- return send_sigqueue(timr->sigq, timr->it_process, 1);
+ shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
+ ret = send_sigqueue(timr->sigq, timr->it_process, shared);
+ /* If we failed to send the signal the timer stops. */
+ return ret > 0;
}
EXPORT_SYMBOL_GPL(posix_timer_event);
@@ -468,11 +469,9 @@
struct sigevent __user *timer_event_spec,
timer_t __user * created_timer_id)
{
- int error = 0;
- struct k_itimer *new_timer = NULL;
- int new_timer_id;
- struct task_struct *process = NULL;
- unsigned long flags;
+ struct k_itimer *new_timer;
+ int error, new_timer_id;
+ struct task_struct *process;
sigevent_t event;
int it_id_set = IT_ID_NOT_SET;
@@ -490,12 +489,11 @@
goto out;
}
spin_lock_irq(&idr_lock);
- error = idr_get_new(&posix_timers_id, (void *) new_timer,
- &new_timer_id);
+ error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
spin_unlock_irq(&idr_lock);
- if (error == -EAGAIN)
- goto retry;
- else if (error) {
+ if (error) {
+ if (error == -EAGAIN)
+ goto retry;
/*
* Weird looking, but we return EAGAIN if the IDR is
* full (proper POSIX return value for this)
@@ -526,67 +524,43 @@
error = -EFAULT;
goto out;
}
- new_timer->it_sigev_notify = event.sigev_notify;
- new_timer->it_sigev_signo = event.sigev_signo;
- new_timer->it_sigev_value = event.sigev_value;
-
- read_lock(&tasklist_lock);
- if ((process = good_sigevent(&event))) {
- /*
- * We may be setting up this process for another
- * thread. It may be exiting. To catch this
- * case the we check the PF_EXITING flag. If
- * the flag is not set, the siglock will catch
- * him before it is too late (in exit_itimers).
- *
- * The exec case is a bit more invloved but easy
- * to code. If the process is in our thread
- * group (and it must be or we would not allow
- * it here) and is doing an exec, it will cause
- * us to be killed. In this case it will wait
- * for us to die which means we can finish this
- * linkage with our last gasp. I.e. no code :)
- */
- spin_lock_irqsave(&process->sighand->siglock, flags);
- if (!(process->flags & PF_EXITING)) {
- new_timer->it_process = process;
- list_add(&new_timer->list,
- &process->signal->posix_timers);
- if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
- get_task_struct(process);
- spin_unlock_irqrestore(&process->sighand->siglock, flags);
- } else {
- spin_unlock_irqrestore(&process->sighand->siglock, flags);
- process = NULL;
- }
- }
- read_unlock(&tasklist_lock);
+ rcu_read_lock();
+ process = good_sigevent(&event);
+ if (process)
+ get_task_struct(process);
+ rcu_read_unlock();
if (!process) {
error = -EINVAL;
goto out;
}
} else {
- new_timer->it_sigev_notify = SIGEV_SIGNAL;
- new_timer->it_sigev_signo = SIGALRM;
- new_timer->it_sigev_value.sival_int = new_timer->it_id;
+ event.sigev_notify = SIGEV_SIGNAL;
+ event.sigev_signo = SIGALRM;
+ event.sigev_value.sival_int = new_timer->it_id;
process = current->group_leader;
- spin_lock_irqsave(&process->sighand->siglock, flags);
- new_timer->it_process = process;
- list_add(&new_timer->list, &process->signal->posix_timers);
- spin_unlock_irqrestore(&process->sighand->siglock, flags);
+ get_task_struct(process);
}
+ new_timer->it_sigev_notify = event.sigev_notify;
+ new_timer->sigq->info.si_signo = event.sigev_signo;
+ new_timer->sigq->info.si_value = event.sigev_value;
+ new_timer->sigq->info.si_tid = new_timer->it_id;
+ new_timer->sigq->info.si_code = SI_TIMER;
+
+ spin_lock_irq(¤t->sighand->siglock);
+ new_timer->it_process = process;
+ list_add(&new_timer->list, ¤t->signal->posix_timers);
+ spin_unlock_irq(¤t->sighand->siglock);
+
+ return 0;
/*
* In the case of the timer belonging to another task, after
* the task is unlocked, the timer is owned by the other task
* and may cease to exist at any time. Don't use or modify
* new_timer after the unlock call.
*/
-
out:
- if (error)
- release_posix_timer(new_timer, it_id_set);
-
+ release_posix_timer(new_timer, it_id_set);
return error;
}
@@ -597,7 +571,7 @@
* the find to the timer lock. To avoid a dead lock, the timer id MUST
* be release with out holding the timer lock.
*/
-static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
/*
@@ -605,23 +579,20 @@
* flags part over to the timer lock. Must not let interrupts in
* while we are moving the lock.
*/
-
spin_lock_irqsave(&idr_lock, *flags);
- timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
+ timr = idr_find(&posix_timers_id, (int)timer_id);
if (timr) {
spin_lock(&timr->it_lock);
-
- if ((timr->it_id != timer_id) || !(timr->it_process) ||
- !same_thread_group(timr->it_process, current)) {
- spin_unlock(&timr->it_lock);
- spin_unlock_irqrestore(&idr_lock, *flags);
- timr = NULL;
- } else
+ if (timr->it_process &&
+ same_thread_group(timr->it_process, current)) {
spin_unlock(&idr_lock);
- } else
- spin_unlock_irqrestore(&idr_lock, *flags);
+ return timr;
+ }
+ spin_unlock(&timr->it_lock);
+ }
+ spin_unlock_irqrestore(&idr_lock, *flags);
- return timr;
+ return NULL;
}
/*
@@ -862,8 +833,7 @@
* This keeps any tasks waiting on the spin lock from thinking
* they got something (see the lock code above).
*/
- if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
- put_task_struct(timer->it_process);
+ put_task_struct(timer->it_process);
timer->it_process = NULL;
unlock_timer(timer, flags);
@@ -890,8 +860,7 @@
* This keeps any tasks waiting on the spin lock from thinking
* they got something (see the lock code above).
*/
- if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
- put_task_struct(timer->it_process);
+ put_task_struct(timer->it_process);
timer->it_process = NULL;
unlock_timer(timer, flags);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 90b5b12..85cb905 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -42,10 +42,10 @@
#include <linux/freezer.h>
#include <linux/cpu.h>
#include <linux/delay.h>
-#include <linux/byteorder/swabb.h>
#include <linux/stat.h>
#include <linux/srcu.h>
#include <linux/slab.h>
+#include <asm/byteorder.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/sched.c b/kernel/sched.c
index 6f23059..d906f72 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
+#include <trace/sched.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
@@ -1936,6 +1937,7 @@
* just go back and repeat.
*/
rq = task_rq_lock(p, &flags);
+ trace_sched_wait_task(rq, p);
running = task_running(rq, p);
on_rq = p->se.on_rq;
ncsw = 0;
@@ -2297,9 +2299,7 @@
success = 1;
out_running:
- trace_mark(kernel_sched_wakeup,
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- p->pid, p->state, rq, p, rq->curr);
+ trace_sched_wakeup(rq, p);
check_preempt_curr(rq, p, sync);
p->state = TASK_RUNNING;
@@ -2432,9 +2432,7 @@
p->sched_class->task_new(rq, p);
inc_nr_running(rq);
}
- trace_mark(kernel_sched_wakeup_new,
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- p->pid, p->state, rq, p, rq->curr);
+ trace_sched_wakeup_new(rq, p);
check_preempt_curr(rq, p, 0);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
@@ -2607,11 +2605,7 @@
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
- trace_mark(kernel_sched_schedule,
- "prev_pid %d next_pid %d prev_state %ld "
- "## rq %p prev %p next %p",
- prev->pid, next->pid, prev->state,
- rq, prev, next);
+ trace_sched_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
@@ -2851,6 +2845,7 @@
|| unlikely(!cpu_active(dest_cpu)))
goto out;
+ trace_sched_migrate_task(rq, p, dest_cpu);
/* force the process onto the specified CPU */
if (migrate_task(p, dest_cpu, &req)) {
/* Need to wait for migration thread (might exit: take ref). */
@@ -4052,23 +4047,26 @@
EXPORT_PER_CPU_SYMBOL(kstat);
/*
- * Return p->sum_exec_runtime plus any more ns on the sched_clock
- * that have not yet been banked in case the task is currently running.
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
*/
-unsigned long long task_sched_runtime(struct task_struct *p)
+unsigned long long task_delta_exec(struct task_struct *p)
{
unsigned long flags;
- u64 ns, delta_exec;
struct rq *rq;
+ u64 ns = 0;
rq = task_rq_lock(p, &flags);
- ns = p->se.sum_exec_runtime;
+
if (task_current(rq, p)) {
+ u64 delta_exec;
+
update_rq_clock(rq);
delta_exec = rq->clock - p->se.exec_start;
if ((s64)delta_exec > 0)
- ns += delta_exec;
+ ns = delta_exec;
}
+
task_rq_unlock(rq, &flags);
return ns;
@@ -4085,6 +4083,7 @@
cputime64_t tmp;
p->utime = cputime_add(p->utime, cputime);
+ account_group_user_time(p, cputime);
/* Add user time to cpustat. */
tmp = cputime_to_cputime64(cputime);
@@ -4109,6 +4108,7 @@
tmp = cputime_to_cputime64(cputime);
p->utime = cputime_add(p->utime, cputime);
+ account_group_user_time(p, cputime);
p->gtime = cputime_add(p->gtime, cputime);
cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4144,6 +4144,7 @@
}
p->stime = cputime_add(p->stime, cputime);
+ account_group_system_time(p, cputime);
/* Add system time to cpustat. */
tmp = cputime_to_cputime64(cputime);
@@ -4185,6 +4186,7 @@
if (p == rq->idle) {
p->stime = cputime_add(p->stime, steal);
+ account_group_system_time(p, steal);
if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
else
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 18fd171..f604dae 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -449,6 +449,7 @@
struct task_struct *curtask = task_of(curr);
cpuacct_charge(curtask, delta_exec);
+ account_group_exec_runtime(curtask, delta_exec);
}
}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index cdf5740..b446dc8 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -526,6 +526,8 @@
schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);
@@ -1458,7 +1460,7 @@
p->rt.timeout++;
next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
if (p->rt.timeout > next)
- p->it_sched_expires = p->se.sum_exec_runtime;
+ p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
}
}
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 8385d43..b8c1569 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -270,3 +270,89 @@
#define sched_info_switch(t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+/*
+ * The following are functions that support scheduler-internal time accounting.
+ * These functions are generally called at the timer tick. None of this depends
+ * on CONFIG_SCHEDSTATS.
+ */
+
+/**
+ * account_group_user_time - Maintain utime for a thread group.
+ *
+ * @tsk: Pointer to task structure.
+ * @cputime: Time value by which to increment the utime field of the
+ * thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the utime field there.
+ */
+static inline void account_group_user_time(struct task_struct *tsk,
+ cputime_t cputime)
+{
+ struct signal_struct *sig;
+
+ sig = tsk->signal;
+ if (unlikely(!sig))
+ return;
+ if (sig->cputime.totals) {
+ struct task_cputime *times;
+
+ times = per_cpu_ptr(sig->cputime.totals, get_cpu());
+ times->utime = cputime_add(times->utime, cputime);
+ put_cpu_no_resched();
+ }
+}
+
+/**
+ * account_group_system_time - Maintain stime for a thread group.
+ *
+ * @tsk: Pointer to task structure.
+ * @cputime: Time value by which to increment the stime field of the
+ * thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the stime field there.
+ */
+static inline void account_group_system_time(struct task_struct *tsk,
+ cputime_t cputime)
+{
+ struct signal_struct *sig;
+
+ sig = tsk->signal;
+ if (unlikely(!sig))
+ return;
+ if (sig->cputime.totals) {
+ struct task_cputime *times;
+
+ times = per_cpu_ptr(sig->cputime.totals, get_cpu());
+ times->stime = cputime_add(times->stime, cputime);
+ put_cpu_no_resched();
+ }
+}
+
+/**
+ * account_group_exec_runtime - Maintain exec runtime for a thread group.
+ *
+ * @tsk: Pointer to task structure.
+ * @ns: Time value by which to increment the sum_exec_runtime field
+ * of the thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the sum_exec_runtime field there.
+ */
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+ unsigned long long ns)
+{
+ struct signal_struct *sig;
+
+ sig = tsk->signal;
+ if (unlikely(!sig))
+ return;
+ if (sig->cputime.totals) {
+ struct task_cputime *times;
+
+ times = per_cpu_ptr(sig->cputime.totals, get_cpu());
+ times->sum_exec_runtime += ns;
+ put_cpu_no_resched();
+ }
+}
diff --git a/kernel/signal.c b/kernel/signal.c
index e661b01..105217d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,6 +27,7 @@
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
+#include <trace/sched.h>
#include <asm/param.h>
#include <asm/uaccess.h>
@@ -803,6 +804,8 @@
struct sigpending *pending;
struct sigqueue *q;
+ trace_sched_signal_send(sig, t);
+
assert_spin_locked(&t->sighand->siglock);
if (!prepare_signal(sig, t))
return 0;
@@ -1338,6 +1341,7 @@
struct siginfo info;
unsigned long flags;
struct sighand_struct *psig;
+ struct task_cputime cputime;
int ret = sig;
BUG_ON(sig == -1);
@@ -1368,10 +1372,9 @@
info.si_uid = tsk->uid;
- info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
- tsk->signal->utime));
- info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
- tsk->signal->stime));
+ thread_group_cputime(tsk, &cputime);
+ info.si_utime = cputime_to_jiffies(cputime.utime);
+ info.si_stime = cputime_to_jiffies(cputime.stime);
info.si_status = tsk->exit_code & 0x7f;
if (tsk->exit_code & 0x80)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 83ba21a..7110dae 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -267,16 +267,12 @@
*/
void irq_enter(void)
{
-#ifdef CONFIG_NO_HZ
int cpu = smp_processor_id();
+
if (idle_cpu(cpu) && !in_interrupt())
- tick_nohz_stop_idle(cpu);
-#endif
+ tick_check_idle(cpu);
+
__irq_enter();
-#ifdef CONFIG_NO_HZ
- if (idle_cpu(cpu))
- tick_nohz_update_jiffies();
-#endif
}
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
diff --git a/kernel/sys.c b/kernel/sys.c
index 0bc8fa3..53879cd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -853,38 +853,28 @@
return old_fsgid;
}
+void do_sys_times(struct tms *tms)
+{
+ struct task_cputime cputime;
+ cputime_t cutime, cstime;
+
+ spin_lock_irq(¤t->sighand->siglock);
+ thread_group_cputime(current, &cputime);
+ cutime = current->signal->cutime;
+ cstime = current->signal->cstime;
+ spin_unlock_irq(¤t->sighand->siglock);
+ tms->tms_utime = cputime_to_clock_t(cputime.utime);
+ tms->tms_stime = cputime_to_clock_t(cputime.stime);
+ tms->tms_cutime = cputime_to_clock_t(cutime);
+ tms->tms_cstime = cputime_to_clock_t(cstime);
+}
+
asmlinkage long sys_times(struct tms __user * tbuf)
{
- /*
- * In the SMP world we might just be unlucky and have one of
- * the times increment as we use it. Since the value is an
- * atomically safe type this is just fine. Conceptually its
- * as if the syscall took an instant longer to occur.
- */
if (tbuf) {
struct tms tmp;
- struct task_struct *tsk = current;
- struct task_struct *t;
- cputime_t utime, stime, cutime, cstime;
- spin_lock_irq(&tsk->sighand->siglock);
- utime = tsk->signal->utime;
- stime = tsk->signal->stime;
- t = tsk;
- do {
- utime = cputime_add(utime, t->utime);
- stime = cputime_add(stime, t->stime);
- t = next_thread(t);
- } while (t != tsk);
-
- cutime = tsk->signal->cutime;
- cstime = tsk->signal->cstime;
- spin_unlock_irq(&tsk->sighand->siglock);
-
- tmp.tms_utime = cputime_to_clock_t(utime);
- tmp.tms_stime = cputime_to_clock_t(stime);
- tmp.tms_cutime = cputime_to_clock_t(cutime);
- tmp.tms_cstime = cputime_to_clock_t(cstime);
+ do_sys_times(&tmp);
if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
return -EFAULT;
}
@@ -1449,7 +1439,6 @@
asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
{
struct rlimit new_rlim, *old_rlim;
- unsigned long it_prof_secs;
int retval;
if (resource >= RLIM_NLIMITS)
@@ -1503,18 +1492,7 @@
if (new_rlim.rlim_cur == RLIM_INFINITY)
goto out;
- it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
- if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
- unsigned long rlim_cur = new_rlim.rlim_cur;
- cputime_t cputime;
-
- cputime = secs_to_cputime(rlim_cur);
- read_lock(&tasklist_lock);
- spin_lock_irq(¤t->sighand->siglock);
- set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
- spin_unlock_irq(¤t->sighand->siglock);
- read_unlock(&tasklist_lock);
- }
+ update_rlimit_cpu(new_rlim.rlim_cur);
out:
return 0;
}
@@ -1552,11 +1530,8 @@
*
*/
-static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
- cputime_t *utimep, cputime_t *stimep)
+static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
{
- *utimep = cputime_add(*utimep, t->utime);
- *stimep = cputime_add(*stimep, t->stime);
r->ru_nvcsw += t->nvcsw;
r->ru_nivcsw += t->nivcsw;
r->ru_minflt += t->min_flt;
@@ -1570,12 +1545,13 @@
struct task_struct *t;
unsigned long flags;
cputime_t utime, stime;
+ struct task_cputime cputime;
memset((char *) r, 0, sizeof *r);
utime = stime = cputime_zero;
if (who == RUSAGE_THREAD) {
- accumulate_thread_rusage(p, r, &utime, &stime);
+ accumulate_thread_rusage(p, r);
goto out;
}
@@ -1598,8 +1574,9 @@
break;
case RUSAGE_SELF:
- utime = cputime_add(utime, p->signal->utime);
- stime = cputime_add(stime, p->signal->stime);
+ thread_group_cputime(p, &cputime);
+ utime = cputime_add(utime, cputime.utime);
+ stime = cputime_add(stime, cputime.stime);
r->ru_nvcsw += p->signal->nvcsw;
r->ru_nivcsw += p->signal->nivcsw;
r->ru_minflt += p->signal->min_flt;
@@ -1608,7 +1585,7 @@
r->ru_oublock += p->signal->oublock;
t = p;
do {
- accumulate_thread_rusage(t, r, &utime, &stime);
+ accumulate_thread_rusage(t, r);
t = next_thread(t);
} while (t != p);
break;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 093d4ac..9ed2eec 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -325,6 +325,9 @@
unsigned long flags;
int ret;
+ /* save mult_orig on registration */
+ c->mult_orig = c->mult;
+
spin_lock_irqsave(&clocksource_lock, flags);
ret = clocksource_enqueue(c);
if (!ret)
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 4c256fd..1ca9955 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,6 +61,7 @@
.read = jiffies_read,
.mask = 0xffffffff, /*32bits*/
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+ .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
.shift = JIFFIES_SHIFT,
};
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 1ad46f3..1a20715 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -10,13 +10,13 @@
#include <linux/mm.h>
#include <linux/time.h>
-#include <linux/timer.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/hrtimer.h>
#include <linux/capability.h>
#include <linux/math64.h>
#include <linux/clocksource.h>
+#include <linux/workqueue.h>
#include <asm/timex.h>
/*
@@ -218,11 +218,11 @@
/* Disable the cmos update - used by virtualization and embedded */
int no_sync_cmos_clock __read_mostly;
-static void sync_cmos_clock(unsigned long dummy);
+static void sync_cmos_clock(struct work_struct *work);
-static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
+static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
-static void sync_cmos_clock(unsigned long dummy)
+static void sync_cmos_clock(struct work_struct *work)
{
struct timespec now, next;
int fail = 1;
@@ -258,13 +258,13 @@
next.tv_sec++;
next.tv_nsec -= NSEC_PER_SEC;
}
- mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next));
+ schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
}
static void notify_cmos_timer(void)
{
if (!no_sync_cmos_clock)
- mod_timer(&sync_cmos_timer, jiffies + 1);
+ schedule_delayed_work(&sync_cmos_work, 0);
}
#else
@@ -277,38 +277,50 @@
int do_adjtimex(struct timex *txc)
{
struct timespec ts;
- long save_adjust, sec;
int result;
- /* In order to modify anything, you gotta be super-user! */
- if (txc->modes && !capable(CAP_SYS_TIME))
- return -EPERM;
-
- /* Now we validate the data before disabling interrupts */
-
- if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
+ /* Validate the data before disabling interrupts */
+ if (txc->modes & ADJ_ADJTIME) {
/* singleshot must not be used with any other mode bits */
- if (txc->modes & ~ADJ_OFFSET_SS_READ)
+ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
return -EINVAL;
+ if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+ !capable(CAP_SYS_TIME))
+ return -EPERM;
+ } else {
+ /* In order to modify anything, you gotta be super-user! */
+ if (txc->modes && !capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ /* if the quartz is off by more than 10% something is VERY wrong! */
+ if (txc->modes & ADJ_TICK &&
+ (txc->tick < 900000/USER_HZ ||
+ txc->tick > 1100000/USER_HZ))
+ return -EINVAL;
+
+ if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
+ hrtimer_cancel(&leap_timer);
}
- /* if the quartz is off by more than 10% something is VERY wrong ! */
- if (txc->modes & ADJ_TICK)
- if (txc->tick < 900000/USER_HZ ||
- txc->tick > 1100000/USER_HZ)
- return -EINVAL;
-
- if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
- hrtimer_cancel(&leap_timer);
getnstimeofday(&ts);
write_seqlock_irq(&xtime_lock);
- /* Save for later - semantics of adjtime is to return old value */
- save_adjust = time_adjust;
-
/* If there are input parameters, then process them */
+ if (txc->modes & ADJ_ADJTIME) {
+ long save_adjust = time_adjust;
+
+ if (!(txc->modes & ADJ_OFFSET_READONLY)) {
+ /* adjtime() is independent from ntp_adjtime() */
+ time_adjust = txc->offset;
+ ntp_update_frequency();
+ }
+ txc->offset = save_adjust;
+ goto adj_done;
+ }
if (txc->modes) {
+ long sec;
+
if (txc->modes & ADJ_STATUS) {
if ((time_status & STA_PLL) &&
!(txc->status & STA_PLL)) {
@@ -375,13 +387,8 @@
if (txc->modes & ADJ_TAI && txc->constant > 0)
time_tai = txc->constant;
- if (txc->modes & ADJ_OFFSET) {
- if (txc->modes == ADJ_OFFSET_SINGLESHOT)
- /* adjtime() is independent from ntp_adjtime() */
- time_adjust = txc->offset;
- else
- ntp_update_offset(txc->offset);
- }
+ if (txc->modes & ADJ_OFFSET)
+ ntp_update_offset(txc->offset);
if (txc->modes & ADJ_TICK)
tick_usec = txc->tick;
@@ -389,22 +396,18 @@
ntp_update_frequency();
}
+ txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+ NTP_SCALE_SHIFT);
+ if (!(time_status & STA_NANO))
+ txc->offset /= NSEC_PER_USEC;
+
+adj_done:
result = time_state; /* mostly `TIME_OK' */
if (time_status & (STA_UNSYNC|STA_CLOCKERR))
result = TIME_ERROR;
- if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
- (txc->modes == ADJ_OFFSET_SS_READ))
- txc->offset = save_adjust;
- else {
- txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
- NTP_SCALE_SHIFT);
- if (!(time_status & STA_NANO))
- txc->offset /= NSEC_PER_USEC;
- }
- txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
- (s64)PPM_SCALE_INV,
- NTP_SCALE_SHIFT);
+ txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
+ (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
txc->maxerror = time_maxerror;
txc->esterror = time_esterror;
txc->status = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index cb01cd8..f98a1b7 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -384,6 +384,19 @@
}
/*
+ * Called from irq_enter() when idle was interrupted to reenable the
+ * per cpu device.
+ */
+void tick_check_oneshot_broadcast(int cpu)
+{
+ if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
+ struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
+
+ clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
+ }
+}
+
+/*
* Handle oneshot mode broadcasting
*/
static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 4692487..b1c05bf 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -36,6 +36,7 @@
extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
extern int tick_broadcast_oneshot_active(void);
+extern void tick_check_oneshot_broadcast(int cpu);
# else /* BROADCAST */
static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
@@ -45,6 +46,7 @@
static inline void tick_broadcast_switch_to_oneshot(void) { }
static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
+static inline void tick_check_oneshot_broadcast(int cpu) { }
# endif /* !BROADCAST */
#else /* !ONESHOT */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b711ffc..0581c11 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -155,7 +155,7 @@
touch_softlockup_watchdog();
}
-void tick_nohz_stop_idle(int cpu)
+static void tick_nohz_stop_idle(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -377,6 +377,32 @@
return ts->sleep_length;
}
+static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+{
+ hrtimer_cancel(&ts->sched_timer);
+ ts->sched_timer.expires = ts->idle_tick;
+
+ while (1) {
+ /* Forward the time to expire in the future */
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+ hrtimer_start(&ts->sched_timer,
+ ts->sched_timer.expires,
+ HRTIMER_MODE_ABS);
+ /* Check, if the timer was already in the past */
+ if (hrtimer_active(&ts->sched_timer))
+ break;
+ } else {
+ if (!tick_program_event(ts->sched_timer.expires, 0))
+ break;
+ }
+ /* Update jiffies and reread time */
+ tick_do_update_jiffies64(now);
+ now = ktime_get();
+ }
+}
+
/**
* tick_nohz_restart_sched_tick - restart the idle tick from the idle task
*
@@ -430,28 +456,7 @@
*/
ts->tick_stopped = 0;
ts->idle_exittime = now;
- hrtimer_cancel(&ts->sched_timer);
- ts->sched_timer.expires = ts->idle_tick;
-
- while (1) {
- /* Forward the time to expire in the future */
- hrtimer_forward(&ts->sched_timer, now, tick_period);
-
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer,
- ts->sched_timer.expires,
- HRTIMER_MODE_ABS);
- /* Check, if the timer was already in the past */
- if (hrtimer_active(&ts->sched_timer))
- break;
- } else {
- if (!tick_program_event(ts->sched_timer.expires, 0))
- break;
- }
- /* Update jiffies and reread time */
- tick_do_update_jiffies64(now);
- now = ktime_get();
- }
+ tick_nohz_restart(ts, now);
local_irq_enable();
}
@@ -503,10 +508,6 @@
update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING);
- /* Do not restart, when we are in the idle loop */
- if (ts->tick_stopped)
- return;
-
while (tick_nohz_reprogram(ts, now)) {
now = ktime_get();
tick_do_update_jiffies64(now);
@@ -552,6 +553,27 @@
smp_processor_id());
}
+/*
+ * When NOHZ is enabled and the tick is stopped, we need to kick the
+ * tick timer from irq_enter() so that the jiffies update is kept
+ * alive during long running softirqs. That's ugly as hell, but
+ * correctness is key even if we need to fix the offending softirq in
+ * the first place.
+ *
+ * Note, this is different to tick_nohz_restart. We just kick the
+ * timer and do not touch the other magic bits which need to be done
+ * when idle is left.
+ */
+static void tick_nohz_kick_tick(int cpu)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+ if (!ts->tick_stopped)
+ return;
+
+ tick_nohz_restart(ts, ktime_get());
+}
+
#else
static inline void tick_nohz_switch_to_nohz(void) { }
@@ -559,6 +581,19 @@
#endif /* NO_HZ */
/*
+ * Called from irq_enter to notify about the possible interruption of idle()
+ */
+void tick_check_idle(int cpu)
+{
+ tick_check_oneshot_broadcast(cpu);
+#ifdef CONFIG_NO_HZ
+ tick_nohz_stop_idle(cpu);
+ tick_nohz_update_jiffies();
+ tick_nohz_kick_tick(cpu);
+#endif
+}
+
+/*
* High resolution timer specific code
*/
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -611,10 +646,6 @@
profile_tick(CPU_PROFILING);
}
- /* Do not restart, when we are in the idle loop */
- if (ts->tick_stopped)
- return HRTIMER_NORESTART;
-
hrtimer_forward(timer, now, tick_period);
return HRTIMER_RESTART;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e91c29f..e7acfb4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -58,27 +58,26 @@
#ifdef CONFIG_GENERIC_TIME
/**
- * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ * clocksource_forward_now - update clock to the current time
*
- * private function, must hold xtime_lock lock when being
- * called. Returns the number of nanoseconds since the
- * last call to update_wall_time() (adjusted by NTP scaling)
+ * Forward the current clock to update its state since the last call to
+ * update_wall_time(). This is useful before significant clock changes,
+ * as it avoids having to deal with this time offset explicitly.
*/
-static inline s64 __get_nsec_offset(void)
+static void clocksource_forward_now(void)
{
cycle_t cycle_now, cycle_delta;
- s64 ns_offset;
+ s64 nsec;
- /* read clocksource: */
cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ clock->cycle_last = cycle_now;
- /* convert to nanoseconds: */
- ns_offset = cyc2ns(clock, cycle_delta);
+ nsec = cyc2ns(clock, cycle_delta);
+ timespec_add_ns(&xtime, nsec);
- return ns_offset;
+ nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+ clock->raw_time.tv_nsec += nsec;
}
/**
@@ -89,6 +88,7 @@
*/
void getnstimeofday(struct timespec *ts)
{
+ cycle_t cycle_now, cycle_delta;
unsigned long seq;
s64 nsecs;
@@ -96,7 +96,15 @@
seq = read_seqbegin(&xtime_lock);
*ts = xtime;
- nsecs = __get_nsec_offset();
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* convert to nanoseconds: */
+ nsecs = cyc2ns(clock, cycle_delta);
} while (read_seqretry(&xtime_lock, seq));
@@ -129,22 +137,22 @@
*/
int do_settimeofday(struct timespec *tv)
{
+ struct timespec ts_delta;
unsigned long flags;
- time_t wtm_sec, sec = tv->tv_sec;
- long wtm_nsec, nsec = tv->tv_nsec;
if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
write_seqlock_irqsave(&xtime_lock, flags);
- nsec -= __get_nsec_offset();
+ clocksource_forward_now();
- wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
- wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+ ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
+ ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
+ wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
- set_normalized_timespec(&xtime, sec, nsec);
- set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+ xtime = *tv;
+
update_xtime_cache(0);
clock->error = 0;
@@ -170,22 +178,19 @@
static void change_clocksource(void)
{
struct clocksource *new;
- cycle_t now;
- u64 nsec;
new = clocksource_get_next();
if (clock == new)
return;
- new->cycle_last = 0;
- now = clocksource_read(new);
- nsec = __get_nsec_offset();
- timespec_add_ns(&xtime, nsec);
+ clocksource_forward_now();
+
+ new->raw_time = clock->raw_time;
clock = new;
- clock->cycle_last = now;
-
+ clock->cycle_last = 0;
+ clock->cycle_last = clocksource_read(new);
clock->error = 0;
clock->xtime_nsec = 0;
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -200,11 +205,44 @@
*/
}
#else
+static inline void clocksource_forward_now(void) { }
static inline void change_clocksource(void) { }
-static inline s64 __get_nsec_offset(void) { return 0; }
#endif
/**
+ * getrawmonotonic - Returns the raw monotonic time in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the raw monotonic time (completely un-modified by ntp)
+ */
+void getrawmonotonic(struct timespec *ts)
+{
+ unsigned long seq;
+ s64 nsecs;
+ cycle_t cycle_now, cycle_delta;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* convert to nanoseconds: */
+ nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+
+ *ts = clock->raw_time;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ timespec_add_ns(ts, nsecs);
+}
+EXPORT_SYMBOL(getrawmonotonic);
+
+
+/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
*/
int timekeeping_valid_for_hres(void)
@@ -265,8 +303,6 @@
static int timekeeping_suspended;
/* time in seconds when suspend began */
static unsigned long timekeeping_suspend_time;
-/* xtime offset when we went into suspend */
-static s64 timekeeping_suspend_nsecs;
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -292,8 +328,6 @@
wall_to_monotonic.tv_sec -= sleep_length;
total_sleep_time += sleep_length;
}
- /* Make sure that we have the correct xtime reference */
- timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
update_xtime_cache(0);
/* re-base the last cycle value */
clock->cycle_last = 0;
@@ -319,8 +353,7 @@
timekeeping_suspend_time = read_persistent_clock();
write_seqlock_irqsave(&xtime_lock, flags);
- /* Get the current xtime offset */
- timekeeping_suspend_nsecs = __get_nsec_offset();
+ clocksource_forward_now();
timekeeping_suspended = 1;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -454,23 +487,29 @@
#else
offset = clock->cycle_interval;
#endif
- clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+ clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
/* normally this loop will run just once, however in the
* case of lost or late ticks, it will accumulate correctly.
*/
while (offset >= clock->cycle_interval) {
/* accumulate one interval */
- clock->xtime_nsec += clock->xtime_interval;
- clock->cycle_last += clock->cycle_interval;
offset -= clock->cycle_interval;
+ clock->cycle_last += clock->cycle_interval;
+ clock->xtime_nsec += clock->xtime_interval;
if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
xtime.tv_sec++;
second_overflow();
}
+ clock->raw_time.tv_nsec += clock->raw_interval;
+ if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
+ clock->raw_time.tv_nsec -= NSEC_PER_SEC;
+ clock->raw_time.tv_sec++;
+ }
+
/* accumulate error between NTP and clock interval */
clock->error += tick_length;
clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
@@ -479,9 +518,12 @@
/* correct the clock when NTP error is too big */
clocksource_adjust(offset);
- /* store full nanoseconds into xtime */
- xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
+ /* store full nanoseconds into xtime after rounding it up and
+ * add the remainder to the error difference.
+ */
+ xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+ clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
update_xtime_cache(cyc2ns(clock, offset));
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a40e20f..f642691 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -47,13 +47,14 @@
}
static void
-print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
+print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
+ int idx, u64 now)
{
#ifdef CONFIG_TIMER_STATS
char tmp[TASK_COMM_LEN + 1];
#endif
SEQ_printf(m, " #%d: ", idx);
- print_name_offset(m, timer);
+ print_name_offset(m, taddr);
SEQ_printf(m, ", ");
print_name_offset(m, timer->function);
SEQ_printf(m, ", S:%02lx", timer->state);
@@ -99,7 +100,7 @@
tmp = *timer;
spin_unlock_irqrestore(&base->cpu_base->lock, flags);
- print_timer(m, &tmp, i, now);
+ print_timer(m, timer, &tmp, i, now);
next++;
goto next_one;
}
@@ -109,6 +110,7 @@
static void
print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
+ SEQ_printf(m, " .base: %p\n", base);
SEQ_printf(m, " .index: %d\n",
base->index);
SEQ_printf(m, " .resolution: %Lu nsecs\n",
@@ -183,12 +185,16 @@
#ifdef CONFIG_GENERIC_CLOCKEVENTS
static void
-print_tickdevice(struct seq_file *m, struct tick_device *td)
+print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
{
struct clock_event_device *dev = td->evtdev;
SEQ_printf(m, "\n");
SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
+ if (cpu < 0)
+ SEQ_printf(m, "Broadcast device\n");
+ else
+ SEQ_printf(m, "Per CPU device: %d\n", cpu);
SEQ_printf(m, "Clock Event Device: ");
if (!dev) {
@@ -222,7 +228,7 @@
int cpu;
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
- print_tickdevice(m, tick_get_broadcast_device());
+ print_tickdevice(m, tick_get_broadcast_device(), -1);
SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
tick_get_broadcast_mask()->bits[0]);
#ifdef CONFIG_TICK_ONESHOT
@@ -232,7 +238,7 @@
SEQ_printf(m, "\n");
#endif
for_each_online_cpu(cpu)
- print_tickdevice(m, tick_get_device(cpu));
+ print_tickdevice(m, tick_get_device(cpu), cpu);
SEQ_printf(m, "\n");
}
#else
@@ -244,7 +250,7 @@
u64 now = ktime_to_ns(ktime_get());
int cpu;
- SEQ_printf(m, "Timer List Version: v0.3\n");
+ SEQ_printf(m, "Timer List Version: v0.4\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
diff --git a/kernel/timer.c b/kernel/timer.c
index 510fe69..56becf3 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1436,9 +1436,11 @@
BUG_ON(cpu_online(cpu));
old_base = per_cpu(tvec_bases, cpu);
new_base = get_cpu_var(tvec_bases);
-
- local_irq_disable();
- spin_lock(&new_base->lock);
+ /*
+ * The caller is globally serialized and nobody else
+ * takes two locks at once, deadlock is not possible.
+ */
+ spin_lock_irq(&new_base->lock);
spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
BUG_ON(old_base->running_timer);
@@ -1453,8 +1455,7 @@
}
spin_unlock(&old_base->lock);
- spin_unlock(&new_base->lock);
- local_irq_enable();
+ spin_unlock_irq(&new_base->lock);
put_cpu_var(tvec_bases);
}
#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 263e9e6..1cb3e1f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1,23 +1,37 @@
#
# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
#
-config HAVE_FTRACE
+
+config NOP_TRACER
bool
+config HAVE_FTRACE
+ bool
+ select NOP_TRACER
+
config HAVE_DYNAMIC_FTRACE
bool
+config HAVE_FTRACE_MCOUNT_RECORD
+ bool
+
config TRACER_MAX_TRACE
bool
+config RING_BUFFER
+ bool
+
config TRACING
bool
select DEBUG_FS
+ select RING_BUFFER
select STACKTRACE
+ select TRACEPOINTS
config FTRACE
bool "Kernel Function Tracer"
depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
select FRAME_POINTER
select TRACING
select CONTEXT_SWITCH_TRACER
@@ -36,6 +50,7 @@
depends on TRACE_IRQFLAGS_SUPPORT
depends on GENERIC_TIME
depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
select TRACE_IRQFLAGS
select TRACING
select TRACER_MAX_TRACE
@@ -59,6 +74,7 @@
depends on GENERIC_TIME
depends on PREEMPT
depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
select TRACING
select TRACER_MAX_TRACE
help
@@ -86,6 +102,7 @@
config SCHED_TRACER
bool "Scheduling Latency Tracer"
depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
select TRACING
select CONTEXT_SWITCH_TRACER
select TRACER_MAX_TRACE
@@ -96,16 +113,56 @@
config CONTEXT_SWITCH_TRACER
bool "Trace process context switches"
depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
select TRACING
select MARKERS
help
This tracer gets called from the context switch and records
all switching of tasks.
+config BOOT_TRACER
+ bool "Trace boot initcalls"
+ depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
+ select TRACING
+ help
+ This tracer helps developers to optimize boot times: it records
+ the timings of the initcalls and traces key events and the identity
+ of tasks that can cause boot delays, such as context-switches.
+
+ Its aim is to be parsed by the /scripts/bootgraph.pl tool to
+ produce pretty graphics about boot inefficiencies, giving a visual
+ representation of the delays during initcalls - but the raw
+ /debug/tracing/trace text output is readable too.
+
+ ( Note that tracing self tests can't be enabled if this tracer is
+ selected, because the self-tests are an initcall as well and that
+ would invalidate the boot trace. )
+
+config STACK_TRACER
+ bool "Trace max stack"
+ depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
+ select FTRACE
+ select STACKTRACE
+ help
+ This special tracer records the maximum stack footprint of the
+ kernel and displays it in debugfs/tracing/stack_trace.
+
+ This tracer works by hooking into every function call that the
+ kernel executes, and keeping a maximum stack depth value and
+ stack-trace saved. Because this logic has to execute in every
+ kernel function, all the time, this option can slow down the
+ kernel measurably and is generally intended for kernel
+ developers only.
+
+ Say N if unsure.
+
config DYNAMIC_FTRACE
bool "enable/disable ftrace tracepoints dynamically"
depends on FTRACE
depends on HAVE_DYNAMIC_FTRACE
+ depends on DEBUG_KERNEL
default y
help
This option will modify all the calls to ftrace dynamically
@@ -121,12 +178,17 @@
were made. If so, it runs stop_machine (stops all CPUS)
and modifies the code to jump over the call to ftrace.
+config FTRACE_MCOUNT_RECORD
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_FTRACE_MCOUNT_RECORD
+
config FTRACE_SELFTEST
bool
config FTRACE_STARTUP_TEST
bool "Perform a startup test on ftrace"
- depends on TRACING
+ depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER
select FTRACE_SELFTEST
help
This option performs a series of startup tests on ftrace. On bootup
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 71d17de..a85dfba 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -11,6 +11,7 @@
endif
obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
@@ -19,6 +20,9 @@
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_NOP_TRACER) += trace_nop.o
+obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
+obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f6e3af3..4dda4f6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -81,7 +81,7 @@
static int __register_ftrace_function(struct ftrace_ops *ops)
{
- /* Should never be called by interrupts */
+ /* should not be called from interrupt context */
spin_lock(&ftrace_lock);
ops->next = ftrace_list;
@@ -115,6 +115,7 @@
struct ftrace_ops **p;
int ret = 0;
+ /* should not be called from interrupt context */
spin_lock(&ftrace_lock);
/*
@@ -153,6 +154,30 @@
#ifdef CONFIG_DYNAMIC_FTRACE
+#ifndef CONFIG_FTRACE_MCOUNT_RECORD
+/*
+ * The hash lock is only needed when the recording of the mcount
+ * callers are dynamic. That is, by the caller themselves and
+ * not recorded via the compilation.
+ */
+static DEFINE_SPINLOCK(ftrace_hash_lock);
+#define ftrace_hash_lock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags)
+#define ftrace_hash_unlock(flags) \
+ spin_unlock_irqrestore(&ftrace_hash_lock, flags)
+#else
+/* This is protected via the ftrace_lock with MCOUNT_RECORD. */
+#define ftrace_hash_lock(flags) do { (void)(flags); } while (0)
+#define ftrace_hash_unlock(flags) do { } while(0)
+#endif
+
+/*
+ * Since MCOUNT_ADDR may point to mcount itself, we do not want
+ * to get it confused by reading a reference in the code as we
+ * are parsing on objcopy output of text. Use a variable for
+ * it instead.
+ */
+static unsigned long mcount_addr = MCOUNT_ADDR;
+
static struct task_struct *ftraced_task;
enum {
@@ -171,7 +196,6 @@
static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
-static DEFINE_SPINLOCK(ftrace_shutdown_lock);
static DEFINE_MUTEX(ftraced_lock);
static DEFINE_MUTEX(ftrace_regex_lock);
@@ -294,13 +318,37 @@
static void ftrace_free_rec(struct dyn_ftrace *rec)
{
- /* no locking, only called from kstop_machine */
-
rec->ip = (unsigned long)ftrace_free_records;
ftrace_free_records = rec;
rec->flags |= FTRACE_FL_FREE;
}
+void ftrace_release(void *start, unsigned long size)
+{
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
+ unsigned long s = (unsigned long)start;
+ unsigned long e = s + size;
+ int i;
+
+ if (ftrace_disabled || !start)
+ return;
+
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
+
+ for (pg = ftrace_pages_start; pg; pg = pg->next) {
+ for (i = 0; i < pg->index; i++) {
+ rec = &pg->records[i];
+
+ if ((rec->ip >= s) && (rec->ip < e))
+ ftrace_free_rec(rec);
+ }
+ }
+ spin_unlock(&ftrace_lock);
+
+}
+
static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
{
struct dyn_ftrace *rec;
@@ -338,7 +386,6 @@
unsigned long flags;
unsigned long key;
int resched;
- int atomic;
int cpu;
if (!ftrace_enabled || ftrace_disabled)
@@ -368,9 +415,7 @@
if (ftrace_ip_in_hash(ip, key))
goto out;
- atomic = irqs_disabled();
-
- spin_lock_irqsave(&ftrace_shutdown_lock, flags);
+ ftrace_hash_lock(flags);
/* This ip may have hit the hash before the lock */
if (ftrace_ip_in_hash(ip, key))
@@ -387,7 +432,7 @@
ftraced_trigger = 1;
out_unlock:
- spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
+ ftrace_hash_unlock(flags);
out:
per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
@@ -531,6 +576,16 @@
ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
}
+static void print_ip_ins(const char *fmt, unsigned char *p)
+{
+ int i;
+
+ printk(KERN_CONT "%s", fmt);
+
+ for (i = 0; i < MCOUNT_INSN_SIZE; i++)
+ printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
+}
+
static int
ftrace_code_disable(struct dyn_ftrace *rec)
{
@@ -541,10 +596,27 @@
ip = rec->ip;
nop = ftrace_nop_replace();
- call = ftrace_call_replace(ip, MCOUNT_ADDR);
+ call = ftrace_call_replace(ip, mcount_addr);
failed = ftrace_modify_code(ip, call, nop);
if (failed) {
+ switch (failed) {
+ case 1:
+ WARN_ON_ONCE(1);
+ pr_info("ftrace faulted on modifying ");
+ print_ip_sym(ip);
+ break;
+ case 2:
+ WARN_ON_ONCE(1);
+ pr_info("ftrace failed to modify ");
+ print_ip_sym(ip);
+ print_ip_ins(" expected: ", call);
+ print_ip_ins(" actual: ", (unsigned char *)ip);
+ print_ip_ins(" replace: ", nop);
+ printk(KERN_CONT "\n");
+ break;
+ }
+
rec->flags |= FTRACE_FL_FAILED;
return 0;
}
@@ -792,47 +864,7 @@
return 1;
}
-static int ftraced(void *ignore)
-{
- unsigned long usecs;
-
- while (!kthread_should_stop()) {
-
- set_current_state(TASK_INTERRUPTIBLE);
-
- /* check once a second */
- schedule_timeout(HZ);
-
- if (unlikely(ftrace_disabled))
- continue;
-
- mutex_lock(&ftrace_sysctl_lock);
- mutex_lock(&ftraced_lock);
- if (!ftraced_suspend && !ftraced_stop &&
- ftrace_update_code()) {
- usecs = nsecs_to_usecs(ftrace_update_time);
- if (ftrace_update_tot_cnt > 100000) {
- ftrace_update_tot_cnt = 0;
- pr_info("hm, dftrace overflow: %lu change%s"
- " (%lu total) in %lu usec%s\n",
- ftrace_update_cnt,
- ftrace_update_cnt != 1 ? "s" : "",
- ftrace_update_tot_cnt,
- usecs, usecs != 1 ? "s" : "");
- ftrace_disabled = 1;
- WARN_ON_ONCE(1);
- }
- }
- mutex_unlock(&ftraced_lock);
- mutex_unlock(&ftrace_sysctl_lock);
-
- ftrace_shutdown_replenish();
- }
- __set_current_state(TASK_RUNNING);
- return 0;
-}
-
-static int __init ftrace_dyn_table_alloc(void)
+static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
{
struct ftrace_page *pg;
int cnt;
@@ -859,7 +891,9 @@
pg = ftrace_pages = ftrace_pages_start;
- cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+ cnt = num_to_init / ENTRIES_PER_PAGE;
+ pr_info("ftrace: allocating %ld hash entries in %d pages\n",
+ num_to_init, cnt);
for (i = 0; i < cnt; i++) {
pg->next = (void *)get_zeroed_page(GFP_KERNEL);
@@ -901,6 +935,8 @@
(*pos)++;
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
retry:
if (iter->idx >= iter->pg->index) {
if (iter->pg->next) {
@@ -910,15 +946,13 @@
}
} else {
rec = &iter->pg->records[iter->idx++];
- if ((!(iter->flags & FTRACE_ITER_FAILURES) &&
+ if ((rec->flags & FTRACE_FL_FREE) ||
+
+ (!(iter->flags & FTRACE_ITER_FAILURES) &&
(rec->flags & FTRACE_FL_FAILED)) ||
((iter->flags & FTRACE_ITER_FAILURES) &&
- (!(rec->flags & FTRACE_FL_FAILED) ||
- (rec->flags & FTRACE_FL_FREE))) ||
-
- ((iter->flags & FTRACE_ITER_FILTER) &&
- !(rec->flags & FTRACE_FL_FILTER)) ||
+ !(rec->flags & FTRACE_FL_FAILED)) ||
((iter->flags & FTRACE_ITER_NOTRACE) &&
!(rec->flags & FTRACE_FL_NOTRACE))) {
@@ -926,6 +960,7 @@
goto retry;
}
}
+ spin_unlock(&ftrace_lock);
iter->pos = *pos;
@@ -1039,8 +1074,8 @@
unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
unsigned i;
- /* keep kstop machine from running */
- preempt_disable();
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
if (enable)
ftrace_filtered = 0;
pg = ftrace_pages_start;
@@ -1053,7 +1088,7 @@
}
pg = pg->next;
}
- preempt_enable();
+ spin_unlock(&ftrace_lock);
}
static int
@@ -1165,8 +1200,8 @@
}
}
- /* keep kstop machine from running */
- preempt_disable();
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
if (enable)
ftrace_filtered = 1;
pg = ftrace_pages_start;
@@ -1203,7 +1238,7 @@
}
pg = pg->next;
}
- preempt_enable();
+ spin_unlock(&ftrace_lock);
}
static ssize_t
@@ -1556,6 +1591,114 @@
fs_initcall(ftrace_init_debugfs);
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+static int ftrace_convert_nops(unsigned long *start,
+ unsigned long *end)
+{
+ unsigned long *p;
+ unsigned long addr;
+ unsigned long flags;
+
+ p = start;
+ while (p < end) {
+ addr = ftrace_call_adjust(*p++);
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
+ ftrace_record_ip(addr);
+ spin_unlock(&ftrace_lock);
+ ftrace_shutdown_replenish();
+ }
+
+ /* p is ignored */
+ local_irq_save(flags);
+ __ftrace_update_code(p);
+ local_irq_restore(flags);
+
+ return 0;
+}
+
+void ftrace_init_module(unsigned long *start, unsigned long *end)
+{
+ if (ftrace_disabled || start == end)
+ return;
+ ftrace_convert_nops(start, end);
+}
+
+extern unsigned long __start_mcount_loc[];
+extern unsigned long __stop_mcount_loc[];
+
+void __init ftrace_init(void)
+{
+ unsigned long count, addr, flags;
+ int ret;
+
+ /* Keep the ftrace pointer to the stub */
+ addr = (unsigned long)ftrace_stub;
+
+ local_irq_save(flags);
+ ftrace_dyn_arch_init(&addr);
+ local_irq_restore(flags);
+
+ /* ftrace_dyn_arch_init places the return code in addr */
+ if (addr)
+ goto failed;
+
+ count = __stop_mcount_loc - __start_mcount_loc;
+
+ ret = ftrace_dyn_table_alloc(count);
+ if (ret)
+ goto failed;
+
+ last_ftrace_enabled = ftrace_enabled = 1;
+
+ ret = ftrace_convert_nops(__start_mcount_loc,
+ __stop_mcount_loc);
+
+ return;
+ failed:
+ ftrace_disabled = 1;
+}
+#else /* CONFIG_FTRACE_MCOUNT_RECORD */
+static int ftraced(void *ignore)
+{
+ unsigned long usecs;
+
+ while (!kthread_should_stop()) {
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* check once a second */
+ schedule_timeout(HZ);
+
+ if (unlikely(ftrace_disabled))
+ continue;
+
+ mutex_lock(&ftrace_sysctl_lock);
+ mutex_lock(&ftraced_lock);
+ if (!ftraced_suspend && !ftraced_stop &&
+ ftrace_update_code()) {
+ usecs = nsecs_to_usecs(ftrace_update_time);
+ if (ftrace_update_tot_cnt > 100000) {
+ ftrace_update_tot_cnt = 0;
+ pr_info("hm, dftrace overflow: %lu change%s"
+ " (%lu total) in %lu usec%s\n",
+ ftrace_update_cnt,
+ ftrace_update_cnt != 1 ? "s" : "",
+ ftrace_update_tot_cnt,
+ usecs, usecs != 1 ? "s" : "");
+ ftrace_disabled = 1;
+ WARN_ON_ONCE(1);
+ }
+ }
+ mutex_unlock(&ftraced_lock);
+ mutex_unlock(&ftrace_sysctl_lock);
+
+ ftrace_shutdown_replenish();
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+
static int __init ftrace_dynamic_init(void)
{
struct task_struct *p;
@@ -1572,7 +1715,7 @@
goto failed;
}
- ret = ftrace_dyn_table_alloc();
+ ret = ftrace_dyn_table_alloc(NR_TO_INIT);
if (ret)
goto failed;
@@ -1593,6 +1736,8 @@
}
core_initcall(ftrace_dynamic_init);
+#endif /* CONFIG_FTRACE_MCOUNT_RECORD */
+
#else
# define ftrace_startup() do { } while (0)
# define ftrace_shutdown() do { } while (0)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
new file mode 100644
index 0000000..94af1fe
--- /dev/null
+++ b/kernel/trace/ring_buffer.c
@@ -0,0 +1,2014 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h> /* used for sched_clock() (for now) */
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+/* FIXME!!! */
+u64 ring_buffer_time_stamp(int cpu)
+{
+ /* shift to debug/test normalization and TIME_EXTENTS */
+ return sched_clock() << DEBUG_SHIFT;
+}
+
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
+{
+ /* Just stupid testing the normalize function and deltas */
+ *ts >>= DEBUG_SHIFT;
+}
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_ALIGNMENT_SHIFT 2
+#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA 28
+
+enum {
+ RB_LEN_TIME_EXTEND = 8,
+ RB_LEN_TIME_STAMP = 16,
+};
+
+/* inline for ring buffer fast paths */
+static inline unsigned
+rb_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ /* undefined */
+ return -1;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ return RB_LEN_TIME_EXTEND;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ return RB_LEN_TIME_STAMP;
+
+ case RINGBUF_TYPE_DATA:
+ if (event->len)
+ length = event->len << RB_ALIGNMENT_SHIFT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+ default:
+ BUG();
+ }
+ /* not hit */
+ return 0;
+}
+
+/**
+ * ring_buffer_event_length - return the length of the event
+ * @event: the event to get the length of
+ */
+unsigned ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ return rb_event_length(event);
+}
+
+/* inline for ring buffer fast paths */
+static inline void *
+rb_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RINGBUF_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
+
+/**
+ * ring_buffer_event_data - return the data of the event
+ * @event: the event to get the data from
+ */
+void *ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ return rb_event_data(event);
+}
+
+#define for_each_buffer_cpu(buffer, cpu) \
+ for_each_cpu_mask(cpu, buffer->cpumask)
+
+#define TS_SHIFT 27
+#define TS_MASK ((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST (~TS_MASK)
+
+/*
+ * This hack stolen from mm/slob.c.
+ * We can store per page timing information in the page frame of the page.
+ * Thanks to Peter Zijlstra for suggesting this idea.
+ */
+struct buffer_page {
+ u64 time_stamp; /* page time stamp */
+ local_t write; /* index for next write */
+ local_t commit; /* write commited index */
+ unsigned read; /* index for next read */
+ struct list_head list; /* list of free pages */
+ void *page; /* Actual data page */
+};
+
+/*
+ * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
+ * this issue out.
+ */
+static inline void free_buffer_page(struct buffer_page *bpage)
+{
+ if (bpage->page)
+ __free_page(bpage->page);
+ kfree(bpage);
+}
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ */
+static inline int test_time_stamp(u64 delta)
+{
+ if (delta & TS_DELTA_TEST)
+ return 1;
+ return 0;
+}
+
+#define BUF_PAGE_SIZE PAGE_SIZE
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ spinlock_t lock;
+ struct lock_class_key lock_key;
+ struct list_head pages;
+ struct buffer_page *head_page; /* read from head */
+ struct buffer_page *tail_page; /* write to tail */
+ struct buffer_page *commit_page; /* commited pages */
+ struct buffer_page *reader_page;
+ unsigned long overrun;
+ unsigned long entries;
+ u64 write_stamp;
+ u64 read_stamp;
+ atomic_t record_disabled;
+};
+
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ cpumask_t cpumask;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ struct ring_buffer_per_cpu **buffers;
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ struct buffer_page *head_page;
+ u64 read_stamp;
+};
+
+#define RB_WARN_ON(buffer, cond) \
+ do { \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ } \
+ } while (0)
+
+#define RB_WARN_ON_RET(buffer, cond) \
+ do { \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ return -1; \
+ } \
+ } while (0)
+
+#define RB_WARN_ON_ONCE(buffer, cond) \
+ do { \
+ static int once; \
+ if (unlikely(cond) && !once) { \
+ once++; \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ } \
+ } while (0)
+
+/**
+ * check_pages - integrity check of buffer pages
+ * @cpu_buffer: CPU buffer with pages to test
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+
+ RB_WARN_ON_RET(cpu_buffer, head->next->prev != head);
+ RB_WARN_ON_RET(cpu_buffer, head->prev->next != head);
+
+ list_for_each_entry_safe(page, tmp, head, list) {
+ RB_WARN_ON_RET(cpu_buffer,
+ page->list.next->prev != &page->list);
+ RB_WARN_ON_RET(cpu_buffer,
+ page->list.prev->next != &page->list);
+ }
+
+ return 0;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+ unsigned long addr;
+ LIST_HEAD(pages);
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
+ if (!page)
+ goto free_pages;
+ list_add(&page->list, &pages);
+
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page->page = (void *)addr;
+ }
+
+ list_splice(&pages, head);
+
+ rb_check_pages(cpu_buffer);
+
+ return 0;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, list) {
+ list_del_init(&page->list);
+ free_buffer_page(page);
+ }
+ return -ENOMEM;
+}
+
+static struct ring_buffer_per_cpu *
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *page;
+ unsigned long addr;
+ int ret;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ spin_lock_init(&cpu_buffer->lock);
+ INIT_LIST_HEAD(&cpu_buffer->pages);
+
+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto fail_free_buffer;
+
+ cpu_buffer->reader_page = page;
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto fail_free_reader;
+ page->page = (void *)addr;
+
+ INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+
+ ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+ if (ret < 0)
+ goto fail_free_reader;
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ return cpu_buffer;
+
+ fail_free_reader:
+ free_buffer_page(cpu_buffer->reader_page);
+
+ fail_free_buffer:
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+
+ list_del_init(&cpu_buffer->reader_page->list);
+ free_buffer_page(cpu_buffer->reader_page);
+
+ list_for_each_entry_safe(page, tmp, head, list) {
+ list_del_init(&page->list);
+ free_buffer_page(page);
+ }
+ kfree(cpu_buffer);
+}
+
+/*
+ * Causes compile errors if the struct buffer_page gets bigger
+ * than the struct page.
+ */
+extern int ring_buffer_page_too_big(void);
+
+/**
+ * ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes that is needed.
+ * @flags: attributes to set for the ring buffer.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+{
+ struct ring_buffer *buffer;
+ int bsize;
+ int cpu;
+
+ /* Paranoid! Optimizes out when all is well */
+ if (sizeof(struct buffer_page) > sizeof(struct page))
+ ring_buffer_page_too_big();
+
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ buffer->flags = flags;
+
+ /* need at least two pages */
+ if (buffer->pages == 1)
+ buffer->pages++;
+
+ buffer->cpumask = cpu_possible_map;
+ buffer->cpus = nr_cpu_ids;
+
+ bsize = sizeof(void *) * nr_cpu_ids;
+ buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer->buffers)
+ goto fail_free_buffer;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ buffer->buffers[cpu] =
+ rb_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+ mutex_init(&buffer->mutex);
+
+ return buffer;
+
+ fail_free_buffers:
+ for_each_buffer_cpu(buffer, cpu) {
+ if (buffer->buffers[cpu])
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+ kfree(buffer->buffers);
+
+ fail_free_buffer:
+ kfree(buffer);
+ return NULL;
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for_each_buffer_cpu(buffer, cpu)
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+
+ kfree(buffer);
+}
+
+static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
+
+static void
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct buffer_page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(&cpu_buffer->pages));
+ p = cpu_buffer->pages.next;
+ page = list_entry(p, struct buffer_page, list);
+ list_del_init(&page->list);
+ free_buffer_page(page);
+ }
+ BUG_ON(list_empty(&cpu_buffer->pages));
+
+ rb_reset_cpu(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+
+}
+
+static void
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct buffer_page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(pages));
+ p = pages->next;
+ page = list_entry(p, struct buffer_page, list);
+ list_del_init(&page->list);
+ list_add_tail(&page->list, &cpu_buffer->pages);
+ }
+ rb_reset_cpu(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_resize - resize the ring buffer
+ * @buffer: the buffer to resize.
+ * @size: the new size.
+ *
+ * The tracer is responsible for making sure that the buffer is
+ * not being used while changing the size.
+ * Note: We may be able to change the above requirement by using
+ * RCU synchronizations.
+ *
+ * Minimum size is 2 * BUF_PAGE_SIZE.
+ *
+ * Returns -1 on failure.
+ */
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned nr_pages, rm_pages, new_pages;
+ struct buffer_page *page, *tmp;
+ unsigned long buffer_size;
+ unsigned long addr;
+ LIST_HEAD(pages);
+ int i, cpu;
+
+ size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ size *= BUF_PAGE_SIZE;
+ buffer_size = buffer->pages * BUF_PAGE_SIZE;
+
+ /* we need a minimum of two pages */
+ if (size < BUF_PAGE_SIZE * 2)
+ size = BUF_PAGE_SIZE * 2;
+
+ if (size == buffer_size)
+ return size;
+
+ mutex_lock(&buffer->mutex);
+
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+
+ if (size < buffer_size) {
+
+ /* easy case, just free pages */
+ BUG_ON(nr_pages >= buffer->pages);
+
+ rm_pages = buffer->pages - nr_pages;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_remove_pages(cpu_buffer, rm_pages);
+ }
+ goto out;
+ }
+
+ /*
+ * This is a bit more difficult. We only want to add pages
+ * when we can allocate enough for all CPUs. We do this
+ * by allocating all the pages and storing them on a local
+ * link list. If we succeed in our allocation, then we
+ * add these pages to the cpu_buffers. Otherwise we just free
+ * them all and return -ENOMEM;
+ */
+ BUG_ON(nr_pages <= buffer->pages);
+ new_pages = nr_pages - buffer->pages;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ for (i = 0; i < new_pages; i++) {
+ page = kzalloc_node(ALIGN(sizeof(*page),
+ cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto free_pages;
+ list_add(&page->list, &pages);
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page->page = (void *)addr;
+ }
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_insert_pages(cpu_buffer, &pages, new_pages);
+ }
+
+ BUG_ON(!list_empty(&pages));
+
+ out:
+ buffer->pages = nr_pages;
+ mutex_unlock(&buffer->mutex);
+
+ return size;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, list) {
+ list_del_init(&page->list);
+ free_buffer_page(page);
+ }
+ return -ENOMEM;
+}
+
+static inline int rb_null_event(struct ring_buffer_event *event)
+{
+ return event->type == RINGBUF_TYPE_PADDING;
+}
+
+static inline void *__rb_page_index(struct buffer_page *page, unsigned index)
+{
+ return page->page + index;
+}
+
+static inline struct ring_buffer_event *
+rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return __rb_page_index(cpu_buffer->reader_page,
+ cpu_buffer->reader_page->read);
+}
+
+static inline struct ring_buffer_event *
+rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return __rb_page_index(cpu_buffer->head_page,
+ cpu_buffer->head_page->read);
+}
+
+static inline struct ring_buffer_event *
+rb_iter_head_event(struct ring_buffer_iter *iter)
+{
+ return __rb_page_index(iter->head_page, iter->head);
+}
+
+static inline unsigned rb_page_write(struct buffer_page *bpage)
+{
+ return local_read(&bpage->write);
+}
+
+static inline unsigned rb_page_commit(struct buffer_page *bpage)
+{
+ return local_read(&bpage->commit);
+}
+
+/* Size is determined by what has been commited */
+static inline unsigned rb_page_size(struct buffer_page *bpage)
+{
+ return rb_page_commit(bpage);
+}
+
+static inline unsigned
+rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_commit(cpu_buffer->commit_page);
+}
+
+static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_commit(cpu_buffer->head_page);
+}
+
+/*
+ * When the tail hits the head and the buffer is in overwrite mode,
+ * the head jumps to the next page and all content on the previous
+ * page is discarded. But before doing so, we update the overrun
+ * variable of the buffer.
+ */
+static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned long head;
+
+ for (head = 0; head < rb_head_size(cpu_buffer);
+ head += rb_event_length(event)) {
+
+ event = __rb_page_index(cpu_buffer->head_page, head);
+ BUG_ON(rb_null_event(event));
+ /* Only count data entries */
+ if (event->type != RINGBUF_TYPE_DATA)
+ continue;
+ cpu_buffer->overrun++;
+ cpu_buffer->entries--;
+ }
+}
+
+static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page **page)
+{
+ struct list_head *p = (*page)->list.next;
+
+ if (p == &cpu_buffer->pages)
+ p = p->next;
+
+ *page = list_entry(p, struct buffer_page, list);
+}
+
+static inline unsigned
+rb_event_index(struct ring_buffer_event *event)
+{
+ unsigned long addr = (unsigned long)event;
+
+ return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
+}
+
+static inline int
+rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ unsigned long addr = (unsigned long)event;
+ unsigned long index;
+
+ index = rb_event_index(event);
+ addr &= PAGE_MASK;
+
+ return cpu_buffer->commit_page->page == (void *)addr &&
+ rb_commit_index(cpu_buffer) == index;
+}
+
+static inline void
+rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ unsigned long addr = (unsigned long)event;
+ unsigned long index;
+
+ index = rb_event_index(event);
+ addr &= PAGE_MASK;
+
+ while (cpu_buffer->commit_page->page != (void *)addr) {
+ RB_WARN_ON(cpu_buffer,
+ cpu_buffer->commit_page == cpu_buffer->tail_page);
+ cpu_buffer->commit_page->commit =
+ cpu_buffer->commit_page->write;
+ rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+ cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
+ }
+
+ /* Now set the commit to the event's index */
+ local_set(&cpu_buffer->commit_page->commit, index);
+}
+
+static inline void
+rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ /*
+ * We only race with interrupts and NMIs on this CPU.
+ * If we own the commit event, then we can commit
+ * all others that interrupted us, since the interruptions
+ * are in stack format (they finish before they come
+ * back to us). This allows us to do a simple loop to
+ * assign the commit to the tail.
+ */
+ while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+ cpu_buffer->commit_page->commit =
+ cpu_buffer->commit_page->write;
+ rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+ cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
+ /* add barrier to keep gcc from optimizing too much */
+ barrier();
+ }
+ while (rb_commit_index(cpu_buffer) !=
+ rb_page_write(cpu_buffer->commit_page)) {
+ cpu_buffer->commit_page->commit =
+ cpu_buffer->commit_page->write;
+ barrier();
+ }
+}
+
+static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp;
+ cpu_buffer->reader_page->read = 0;
+}
+
+static inline void rb_inc_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ /*
+ * The iterator could be on the reader page (it starts there).
+ * But the head could have moved, since the reader was
+ * found. Check for this case and assign the iterator
+ * to the head page instead of next.
+ */
+ if (iter->head_page == cpu_buffer->reader_page)
+ iter->head_page = cpu_buffer->head_page;
+ else
+ rb_inc_page(cpu_buffer, &iter->head_page);
+
+ iter->read_stamp = iter->head_page->time_stamp;
+ iter->head = 0;
+}
+
+/**
+ * ring_buffer_update_event - update event type and data
+ * @event: the even to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static inline void
+rb_update_event(struct ring_buffer_event *event,
+ unsigned type, unsigned length)
+{
+ event->type = type;
+
+ switch (type) {
+
+ case RINGBUF_TYPE_PADDING:
+ break;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ event->len =
+ (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ event->len =
+ (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RINGBUF_TYPE_DATA:
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA) {
+ event->len = 0;
+ event->array[0] = length;
+ } else
+ event->len =
+ (length + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
+
+ /* zero length can cause confusions */
+ if (!length)
+ length = 1;
+
+ if (length > RB_MAX_SMALL_DATA)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ALIGNMENT);
+
+ return length;
+}
+
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length, u64 *ts)
+{
+ struct buffer_page *tail_page, *head_page, *reader_page;
+ unsigned long tail, write;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+ unsigned long flags;
+
+ tail_page = cpu_buffer->tail_page;
+ write = local_add_return(length, &tail_page->write);
+ tail = write - length;
+
+ /* See if we shot pass the end of this buffer page */
+ if (write > BUF_PAGE_SIZE) {
+ struct buffer_page *next_page = tail_page;
+
+ spin_lock_irqsave(&cpu_buffer->lock, flags);
+
+ rb_inc_page(cpu_buffer, &next_page);
+
+ head_page = cpu_buffer->head_page;
+ reader_page = cpu_buffer->reader_page;
+
+ /* we grabbed the lock before incrementing */
+ RB_WARN_ON(cpu_buffer, next_page == reader_page);
+
+ /*
+ * If for some reason, we had an interrupt storm that made
+ * it all the way around the buffer, bail, and warn
+ * about it.
+ */
+ if (unlikely(next_page == cpu_buffer->commit_page)) {
+ WARN_ON_ONCE(1);
+ goto out_unlock;
+ }
+
+ if (next_page == head_page) {
+ if (!(buffer->flags & RB_FL_OVERWRITE)) {
+ /* reset write */
+ if (tail <= BUF_PAGE_SIZE)
+ local_set(&tail_page->write, tail);
+ goto out_unlock;
+ }
+
+ /* tail_page has not moved yet? */
+ if (tail_page == cpu_buffer->tail_page) {
+ /* count overflows */
+ rb_update_overflow(cpu_buffer);
+
+ rb_inc_page(cpu_buffer, &head_page);
+ cpu_buffer->head_page = head_page;
+ cpu_buffer->head_page->read = 0;
+ }
+ }
+
+ /*
+ * If the tail page is still the same as what we think
+ * it is, then it is up to us to update the tail
+ * pointer.
+ */
+ if (tail_page == cpu_buffer->tail_page) {
+ local_set(&next_page->write, 0);
+ local_set(&next_page->commit, 0);
+ cpu_buffer->tail_page = next_page;
+
+ /* reread the time stamp */
+ *ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+ cpu_buffer->tail_page->time_stamp = *ts;
+ }
+
+ /*
+ * The actual tail page has moved forward.
+ */
+ if (tail < BUF_PAGE_SIZE) {
+ /* Mark the rest of the page with padding */
+ event = __rb_page_index(tail_page, tail);
+ event->type = RINGBUF_TYPE_PADDING;
+ }
+
+ if (tail <= BUF_PAGE_SIZE)
+ /* Set the write back to the previous setting */
+ local_set(&tail_page->write, tail);
+
+ /*
+ * If this was a commit entry that failed,
+ * increment that too
+ */
+ if (tail_page == cpu_buffer->commit_page &&
+ tail == rb_commit_index(cpu_buffer)) {
+ rb_set_commit_to_write(cpu_buffer);
+ }
+
+ spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+
+ /* fail and let the caller try again */
+ return ERR_PTR(-EAGAIN);
+ }
+
+ /* We reserved something on the buffer */
+
+ BUG_ON(write > BUF_PAGE_SIZE);
+
+ event = __rb_page_index(tail_page, tail);
+ rb_update_event(event, type, length);
+
+ /*
+ * If this is a commit and the tail is zero, then update
+ * this page's time stamp.
+ */
+ if (!tail && rb_is_commit(cpu_buffer, event))
+ cpu_buffer->commit_page->time_stamp = *ts;
+
+ return event;
+
+ out_unlock:
+ spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+ return NULL;
+}
+
+static int
+rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ u64 *ts, u64 *delta)
+{
+ struct ring_buffer_event *event;
+ static int once;
+ int ret;
+
+ if (unlikely(*delta > (1ULL << 59) && !once++)) {
+ printk(KERN_WARNING "Delta way too big! %llu"
+ " ts=%llu write stamp = %llu\n",
+ *delta, *ts, cpu_buffer->write_stamp);
+ WARN_ON(1);
+ }
+
+ /*
+ * The delta is too big, we to add a
+ * new timestamp.
+ */
+ event = __rb_reserve_next(cpu_buffer,
+ RINGBUF_TYPE_TIME_EXTEND,
+ RB_LEN_TIME_EXTEND,
+ ts);
+ if (!event)
+ return -EBUSY;
+
+ if (PTR_ERR(event) == -EAGAIN)
+ return -EAGAIN;
+
+ /* Only a commited time event can update the write stamp */
+ if (rb_is_commit(cpu_buffer, event)) {
+ /*
+ * If this is the first on the page, then we need to
+ * update the page itself, and just put in a zero.
+ */
+ if (rb_event_index(event)) {
+ event->time_delta = *delta & TS_MASK;
+ event->array[0] = *delta >> TS_SHIFT;
+ } else {
+ cpu_buffer->commit_page->time_stamp = *ts;
+ event->time_delta = 0;
+ event->array[0] = 0;
+ }
+ cpu_buffer->write_stamp = *ts;
+ /* let the caller know this was the commit */
+ ret = 1;
+ } else {
+ /* Darn, this is just wasted space */
+ event->time_delta = 0;
+ event->array[0] = 0;
+ ret = 0;
+ }
+
+ *delta = 0;
+
+ return ret;
+}
+
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length)
+{
+ struct ring_buffer_event *event;
+ u64 ts, delta;
+ int commit = 0;
+
+ again:
+ ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+
+ /*
+ * Only the first commit can update the timestamp.
+ * Yes there is a race here. If an interrupt comes in
+ * just after the conditional and it traces too, then it
+ * will also check the deltas. More than one timestamp may
+ * also be made. But only the entry that did the actual
+ * commit will be something other than zero.
+ */
+ if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
+ rb_page_write(cpu_buffer->tail_page) ==
+ rb_commit_index(cpu_buffer)) {
+
+ delta = ts - cpu_buffer->write_stamp;
+
+ /* make sure this delta is calculated here */
+ barrier();
+
+ /* Did the write stamp get updated already? */
+ if (unlikely(ts < cpu_buffer->write_stamp))
+ goto again;
+
+ if (test_time_stamp(delta)) {
+
+ commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
+
+ if (commit == -EBUSY)
+ return NULL;
+
+ if (commit == -EAGAIN)
+ goto again;
+
+ RB_WARN_ON(cpu_buffer, commit < 0);
+ }
+ } else
+ /* Non commits have zero deltas */
+ delta = 0;
+
+ event = __rb_reserve_next(cpu_buffer, type, length, &ts);
+ if (PTR_ERR(event) == -EAGAIN)
+ goto again;
+
+ if (!event) {
+ if (unlikely(commit))
+ /*
+ * Ouch! We needed a timestamp and it was commited. But
+ * we didn't get our event reserved.
+ */
+ rb_set_commit_to_write(cpu_buffer);
+ return NULL;
+ }
+
+ /*
+ * If the timestamp was commited, make the commit our entry
+ * now so that we will update it when needed.
+ */
+ if (commit)
+ rb_set_commit_event(cpu_buffer, event);
+ else if (!rb_is_commit(cpu_buffer, event))
+ delta = 0;
+
+ event->time_delta = delta;
+
+ return event;
+}
+
+static DEFINE_PER_CPU(int, rb_need_resched);
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ * @buffer: the ring buffer to reserve from
+ * @length: the length of the data to reserve (excluding event header)
+ * @flags: a pointer to save the interrupt flags
+ *
+ * Returns a reseverd event on the ring buffer to copy directly to.
+ * The user of this interface will need to get the body to write into
+ * and can use the ring_buffer_event_data() interface.
+ *
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int cpu, resched;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ /* If we are tracing schedule, we don't want to recurse */
+ resched = need_resched();
+ preempt_disable_notrace();
+
+ cpu = raw_smp_processor_id();
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ goto out;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ length = rb_calculate_event_length(length);
+ if (length > BUF_PAGE_SIZE)
+ goto out;
+
+ event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+ if (!event)
+ goto out;
+
+ /*
+ * Need to store resched state on this cpu.
+ * Only the first needs to.
+ */
+
+ if (preempt_count() == 1)
+ per_cpu(rb_need_resched, cpu) = resched;
+
+ return event;
+
+ out:
+ if (resched)
+ preempt_enable_notrace();
+ else
+ preempt_enable_notrace();
+ return NULL;
+}
+
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ cpu_buffer->entries++;
+
+ /* Only process further if we own the commit */
+ if (!rb_is_commit(cpu_buffer, event))
+ return;
+
+ cpu_buffer->write_stamp += event->time_delta;
+
+ rb_set_commit_to_write(cpu_buffer);
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ * @flags: the interrupt flags received from ring_buffer_lock_reserve.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ rb_commit(cpu_buffer, event);
+
+ /*
+ * Only the last preempt count needs to restore preemption.
+ */
+ if (preempt_count() == 1) {
+ if (per_cpu(rb_need_resched, cpu))
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
+ } else
+ preempt_enable_no_resched_notrace();
+
+ return 0;
+}
+
+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ * @buffer: The ring buffer to write to.
+ * @length: The length of the data being written (excluding the event header)
+ * @data: The data to write to the buffer.
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length,
+ void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long event_length;
+ void *body;
+ int ret = -EBUSY;
+ int cpu, resched;
+
+ if (atomic_read(&buffer->record_disabled))
+ return -EBUSY;
+
+ resched = need_resched();
+ preempt_disable_notrace();
+
+ cpu = raw_smp_processor_id();
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ goto out;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ event_length = rb_calculate_event_length(length);
+ event = rb_reserve_next_event(cpu_buffer,
+ RINGBUF_TYPE_DATA, event_length);
+ if (!event)
+ goto out;
+
+ body = rb_event_data(event);
+
+ memcpy(body, data, length);
+
+ rb_commit(cpu_buffer, event);
+
+ ret = 0;
+ out:
+ if (resched)
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
+
+ return ret;
+}
+
+static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *reader = cpu_buffer->reader_page;
+ struct buffer_page *head = cpu_buffer->head_page;
+ struct buffer_page *commit = cpu_buffer->commit_page;
+
+ return reader->read == rb_page_commit(reader) &&
+ (commit == reader ||
+ (commit == head &&
+ head->read == rb_page_commit(commit)));
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+ atomic_inc(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+ atomic_dec(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
+ * @buffer: The ring buffer to stop writes to.
+ * @cpu: The CPU buffer to stop
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_inc(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable_cpu - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ * @cpu: The CPU to enable.
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the entries from.
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->overrun;
+}
+
+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long entries = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ entries += cpu_buffer->entries;
+ }
+
+ return entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long overruns = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ overruns += cpu_buffer->overrun;
+ }
+
+ return overruns;
+}
+
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ /* Iterator usage is expected to have record disabled */
+ if (list_empty(&cpu_buffer->reader_page->list)) {
+ iter->head_page = cpu_buffer->head_page;
+ iter->head = cpu_buffer->head_page->read;
+ } else {
+ iter->head_page = cpu_buffer->reader_page;
+ iter->head = cpu_buffer->reader_page->read;
+ }
+ if (iter->head)
+ iter->read_stamp = cpu_buffer->read_stamp;
+ else
+ iter->read_stamp = iter->head_page->time_stamp;
+}
+
+/**
+ * ring_buffer_iter_empty - check if an iterator has no more to read
+ * @iter: The iterator to check
+ */
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ return iter->head_page == cpu_buffer->commit_page &&
+ iter->head == rb_commit_index(cpu_buffer);
+}
+
+static void
+rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ return;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->read_stamp += delta;
+ return;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RINGBUF_TYPE_DATA:
+ cpu_buffer->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void
+rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ return;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ iter->read_stamp += delta;
+ return;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RINGBUF_TYPE_DATA:
+ iter->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static struct buffer_page *
+rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *reader = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cpu_buffer->lock, flags);
+
+ again:
+ reader = cpu_buffer->reader_page;
+
+ /* If there's more to read, return this page */
+ if (cpu_buffer->reader_page->read < rb_page_size(reader))
+ goto out;
+
+ /* Never should we have an index greater than the size */
+ RB_WARN_ON(cpu_buffer,
+ cpu_buffer->reader_page->read > rb_page_size(reader));
+
+ /* check if we caught up to the tail */
+ reader = NULL;
+ if (cpu_buffer->commit_page == cpu_buffer->reader_page)
+ goto out;
+
+ /*
+ * Splice the empty reader page into the list around the head.
+ * Reset the reader page to size zero.
+ */
+
+ reader = cpu_buffer->head_page;
+ cpu_buffer->reader_page->list.next = reader->list.next;
+ cpu_buffer->reader_page->list.prev = reader->list.prev;
+
+ local_set(&cpu_buffer->reader_page->write, 0);
+ local_set(&cpu_buffer->reader_page->commit, 0);
+
+ /* Make the reader page now replace the head */
+ reader->list.prev->next = &cpu_buffer->reader_page->list;
+ reader->list.next->prev = &cpu_buffer->reader_page->list;
+
+ /*
+ * If the tail is on the reader, then we must set the head
+ * to the inserted page, otherwise we set it one before.
+ */
+ cpu_buffer->head_page = cpu_buffer->reader_page;
+
+ if (cpu_buffer->commit_page != reader)
+ rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+
+ /* Finally update the reader page to the new head */
+ cpu_buffer->reader_page = reader;
+ rb_reset_reader_page(cpu_buffer);
+
+ goto again;
+
+ out:
+ spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+
+ return reader;
+}
+
+static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ struct buffer_page *reader;
+ unsigned length;
+
+ reader = rb_get_reader_page(cpu_buffer);
+
+ /* This function should not be called when buffer is empty */
+ BUG_ON(!reader);
+
+ event = rb_reader_event(cpu_buffer);
+
+ if (event->type == RINGBUF_TYPE_DATA)
+ cpu_buffer->entries--;
+
+ rb_update_read_stamp(cpu_buffer, event);
+
+ length = rb_event_length(event);
+ cpu_buffer->reader_page->read += length;
+}
+
+static void rb_advance_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (iter->head >= rb_page_size(iter->head_page)) {
+ BUG_ON(iter->head_page == cpu_buffer->commit_page);
+ rb_inc_iter(iter);
+ return;
+ }
+
+ event = rb_iter_head_event(iter);
+
+ length = rb_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((iter->head_page == cpu_buffer->commit_page) &&
+ (iter->head + length > rb_commit_index(cpu_buffer)));
+
+ rb_update_iter_read_stamp(iter, event);
+
+ iter->head += length;
+
+ /* check for end of page padding */
+ if ((iter->head >= rb_page_size(iter->head_page)) &&
+ (iter->head_page != cpu_buffer->commit_page))
+ rb_advance_iter(iter);
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ struct buffer_page *reader;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ again:
+ reader = rb_get_reader_page(cpu_buffer);
+ if (!reader)
+ return NULL;
+
+ event = rb_reader_event(cpu_buffer);
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ RB_WARN_ON(cpu_buffer, 1);
+ rb_advance_reader(cpu_buffer);
+ return NULL;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ /* Internal data, OK to advance */
+ rb_advance_reader(cpu_buffer);
+ goto again;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ rb_advance_reader(cpu_buffer);
+ goto again;
+
+ case RINGBUF_TYPE_DATA:
+ if (ts) {
+ *ts = cpu_buffer->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (ring_buffer_iter_empty(iter))
+ return NULL;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ again:
+ if (rb_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = rb_iter_head_event(iter);
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ rb_inc_iter(iter);
+ goto again;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ /* Internal data, OK to advance */
+ rb_advance_iter(iter);
+ goto again;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ rb_advance_iter(iter);
+ goto again;
+
+ case RINGBUF_TYPE_DATA:
+ if (ts) {
+ *ts = iter->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ * @buffer: The ring buffer to get the next event from
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ event = ring_buffer_peek(buffer, cpu, ts);
+ if (!event)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+ rb_advance_reader(cpu_buffer);
+
+ return event;
+}
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @buffer: The ring buffer to read from
+ * @cpu: The cpu buffer to iterate over
+ *
+ * This starts up an iteration through the buffer. It also disables
+ * the recording to the buffer until the reading is finished.
+ * This prevents the reading from being corrupted. This is not
+ * a consuming read, so a producer is not expected.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter *iter;
+ unsigned long flags;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ iter->cpu_buffer = cpu_buffer;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ spin_lock_irqsave(&cpu_buffer->lock, flags);
+ ring_buffer_iter_reset(iter);
+ spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+
+ return iter;
+}
+
+/**
+ * ring_buffer_finish - finish reading the iterator of the buffer
+ * @iter: The iterator retrieved by ring_buffer_start
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_read_finish(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ kfree(iter);
+}
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * @iter: The ring buffer iterator
+ * @ts: The time stamp of the event read.
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_iter_peek(iter, ts);
+ if (!event)
+ return NULL;
+
+ rb_advance_iter(iter);
+
+ return event;
+}
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ * @buffer: The ring buffer.
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
+{
+ return BUF_PAGE_SIZE * buffer->pages;
+}
+
+static void
+rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ local_set(&cpu_buffer->head_page->write, 0);
+ local_set(&cpu_buffer->head_page->commit, 0);
+
+ cpu_buffer->head_page->read = 0;
+
+ cpu_buffer->tail_page = cpu_buffer->head_page;
+ cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+ local_set(&cpu_buffer->reader_page->write, 0);
+ local_set(&cpu_buffer->reader_page->commit, 0);
+ cpu_buffer->reader_page->read = 0;
+
+ cpu_buffer->overrun = 0;
+ cpu_buffer->entries = 0;
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ unsigned long flags;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ spin_lock_irqsave(&cpu_buffer->lock, flags);
+
+ rb_reset_cpu(cpu_buffer);
+
+ spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+}
+
+/**
+ * ring_buffer_reset - reset a ring buffer
+ * @buffer: The ring buffer to reset all cpu buffers
+ */
+void ring_buffer_reset(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for_each_buffer_cpu(buffer, cpu)
+ ring_buffer_reset_cpu(buffer, cpu);
+}
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ * @buffer: The ring buffer to test
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!rb_per_cpu_empty(cpu_buffer))
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to test
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 1;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return rb_per_cpu_empty(cpu_buffer);
+}
+
+/**
+ * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
+ * @buffer_a: One buffer to swap with
+ * @buffer_b: The other buffer to swap with
+ *
+ * This function is useful for tracers that want to take a "snapshot"
+ * of a CPU buffer and has another back up buffer lying around.
+ * it is expected that the tracer handles the cpu buffer not being
+ * used at the moment.
+ */
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer_a;
+ struct ring_buffer_per_cpu *cpu_buffer_b;
+
+ if (!cpu_isset(cpu, buffer_a->cpumask) ||
+ !cpu_isset(cpu, buffer_b->cpumask))
+ return -EINVAL;
+
+ /* At least make sure the two buffers are somewhat the same */
+ if (buffer_a->size != buffer_b->size ||
+ buffer_a->pages != buffer_b->pages)
+ return -EINVAL;
+
+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
+ /*
+ * We can't do a synchronize_sched here because this
+ * function can be called in atomic context.
+ * Normally this will be called from the same CPU as cpu.
+ * If not it's up to the caller to protect this.
+ */
+ atomic_inc(&cpu_buffer_a->record_disabled);
+ atomic_inc(&cpu_buffer_b->record_disabled);
+
+ buffer_a->buffers[cpu] = cpu_buffer_b;
+ buffer_b->buffers[cpu] = cpu_buffer_a;
+
+ cpu_buffer_b->buffer = buffer_a;
+ cpu_buffer_a->buffer = buffer_b;
+
+ atomic_dec(&cpu_buffer_a->record_disabled);
+ atomic_dec(&cpu_buffer_b->record_disabled);
+
+ return 0;
+}
+
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8f3fb3d..d345d64 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -14,6 +14,7 @@
#include <linux/utsrelease.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
+#include <linux/notifier.h>
#include <linux/debugfs.h>
#include <linux/pagemap.h>
#include <linux/hardirq.h>
@@ -22,6 +23,7 @@
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/percpu.h>
+#include <linux/kdebug.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/poll.h>
@@ -31,25 +33,36 @@
#include <linux/writeback.h>
#include <linux/stacktrace.h>
+#include <linux/ring_buffer.h>
#include "trace.h"
+#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
+
unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
unsigned long __read_mostly tracing_thresh;
-static unsigned long __read_mostly tracing_nr_buffers;
+static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
+
+static inline void ftrace_disable_cpu(void)
+{
+ preempt_disable();
+ local_inc(&__get_cpu_var(ftrace_cpu_disabled));
+}
+
+static inline void ftrace_enable_cpu(void)
+{
+ local_dec(&__get_cpu_var(ftrace_cpu_disabled));
+ preempt_enable();
+}
+
static cpumask_t __read_mostly tracing_buffer_mask;
#define for_each_tracing_cpu(cpu) \
for_each_cpu_mask(cpu, tracing_buffer_mask)
-static int trace_alloc_page(void);
-static int trace_free_page(void);
-
static int tracing_disabled = 1;
-static unsigned long tracing_pages_allocated;
-
long
ns2usecs(cycle_t nsec)
{
@@ -60,7 +73,9 @@
cycle_t ftrace_now(int cpu)
{
- return cpu_clock(cpu);
+ u64 ts = ring_buffer_time_stamp(cpu);
+ ring_buffer_normalize_time_stamp(cpu, &ts);
+ return ts;
}
/*
@@ -100,11 +115,18 @@
int ftrace_function_enabled;
/*
- * trace_nr_entries is the number of entries that is allocated
- * for a buffer. Note, the number of entries is always rounded
- * to ENTRIES_PER_PAGE.
+ * trace_buf_size is the size in bytes that is allocated
+ * for a buffer. Note, the number of bytes is always rounded
+ * to page size.
+ *
+ * This number is purposely set to a low number of 16384.
+ * If the dump on oops happens, it will be much appreciated
+ * to not have to wait for all that output. Anyway this can be
+ * boot time and run time configurable.
*/
-static unsigned long trace_nr_entries = 65536UL;
+#define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */
+
+static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
/* trace_types holds a link list of available tracers. */
static struct tracer *trace_types __read_mostly;
@@ -133,24 +155,6 @@
/* trace_flags holds iter_ctrl options */
unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
-static notrace void no_trace_init(struct trace_array *tr)
-{
- int cpu;
-
- ftrace_function_enabled = 0;
- if(tr->ctrl)
- for_each_online_cpu(cpu)
- tracing_reset(tr->data[cpu]);
- tracer_enabled = 0;
-}
-
-/* dummy trace to disable tracing */
-static struct tracer no_tracer __read_mostly = {
- .name = "none",
- .init = no_trace_init
-};
-
-
/**
* trace_wake_up - wake up tasks waiting for trace input
*
@@ -167,23 +171,21 @@
wake_up(&trace_wait);
}
-#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
-
-static int __init set_nr_entries(char *str)
+static int __init set_buf_size(char *str)
{
- unsigned long nr_entries;
+ unsigned long buf_size;
int ret;
if (!str)
return 0;
- ret = strict_strtoul(str, 0, &nr_entries);
+ ret = strict_strtoul(str, 0, &buf_size);
/* nr_entries can not be zero */
- if (ret < 0 || nr_entries == 0)
+ if (ret < 0 || buf_size == 0)
return 0;
- trace_nr_entries = nr_entries;
+ trace_buf_size = buf_size;
return 1;
}
-__setup("trace_entries=", set_nr_entries);
+__setup("trace_buf_size=", set_buf_size);
unsigned long nsecs_to_usecs(unsigned long nsecs)
{
@@ -191,21 +193,6 @@
}
/*
- * trace_flag_type is an enumeration that holds different
- * states when a trace occurs. These are:
- * IRQS_OFF - interrupts were disabled
- * NEED_RESCED - reschedule is requested
- * HARDIRQ - inside an interrupt handler
- * SOFTIRQ - inside a softirq handler
- */
-enum trace_flag_type {
- TRACE_FLAG_IRQS_OFF = 0x01,
- TRACE_FLAG_NEED_RESCHED = 0x02,
- TRACE_FLAG_HARDIRQ = 0x04,
- TRACE_FLAG_SOFTIRQ = 0x08,
-};
-
-/*
* TRACE_ITER_SYM_MASK masks the options in trace_flags that
* control the output of kernel symbols.
*/
@@ -224,6 +211,7 @@
"block",
"stacktrace",
"sched-tree",
+ "ftrace_printk",
NULL
};
@@ -266,54 +254,6 @@
tracing_record_cmdline(current);
}
-#define CHECK_COND(cond) \
- if (unlikely(cond)) { \
- tracing_disabled = 1; \
- WARN_ON(1); \
- return -1; \
- }
-
-/**
- * check_pages - integrity check of trace buffers
- *
- * As a safty measure we check to make sure the data pages have not
- * been corrupted.
- */
-int check_pages(struct trace_array_cpu *data)
-{
- struct page *page, *tmp;
-
- CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
- CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
-
- list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
- CHECK_COND(page->lru.next->prev != &page->lru);
- CHECK_COND(page->lru.prev->next != &page->lru);
- }
-
- return 0;
-}
-
-/**
- * head_page - page address of the first page in per_cpu buffer.
- *
- * head_page returns the page address of the first page in
- * a per_cpu buffer. This also preforms various consistency
- * checks to make sure the buffer has not been corrupted.
- */
-void *head_page(struct trace_array_cpu *data)
-{
- struct page *page;
-
- if (list_empty(&data->trace_pages))
- return NULL;
-
- page = list_entry(data->trace_pages.next, struct page, lru);
- BUG_ON(&page->lru == &data->trace_pages);
-
- return page_address(page);
-}
-
/**
* trace_seq_printf - sequence printing of trace information
* @s: trace sequence descriptor
@@ -395,28 +335,23 @@
return len;
}
-#define HEX_CHARS 17
-static const char hex2asc[] = "0123456789abcdef";
+#define MAX_MEMHEX_BYTES 8
+#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
static int
trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
{
unsigned char hex[HEX_CHARS];
unsigned char *data = mem;
- unsigned char byte;
int i, j;
- BUG_ON(len >= HEX_CHARS);
-
#ifdef __BIG_ENDIAN
for (i = 0, j = 0; i < len; i++) {
#else
for (i = len-1, j = 0; i >= 0; i--) {
#endif
- byte = data[i];
-
- hex[j++] = hex2asc[byte & 0x0f];
- hex[j++] = hex2asc[byte >> 4];
+ hex[j++] = hex_asc_hi(data[i]);
+ hex[j++] = hex_asc_lo(data[i]);
}
hex[j++] = ' ';
@@ -460,34 +395,6 @@
trace_seq_reset(s);
}
-/*
- * flip the trace buffers between two trace descriptors.
- * This usually is the buffers between the global_trace and
- * the max_tr to record a snapshot of a current trace.
- *
- * The ftrace_max_lock must be held.
- */
-static void
-flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
-{
- struct list_head flip_pages;
-
- INIT_LIST_HEAD(&flip_pages);
-
- memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
- sizeof(struct trace_array_cpu) -
- offsetof(struct trace_array_cpu, trace_head_idx));
-
- check_pages(tr1);
- check_pages(tr2);
- list_splice_init(&tr1->trace_pages, &flip_pages);
- list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
- list_splice_init(&flip_pages, &tr2->trace_pages);
- BUG_ON(!list_empty(&flip_pages));
- check_pages(tr1);
- check_pages(tr2);
-}
-
/**
* update_max_tr - snapshot all trace buffers from global_trace to max_tr
* @tr: tracer
@@ -500,17 +407,17 @@
void
update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct trace_array_cpu *data;
- int i;
+ struct ring_buffer *buf = tr->buffer;
WARN_ON_ONCE(!irqs_disabled());
__raw_spin_lock(&ftrace_max_lock);
- /* clear out all the previous traces */
- for_each_tracing_cpu(i) {
- data = tr->data[i];
- flip_trace(max_tr.data[i], data);
- tracing_reset(data);
- }
+
+ tr->buffer = max_tr.buffer;
+ max_tr.buffer = buf;
+
+ ftrace_disable_cpu();
+ ring_buffer_reset(tr->buffer);
+ ftrace_enable_cpu();
__update_max_tr(tr, tsk, cpu);
__raw_spin_unlock(&ftrace_max_lock);
@@ -527,16 +434,19 @@
void
update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct trace_array_cpu *data = tr->data[cpu];
- int i;
+ int ret;
WARN_ON_ONCE(!irqs_disabled());
__raw_spin_lock(&ftrace_max_lock);
- for_each_tracing_cpu(i)
- tracing_reset(max_tr.data[i]);
- flip_trace(max_tr.data[cpu], data);
- tracing_reset(data);
+ ftrace_disable_cpu();
+
+ ring_buffer_reset(max_tr.buffer);
+ ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
+
+ ftrace_enable_cpu();
+
+ WARN_ON_ONCE(ret);
__update_max_tr(tr, tsk, cpu);
__raw_spin_unlock(&ftrace_max_lock);
@@ -573,7 +483,6 @@
#ifdef CONFIG_FTRACE_STARTUP_TEST
if (type->selftest) {
struct tracer *saved_tracer = current_trace;
- struct trace_array_cpu *data;
struct trace_array *tr = &global_trace;
int saved_ctrl = tr->ctrl;
int i;
@@ -585,10 +494,7 @@
* If we fail, we do not register this tracer.
*/
for_each_tracing_cpu(i) {
- data = tr->data[i];
- if (!head_page(data))
- continue;
- tracing_reset(data);
+ tracing_reset(tr, i);
}
current_trace = type;
tr->ctrl = 0;
@@ -604,10 +510,7 @@
}
/* Only reset on passing, to avoid touching corrupted buffers */
for_each_tracing_cpu(i) {
- data = tr->data[i];
- if (!head_page(data))
- continue;
- tracing_reset(data);
+ tracing_reset(tr, i);
}
printk(KERN_CONT "PASSED\n");
}
@@ -653,13 +556,11 @@
mutex_unlock(&trace_types_lock);
}
-void tracing_reset(struct trace_array_cpu *data)
+void tracing_reset(struct trace_array *tr, int cpu)
{
- data->trace_idx = 0;
- data->overrun = 0;
- data->trace_head = data->trace_tail = head_page(data);
- data->trace_head_idx = 0;
- data->trace_tail_idx = 0;
+ ftrace_disable_cpu();
+ ring_buffer_reset_cpu(tr->buffer, cpu);
+ ftrace_enable_cpu();
}
#define SAVED_CMDLINES 128
@@ -745,82 +646,16 @@
trace_save_cmdline(tsk);
}
-static inline struct list_head *
-trace_next_list(struct trace_array_cpu *data, struct list_head *next)
-{
- /*
- * Roundrobin - but skip the head (which is not a real page):
- */
- next = next->next;
- if (unlikely(next == &data->trace_pages))
- next = next->next;
- BUG_ON(next == &data->trace_pages);
-
- return next;
-}
-
-static inline void *
-trace_next_page(struct trace_array_cpu *data, void *addr)
-{
- struct list_head *next;
- struct page *page;
-
- page = virt_to_page(addr);
-
- next = trace_next_list(data, &page->lru);
- page = list_entry(next, struct page, lru);
-
- return page_address(page);
-}
-
-static inline struct trace_entry *
-tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
-{
- unsigned long idx, idx_next;
- struct trace_entry *entry;
-
- data->trace_idx++;
- idx = data->trace_head_idx;
- idx_next = idx + 1;
-
- BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
-
- entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
-
- if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
- data->trace_head = trace_next_page(data, data->trace_head);
- idx_next = 0;
- }
-
- if (data->trace_head == data->trace_tail &&
- idx_next == data->trace_tail_idx) {
- /* overrun */
- data->overrun++;
- data->trace_tail_idx++;
- if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
- data->trace_tail =
- trace_next_page(data, data->trace_tail);
- data->trace_tail_idx = 0;
- }
- }
-
- data->trace_head_idx = idx_next;
-
- return entry;
-}
-
-static inline void
-tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
+void
+tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
+ int pc)
{
struct task_struct *tsk = current;
- unsigned long pc;
- pc = preempt_count();
-
- entry->preempt_count = pc & 0xff;
- entry->pid = (tsk) ? tsk->pid : 0;
- entry->t = ftrace_now(raw_smp_processor_id());
- entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+ entry->preempt_count = pc & 0xff;
+ entry->pid = (tsk) ? tsk->pid : 0;
+ entry->flags =
+ (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
@@ -828,119 +663,110 @@
void
trace_function(struct trace_array *tr, struct trace_array_cpu *data,
- unsigned long ip, unsigned long parent_ip, unsigned long flags)
+ unsigned long ip, unsigned long parent_ip, unsigned long flags,
+ int pc)
{
- struct trace_entry *entry;
+ struct ring_buffer_event *event;
+ struct ftrace_entry *entry;
unsigned long irq_flags;
- raw_local_irq_save(irq_flags);
- __raw_spin_lock(&data->lock);
- entry = tracing_get_trace_entry(tr, data);
- tracing_generic_entry_update(entry, flags);
- entry->type = TRACE_FN;
- entry->fn.ip = ip;
- entry->fn.parent_ip = parent_ip;
- __raw_spin_unlock(&data->lock);
- raw_local_irq_restore(irq_flags);
+ /* If we are reading the ring buffer, don't trace */
+ if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+ return;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_FN;
+ entry->ip = ip;
+ entry->parent_ip = parent_ip;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
}
void
ftrace(struct trace_array *tr, struct trace_array_cpu *data,
- unsigned long ip, unsigned long parent_ip, unsigned long flags)
+ unsigned long ip, unsigned long parent_ip, unsigned long flags,
+ int pc)
{
if (likely(!atomic_read(&data->disabled)))
- trace_function(tr, data, ip, parent_ip, flags);
+ trace_function(tr, data, ip, parent_ip, flags, pc);
}
-#ifdef CONFIG_MMIOTRACE
-void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
- struct mmiotrace_rw *rw)
+static void ftrace_trace_stack(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long flags,
+ int skip, int pc)
{
- struct trace_entry *entry;
+ struct ring_buffer_event *event;
+ struct stack_entry *entry;
+ struct stack_trace trace;
unsigned long irq_flags;
- raw_local_irq_save(irq_flags);
- __raw_spin_lock(&data->lock);
+ if (!(trace_flags & TRACE_ITER_STACKTRACE))
+ return;
- entry = tracing_get_trace_entry(tr, data);
- tracing_generic_entry_update(entry, 0);
- entry->type = TRACE_MMIO_RW;
- entry->mmiorw = *rw;
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_STACK;
- __raw_spin_unlock(&data->lock);
- raw_local_irq_restore(irq_flags);
+ memset(&entry->caller, 0, sizeof(entry->caller));
- trace_wake_up();
+ trace.nr_entries = 0;
+ trace.max_entries = FTRACE_STACK_ENTRIES;
+ trace.skip = skip;
+ trace.entries = entry->caller;
+
+ save_stack_trace(&trace);
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
}
-void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
- struct mmiotrace_map *map)
-{
- struct trace_entry *entry;
- unsigned long irq_flags;
-
- raw_local_irq_save(irq_flags);
- __raw_spin_lock(&data->lock);
-
- entry = tracing_get_trace_entry(tr, data);
- tracing_generic_entry_update(entry, 0);
- entry->type = TRACE_MMIO_MAP;
- entry->mmiomap = *map;
-
- __raw_spin_unlock(&data->lock);
- raw_local_irq_restore(irq_flags);
-
- trace_wake_up();
-}
-#endif
-
void __trace_stack(struct trace_array *tr,
struct trace_array_cpu *data,
unsigned long flags,
int skip)
{
- struct trace_entry *entry;
- struct stack_trace trace;
+ ftrace_trace_stack(tr, data, flags, skip, preempt_count());
+}
- if (!(trace_flags & TRACE_ITER_STACKTRACE))
+static void
+ftrace_trace_special(void *__tr, void *__data,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3,
+ int pc)
+{
+ struct ring_buffer_event *event;
+ struct trace_array_cpu *data = __data;
+ struct trace_array *tr = __tr;
+ struct special_entry *entry;
+ unsigned long irq_flags;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, pc);
+ entry->ent.type = TRACE_SPECIAL;
+ entry->arg1 = arg1;
+ entry->arg2 = arg2;
+ entry->arg3 = arg3;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ ftrace_trace_stack(tr, data, irq_flags, 4, pc);
- entry = tracing_get_trace_entry(tr, data);
- tracing_generic_entry_update(entry, flags);
- entry->type = TRACE_STACK;
-
- memset(&entry->stack, 0, sizeof(entry->stack));
-
- trace.nr_entries = 0;
- trace.max_entries = FTRACE_STACK_ENTRIES;
- trace.skip = skip;
- trace.entries = entry->stack.caller;
-
- save_stack_trace(&trace);
+ trace_wake_up();
}
void
__trace_special(void *__tr, void *__data,
unsigned long arg1, unsigned long arg2, unsigned long arg3)
{
- struct trace_array_cpu *data = __data;
- struct trace_array *tr = __tr;
- struct trace_entry *entry;
- unsigned long irq_flags;
-
- raw_local_irq_save(irq_flags);
- __raw_spin_lock(&data->lock);
- entry = tracing_get_trace_entry(tr, data);
- tracing_generic_entry_update(entry, 0);
- entry->type = TRACE_SPECIAL;
- entry->special.arg1 = arg1;
- entry->special.arg2 = arg2;
- entry->special.arg3 = arg3;
- __trace_stack(tr, data, irq_flags, 4);
- __raw_spin_unlock(&data->lock);
- raw_local_irq_restore(irq_flags);
-
- trace_wake_up();
+ ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count());
}
void
@@ -948,25 +774,28 @@
struct trace_array_cpu *data,
struct task_struct *prev,
struct task_struct *next,
- unsigned long flags)
+ unsigned long flags, int pc)
{
- struct trace_entry *entry;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
unsigned long irq_flags;
- raw_local_irq_save(irq_flags);
- __raw_spin_lock(&data->lock);
- entry = tracing_get_trace_entry(tr, data);
- tracing_generic_entry_update(entry, flags);
- entry->type = TRACE_CTX;
- entry->ctx.prev_pid = prev->pid;
- entry->ctx.prev_prio = prev->prio;
- entry->ctx.prev_state = prev->state;
- entry->ctx.next_pid = next->pid;
- entry->ctx.next_prio = next->prio;
- entry->ctx.next_state = next->state;
- __trace_stack(tr, data, flags, 5);
- __raw_spin_unlock(&data->lock);
- raw_local_irq_restore(irq_flags);
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_CTX;
+ entry->prev_pid = prev->pid;
+ entry->prev_prio = prev->prio;
+ entry->prev_state = prev->state;
+ entry->next_pid = next->pid;
+ entry->next_prio = next->prio;
+ entry->next_state = next->state;
+ entry->next_cpu = task_cpu(next);
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ ftrace_trace_stack(tr, data, flags, 5, pc);
}
void
@@ -974,25 +803,28 @@
struct trace_array_cpu *data,
struct task_struct *wakee,
struct task_struct *curr,
- unsigned long flags)
+ unsigned long flags, int pc)
{
- struct trace_entry *entry;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
unsigned long irq_flags;
- raw_local_irq_save(irq_flags);
- __raw_spin_lock(&data->lock);
- entry = tracing_get_trace_entry(tr, data);
- tracing_generic_entry_update(entry, flags);
- entry->type = TRACE_WAKE;
- entry->ctx.prev_pid = curr->pid;
- entry->ctx.prev_prio = curr->prio;
- entry->ctx.prev_state = curr->state;
- entry->ctx.next_pid = wakee->pid;
- entry->ctx.next_prio = wakee->prio;
- entry->ctx.next_state = wakee->state;
- __trace_stack(tr, data, flags, 6);
- __raw_spin_unlock(&data->lock);
- raw_local_irq_restore(irq_flags);
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_WAKE;
+ entry->prev_pid = curr->pid;
+ entry->prev_prio = curr->prio;
+ entry->prev_state = curr->state;
+ entry->next_pid = wakee->pid;
+ entry->next_prio = wakee->prio;
+ entry->next_state = wakee->state;
+ entry->next_cpu = task_cpu(wakee);
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ ftrace_trace_stack(tr, data, flags, 6, pc);
trace_wake_up();
}
@@ -1002,23 +834,21 @@
{
struct trace_array *tr = &global_trace;
struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
int cpu;
+ int pc;
- if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl)
+ if (tracing_disabled || !tr->ctrl)
return;
- local_irq_save(flags);
+ pc = preempt_count();
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1))
- __trace_special(tr, data, arg1, arg2, arg3);
+ if (likely(!atomic_read(&data->disabled)))
+ ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
+ preempt_enable_notrace();
}
#ifdef CONFIG_FTRACE
@@ -1029,7 +859,8 @@
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
- int cpu;
+ int cpu, resched;
+ int pc;
if (unlikely(!ftrace_function_enabled))
return;
@@ -1037,16 +868,22 @@
if (skip_trace(ip))
return;
- local_irq_save(flags);
+ pc = preempt_count();
+ resched = need_resched();
+ preempt_disable_notrace();
+ local_save_flags(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1))
- trace_function(tr, data, ip, parent_ip, flags);
+ trace_function(tr, data, ip, parent_ip, flags, pc);
atomic_dec(&data->disabled);
- local_irq_restore(flags);
+ if (resched)
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
}
static struct ftrace_ops trace_ops __read_mostly =
@@ -1073,111 +910,96 @@
TRACE_FILE_LAT_FMT = 1,
};
-static struct trace_entry *
-trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
- struct trace_iterator *iter, int cpu)
+static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
{
- struct page *page;
- struct trace_entry *array;
+ /* Don't allow ftrace to trace into the ring buffers */
+ ftrace_disable_cpu();
- if (iter->next_idx[cpu] >= tr->entries ||
- iter->next_idx[cpu] >= data->trace_idx ||
- (data->trace_head == data->trace_tail &&
- data->trace_head_idx == data->trace_tail_idx))
- return NULL;
+ iter->idx++;
+ if (iter->buffer_iter[iter->cpu])
+ ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
- if (!iter->next_page[cpu]) {
- /* Initialize the iterator for this cpu trace buffer */
- WARN_ON(!data->trace_tail);
- page = virt_to_page(data->trace_tail);
- iter->next_page[cpu] = &page->lru;
- iter->next_page_idx[cpu] = data->trace_tail_idx;
- }
-
- page = list_entry(iter->next_page[cpu], struct page, lru);
- BUG_ON(&data->trace_pages == &page->lru);
-
- array = page_address(page);
-
- WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE);
- return &array[iter->next_page_idx[cpu]];
+ ftrace_enable_cpu();
}
static struct trace_entry *
-find_next_entry(struct trace_iterator *iter, int *ent_cpu)
+peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
{
- struct trace_array *tr = iter->tr;
+ struct ring_buffer_event *event;
+ struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
+
+ /* Don't allow ftrace to trace into the ring buffers */
+ ftrace_disable_cpu();
+
+ if (buf_iter)
+ event = ring_buffer_iter_peek(buf_iter, ts);
+ else
+ event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
+
+ ftrace_enable_cpu();
+
+ return event ? ring_buffer_event_data(event) : NULL;
+}
+
+static struct trace_entry *
+__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+{
+ struct ring_buffer *buffer = iter->tr->buffer;
struct trace_entry *ent, *next = NULL;
+ u64 next_ts = 0, ts;
int next_cpu = -1;
int cpu;
for_each_tracing_cpu(cpu) {
- if (!head_page(tr->data[cpu]))
+
+ if (ring_buffer_empty_cpu(buffer, cpu))
continue;
- ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
+
+ ent = peek_next_entry(iter, cpu, &ts);
+
/*
* Pick the entry with the smallest timestamp:
*/
- if (ent && (!next || ent->t < next->t)) {
+ if (ent && (!next || ts < next_ts)) {
next = ent;
next_cpu = cpu;
+ next_ts = ts;
}
}
if (ent_cpu)
*ent_cpu = next_cpu;
+ if (ent_ts)
+ *ent_ts = next_ts;
+
return next;
}
-static void trace_iterator_increment(struct trace_iterator *iter)
+/* Find the next real entry, without updating the iterator itself */
+static struct trace_entry *
+find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
{
- iter->idx++;
- iter->next_idx[iter->cpu]++;
- iter->next_page_idx[iter->cpu]++;
+ return __find_next_entry(iter, ent_cpu, ent_ts);
+}
- if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
- struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+/* Find the next real entry, and increment the iterator to the next entry */
+static void *find_next_entry_inc(struct trace_iterator *iter)
+{
+ iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
- iter->next_page_idx[iter->cpu] = 0;
- iter->next_page[iter->cpu] =
- trace_next_list(data, iter->next_page[iter->cpu]);
- }
+ if (iter->ent)
+ trace_iterator_increment(iter, iter->cpu);
+
+ return iter->ent ? iter : NULL;
}
static void trace_consume(struct trace_iterator *iter)
{
- struct trace_array_cpu *data = iter->tr->data[iter->cpu];
-
- data->trace_tail_idx++;
- if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
- data->trace_tail = trace_next_page(data, data->trace_tail);
- data->trace_tail_idx = 0;
- }
-
- /* Check if we empty it, then reset the index */
- if (data->trace_head == data->trace_tail &&
- data->trace_head_idx == data->trace_tail_idx)
- data->trace_idx = 0;
-}
-
-static void *find_next_entry_inc(struct trace_iterator *iter)
-{
- struct trace_entry *next;
- int next_cpu = -1;
-
- next = find_next_entry(iter, &next_cpu);
-
- iter->prev_ent = iter->ent;
- iter->prev_cpu = iter->cpu;
-
- iter->ent = next;
- iter->cpu = next_cpu;
-
- if (next)
- trace_iterator_increment(iter);
-
- return next ? iter : NULL;
+ /* Don't allow ftrace to trace into the ring buffers */
+ ftrace_disable_cpu();
+ ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
+ ftrace_enable_cpu();
}
static void *s_next(struct seq_file *m, void *v, loff_t *pos)
@@ -1210,7 +1032,7 @@
struct trace_iterator *iter = m->private;
void *p = NULL;
loff_t l = 0;
- int i;
+ int cpu;
mutex_lock(&trace_types_lock);
@@ -1229,14 +1051,15 @@
iter->ent = NULL;
iter->cpu = 0;
iter->idx = -1;
- iter->prev_ent = NULL;
- iter->prev_cpu = -1;
- for_each_tracing_cpu(i) {
- iter->next_idx[i] = 0;
- iter->next_page[i] = NULL;
+ ftrace_disable_cpu();
+
+ for_each_tracing_cpu(cpu) {
+ ring_buffer_iter_reset(iter->buffer_iter[cpu]);
}
+ ftrace_enable_cpu();
+
for (p = iter; p && l < *pos; p = s_next(m, p, &l))
;
@@ -1330,21 +1153,21 @@
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _------=> CPU# \n");
- seq_puts(m, "# / _-----=> irqs-off \n");
- seq_puts(m, "# | / _----=> need-resched \n");
- seq_puts(m, "# || / _---=> hardirq/softirq \n");
- seq_puts(m, "# ||| / _--=> preempt-depth \n");
- seq_puts(m, "# |||| / \n");
- seq_puts(m, "# ||||| delay \n");
- seq_puts(m, "# cmd pid ||||| time | caller \n");
- seq_puts(m, "# \\ / ||||| \\ | / \n");
+ seq_puts(m, "# _------=> CPU# \n");
+ seq_puts(m, "# / _-----=> irqs-off \n");
+ seq_puts(m, "# | / _----=> need-resched \n");
+ seq_puts(m, "# || / _---=> hardirq/softirq \n");
+ seq_puts(m, "# ||| / _--=> preempt-depth \n");
+ seq_puts(m, "# |||| / \n");
+ seq_puts(m, "# ||||| delay \n");
+ seq_puts(m, "# cmd pid ||||| time | caller \n");
+ seq_puts(m, "# \\ / ||||| \\ | / \n");
}
static void print_func_help_header(struct seq_file *m)
{
- seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
- seq_puts(m, "# | | | | |\n");
+ seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | | |\n");
}
@@ -1355,23 +1178,16 @@
struct trace_array *tr = iter->tr;
struct trace_array_cpu *data = tr->data[tr->cpu];
struct tracer *type = current_trace;
- unsigned long total = 0;
- unsigned long entries = 0;
- int cpu;
+ unsigned long total;
+ unsigned long entries;
const char *name = "preemption";
if (type)
name = type->name;
- for_each_tracing_cpu(cpu) {
- if (head_page(tr->data[cpu])) {
- total += tr->data[cpu]->trace_idx;
- if (tr->data[cpu]->trace_idx > tr->entries)
- entries += tr->entries;
- else
- entries += tr->data[cpu]->trace_idx;
- }
- }
+ entries = ring_buffer_entries(iter->tr->buffer);
+ total = entries +
+ ring_buffer_overruns(iter->tr->buffer);
seq_printf(m, "%s latency trace v1.1.5 on %s\n",
name, UTS_RELEASE);
@@ -1428,7 +1244,7 @@
comm = trace_find_cmdline(entry->pid);
trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
- trace_seq_printf(s, "%d", cpu);
+ trace_seq_printf(s, "%3d", cpu);
trace_seq_printf(s, "%c%c",
(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
@@ -1457,7 +1273,7 @@
unsigned long preempt_mark_thresh = 100;
static void
-lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
+lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
unsigned long rel_usecs)
{
trace_seq_printf(s, " %4lldus", abs_usecs);
@@ -1471,34 +1287,76 @@
static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-static int
+/*
+ * The message is supposed to contain an ending newline.
+ * If the printing stops prematurely, try to add a newline of our own.
+ */
+void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
+{
+ struct trace_entry *ent;
+ struct trace_field_cont *cont;
+ bool ok = true;
+
+ ent = peek_next_entry(iter, iter->cpu, NULL);
+ if (!ent || ent->type != TRACE_CONT) {
+ trace_seq_putc(s, '\n');
+ return;
+ }
+
+ do {
+ cont = (struct trace_field_cont *)ent;
+ if (ok)
+ ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
+
+ ftrace_disable_cpu();
+
+ if (iter->buffer_iter[iter->cpu])
+ ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+ else
+ ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+
+ ftrace_enable_cpu();
+
+ ent = peek_next_entry(iter, iter->cpu, NULL);
+ } while (ent && ent->type == TRACE_CONT);
+
+ if (!ok)
+ trace_seq_putc(s, '\n');
+}
+
+static enum print_line_t
print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
{
struct trace_seq *s = &iter->seq;
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
- struct trace_entry *next_entry = find_next_entry(iter, NULL);
+ struct trace_entry *next_entry;
unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
struct trace_entry *entry = iter->ent;
unsigned long abs_usecs;
unsigned long rel_usecs;
+ u64 next_ts;
char *comm;
int S, T;
int i;
unsigned state;
+ if (entry->type == TRACE_CONT)
+ return TRACE_TYPE_HANDLED;
+
+ next_entry = find_next_entry(iter, NULL, &next_ts);
if (!next_entry)
- next_entry = entry;
- rel_usecs = ns2usecs(next_entry->t - entry->t);
- abs_usecs = ns2usecs(entry->t - iter->tr->time_start);
+ next_ts = iter->ts;
+ rel_usecs = ns2usecs(next_ts - iter->ts);
+ abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
if (verbose) {
comm = trace_find_cmdline(entry->pid);
- trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]"
+ trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]"
" %ld.%03ldms (+%ld.%03ldms): ",
comm,
entry->pid, cpu, entry->flags,
entry->preempt_count, trace_idx,
- ns2usecs(entry->t),
+ ns2usecs(iter->ts),
abs_usecs/1000,
abs_usecs % 1000, rel_usecs/1000,
rel_usecs % 1000);
@@ -1507,52 +1365,85 @@
lat_print_timestamp(s, abs_usecs, rel_usecs);
}
switch (entry->type) {
- case TRACE_FN:
- seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+ case TRACE_FN: {
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ seq_print_ip_sym(s, field->ip, sym_flags);
trace_seq_puts(s, " (");
- if (kretprobed(entry->fn.parent_ip))
+ if (kretprobed(field->parent_ip))
trace_seq_puts(s, KRETPROBE_MSG);
else
- seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
+ seq_print_ip_sym(s, field->parent_ip, sym_flags);
trace_seq_puts(s, ")\n");
break;
+ }
case TRACE_CTX:
- case TRACE_WAKE:
- T = entry->ctx.next_state < sizeof(state_to_char) ?
- state_to_char[entry->ctx.next_state] : 'X';
+ case TRACE_WAKE: {
+ struct ctx_switch_entry *field;
- state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0;
+ trace_assign_type(field, entry);
+
+ T = field->next_state < sizeof(state_to_char) ?
+ state_to_char[field->next_state] : 'X';
+
+ state = field->prev_state ?
+ __ffs(field->prev_state) + 1 : 0;
S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
- comm = trace_find_cmdline(entry->ctx.next_pid);
- trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
- entry->ctx.prev_pid,
- entry->ctx.prev_prio,
+ comm = trace_find_cmdline(field->next_pid);
+ trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+ field->prev_pid,
+ field->prev_prio,
S, entry->type == TRACE_CTX ? "==>" : " +",
- entry->ctx.next_pid,
- entry->ctx.next_prio,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
T, comm);
break;
- case TRACE_SPECIAL:
+ }
+ case TRACE_SPECIAL: {
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);
+
trace_seq_printf(s, "# %ld %ld %ld\n",
- entry->special.arg1,
- entry->special.arg2,
- entry->special.arg3);
+ field->arg1,
+ field->arg2,
+ field->arg3);
break;
- case TRACE_STACK:
+ }
+ case TRACE_STACK: {
+ struct stack_entry *field;
+
+ trace_assign_type(field, entry);
+
for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
if (i)
trace_seq_puts(s, " <= ");
- seq_print_ip_sym(s, entry->stack.caller[i], sym_flags);
+ seq_print_ip_sym(s, field->caller[i], sym_flags);
}
trace_seq_puts(s, "\n");
break;
+ }
+ case TRACE_PRINT: {
+ struct print_entry *field;
+
+ trace_assign_type(field, entry);
+
+ seq_print_ip_sym(s, field->ip, sym_flags);
+ trace_seq_printf(s, ": %s", field->buf);
+ if (entry->flags & TRACE_FLAG_CONT)
+ trace_seq_print_cont(s, iter);
+ break;
+ }
default:
trace_seq_printf(s, "Unknown type %d\n", entry->type);
}
- return 1;
+ return TRACE_TYPE_HANDLED;
}
-static int print_trace_fmt(struct trace_iterator *iter)
+static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1567,90 +1458,126 @@
entry = iter->ent;
+ if (entry->type == TRACE_CONT)
+ return TRACE_TYPE_HANDLED;
+
comm = trace_find_cmdline(iter->ent->pid);
- t = ns2usecs(entry->t);
+ t = ns2usecs(iter->ts);
usec_rem = do_div(t, 1000000ULL);
secs = (unsigned long)t;
ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
if (!ret)
- return 0;
- ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
+ return TRACE_TYPE_PARTIAL_LINE;
+ ret = trace_seq_printf(s, "[%03d] ", iter->cpu);
if (!ret)
- return 0;
+ return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
if (!ret)
- return 0;
+ return TRACE_TYPE_PARTIAL_LINE;
switch (entry->type) {
- case TRACE_FN:
- ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+ case TRACE_FN: {
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ ret = seq_print_ip_sym(s, field->ip, sym_flags);
if (!ret)
- return 0;
+ return TRACE_TYPE_PARTIAL_LINE;
if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
- entry->fn.parent_ip) {
+ field->parent_ip) {
ret = trace_seq_printf(s, " <-");
if (!ret)
- return 0;
- if (kretprobed(entry->fn.parent_ip))
+ return TRACE_TYPE_PARTIAL_LINE;
+ if (kretprobed(field->parent_ip))
ret = trace_seq_puts(s, KRETPROBE_MSG);
else
- ret = seq_print_ip_sym(s, entry->fn.parent_ip,
+ ret = seq_print_ip_sym(s,
+ field->parent_ip,
sym_flags);
if (!ret)
- return 0;
+ return TRACE_TYPE_PARTIAL_LINE;
}
ret = trace_seq_printf(s, "\n");
if (!ret)
- return 0;
+ return TRACE_TYPE_PARTIAL_LINE;
break;
+ }
case TRACE_CTX:
- case TRACE_WAKE:
- S = entry->ctx.prev_state < sizeof(state_to_char) ?
- state_to_char[entry->ctx.prev_state] : 'X';
- T = entry->ctx.next_state < sizeof(state_to_char) ?
- state_to_char[entry->ctx.next_state] : 'X';
- ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n",
- entry->ctx.prev_pid,
- entry->ctx.prev_prio,
+ case TRACE_WAKE: {
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);
+
+ S = field->prev_state < sizeof(state_to_char) ?
+ state_to_char[field->prev_state] : 'X';
+ T = field->next_state < sizeof(state_to_char) ?
+ state_to_char[field->next_state] : 'X';
+ ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
+ field->prev_pid,
+ field->prev_prio,
S,
entry->type == TRACE_CTX ? "==>" : " +",
- entry->ctx.next_pid,
- entry->ctx.next_prio,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
T);
if (!ret)
- return 0;
+ return TRACE_TYPE_PARTIAL_LINE;
break;
- case TRACE_SPECIAL:
+ }
+ case TRACE_SPECIAL: {
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);
+
ret = trace_seq_printf(s, "# %ld %ld %ld\n",
- entry->special.arg1,
- entry->special.arg2,
- entry->special.arg3);
+ field->arg1,
+ field->arg2,
+ field->arg3);
if (!ret)
- return 0;
+ return TRACE_TYPE_PARTIAL_LINE;
break;
- case TRACE_STACK:
+ }
+ case TRACE_STACK: {
+ struct stack_entry *field;
+
+ trace_assign_type(field, entry);
+
for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
if (i) {
ret = trace_seq_puts(s, " <= ");
if (!ret)
- return 0;
+ return TRACE_TYPE_PARTIAL_LINE;
}
- ret = seq_print_ip_sym(s, entry->stack.caller[i],
+ ret = seq_print_ip_sym(s, field->caller[i],
sym_flags);
if (!ret)
- return 0;
+ return TRACE_TYPE_PARTIAL_LINE;
}
ret = trace_seq_puts(s, "\n");
if (!ret)
- return 0;
+ return TRACE_TYPE_PARTIAL_LINE;
break;
}
- return 1;
+ case TRACE_PRINT: {
+ struct print_entry *field;
+
+ trace_assign_type(field, entry);
+
+ seq_print_ip_sym(s, field->ip, sym_flags);
+ trace_seq_printf(s, ": %s", field->buf);
+ if (entry->flags & TRACE_FLAG_CONT)
+ trace_seq_print_cont(s, iter);
+ break;
+ }
+ }
+ return TRACE_TYPE_HANDLED;
}
-static int print_raw_fmt(struct trace_iterator *iter)
+static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *entry;
@@ -1659,47 +1586,77 @@
entry = iter->ent;
+ if (entry->type == TRACE_CONT)
+ return TRACE_TYPE_HANDLED;
+
ret = trace_seq_printf(s, "%d %d %llu ",
- entry->pid, iter->cpu, entry->t);
+ entry->pid, iter->cpu, iter->ts);
if (!ret)
- return 0;
+ return TRACE_TYPE_PARTIAL_LINE;
switch (entry->type) {
- case TRACE_FN:
+ case TRACE_FN: {
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
ret = trace_seq_printf(s, "%x %x\n",
- entry->fn.ip, entry->fn.parent_ip);
+ field->ip,
+ field->parent_ip);
if (!ret)
- return 0;
- break;
- case TRACE_CTX:
- case TRACE_WAKE:
- S = entry->ctx.prev_state < sizeof(state_to_char) ?
- state_to_char[entry->ctx.prev_state] : 'X';
- T = entry->ctx.next_state < sizeof(state_to_char) ?
- state_to_char[entry->ctx.next_state] : 'X';
- if (entry->type == TRACE_WAKE)
- S = '+';
- ret = trace_seq_printf(s, "%d %d %c %d %d %c\n",
- entry->ctx.prev_pid,
- entry->ctx.prev_prio,
- S,
- entry->ctx.next_pid,
- entry->ctx.next_prio,
- T);
- if (!ret)
- return 0;
- break;
- case TRACE_SPECIAL:
- case TRACE_STACK:
- ret = trace_seq_printf(s, "# %ld %ld %ld\n",
- entry->special.arg1,
- entry->special.arg2,
- entry->special.arg3);
- if (!ret)
- return 0;
+ return TRACE_TYPE_PARTIAL_LINE;
break;
}
- return 1;
+ case TRACE_CTX:
+ case TRACE_WAKE: {
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);
+
+ S = field->prev_state < sizeof(state_to_char) ?
+ state_to_char[field->prev_state] : 'X';
+ T = field->next_state < sizeof(state_to_char) ?
+ state_to_char[field->next_state] : 'X';
+ if (entry->type == TRACE_WAKE)
+ S = '+';
+ ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
+ field->prev_pid,
+ field->prev_prio,
+ S,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ break;
+ }
+ case TRACE_SPECIAL:
+ case TRACE_STACK: {
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);
+
+ ret = trace_seq_printf(s, "# %ld %ld %ld\n",
+ field->arg1,
+ field->arg2,
+ field->arg3);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ break;
+ }
+ case TRACE_PRINT: {
+ struct print_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
+ if (entry->flags & TRACE_FLAG_CONT)
+ trace_seq_print_cont(s, iter);
+ break;
+ }
+ }
+ return TRACE_TYPE_HANDLED;
}
#define SEQ_PUT_FIELD_RET(s, x) \
@@ -1710,11 +1667,12 @@
#define SEQ_PUT_HEX_FIELD_RET(s, x) \
do { \
+ BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
return 0; \
} while (0)
-static int print_hex_fmt(struct trace_iterator *iter)
+static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
unsigned char newline = '\n';
@@ -1723,97 +1681,139 @@
entry = iter->ent;
+ if (entry->type == TRACE_CONT)
+ return TRACE_TYPE_HANDLED;
+
SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
- SEQ_PUT_HEX_FIELD_RET(s, entry->t);
+ SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
switch (entry->type) {
- case TRACE_FN:
- SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip);
- SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+ case TRACE_FN: {
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_HEX_FIELD_RET(s, field->ip);
+ SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
break;
+ }
case TRACE_CTX:
- case TRACE_WAKE:
- S = entry->ctx.prev_state < sizeof(state_to_char) ?
- state_to_char[entry->ctx.prev_state] : 'X';
- T = entry->ctx.next_state < sizeof(state_to_char) ?
- state_to_char[entry->ctx.next_state] : 'X';
+ case TRACE_WAKE: {
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);
+
+ S = field->prev_state < sizeof(state_to_char) ?
+ state_to_char[field->prev_state] : 'X';
+ T = field->next_state < sizeof(state_to_char) ?
+ state_to_char[field->next_state] : 'X';
if (entry->type == TRACE_WAKE)
S = '+';
- SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid);
- SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio);
+ SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
+ SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
SEQ_PUT_HEX_FIELD_RET(s, S);
- SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid);
- SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio);
- SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+ SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
+ SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
+ SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
SEQ_PUT_HEX_FIELD_RET(s, T);
break;
+ }
case TRACE_SPECIAL:
- case TRACE_STACK:
- SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1);
- SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2);
- SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3);
+ case TRACE_STACK: {
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
+ SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
+ SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
break;
}
+ }
SEQ_PUT_FIELD_RET(s, newline);
- return 1;
+ return TRACE_TYPE_HANDLED;
}
-static int print_bin_fmt(struct trace_iterator *iter)
+static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *entry;
entry = iter->ent;
+ if (entry->type == TRACE_CONT)
+ return TRACE_TYPE_HANDLED;
+
SEQ_PUT_FIELD_RET(s, entry->pid);
- SEQ_PUT_FIELD_RET(s, entry->cpu);
- SEQ_PUT_FIELD_RET(s, entry->t);
+ SEQ_PUT_FIELD_RET(s, iter->cpu);
+ SEQ_PUT_FIELD_RET(s, iter->ts);
switch (entry->type) {
- case TRACE_FN:
- SEQ_PUT_FIELD_RET(s, entry->fn.ip);
- SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip);
+ case TRACE_FN: {
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_FIELD_RET(s, field->ip);
+ SEQ_PUT_FIELD_RET(s, field->parent_ip);
break;
- case TRACE_CTX:
- SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid);
- SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio);
- SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state);
- SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
- SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
- SEQ_PUT_FIELD_RET(s, entry->ctx.next_state);
+ }
+ case TRACE_CTX: {
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_FIELD_RET(s, field->prev_pid);
+ SEQ_PUT_FIELD_RET(s, field->prev_prio);
+ SEQ_PUT_FIELD_RET(s, field->prev_state);
+ SEQ_PUT_FIELD_RET(s, field->next_pid);
+ SEQ_PUT_FIELD_RET(s, field->next_prio);
+ SEQ_PUT_FIELD_RET(s, field->next_state);
break;
+ }
case TRACE_SPECIAL:
- case TRACE_STACK:
- SEQ_PUT_FIELD_RET(s, entry->special.arg1);
- SEQ_PUT_FIELD_RET(s, entry->special.arg2);
- SEQ_PUT_FIELD_RET(s, entry->special.arg3);
+ case TRACE_STACK: {
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_FIELD_RET(s, field->arg1);
+ SEQ_PUT_FIELD_RET(s, field->arg2);
+ SEQ_PUT_FIELD_RET(s, field->arg3);
break;
}
+ }
return 1;
}
static int trace_empty(struct trace_iterator *iter)
{
- struct trace_array_cpu *data;
int cpu;
for_each_tracing_cpu(cpu) {
- data = iter->tr->data[cpu];
-
- if (head_page(data) && data->trace_idx &&
- (data->trace_tail != data->trace_head ||
- data->trace_tail_idx != data->trace_head_idx))
- return 0;
+ if (iter->buffer_iter[cpu]) {
+ if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+ return 0;
+ } else {
+ if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+ return 0;
+ }
}
+
return 1;
}
-static int print_trace_line(struct trace_iterator *iter)
+static enum print_line_t print_trace_line(struct trace_iterator *iter)
{
- if (iter->trace && iter->trace->print_line)
- return iter->trace->print_line(iter);
+ enum print_line_t ret;
+
+ if (iter->trace && iter->trace->print_line) {
+ ret = iter->trace->print_line(iter);
+ if (ret != TRACE_TYPE_UNHANDLED)
+ return ret;
+ }
if (trace_flags & TRACE_ITER_BIN)
return print_bin_fmt(iter);
@@ -1869,6 +1869,8 @@
__tracing_open(struct inode *inode, struct file *file, int *ret)
{
struct trace_iterator *iter;
+ struct seq_file *m;
+ int cpu;
if (tracing_disabled) {
*ret = -ENODEV;
@@ -1889,28 +1891,45 @@
iter->trace = current_trace;
iter->pos = -1;
+ for_each_tracing_cpu(cpu) {
+
+ iter->buffer_iter[cpu] =
+ ring_buffer_read_start(iter->tr->buffer, cpu);
+
+ if (!iter->buffer_iter[cpu])
+ goto fail_buffer;
+ }
+
/* TODO stop tracer */
*ret = seq_open(file, &tracer_seq_ops);
- if (!*ret) {
- struct seq_file *m = file->private_data;
- m->private = iter;
+ if (*ret)
+ goto fail_buffer;
- /* stop the trace while dumping */
- if (iter->tr->ctrl) {
- tracer_enabled = 0;
- ftrace_function_enabled = 0;
- }
+ m = file->private_data;
+ m->private = iter;
- if (iter->trace && iter->trace->open)
- iter->trace->open(iter);
- } else {
- kfree(iter);
- iter = NULL;
+ /* stop the trace while dumping */
+ if (iter->tr->ctrl) {
+ tracer_enabled = 0;
+ ftrace_function_enabled = 0;
}
+
+ if (iter->trace && iter->trace->open)
+ iter->trace->open(iter);
+
mutex_unlock(&trace_types_lock);
out:
return iter;
+
+ fail_buffer:
+ for_each_tracing_cpu(cpu) {
+ if (iter->buffer_iter[cpu])
+ ring_buffer_read_finish(iter->buffer_iter[cpu]);
+ }
+ mutex_unlock(&trace_types_lock);
+
+ return ERR_PTR(-ENOMEM);
}
int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -1926,8 +1945,14 @@
{
struct seq_file *m = (struct seq_file *)file->private_data;
struct trace_iterator *iter = m->private;
+ int cpu;
mutex_lock(&trace_types_lock);
+ for_each_tracing_cpu(cpu) {
+ if (iter->buffer_iter[cpu])
+ ring_buffer_read_finish(iter->buffer_iter[cpu]);
+ }
+
if (iter->trace && iter->trace->close)
iter->trace->close(iter);
@@ -2352,9 +2377,11 @@
struct tracer *t;
char buf[max_tracer_type_len+1];
int i;
+ size_t ret;
if (cnt > max_tracer_type_len)
cnt = max_tracer_type_len;
+ ret = cnt;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
@@ -2370,7 +2397,11 @@
if (strcmp(t->name, buf) == 0)
break;
}
- if (!t || t == current_trace)
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (t == current_trace)
goto out;
if (current_trace && current_trace->reset)
@@ -2383,9 +2414,10 @@
out:
mutex_unlock(&trace_types_lock);
- filp->f_pos += cnt;
+ if (ret == cnt)
+ filp->f_pos += cnt;
- return cnt;
+ return ret;
}
static ssize_t
@@ -2500,20 +2532,12 @@
size_t cnt, loff_t *ppos)
{
struct trace_iterator *iter = filp->private_data;
- struct trace_array_cpu *data;
- static cpumask_t mask;
- unsigned long flags;
-#ifdef CONFIG_FTRACE
- int ftrace_save;
-#endif
- int cpu;
ssize_t sret;
/* return any leftover data */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
if (sret != -EBUSY)
return sret;
- sret = 0;
trace_seq_reset(&iter->seq);
@@ -2524,6 +2548,8 @@
goto out;
}
+waitagain:
+ sret = 0;
while (trace_empty(iter)) {
if ((filp->f_flags & O_NONBLOCK)) {
@@ -2588,46 +2614,12 @@
offsetof(struct trace_iterator, seq));
iter->pos = -1;
- /*
- * We need to stop all tracing on all CPUS to read the
- * the next buffer. This is a bit expensive, but is
- * not done often. We fill all what we can read,
- * and then release the locks again.
- */
-
- cpus_clear(mask);
- local_irq_save(flags);
-#ifdef CONFIG_FTRACE
- ftrace_save = ftrace_enabled;
- ftrace_enabled = 0;
-#endif
- smp_wmb();
- for_each_tracing_cpu(cpu) {
- data = iter->tr->data[cpu];
-
- if (!head_page(data) || !data->trace_idx)
- continue;
-
- atomic_inc(&data->disabled);
- cpu_set(cpu, mask);
- }
-
- for_each_cpu_mask(cpu, mask) {
- data = iter->tr->data[cpu];
- __raw_spin_lock(&data->lock);
-
- if (data->overrun > iter->last_overrun[cpu])
- iter->overrun[cpu] +=
- data->overrun - iter->last_overrun[cpu];
- iter->last_overrun[cpu] = data->overrun;
- }
-
while (find_next_entry_inc(iter) != NULL) {
- int ret;
+ enum print_line_t ret;
int len = iter->seq.len;
ret = print_trace_line(iter);
- if (!ret) {
+ if (ret == TRACE_TYPE_PARTIAL_LINE) {
/* don't print partial lines */
iter->seq.len = len;
break;
@@ -2639,26 +2631,17 @@
break;
}
- for_each_cpu_mask(cpu, mask) {
- data = iter->tr->data[cpu];
- __raw_spin_unlock(&data->lock);
- }
-
- for_each_cpu_mask(cpu, mask) {
- data = iter->tr->data[cpu];
- atomic_dec(&data->disabled);
- }
-#ifdef CONFIG_FTRACE
- ftrace_enabled = ftrace_save;
-#endif
- local_irq_restore(flags);
-
/* Now copy what we have to the user */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
if (iter->seq.readpos >= iter->seq.len)
trace_seq_reset(&iter->seq);
+
+ /*
+ * If there was nothing to send to user, inspite of consuming trace
+ * entries, go back to wait for more entries.
+ */
if (sret == -EBUSY)
- sret = 0;
+ goto waitagain;
out:
mutex_unlock(&trace_types_lock);
@@ -2684,7 +2667,8 @@
{
unsigned long val;
char buf[64];
- int i, ret;
+ int ret;
+ struct trace_array *tr = filp->private_data;
if (cnt >= sizeof(buf))
return -EINVAL;
@@ -2704,59 +2688,38 @@
mutex_lock(&trace_types_lock);
- if (current_trace != &no_tracer) {
+ if (tr->ctrl) {
cnt = -EBUSY;
- pr_info("ftrace: set current_tracer to none"
+ pr_info("ftrace: please disable tracing"
" before modifying buffer size\n");
goto out;
}
- if (val > global_trace.entries) {
- long pages_requested;
- unsigned long freeable_pages;
-
- /* make sure we have enough memory before mapping */
- pages_requested =
- (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
-
- /* account for each buffer (and max_tr) */
- pages_requested *= tracing_nr_buffers * 2;
-
- /* Check for overflow */
- if (pages_requested < 0) {
- cnt = -ENOMEM;
+ if (val != global_trace.entries) {
+ ret = ring_buffer_resize(global_trace.buffer, val);
+ if (ret < 0) {
+ cnt = ret;
goto out;
}
- freeable_pages = determine_dirtyable_memory();
-
- /* we only allow to request 1/4 of useable memory */
- if (pages_requested >
- ((freeable_pages + tracing_pages_allocated) / 4)) {
- cnt = -ENOMEM;
- goto out;
- }
-
- while (global_trace.entries < val) {
- if (trace_alloc_page()) {
- cnt = -ENOMEM;
- goto out;
+ ret = ring_buffer_resize(max_tr.buffer, val);
+ if (ret < 0) {
+ int r;
+ cnt = ret;
+ r = ring_buffer_resize(global_trace.buffer,
+ global_trace.entries);
+ if (r < 0) {
+ /* AARGH! We are left with different
+ * size max buffer!!!! */
+ WARN_ON(1);
+ tracing_disabled = 1;
}
- /* double check that we don't go over the known pages */
- if (tracing_pages_allocated > pages_requested)
- break;
+ goto out;
}
- } else {
- /* include the number of entries in val (inc of page entries) */
- while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
- trace_free_page();
+ global_trace.entries = val;
}
- /* check integrity */
- for_each_tracing_cpu(i)
- check_pages(global_trace.data[i]);
-
filp->f_pos += cnt;
/* If check pages failed, return ENOMEM */
@@ -2769,6 +2732,52 @@
return cnt;
}
+static int mark_printk(const char *fmt, ...)
+{
+ int ret;
+ va_list args;
+ va_start(args, fmt);
+ ret = trace_vprintk(0, fmt, args);
+ va_end(args);
+ return ret;
+}
+
+static ssize_t
+tracing_mark_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *fpos)
+{
+ char *buf;
+ char *end;
+ struct trace_array *tr = &global_trace;
+
+ if (!tr->ctrl || tracing_disabled)
+ return -EINVAL;
+
+ if (cnt > TRACE_BUF_SIZE)
+ cnt = TRACE_BUF_SIZE;
+
+ buf = kmalloc(cnt + 1, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, ubuf, cnt)) {
+ kfree(buf);
+ return -EFAULT;
+ }
+
+ /* Cut from the first nil or newline. */
+ buf[cnt] = '\0';
+ end = strchr(buf, '\n');
+ if (end)
+ *end = '\0';
+
+ cnt = mark_printk("%s\n", buf);
+ kfree(buf);
+ *fpos += cnt;
+
+ return cnt;
+}
+
static struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
@@ -2800,6 +2809,11 @@
.write = tracing_entries_write,
};
+static struct file_operations tracing_mark_fops = {
+ .open = tracing_open_generic,
+ .write = tracing_mark_write,
+};
+
#ifdef CONFIG_DYNAMIC_FTRACE
static ssize_t
@@ -2846,7 +2860,7 @@
#include "trace_selftest.c"
#endif
-static __init void tracer_init_debugfs(void)
+static __init int tracer_init_debugfs(void)
{
struct dentry *d_tracer;
struct dentry *entry;
@@ -2881,12 +2895,12 @@
entry = debugfs_create_file("available_tracers", 0444, d_tracer,
&global_trace, &show_traces_fops);
if (!entry)
- pr_warning("Could not create debugfs 'trace' entry\n");
+ pr_warning("Could not create debugfs 'available_tracers' entry\n");
entry = debugfs_create_file("current_tracer", 0444, d_tracer,
&global_trace, &set_tracer_fops);
if (!entry)
- pr_warning("Could not create debugfs 'trace' entry\n");
+ pr_warning("Could not create debugfs 'current_tracer' entry\n");
entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
&tracing_max_latency,
@@ -2899,7 +2913,7 @@
&tracing_thresh, &tracing_max_lat_fops);
if (!entry)
pr_warning("Could not create debugfs "
- "'tracing_threash' entry\n");
+ "'tracing_thresh' entry\n");
entry = debugfs_create_file("README", 0644, d_tracer,
NULL, &tracing_readme_fops);
if (!entry)
@@ -2909,13 +2923,19 @@
NULL, &tracing_pipe_fops);
if (!entry)
pr_warning("Could not create debugfs "
- "'tracing_threash' entry\n");
+ "'trace_pipe' entry\n");
entry = debugfs_create_file("trace_entries", 0644, d_tracer,
&global_trace, &tracing_entries_fops);
if (!entry)
pr_warning("Could not create debugfs "
- "'tracing_threash' entry\n");
+ "'trace_entries' entry\n");
+
+ entry = debugfs_create_file("trace_marker", 0220, d_tracer,
+ NULL, &tracing_mark_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'trace_marker' entry\n");
#ifdef CONFIG_DYNAMIC_FTRACE
entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
@@ -2928,230 +2948,263 @@
#ifdef CONFIG_SYSPROF_TRACER
init_tracer_sysprof_debugfs(d_tracer);
#endif
-}
-
-static int trace_alloc_page(void)
-{
- struct trace_array_cpu *data;
- struct page *page, *tmp;
- LIST_HEAD(pages);
- void *array;
- unsigned pages_allocated = 0;
- int i;
-
- /* first allocate a page for each CPU */
- for_each_tracing_cpu(i) {
- array = (void *)__get_free_page(GFP_KERNEL);
- if (array == NULL) {
- printk(KERN_ERR "tracer: failed to allocate page"
- "for trace buffer!\n");
- goto free_pages;
- }
-
- pages_allocated++;
- page = virt_to_page(array);
- list_add(&page->lru, &pages);
-
-/* Only allocate if we are actually using the max trace */
-#ifdef CONFIG_TRACER_MAX_TRACE
- array = (void *)__get_free_page(GFP_KERNEL);
- if (array == NULL) {
- printk(KERN_ERR "tracer: failed to allocate page"
- "for trace buffer!\n");
- goto free_pages;
- }
- pages_allocated++;
- page = virt_to_page(array);
- list_add(&page->lru, &pages);
-#endif
- }
-
- /* Now that we successfully allocate a page per CPU, add them */
- for_each_tracing_cpu(i) {
- data = global_trace.data[i];
- page = list_entry(pages.next, struct page, lru);
- list_del_init(&page->lru);
- list_add_tail(&page->lru, &data->trace_pages);
- ClearPageLRU(page);
-
-#ifdef CONFIG_TRACER_MAX_TRACE
- data = max_tr.data[i];
- page = list_entry(pages.next, struct page, lru);
- list_del_init(&page->lru);
- list_add_tail(&page->lru, &data->trace_pages);
- SetPageLRU(page);
-#endif
- }
- tracing_pages_allocated += pages_allocated;
- global_trace.entries += ENTRIES_PER_PAGE;
-
return 0;
-
- free_pages:
- list_for_each_entry_safe(page, tmp, &pages, lru) {
- list_del_init(&page->lru);
- __free_page(page);
- }
- return -ENOMEM;
}
-static int trace_free_page(void)
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
+ static DEFINE_SPINLOCK(trace_buf_lock);
+ static char trace_buf[TRACE_BUF_SIZE];
+
+ struct ring_buffer_event *event;
+ struct trace_array *tr = &global_trace;
struct trace_array_cpu *data;
- struct page *page;
- struct list_head *p;
- int i;
- int ret = 0;
+ struct print_entry *entry;
+ unsigned long flags, irq_flags;
+ int cpu, len = 0, size, pc;
- /* free one page from each buffer */
- for_each_tracing_cpu(i) {
- data = global_trace.data[i];
- p = data->trace_pages.next;
- if (p == &data->trace_pages) {
- /* should never happen */
- WARN_ON(1);
- tracing_disabled = 1;
- ret = -1;
- break;
- }
- page = list_entry(p, struct page, lru);
- ClearPageLRU(page);
- list_del(&page->lru);
- tracing_pages_allocated--;
- tracing_pages_allocated--;
- __free_page(page);
+ if (!tr->ctrl || tracing_disabled)
+ return 0;
- tracing_reset(data);
+ pc = preempt_count();
+ preempt_disable_notrace();
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
-#ifdef CONFIG_TRACER_MAX_TRACE
- data = max_tr.data[i];
- p = data->trace_pages.next;
- if (p == &data->trace_pages) {
- /* should never happen */
- WARN_ON(1);
- tracing_disabled = 1;
- ret = -1;
- break;
- }
- page = list_entry(p, struct page, lru);
- ClearPageLRU(page);
- list_del(&page->lru);
- __free_page(page);
+ if (unlikely(atomic_read(&data->disabled)))
+ goto out;
- tracing_reset(data);
-#endif
- }
- global_trace.entries -= ENTRIES_PER_PAGE;
+ spin_lock_irqsave(&trace_buf_lock, flags);
+ len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+ len = min(len, TRACE_BUF_SIZE-1);
+ trace_buf[len] = 0;
+
+ size = sizeof(*entry) + len + 1;
+ event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
+ if (!event)
+ goto out_unlock;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_PRINT;
+ entry->ip = ip;
+
+ memcpy(&entry->buf, trace_buf, len);
+ entry->buf[len] = 0;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ out_unlock:
+ spin_unlock_irqrestore(&trace_buf_lock, flags);
+
+ out:
+ preempt_enable_notrace();
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_vprintk);
+
+int __ftrace_printk(unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = trace_vprintk(ip, fmt, ap);
+ va_end(ap);
return ret;
}
+EXPORT_SYMBOL_GPL(__ftrace_printk);
+
+static int trace_panic_handler(struct notifier_block *this,
+ unsigned long event, void *unused)
+{
+ ftrace_dump();
+ return NOTIFY_OK;
+}
+
+static struct notifier_block trace_panic_notifier = {
+ .notifier_call = trace_panic_handler,
+ .next = NULL,
+ .priority = 150 /* priority: INT_MAX >= x >= 0 */
+};
+
+static int trace_die_handler(struct notifier_block *self,
+ unsigned long val,
+ void *data)
+{
+ switch (val) {
+ case DIE_OOPS:
+ ftrace_dump();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block trace_die_notifier = {
+ .notifier_call = trace_die_handler,
+ .priority = 200
+};
+
+/*
+ * printk is set to max of 1024, we really don't need it that big.
+ * Nothing should be printing 1000 characters anyway.
+ */
+#define TRACE_MAX_PRINT 1000
+
+/*
+ * Define here KERN_TRACE so that we have one place to modify
+ * it if we decide to change what log level the ftrace dump
+ * should be at.
+ */
+#define KERN_TRACE KERN_INFO
+
+static void
+trace_printk_seq(struct trace_seq *s)
+{
+ /* Probably should print a warning here. */
+ if (s->len >= 1000)
+ s->len = 1000;
+
+ /* should be zero ended, but we are paranoid. */
+ s->buffer[s->len] = 0;
+
+ printk(KERN_TRACE "%s", s->buffer);
+
+ trace_seq_reset(s);
+}
+
+
+void ftrace_dump(void)
+{
+ static DEFINE_SPINLOCK(ftrace_dump_lock);
+ /* use static because iter can be a bit big for the stack */
+ static struct trace_iterator iter;
+ static cpumask_t mask;
+ static int dump_ran;
+ unsigned long flags;
+ int cnt = 0, cpu;
+
+ /* only one dump */
+ spin_lock_irqsave(&ftrace_dump_lock, flags);
+ if (dump_ran)
+ goto out;
+
+ dump_ran = 1;
+
+ /* No turning back! */
+ ftrace_kill_atomic();
+
+ for_each_tracing_cpu(cpu) {
+ atomic_inc(&global_trace.data[cpu]->disabled);
+ }
+
+ printk(KERN_TRACE "Dumping ftrace buffer:\n");
+
+ iter.tr = &global_trace;
+ iter.trace = current_trace;
+
+ /*
+ * We need to stop all tracing on all CPUS to read the
+ * the next buffer. This is a bit expensive, but is
+ * not done often. We fill all what we can read,
+ * and then release the locks again.
+ */
+
+ cpus_clear(mask);
+
+ while (!trace_empty(&iter)) {
+
+ if (!cnt)
+ printk(KERN_TRACE "---------------------------------\n");
+
+ cnt++;
+
+ /* reset all but tr, trace, and overruns */
+ memset(&iter.seq, 0,
+ sizeof(struct trace_iterator) -
+ offsetof(struct trace_iterator, seq));
+ iter.iter_flags |= TRACE_FILE_LAT_FMT;
+ iter.pos = -1;
+
+ if (find_next_entry_inc(&iter) != NULL) {
+ print_trace_line(&iter);
+ trace_consume(&iter);
+ }
+
+ trace_printk_seq(&iter.seq);
+ }
+
+ if (!cnt)
+ printk(KERN_TRACE " (ftrace buffer empty)\n");
+ else
+ printk(KERN_TRACE "---------------------------------\n");
+
+ out:
+ spin_unlock_irqrestore(&ftrace_dump_lock, flags);
+}
__init static int tracer_alloc_buffers(void)
{
struct trace_array_cpu *data;
- void *array;
- struct page *page;
- int pages = 0;
- int ret = -ENOMEM;
int i;
/* TODO: make the number of buffers hot pluggable with CPUS */
- tracing_nr_buffers = num_possible_cpus();
tracing_buffer_mask = cpu_possible_map;
+ global_trace.buffer = ring_buffer_alloc(trace_buf_size,
+ TRACE_BUFFER_FLAGS);
+ if (!global_trace.buffer) {
+ printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
+ WARN_ON(1);
+ return 0;
+ }
+ global_trace.entries = ring_buffer_size(global_trace.buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ max_tr.buffer = ring_buffer_alloc(trace_buf_size,
+ TRACE_BUFFER_FLAGS);
+ if (!max_tr.buffer) {
+ printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
+ WARN_ON(1);
+ ring_buffer_free(global_trace.buffer);
+ return 0;
+ }
+ max_tr.entries = ring_buffer_size(max_tr.buffer);
+ WARN_ON(max_tr.entries != global_trace.entries);
+#endif
+
/* Allocate the first page for all buffers */
for_each_tracing_cpu(i) {
data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
max_tr.data[i] = &per_cpu(max_data, i);
-
- array = (void *)__get_free_page(GFP_KERNEL);
- if (array == NULL) {
- printk(KERN_ERR "tracer: failed to allocate page"
- "for trace buffer!\n");
- goto free_buffers;
- }
-
- /* set the array to the list */
- INIT_LIST_HEAD(&data->trace_pages);
- page = virt_to_page(array);
- list_add(&page->lru, &data->trace_pages);
- /* use the LRU flag to differentiate the two buffers */
- ClearPageLRU(page);
-
- data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
- max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-
-/* Only allocate if we are actually using the max trace */
-#ifdef CONFIG_TRACER_MAX_TRACE
- array = (void *)__get_free_page(GFP_KERNEL);
- if (array == NULL) {
- printk(KERN_ERR "tracer: failed to allocate page"
- "for trace buffer!\n");
- goto free_buffers;
- }
-
- INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
- page = virt_to_page(array);
- list_add(&page->lru, &max_tr.data[i]->trace_pages);
- SetPageLRU(page);
-#endif
}
- /*
- * Since we allocate by orders of pages, we may be able to
- * round up a bit.
- */
- global_trace.entries = ENTRIES_PER_PAGE;
- pages++;
-
- while (global_trace.entries < trace_nr_entries) {
- if (trace_alloc_page())
- break;
- pages++;
- }
- max_tr.entries = global_trace.entries;
-
- pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n",
- pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE);
- pr_info(" actual entries %ld\n", global_trace.entries);
-
- tracer_init_debugfs();
-
trace_init_cmdlines();
- register_tracer(&no_tracer);
- current_trace = &no_tracer;
+ register_tracer(&nop_trace);
+#ifdef CONFIG_BOOT_TRACER
+ register_tracer(&boot_tracer);
+ current_trace = &boot_tracer;
+ current_trace->init(&global_trace);
+#else
+ current_trace = &nop_trace;
+#endif
/* All seems OK, enable tracing */
global_trace.ctrl = tracer_enabled;
tracing_disabled = 0;
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &trace_panic_notifier);
+
+ register_die_notifier(&trace_die_notifier);
+
return 0;
-
- free_buffers:
- for (i-- ; i >= 0; i--) {
- struct page *page, *tmp;
- struct trace_array_cpu *data = global_trace.data[i];
-
- if (data) {
- list_for_each_entry_safe(page, tmp,
- &data->trace_pages, lru) {
- list_del_init(&page->lru);
- __free_page(page);
- }
- }
-
-#ifdef CONFIG_TRACER_MAX_TRACE
- data = max_tr.data[i];
- if (data) {
- list_for_each_entry_safe(page, tmp,
- &data->trace_pages, lru) {
- list_del_init(&page->lru);
- __free_page(page);
- }
- }
-#endif
- }
- return ret;
}
-fs_initcall(tracer_alloc_buffers);
+early_initcall(tracer_alloc_buffers);
+fs_initcall(tracer_init_debugfs);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f69f867..f1f9957 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -5,7 +5,9 @@
#include <asm/atomic.h>
#include <linux/sched.h>
#include <linux/clocksource.h>
+#include <linux/ring_buffer.h>
#include <linux/mmiotrace.h>
+#include <linux/ftrace.h>
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -13,38 +15,60 @@
TRACE_FN,
TRACE_CTX,
TRACE_WAKE,
+ TRACE_CONT,
TRACE_STACK,
+ TRACE_PRINT,
TRACE_SPECIAL,
TRACE_MMIO_RW,
TRACE_MMIO_MAP,
+ TRACE_BOOT,
__TRACE_LAST_TYPE
};
/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+ unsigned char type;
+ unsigned char cpu;
+ unsigned char flags;
+ unsigned char preempt_count;
+ int pid;
+};
+
+/*
* Function trace entry - function address and parent function addres:
*/
struct ftrace_entry {
+ struct trace_entry ent;
unsigned long ip;
unsigned long parent_ip;
};
+extern struct tracer boot_tracer;
/*
* Context switch trace entry - which task (and prio) we switched from/to:
*/
struct ctx_switch_entry {
+ struct trace_entry ent;
unsigned int prev_pid;
unsigned char prev_prio;
unsigned char prev_state;
unsigned int next_pid;
unsigned char next_prio;
unsigned char next_state;
+ unsigned int next_cpu;
};
/*
* Special (free-form) trace entry:
*/
struct special_entry {
+ struct trace_entry ent;
unsigned long arg1;
unsigned long arg2;
unsigned long arg3;
@@ -57,33 +81,60 @@
#define FTRACE_STACK_ENTRIES 8
struct stack_entry {
+ struct trace_entry ent;
unsigned long caller[FTRACE_STACK_ENTRIES];
};
/*
- * The trace entry - the most basic unit of tracing. This is what
- * is printed in the end as a single line in the trace output, such as:
- *
- * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
+ * ftrace_printk entry:
*/
-struct trace_entry {
- char type;
- char cpu;
- char flags;
- char preempt_count;
- int pid;
- cycle_t t;
- union {
- struct ftrace_entry fn;
- struct ctx_switch_entry ctx;
- struct special_entry special;
- struct stack_entry stack;
- struct mmiotrace_rw mmiorw;
- struct mmiotrace_map mmiomap;
- };
+struct print_entry {
+ struct trace_entry ent;
+ unsigned long ip;
+ char buf[];
};
-#define TRACE_ENTRY_SIZE sizeof(struct trace_entry)
+#define TRACE_OLD_SIZE 88
+
+struct trace_field_cont {
+ unsigned char type;
+ /* Temporary till we get rid of this completely */
+ char buf[TRACE_OLD_SIZE - 1];
+};
+
+struct trace_mmiotrace_rw {
+ struct trace_entry ent;
+ struct mmiotrace_rw rw;
+};
+
+struct trace_mmiotrace_map {
+ struct trace_entry ent;
+ struct mmiotrace_map map;
+};
+
+struct trace_boot {
+ struct trace_entry ent;
+ struct boot_trace initcall;
+};
+
+/*
+ * trace_flag_type is an enumeration that holds different
+ * states when a trace occurs. These are:
+ * IRQS_OFF - interrupts were disabled
+ * NEED_RESCED - reschedule is requested
+ * HARDIRQ - inside an interrupt handler
+ * SOFTIRQ - inside a softirq handler
+ * CONT - multiple entries hold the trace item
+ */
+enum trace_flag_type {
+ TRACE_FLAG_IRQS_OFF = 0x01,
+ TRACE_FLAG_NEED_RESCHED = 0x02,
+ TRACE_FLAG_HARDIRQ = 0x04,
+ TRACE_FLAG_SOFTIRQ = 0x08,
+ TRACE_FLAG_CONT = 0x10,
+};
+
+#define TRACE_BUF_SIZE 1024
/*
* The CPU trace array - it consists of thousands of trace entries
@@ -91,16 +142,9 @@
* the trace, etc.)
*/
struct trace_array_cpu {
- struct list_head trace_pages;
atomic_t disabled;
- raw_spinlock_t lock;
- struct lock_class_key lock_key;
/* these fields get copied into max-trace: */
- unsigned trace_head_idx;
- unsigned trace_tail_idx;
- void *trace_head; /* producer */
- void *trace_tail; /* consumer */
unsigned long trace_idx;
unsigned long overrun;
unsigned long saved_latency;
@@ -124,6 +168,7 @@
* They have on/off state as well:
*/
struct trace_array {
+ struct ring_buffer *buffer;
unsigned long entries;
long ctrl;
int cpu;
@@ -132,6 +177,56 @@
struct trace_array_cpu *data[NR_CPUS];
};
+#define FTRACE_CMP_TYPE(var, type) \
+ __builtin_types_compatible_p(typeof(var), type *)
+
+#undef IF_ASSIGN
+#define IF_ASSIGN(var, entry, etype, id) \
+ if (FTRACE_CMP_TYPE(var, etype)) { \
+ var = (typeof(var))(entry); \
+ WARN_ON(id && (entry)->type != id); \
+ break; \
+ }
+
+/* Will cause compile errors if type is not found. */
+extern void __ftrace_bad_type(void);
+
+/*
+ * The trace_assign_type is a verifier that the entry type is
+ * the same as the type being assigned. To add new types simply
+ * add a line with the following format:
+ *
+ * IF_ASSIGN(var, ent, type, id);
+ *
+ * Where "type" is the trace type that includes the trace_entry
+ * as the "ent" item. And "id" is the trace identifier that is
+ * used in the trace_type enum.
+ *
+ * If the type can have more than one id, then use zero.
+ */
+#define trace_assign_type(var, ent) \
+ do { \
+ IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
+ IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
+ IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
+ IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
+ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
+ IF_ASSIGN(var, ent, struct special_entry, 0); \
+ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
+ TRACE_MMIO_RW); \
+ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
+ TRACE_MMIO_MAP); \
+ IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \
+ __ftrace_bad_type(); \
+ } while (0)
+
+/* Return values for print_line callback */
+enum print_line_t {
+ TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
+ TRACE_TYPE_HANDLED = 1,
+ TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
+};
+
/*
* A specific tracer, represented by methods that operate on a trace array:
*/
@@ -152,7 +247,7 @@
int (*selftest)(struct tracer *trace,
struct trace_array *tr);
#endif
- int (*print_line)(struct trace_iterator *iter);
+ enum print_line_t (*print_line)(struct trace_iterator *iter);
struct tracer *next;
int print_max;
};
@@ -171,57 +266,58 @@
struct trace_array *tr;
struct tracer *trace;
void *private;
- long last_overrun[NR_CPUS];
- long overrun[NR_CPUS];
+ struct ring_buffer_iter *buffer_iter[NR_CPUS];
/* The below is zeroed out in pipe_read */
struct trace_seq seq;
struct trace_entry *ent;
int cpu;
-
- struct trace_entry *prev_ent;
- int prev_cpu;
+ u64 ts;
unsigned long iter_flags;
loff_t pos;
- unsigned long next_idx[NR_CPUS];
- struct list_head *next_page[NR_CPUS];
- unsigned next_page_idx[NR_CPUS];
long idx;
};
-void tracing_reset(struct trace_array_cpu *data);
+void trace_wake_up(void);
+void tracing_reset(struct trace_array *tr, int cpu);
int tracing_open_generic(struct inode *inode, struct file *filp);
struct dentry *tracing_init_dentry(void);
void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
+struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
+ struct trace_array_cpu *data);
+void tracing_generic_entry_update(struct trace_entry *entry,
+ unsigned long flags,
+ int pc);
+
void ftrace(struct trace_array *tr,
struct trace_array_cpu *data,
unsigned long ip,
unsigned long parent_ip,
- unsigned long flags);
+ unsigned long flags, int pc);
void tracing_sched_switch_trace(struct trace_array *tr,
struct trace_array_cpu *data,
struct task_struct *prev,
struct task_struct *next,
- unsigned long flags);
+ unsigned long flags, int pc);
void tracing_record_cmdline(struct task_struct *tsk);
void tracing_sched_wakeup_trace(struct trace_array *tr,
struct trace_array_cpu *data,
struct task_struct *wakee,
struct task_struct *cur,
- unsigned long flags);
+ unsigned long flags, int pc);
void trace_special(struct trace_array *tr,
struct trace_array_cpu *data,
unsigned long arg1,
unsigned long arg2,
- unsigned long arg3);
+ unsigned long arg3, int pc);
void trace_function(struct trace_array *tr,
struct trace_array_cpu *data,
unsigned long ip,
unsigned long parent_ip,
- unsigned long flags);
+ unsigned long flags, int pc);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
@@ -268,51 +364,33 @@
extern int DYN_FTRACE_TEST_NAME(void);
#endif
-#ifdef CONFIG_MMIOTRACE
-extern void __trace_mmiotrace_rw(struct trace_array *tr,
- struct trace_array_cpu *data,
- struct mmiotrace_rw *rw);
-extern void __trace_mmiotrace_map(struct trace_array *tr,
- struct trace_array_cpu *data,
- struct mmiotrace_map *map);
-#endif
-
#ifdef CONFIG_FTRACE_STARTUP_TEST
-#ifdef CONFIG_FTRACE
extern int trace_selftest_startup_function(struct tracer *trace,
struct trace_array *tr);
-#endif
-#ifdef CONFIG_IRQSOFF_TRACER
extern int trace_selftest_startup_irqsoff(struct tracer *trace,
struct trace_array *tr);
-#endif
-#ifdef CONFIG_PREEMPT_TRACER
extern int trace_selftest_startup_preemptoff(struct tracer *trace,
struct trace_array *tr);
-#endif
-#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
struct trace_array *tr);
-#endif
-#ifdef CONFIG_SCHED_TRACER
extern int trace_selftest_startup_wakeup(struct tracer *trace,
struct trace_array *tr);
-#endif
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+extern int trace_selftest_startup_nop(struct tracer *trace,
+ struct trace_array *tr);
extern int trace_selftest_startup_sched_switch(struct tracer *trace,
struct trace_array *tr);
-#endif
-#ifdef CONFIG_SYSPROF_TRACER
extern int trace_selftest_startup_sysprof(struct tracer *trace,
struct trace_array *tr);
-#endif
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern void trace_seq_print_cont(struct trace_seq *s,
+ struct trace_iterator *iter);
extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
size_t cnt);
extern long ns2usecs(cycle_t nsec);
+extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
extern unsigned long trace_flags;
@@ -334,6 +412,9 @@
TRACE_ITER_BLOCK = 0x80,
TRACE_ITER_STACKTRACE = 0x100,
TRACE_ITER_SCHED_TREE = 0x200,
+ TRACE_ITER_PRINTK = 0x400,
};
+extern struct tracer nop_trace;
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
new file mode 100644
index 0000000..d0a5e50
--- /dev/null
+++ b/kernel/trace/trace_boot.c
@@ -0,0 +1,126 @@
+/*
+ * ring buffer based initcalls tracer
+ *
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+
+#include "trace.h"
+
+static struct trace_array *boot_trace;
+static int trace_boot_enabled;
+
+
+/* Should be started after do_pre_smp_initcalls() in init/main.c */
+void start_boot_trace(void)
+{
+ trace_boot_enabled = 1;
+}
+
+void stop_boot_trace(void)
+{
+ trace_boot_enabled = 0;
+}
+
+void reset_boot_trace(struct trace_array *tr)
+{
+ stop_boot_trace();
+}
+
+static void boot_trace_init(struct trace_array *tr)
+{
+ int cpu;
+ boot_trace = tr;
+
+ trace_boot_enabled = 0;
+
+ for_each_cpu_mask(cpu, cpu_possible_map)
+ tracing_reset(tr, cpu);
+}
+
+static void boot_trace_ctrl_update(struct trace_array *tr)
+{
+ if (tr->ctrl)
+ start_boot_trace();
+ else
+ stop_boot_trace();
+}
+
+static enum print_line_t initcall_print_line(struct trace_iterator *iter)
+{
+ int ret;
+ struct trace_entry *entry = iter->ent;
+ struct trace_boot *field = (struct trace_boot *)entry;
+ struct boot_trace *it = &field->initcall;
+ struct trace_seq *s = &iter->seq;
+ struct timespec calltime = ktime_to_timespec(it->calltime);
+ struct timespec rettime = ktime_to_timespec(it->rettime);
+
+ if (entry->type == TRACE_BOOT) {
+ ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
+ calltime.tv_sec,
+ calltime.tv_nsec,
+ it->func, it->caller);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
+ "returned %d after %lld msecs\n",
+ rettime.tv_sec,
+ rettime.tv_nsec,
+ it->func, it->result, it->duration);
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ return TRACE_TYPE_HANDLED;
+ }
+ return TRACE_TYPE_UNHANDLED;
+}
+
+struct tracer boot_tracer __read_mostly =
+{
+ .name = "initcall",
+ .init = boot_trace_init,
+ .reset = reset_boot_trace,
+ .ctrl_update = boot_trace_ctrl_update,
+ .print_line = initcall_print_line,
+};
+
+void trace_boot(struct boot_trace *it, initcall_t fn)
+{
+ struct ring_buffer_event *event;
+ struct trace_boot *entry;
+ struct trace_array_cpu *data;
+ unsigned long irq_flags;
+ struct trace_array *tr = boot_trace;
+
+ if (!trace_boot_enabled)
+ return;
+
+ /* Get its name now since this function could
+ * disappear because it is in the .init section.
+ */
+ sprint_symbol(it->func, (unsigned long)fn);
+ preempt_disable();
+ data = tr->data[smp_processor_id()];
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, 0);
+ entry->ent.type = TRACE_BOOT;
+ entry->initcall = *it;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ trace_wake_up();
+
+ out:
+ preempt_enable();
+}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 3121448..e90eb0c 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -23,7 +23,7 @@
tr->time_start = ftrace_now(tr->cpu);
for_each_online_cpu(cpu)
- tracing_reset(tr->data[cpu]);
+ tracing_reset(tr, cpu);
}
static void start_function_trace(struct trace_array *tr)
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index ece6cfb..a7db7f0 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -95,7 +95,7 @@
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1))
- trace_function(tr, data, ip, parent_ip, flags);
+ trace_function(tr, data, ip, parent_ip, flags, preempt_count());
atomic_dec(&data->disabled);
}
@@ -130,6 +130,7 @@
unsigned long latency, t0, t1;
cycle_t T0, T1, delta;
unsigned long flags;
+ int pc;
/*
* usecs conversion is slow so we try to delay the conversion
@@ -141,6 +142,8 @@
local_save_flags(flags);
+ pc = preempt_count();
+
if (!report_latency(delta))
goto out;
@@ -150,7 +153,7 @@
if (!report_latency(delta))
goto out_unlock;
- trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
+ trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
latency = nsecs_to_usecs(delta);
@@ -173,8 +176,8 @@
out:
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
- tracing_reset(data);
- trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
+ tracing_reset(tr, cpu);
+ trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
}
static inline void
@@ -203,11 +206,11 @@
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
data->critical_start = parent_ip ? : ip;
- tracing_reset(data);
+ tracing_reset(tr, cpu);
local_save_flags(flags);
- trace_function(tr, data, ip, parent_ip, flags);
+ trace_function(tr, data, ip, parent_ip, flags, preempt_count());
per_cpu(tracing_cpu, cpu) = 1;
@@ -234,14 +237,14 @@
data = tr->data[cpu];
- if (unlikely(!data) || unlikely(!head_page(data)) ||
+ if (unlikely(!data) ||
!data->critical_start || atomic_read(&data->disabled))
return;
atomic_inc(&data->disabled);
local_save_flags(flags);
- trace_function(tr, data, ip, parent_ip, flags);
+ trace_function(tr, data, ip, parent_ip, flags, preempt_count());
check_critical_timing(tr, data, parent_ip ? : ip, cpu);
data->critical_start = 0;
atomic_dec(&data->disabled);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index b13dc19..f284846 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -27,7 +27,7 @@
tr->time_start = ftrace_now(tr->cpu);
for_each_online_cpu(cpu)
- tracing_reset(tr->data[cpu]);
+ tracing_reset(tr, cpu);
}
static void mmio_trace_init(struct trace_array *tr)
@@ -130,10 +130,14 @@
{
int cpu;
unsigned long cnt = 0;
+/* FIXME: */
+#if 0
for_each_online_cpu(cpu) {
cnt += iter->overrun[cpu];
iter->overrun[cpu] = 0;
}
+#endif
+ (void)cpu;
return cnt;
}
@@ -171,17 +175,21 @@
return (ret == -EBUSY) ? 0 : ret;
}
-static int mmio_print_rw(struct trace_iterator *iter)
+static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
{
struct trace_entry *entry = iter->ent;
- struct mmiotrace_rw *rw = &entry->mmiorw;
+ struct trace_mmiotrace_rw *field;
+ struct mmiotrace_rw *rw;
struct trace_seq *s = &iter->seq;
- unsigned long long t = ns2usecs(entry->t);
+ unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, 1000000ULL);
unsigned secs = (unsigned long)t;
int ret = 1;
- switch (entry->mmiorw.opcode) {
+ trace_assign_type(field, entry);
+ rw = &field->rw;
+
+ switch (rw->opcode) {
case MMIO_READ:
ret = trace_seq_printf(s,
"R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
@@ -209,21 +217,25 @@
break;
}
if (ret)
- return 1;
- return 0;
+ return TRACE_TYPE_HANDLED;
+ return TRACE_TYPE_PARTIAL_LINE;
}
-static int mmio_print_map(struct trace_iterator *iter)
+static enum print_line_t mmio_print_map(struct trace_iterator *iter)
{
struct trace_entry *entry = iter->ent;
- struct mmiotrace_map *m = &entry->mmiomap;
+ struct trace_mmiotrace_map *field;
+ struct mmiotrace_map *m;
struct trace_seq *s = &iter->seq;
- unsigned long long t = ns2usecs(entry->t);
+ unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, 1000000ULL);
unsigned secs = (unsigned long)t;
- int ret = 1;
+ int ret;
- switch (entry->mmiorw.opcode) {
+ trace_assign_type(field, entry);
+ m = &field->map;
+
+ switch (m->opcode) {
case MMIO_PROBE:
ret = trace_seq_printf(s,
"MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
@@ -241,20 +253,43 @@
break;
}
if (ret)
- return 1;
- return 0;
+ return TRACE_TYPE_HANDLED;
+ return TRACE_TYPE_PARTIAL_LINE;
}
-/* return 0 to abort printing without consuming current entry in pipe mode */
-static int mmio_print_line(struct trace_iterator *iter)
+static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct print_entry *print = (struct print_entry *)entry;
+ const char *msg = print->buf;
+ struct trace_seq *s = &iter->seq;
+ unsigned long long t = ns2usecs(iter->ts);
+ unsigned long usec_rem = do_div(t, 1000000ULL);
+ unsigned secs = (unsigned long)t;
+ int ret;
+
+ /* The trailing newline must be in the message. */
+ ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (entry->flags & TRACE_FLAG_CONT)
+ trace_seq_print_cont(s, iter);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t mmio_print_line(struct trace_iterator *iter)
{
switch (iter->ent->type) {
case TRACE_MMIO_RW:
return mmio_print_rw(iter);
case TRACE_MMIO_MAP:
return mmio_print_map(iter);
+ case TRACE_PRINT:
+ return mmio_print_mark(iter);
default:
- return 1; /* ignore unknown entries */
+ return TRACE_TYPE_HANDLED; /* ignore unknown entries */
}
}
@@ -276,6 +311,27 @@
}
device_initcall(init_mmio_trace);
+static void __trace_mmiotrace_rw(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct mmiotrace_rw *rw)
+{
+ struct ring_buffer_event *event;
+ struct trace_mmiotrace_rw *entry;
+ unsigned long irq_flags;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, preempt_count());
+ entry->ent.type = TRACE_MMIO_RW;
+ entry->rw = *rw;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ trace_wake_up();
+}
+
void mmio_trace_rw(struct mmiotrace_rw *rw)
{
struct trace_array *tr = mmio_trace_array;
@@ -283,6 +339,27 @@
__trace_mmiotrace_rw(tr, data, rw);
}
+static void __trace_mmiotrace_map(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct mmiotrace_map *map)
+{
+ struct ring_buffer_event *event;
+ struct trace_mmiotrace_map *entry;
+ unsigned long irq_flags;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, preempt_count());
+ entry->ent.type = TRACE_MMIO_MAP;
+ entry->map = *map;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ trace_wake_up();
+}
+
void mmio_trace_mapping(struct mmiotrace_map *map)
{
struct trace_array *tr = mmio_trace_array;
@@ -293,3 +370,8 @@
__trace_mmiotrace_map(tr, data, map);
preempt_enable();
}
+
+int mmio_trace_printk(const char *fmt, va_list args)
+{
+ return trace_vprintk(0, fmt, args);
+}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
new file mode 100644
index 0000000..4592b48
--- /dev/null
+++ b/kernel/trace/trace_nop.c
@@ -0,0 +1,64 @@
+/*
+ * nop tracer
+ *
+ * Copyright (C) 2008 Steven Noonan <steven@uplinklabs.net>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array *ctx_trace;
+
+static void start_nop_trace(struct trace_array *tr)
+{
+ /* Nothing to do! */
+}
+
+static void stop_nop_trace(struct trace_array *tr)
+{
+ /* Nothing to do! */
+}
+
+static void nop_trace_init(struct trace_array *tr)
+{
+ int cpu;
+ ctx_trace = tr;
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+
+ if (tr->ctrl)
+ start_nop_trace(tr);
+}
+
+static void nop_trace_reset(struct trace_array *tr)
+{
+ if (tr->ctrl)
+ stop_nop_trace(tr);
+}
+
+static void nop_trace_ctrl_update(struct trace_array *tr)
+{
+ /* When starting a new trace, reset the buffers */
+ if (tr->ctrl)
+ start_nop_trace(tr);
+ else
+ stop_nop_trace(tr);
+}
+
+struct tracer nop_trace __read_mostly =
+{
+ .name = "nop",
+ .init = nop_trace_init,
+ .reset = nop_trace_reset,
+ .ctrl_update = nop_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_nop,
+#endif
+};
+
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index cb817a2..b8f56be 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -9,8 +9,8 @@
#include <linux/debugfs.h>
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
-#include <linux/marker.h>
#include <linux/ftrace.h>
+#include <trace/sched.h>
#include "trace.h"
@@ -19,15 +19,16 @@
static atomic_t sched_ref;
static void
-sched_switch_func(void *private, void *__rq, struct task_struct *prev,
+probe_sched_switch(struct rq *__rq, struct task_struct *prev,
struct task_struct *next)
{
- struct trace_array **ptr = private;
- struct trace_array *tr = *ptr;
struct trace_array_cpu *data;
unsigned long flags;
- long disabled;
int cpu;
+ int pc;
+
+ if (!atomic_read(&sched_ref))
+ return;
tracing_record_cmdline(prev);
tracing_record_cmdline(next);
@@ -35,97 +36,41 @@
if (!tracer_enabled)
return;
+ pc = preempt_count();
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
+ data = ctx_trace->data[cpu];
- if (likely(disabled == 1))
- tracing_sched_switch_trace(tr, data, prev, next, flags);
+ if (likely(!atomic_read(&data->disabled)))
+ tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc);
- atomic_dec(&data->disabled);
local_irq_restore(flags);
}
-static notrace void
-sched_switch_callback(void *probe_data, void *call_data,
- const char *format, va_list *args)
-{
- struct task_struct *prev;
- struct task_struct *next;
- struct rq *__rq;
-
- if (!atomic_read(&sched_ref))
- return;
-
- /* skip prev_pid %d next_pid %d prev_state %ld */
- (void)va_arg(*args, int);
- (void)va_arg(*args, int);
- (void)va_arg(*args, long);
- __rq = va_arg(*args, typeof(__rq));
- prev = va_arg(*args, typeof(prev));
- next = va_arg(*args, typeof(next));
-
- /*
- * If tracer_switch_func only points to the local
- * switch func, it still needs the ptr passed to it.
- */
- sched_switch_func(probe_data, __rq, prev, next);
-}
-
static void
-wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
- task_struct *curr)
+probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee)
{
- struct trace_array **ptr = private;
- struct trace_array *tr = *ptr;
struct trace_array_cpu *data;
unsigned long flags;
- long disabled;
- int cpu;
+ int cpu, pc;
- if (!tracer_enabled)
+ if (!likely(tracer_enabled))
return;
- tracing_record_cmdline(curr);
+ pc = preempt_count();
+ tracing_record_cmdline(current);
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
+ data = ctx_trace->data[cpu];
- if (likely(disabled == 1))
- tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
+ if (likely(!atomic_read(&data->disabled)))
+ tracing_sched_wakeup_trace(ctx_trace, data, wakee, current,
+ flags, pc);
- atomic_dec(&data->disabled);
local_irq_restore(flags);
}
-static notrace void
-wake_up_callback(void *probe_data, void *call_data,
- const char *format, va_list *args)
-{
- struct task_struct *curr;
- struct task_struct *task;
- struct rq *__rq;
-
- if (likely(!tracer_enabled))
- return;
-
- /* Skip pid %d state %ld */
- (void)va_arg(*args, int);
- (void)va_arg(*args, long);
- /* now get the meat: "rq %p task %p rq->curr %p" */
- __rq = va_arg(*args, typeof(__rq));
- task = va_arg(*args, typeof(task));
- curr = va_arg(*args, typeof(curr));
-
- tracing_record_cmdline(task);
- tracing_record_cmdline(curr);
-
- wakeup_func(probe_data, __rq, task, curr);
-}
-
static void sched_switch_reset(struct trace_array *tr)
{
int cpu;
@@ -133,67 +78,47 @@
tr->time_start = ftrace_now(tr->cpu);
for_each_online_cpu(cpu)
- tracing_reset(tr->data[cpu]);
+ tracing_reset(tr, cpu);
}
static int tracing_sched_register(void)
{
int ret;
- ret = marker_probe_register("kernel_sched_wakeup",
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- wake_up_callback,
- &ctx_trace);
+ ret = register_trace_sched_wakeup(probe_sched_wakeup);
if (ret) {
- pr_info("wakeup trace: Couldn't add marker"
+ pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup\n");
return ret;
}
- ret = marker_probe_register("kernel_sched_wakeup_new",
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- wake_up_callback,
- &ctx_trace);
+ ret = register_trace_sched_wakeup_new(probe_sched_wakeup);
if (ret) {
- pr_info("wakeup trace: Couldn't add marker"
+ pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup_new\n");
goto fail_deprobe;
}
- ret = marker_probe_register("kernel_sched_schedule",
- "prev_pid %d next_pid %d prev_state %ld "
- "## rq %p prev %p next %p",
- sched_switch_callback,
- &ctx_trace);
+ ret = register_trace_sched_switch(probe_sched_switch);
if (ret) {
- pr_info("sched trace: Couldn't add marker"
+ pr_info("sched trace: Couldn't activate tracepoint"
" probe to kernel_sched_schedule\n");
goto fail_deprobe_wake_new;
}
return ret;
fail_deprobe_wake_new:
- marker_probe_unregister("kernel_sched_wakeup_new",
- wake_up_callback,
- &ctx_trace);
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup);
fail_deprobe:
- marker_probe_unregister("kernel_sched_wakeup",
- wake_up_callback,
- &ctx_trace);
+ unregister_trace_sched_wakeup(probe_sched_wakeup);
return ret;
}
static void tracing_sched_unregister(void)
{
- marker_probe_unregister("kernel_sched_schedule",
- sched_switch_callback,
- &ctx_trace);
- marker_probe_unregister("kernel_sched_wakeup_new",
- wake_up_callback,
- &ctx_trace);
- marker_probe_unregister("kernel_sched_wakeup",
- wake_up_callback,
- &ctx_trace);
+ unregister_trace_sched_switch(probe_sched_switch);
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup);
+ unregister_trace_sched_wakeup(probe_sched_wakeup);
}
static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e303ccb..fe4a252 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
-#include <linux/marker.h>
+#include <trace/sched.h>
#include "trace.h"
@@ -44,10 +44,12 @@
long disabled;
int resched;
int cpu;
+ int pc;
if (likely(!wakeup_task))
return;
+ pc = preempt_count();
resched = need_resched();
preempt_disable_notrace();
@@ -70,7 +72,7 @@
if (task_cpu(wakeup_task) != cpu)
goto unlock;
- trace_function(tr, data, ip, parent_ip, flags);
+ trace_function(tr, data, ip, parent_ip, flags, pc);
unlock:
__raw_spin_unlock(&wakeup_lock);
@@ -112,17 +114,18 @@
}
static void notrace
-wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
+probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
unsigned long latency = 0, t0 = 0, t1 = 0;
- struct trace_array **ptr = private;
- struct trace_array *tr = *ptr;
struct trace_array_cpu *data;
cycle_t T0, T1, delta;
unsigned long flags;
long disabled;
int cpu;
+ int pc;
+
+ tracing_record_cmdline(prev);
if (unlikely(!tracer_enabled))
return;
@@ -139,12 +142,14 @@
if (next != wakeup_task)
return;
+ pc = preempt_count();
+
/* The task we are waiting for is waking up */
- data = tr->data[wakeup_cpu];
+ data = wakeup_trace->data[wakeup_cpu];
/* disable local data, not wakeup_cpu data */
cpu = raw_smp_processor_id();
- disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+ disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
if (likely(disabled != 1))
goto out;
@@ -155,7 +160,7 @@
if (unlikely(!tracer_enabled || next != wakeup_task))
goto out_unlock;
- trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
+ trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
/*
* usecs conversion is slow so we try to delay the conversion
@@ -174,39 +179,14 @@
t0 = nsecs_to_usecs(T0);
t1 = nsecs_to_usecs(T1);
- update_max_tr(tr, wakeup_task, wakeup_cpu);
+ update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
out_unlock:
- __wakeup_reset(tr);
+ __wakeup_reset(wakeup_trace);
__raw_spin_unlock(&wakeup_lock);
local_irq_restore(flags);
out:
- atomic_dec(&tr->data[cpu]->disabled);
-}
-
-static notrace void
-sched_switch_callback(void *probe_data, void *call_data,
- const char *format, va_list *args)
-{
- struct task_struct *prev;
- struct task_struct *next;
- struct rq *__rq;
-
- /* skip prev_pid %d next_pid %d prev_state %ld */
- (void)va_arg(*args, int);
- (void)va_arg(*args, int);
- (void)va_arg(*args, long);
- __rq = va_arg(*args, typeof(__rq));
- prev = va_arg(*args, typeof(prev));
- next = va_arg(*args, typeof(next));
-
- tracing_record_cmdline(prev);
-
- /*
- * If tracer_switch_func only points to the local
- * switch func, it still needs the ptr passed to it.
- */
- wakeup_sched_switch(probe_data, __rq, prev, next);
+ atomic_dec(&wakeup_trace->data[cpu]->disabled);
}
static void __wakeup_reset(struct trace_array *tr)
@@ -216,7 +196,7 @@
for_each_possible_cpu(cpu) {
data = tr->data[cpu];
- tracing_reset(data);
+ tracing_reset(tr, cpu);
}
wakeup_cpu = -1;
@@ -240,19 +220,26 @@
}
static void
-wakeup_check_start(struct trace_array *tr, struct task_struct *p,
- struct task_struct *curr)
+probe_wakeup(struct rq *rq, struct task_struct *p)
{
int cpu = smp_processor_id();
unsigned long flags;
long disabled;
+ int pc;
+
+ if (likely(!tracer_enabled))
+ return;
+
+ tracing_record_cmdline(p);
+ tracing_record_cmdline(current);
if (likely(!rt_task(p)) ||
p->prio >= wakeup_prio ||
- p->prio >= curr->prio)
+ p->prio >= current->prio)
return;
- disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+ pc = preempt_count();
+ disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
if (unlikely(disabled != 1))
goto out;
@@ -264,7 +251,7 @@
goto out_locked;
/* reset the trace */
- __wakeup_reset(tr);
+ __wakeup_reset(wakeup_trace);
wakeup_cpu = task_cpu(p);
wakeup_prio = p->prio;
@@ -274,74 +261,37 @@
local_save_flags(flags);
- tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
- trace_function(tr, tr->data[wakeup_cpu],
- CALLER_ADDR1, CALLER_ADDR2, flags);
+ wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
+ trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu],
+ CALLER_ADDR1, CALLER_ADDR2, flags, pc);
out_locked:
__raw_spin_unlock(&wakeup_lock);
out:
- atomic_dec(&tr->data[cpu]->disabled);
-}
-
-static notrace void
-wake_up_callback(void *probe_data, void *call_data,
- const char *format, va_list *args)
-{
- struct trace_array **ptr = probe_data;
- struct trace_array *tr = *ptr;
- struct task_struct *curr;
- struct task_struct *task;
- struct rq *__rq;
-
- if (likely(!tracer_enabled))
- return;
-
- /* Skip pid %d state %ld */
- (void)va_arg(*args, int);
- (void)va_arg(*args, long);
- /* now get the meat: "rq %p task %p rq->curr %p" */
- __rq = va_arg(*args, typeof(__rq));
- task = va_arg(*args, typeof(task));
- curr = va_arg(*args, typeof(curr));
-
- tracing_record_cmdline(task);
- tracing_record_cmdline(curr);
-
- wakeup_check_start(tr, task, curr);
+ atomic_dec(&wakeup_trace->data[cpu]->disabled);
}
static void start_wakeup_tracer(struct trace_array *tr)
{
int ret;
- ret = marker_probe_register("kernel_sched_wakeup",
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- wake_up_callback,
- &wakeup_trace);
+ ret = register_trace_sched_wakeup(probe_wakeup);
if (ret) {
- pr_info("wakeup trace: Couldn't add marker"
+ pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup\n");
return;
}
- ret = marker_probe_register("kernel_sched_wakeup_new",
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- wake_up_callback,
- &wakeup_trace);
+ ret = register_trace_sched_wakeup_new(probe_wakeup);
if (ret) {
- pr_info("wakeup trace: Couldn't add marker"
+ pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup_new\n");
goto fail_deprobe;
}
- ret = marker_probe_register("kernel_sched_schedule",
- "prev_pid %d next_pid %d prev_state %ld "
- "## rq %p prev %p next %p",
- sched_switch_callback,
- &wakeup_trace);
+ ret = register_trace_sched_switch(probe_wakeup_sched_switch);
if (ret) {
- pr_info("sched trace: Couldn't add marker"
+ pr_info("sched trace: Couldn't activate tracepoint"
" probe to kernel_sched_schedule\n");
goto fail_deprobe_wake_new;
}
@@ -363,28 +313,18 @@
return;
fail_deprobe_wake_new:
- marker_probe_unregister("kernel_sched_wakeup_new",
- wake_up_callback,
- &wakeup_trace);
+ unregister_trace_sched_wakeup_new(probe_wakeup);
fail_deprobe:
- marker_probe_unregister("kernel_sched_wakeup",
- wake_up_callback,
- &wakeup_trace);
+ unregister_trace_sched_wakeup(probe_wakeup);
}
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
unregister_ftrace_function(&trace_ops);
- marker_probe_unregister("kernel_sched_schedule",
- sched_switch_callback,
- &wakeup_trace);
- marker_probe_unregister("kernel_sched_wakeup_new",
- wake_up_callback,
- &wakeup_trace);
- marker_probe_unregister("kernel_sched_wakeup",
- wake_up_callback,
- &wakeup_trace);
+ unregister_trace_sched_switch(probe_wakeup_sched_switch);
+ unregister_trace_sched_wakeup_new(probe_wakeup);
+ unregister_trace_sched_wakeup(probe_wakeup);
}
static void wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 0911b7e..09cf230 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -9,65 +9,29 @@
case TRACE_FN:
case TRACE_CTX:
case TRACE_WAKE:
+ case TRACE_CONT:
case TRACE_STACK:
+ case TRACE_PRINT:
case TRACE_SPECIAL:
return 1;
}
return 0;
}
-static int
-trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
+static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
{
- struct trace_entry *entries;
- struct page *page;
- int idx = 0;
- int i;
+ struct ring_buffer_event *event;
+ struct trace_entry *entry;
- BUG_ON(list_empty(&data->trace_pages));
- page = list_entry(data->trace_pages.next, struct page, lru);
- entries = page_address(page);
+ while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
+ entry = ring_buffer_event_data(event);
- check_pages(data);
- if (head_page(data) != entries)
- goto failed;
-
- /*
- * The starting trace buffer always has valid elements,
- * if any element exists.
- */
- entries = head_page(data);
-
- for (i = 0; i < tr->entries; i++) {
-
- if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
+ if (!trace_valid_entry(entry)) {
printk(KERN_CONT ".. invalid entry %d ",
- entries[idx].type);
+ entry->type);
goto failed;
}
-
- idx++;
- if (idx >= ENTRIES_PER_PAGE) {
- page = virt_to_page(entries);
- if (page->lru.next == &data->trace_pages) {
- if (i != tr->entries - 1) {
- printk(KERN_CONT ".. entries buffer mismatch");
- goto failed;
- }
- } else {
- page = list_entry(page->lru.next, struct page, lru);
- entries = page_address(page);
- }
- idx = 0;
- }
}
-
- page = virt_to_page(entries);
- if (page->lru.next != &data->trace_pages) {
- printk(KERN_CONT ".. too many entries");
- goto failed;
- }
-
return 0;
failed:
@@ -89,13 +53,11 @@
/* Don't allow flipping of max traces now */
raw_local_irq_save(flags);
__raw_spin_lock(&ftrace_max_lock);
+
+ cnt = ring_buffer_entries(tr->buffer);
+
for_each_possible_cpu(cpu) {
- if (!head_page(tr->data[cpu]))
- continue;
-
- cnt += tr->data[cpu]->trace_idx;
-
- ret = trace_test_buffer_cpu(tr, tr->data[cpu]);
+ ret = trace_test_buffer_cpu(tr, cpu);
if (ret)
break;
}
@@ -120,11 +82,11 @@
struct trace_array *tr,
int (*func)(void))
{
- unsigned long count;
- int ret;
int save_ftrace_enabled = ftrace_enabled;
int save_tracer_enabled = tracer_enabled;
+ unsigned long count;
char *func_name;
+ int ret;
/* The ftrace test PASSED */
printk(KERN_CONT "PASSED\n");
@@ -157,6 +119,7 @@
/* enable tracing */
tr->ctrl = 1;
trace->init(tr);
+
/* Sleep for a 1/10 of a second */
msleep(100);
@@ -212,10 +175,10 @@
int
trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
{
- unsigned long count;
- int ret;
int save_ftrace_enabled = ftrace_enabled;
int save_tracer_enabled = tracer_enabled;
+ unsigned long count;
+ int ret;
/* make sure msleep has been recorded */
msleep(1);
@@ -415,6 +378,15 @@
}
#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
+#ifdef CONFIG_NOP_TRACER
+int
+trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
+{
+ /* What could possibly go wrong? */
+ return 0;
+}
+#endif
+
#ifdef CONFIG_SCHED_TRACER
static int trace_wakeup_test_thread(void *data)
{
@@ -486,6 +458,9 @@
wake_up_process(p);
+ /* give a little time to let the thread wake up */
+ msleep(100);
+
/* stop the tracing. */
tr->ctrl = 0;
trace->ctrl_update(tr);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
new file mode 100644
index 0000000..74c5d9a
--- /dev/null
+++ b/kernel/trace/trace_stack.c
@@ -0,0 +1,310 @@
+/*
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/stacktrace.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include "trace.h"
+
+#define STACK_TRACE_ENTRIES 500
+
+static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
+ { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
+static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
+
+static struct stack_trace max_stack_trace = {
+ .max_entries = STACK_TRACE_ENTRIES,
+ .entries = stack_dump_trace,
+};
+
+static unsigned long max_stack_size;
+static raw_spinlock_t max_stack_lock =
+ (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+static int stack_trace_disabled __read_mostly;
+static DEFINE_PER_CPU(int, trace_active);
+
+static inline void check_stack(void)
+{
+ unsigned long this_size, flags;
+ unsigned long *p, *top, *start;
+ int i;
+
+ this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1);
+ this_size = THREAD_SIZE - this_size;
+
+ if (this_size <= max_stack_size)
+ return;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&max_stack_lock);
+
+ /* a race could have already updated it */
+ if (this_size <= max_stack_size)
+ goto out;
+
+ max_stack_size = this_size;
+
+ max_stack_trace.nr_entries = 0;
+ max_stack_trace.skip = 3;
+
+ save_stack_trace(&max_stack_trace);
+
+ /*
+ * Now find where in the stack these are.
+ */
+ i = 0;
+ start = &this_size;
+ top = (unsigned long *)
+ (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
+
+ /*
+ * Loop through all the entries. One of the entries may
+ * for some reason be missed on the stack, so we may
+ * have to account for them. If they are all there, this
+ * loop will only happen once. This code only takes place
+ * on a new max, so it is far from a fast path.
+ */
+ while (i < max_stack_trace.nr_entries) {
+
+ stack_dump_index[i] = this_size;
+ p = start;
+
+ for (; p < top && i < max_stack_trace.nr_entries; p++) {
+ if (*p == stack_dump_trace[i]) {
+ this_size = stack_dump_index[i++] =
+ (top - p) * sizeof(unsigned long);
+ /* Start the search from here */
+ start = p + 1;
+ }
+ }
+
+ i++;
+ }
+
+ out:
+ __raw_spin_unlock(&max_stack_lock);
+ raw_local_irq_restore(flags);
+}
+
+static void
+stack_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+ int cpu, resched;
+
+ if (unlikely(!ftrace_enabled || stack_trace_disabled))
+ return;
+
+ resched = need_resched();
+ preempt_disable_notrace();
+
+ cpu = raw_smp_processor_id();
+ /* no atomic needed, we only modify this variable by this cpu */
+ if (per_cpu(trace_active, cpu)++ != 0)
+ goto out;
+
+ check_stack();
+
+ out:
+ per_cpu(trace_active, cpu)--;
+ /* prevent recursion in schedule */
+ if (resched)
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+ .func = stack_trace_call,
+};
+
+static ssize_t
+stack_max_size_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long *ptr = filp->private_data;
+ char buf[64];
+ int r;
+
+ r = snprintf(buf, sizeof(buf), "%ld\n", *ptr);
+ if (r > sizeof(buf))
+ r = sizeof(buf);
+ return simple_read_from_buffer(ubuf, count, ppos, buf, r);
+}
+
+static ssize_t
+stack_max_size_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ long *ptr = filp->private_data;
+ unsigned long val, flags;
+ char buf[64];
+ int ret;
+
+ if (count >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, count))
+ return -EFAULT;
+
+ buf[count] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&max_stack_lock);
+ *ptr = val;
+ __raw_spin_unlock(&max_stack_lock);
+ raw_local_irq_restore(flags);
+
+ return count;
+}
+
+static struct file_operations stack_max_size_fops = {
+ .open = tracing_open_generic,
+ .read = stack_max_size_read,
+ .write = stack_max_size_write,
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ long i = (long)m->private;
+
+ (*pos)++;
+
+ i++;
+
+ if (i >= max_stack_trace.nr_entries ||
+ stack_dump_trace[i] == ULONG_MAX)
+ return NULL;
+
+ m->private = (void *)i;
+
+ return &m->private;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ void *t = &m->private;
+ loff_t l = 0;
+
+ local_irq_disable();
+ __raw_spin_lock(&max_stack_lock);
+
+ for (; t && l < *pos; t = t_next(m, t, &l))
+ ;
+
+ return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+ __raw_spin_unlock(&max_stack_lock);
+ local_irq_enable();
+}
+
+static int trace_lookup_stack(struct seq_file *m, long i)
+{
+ unsigned long addr = stack_dump_trace[i];
+#ifdef CONFIG_KALLSYMS
+ char str[KSYM_SYMBOL_LEN];
+
+ sprint_symbol(str, addr);
+
+ return seq_printf(m, "%s\n", str);
+#else
+ return seq_printf(m, "%p\n", (void*)addr);
+#endif
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ long i = *(long *)v;
+ int size;
+
+ if (i < 0) {
+ seq_printf(m, " Depth Size Location"
+ " (%d entries)\n"
+ " ----- ---- --------\n",
+ max_stack_trace.nr_entries);
+ return 0;
+ }
+
+ if (i >= max_stack_trace.nr_entries ||
+ stack_dump_trace[i] == ULONG_MAX)
+ return 0;
+
+ if (i+1 == max_stack_trace.nr_entries ||
+ stack_dump_trace[i+1] == ULONG_MAX)
+ size = stack_dump_index[i];
+ else
+ size = stack_dump_index[i] - stack_dump_index[i+1];
+
+ seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size);
+
+ trace_lookup_stack(m, i);
+
+ return 0;
+}
+
+static struct seq_operations stack_trace_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .stop = t_stop,
+ .show = t_show,
+};
+
+static int stack_trace_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = seq_open(file, &stack_trace_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = (void *)-1;
+ }
+
+ return ret;
+}
+
+static struct file_operations stack_trace_fops = {
+ .open = stack_trace_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+};
+
+static __init int stack_trace_init(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+
+ entry = debugfs_create_file("stack_max_size", 0644, d_tracer,
+ &max_stack_size, &stack_max_size_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'stack_max_size' entry\n");
+
+ entry = debugfs_create_file("stack_trace", 0444, d_tracer,
+ NULL, &stack_trace_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'stack_trace' entry\n");
+
+ register_ftrace_function(&trace_ops);
+
+ return 0;
+}
+
+device_initcall(stack_trace_init);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index db58fb6..9587d3b 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -241,7 +241,7 @@
tr->time_start = ftrace_now(tr->cpu);
for_each_online_cpu(cpu)
- tracing_reset(tr->data[cpu]);
+ tracing_reset(tr, cpu);
}
static void start_stack_trace(struct trace_array *tr)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
new file mode 100644
index 0000000..f2b7c28
--- /dev/null
+++ b/kernel/tracepoint.c
@@ -0,0 +1,477 @@
+/*
+ * Copyright (C) 2008 Mathieu Desnoyers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/tracepoint.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+extern struct tracepoint __start___tracepoints[];
+extern struct tracepoint __stop___tracepoints[];
+
+/* Set to 1 to enable tracepoint debug output */
+static const int tracepoint_debug;
+
+/*
+ * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the
+ * builtin and module tracepoints and the hash table.
+ */
+static DEFINE_MUTEX(tracepoints_mutex);
+
+/*
+ * Tracepoint hash table, containing the active tracepoints.
+ * Protected by tracepoints_mutex.
+ */
+#define TRACEPOINT_HASH_BITS 6
+#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
+
+/*
+ * Note about RCU :
+ * It is used to to delay the free of multiple probes array until a quiescent
+ * state is reached.
+ * Tracepoint entries modifications are protected by the tracepoints_mutex.
+ */
+struct tracepoint_entry {
+ struct hlist_node hlist;
+ void **funcs;
+ int refcount; /* Number of times armed. 0 if disarmed. */
+ struct rcu_head rcu;
+ void *oldptr;
+ unsigned char rcu_pending:1;
+ char name[0];
+};
+
+static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
+
+static void free_old_closure(struct rcu_head *head)
+{
+ struct tracepoint_entry *entry = container_of(head,
+ struct tracepoint_entry, rcu);
+ kfree(entry->oldptr);
+ /* Make sure we free the data before setting the pending flag to 0 */
+ smp_wmb();
+ entry->rcu_pending = 0;
+}
+
+static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old)
+{
+ if (!old)
+ return;
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+ call_rcu_sched(&entry->rcu, free_old_closure);
+}
+
+static void debug_print_probes(struct tracepoint_entry *entry)
+{
+ int i;
+
+ if (!tracepoint_debug)
+ return;
+
+ for (i = 0; entry->funcs[i]; i++)
+ printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]);
+}
+
+static void *
+tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
+{
+ int nr_probes = 0;
+ void **old, **new;
+
+ WARN_ON(!probe);
+
+ debug_print_probes(entry);
+ old = entry->funcs;
+ if (old) {
+ /* (N -> N+1), (N != 0, 1) probes */
+ for (nr_probes = 0; old[nr_probes]; nr_probes++)
+ if (old[nr_probes] == probe)
+ return ERR_PTR(-EEXIST);
+ }
+ /* + 2 : one for new probe, one for NULL func */
+ new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (old)
+ memcpy(new, old, nr_probes * sizeof(void *));
+ new[nr_probes] = probe;
+ entry->refcount = nr_probes + 1;
+ entry->funcs = new;
+ debug_print_probes(entry);
+ return old;
+}
+
+static void *
+tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
+{
+ int nr_probes = 0, nr_del = 0, i;
+ void **old, **new;
+
+ old = entry->funcs;
+
+ debug_print_probes(entry);
+ /* (N -> M), (N > 1, M >= 0) probes */
+ for (nr_probes = 0; old[nr_probes]; nr_probes++) {
+ if ((!probe || old[nr_probes] == probe))
+ nr_del++;
+ }
+
+ if (nr_probes - nr_del == 0) {
+ /* N -> 0, (N > 1) */
+ entry->funcs = NULL;
+ entry->refcount = 0;
+ debug_print_probes(entry);
+ return old;
+ } else {
+ int j = 0;
+ /* N -> M, (N > 1, M > 0) */
+ /* + 1 for NULL */
+ new = kzalloc((nr_probes - nr_del + 1)
+ * sizeof(void *), GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; old[i]; i++)
+ if ((probe && old[i] != probe))
+ new[j++] = old[i];
+ entry->refcount = nr_probes - nr_del;
+ entry->funcs = new;
+ }
+ debug_print_probes(entry);
+ return old;
+}
+
+/*
+ * Get tracepoint if the tracepoint is present in the tracepoint hash table.
+ * Must be called with tracepoints_mutex held.
+ * Returns NULL if not present.
+ */
+static struct tracepoint_entry *get_tracepoint(const char *name)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct tracepoint_entry *e;
+ u32 hash = jhash(name, strlen(name), 0);
+
+ head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name))
+ return e;
+ }
+ return NULL;
+}
+
+/*
+ * Add the tracepoint to the tracepoint hash table. Must be called with
+ * tracepoints_mutex held.
+ */
+static struct tracepoint_entry *add_tracepoint(const char *name)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct tracepoint_entry *e;
+ size_t name_len = strlen(name) + 1;
+ u32 hash = jhash(name, name_len-1, 0);
+
+ head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name)) {
+ printk(KERN_NOTICE
+ "tracepoint %s busy\n", name);
+ return ERR_PTR(-EEXIST); /* Already there */
+ }
+ }
+ /*
+ * Using kmalloc here to allocate a variable length element. Could
+ * cause some memory fragmentation if overused.
+ */
+ e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL);
+ if (!e)
+ return ERR_PTR(-ENOMEM);
+ memcpy(&e->name[0], name, name_len);
+ e->funcs = NULL;
+ e->refcount = 0;
+ e->rcu_pending = 0;
+ hlist_add_head(&e->hlist, head);
+ return e;
+}
+
+/*
+ * Remove the tracepoint from the tracepoint hash table. Must be called with
+ * mutex_lock held.
+ */
+static int remove_tracepoint(const char *name)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct tracepoint_entry *e;
+ int found = 0;
+ size_t len = strlen(name) + 1;
+ u32 hash = jhash(name, len-1, 0);
+
+ head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name)) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ return -ENOENT;
+ if (e->refcount)
+ return -EBUSY;
+ hlist_del(&e->hlist);
+ /* Make sure the call_rcu_sched has been executed */
+ if (e->rcu_pending)
+ rcu_barrier_sched();
+ kfree(e);
+ return 0;
+}
+
+/*
+ * Sets the probe callback corresponding to one tracepoint.
+ */
+static void set_tracepoint(struct tracepoint_entry **entry,
+ struct tracepoint *elem, int active)
+{
+ WARN_ON(strcmp((*entry)->name, elem->name) != 0);
+
+ /*
+ * rcu_assign_pointer has a smp_wmb() which makes sure that the new
+ * probe callbacks array is consistent before setting a pointer to it.
+ * This array is referenced by __DO_TRACE from
+ * include/linux/tracepoints.h. A matching smp_read_barrier_depends()
+ * is used.
+ */
+ rcu_assign_pointer(elem->funcs, (*entry)->funcs);
+ elem->state = active;
+}
+
+/*
+ * Disable a tracepoint and its probe callback.
+ * Note: only waiting an RCU period after setting elem->call to the empty
+ * function insures that the original callback is not used anymore. This insured
+ * by preempt_disable around the call site.
+ */
+static void disable_tracepoint(struct tracepoint *elem)
+{
+ elem->state = 0;
+}
+
+/**
+ * tracepoint_update_probe_range - Update a probe range
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Updates the probe callback corresponding to a range of tracepoints.
+ */
+void tracepoint_update_probe_range(struct tracepoint *begin,
+ struct tracepoint *end)
+{
+ struct tracepoint *iter;
+ struct tracepoint_entry *mark_entry;
+
+ mutex_lock(&tracepoints_mutex);
+ for (iter = begin; iter < end; iter++) {
+ mark_entry = get_tracepoint(iter->name);
+ if (mark_entry) {
+ set_tracepoint(&mark_entry, iter,
+ !!mark_entry->refcount);
+ } else {
+ disable_tracepoint(iter);
+ }
+ }
+ mutex_unlock(&tracepoints_mutex);
+}
+
+/*
+ * Update probes, removing the faulty probes.
+ */
+static void tracepoint_update_probes(void)
+{
+ /* Core kernel tracepoints */
+ tracepoint_update_probe_range(__start___tracepoints,
+ __stop___tracepoints);
+ /* tracepoints in modules. */
+ module_update_tracepoints();
+}
+
+/**
+ * tracepoint_probe_register - Connect a probe to a tracepoint
+ * @name: tracepoint name
+ * @probe: probe handler
+ *
+ * Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
+ */
+int tracepoint_probe_register(const char *name, void *probe)
+{
+ struct tracepoint_entry *entry;
+ int ret = 0;
+ void *old;
+
+ mutex_lock(&tracepoints_mutex);
+ entry = get_tracepoint(name);
+ if (!entry) {
+ entry = add_tracepoint(name);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry);
+ goto end;
+ }
+ }
+ /*
+ * If we detect that a call_rcu_sched is pending for this tracepoint,
+ * make sure it's executed now.
+ */
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ old = tracepoint_entry_add_probe(entry, probe);
+ if (IS_ERR(old)) {
+ ret = PTR_ERR(old);
+ goto end;
+ }
+ mutex_unlock(&tracepoints_mutex);
+ tracepoint_update_probes(); /* may update entry */
+ mutex_lock(&tracepoints_mutex);
+ entry = get_tracepoint(name);
+ WARN_ON(!entry);
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ tracepoint_entry_free_old(entry, old);
+end:
+ mutex_unlock(&tracepoints_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_register);
+
+/**
+ * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
+ * @name: tracepoint name
+ * @probe: probe function pointer
+ *
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
+ */
+int tracepoint_probe_unregister(const char *name, void *probe)
+{
+ struct tracepoint_entry *entry;
+ void *old;
+ int ret = -ENOENT;
+
+ mutex_lock(&tracepoints_mutex);
+ entry = get_tracepoint(name);
+ if (!entry)
+ goto end;
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ old = tracepoint_entry_remove_probe(entry, probe);
+ mutex_unlock(&tracepoints_mutex);
+ tracepoint_update_probes(); /* may update entry */
+ mutex_lock(&tracepoints_mutex);
+ entry = get_tracepoint(name);
+ if (!entry)
+ goto end;
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ tracepoint_entry_free_old(entry, old);
+ remove_tracepoint(name); /* Ignore busy error message */
+ ret = 0;
+end:
+ mutex_unlock(&tracepoints_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
+
+/**
+ * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
+ * @tracepoint: current tracepoints (in), next tracepoint (out)
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Returns whether a next tracepoint has been found (1) or not (0).
+ * Will return the first tracepoint in the range if the input tracepoint is
+ * NULL.
+ */
+int tracepoint_get_iter_range(struct tracepoint **tracepoint,
+ struct tracepoint *begin, struct tracepoint *end)
+{
+ if (!*tracepoint && begin != end) {
+ *tracepoint = begin;
+ return 1;
+ }
+ if (*tracepoint >= begin && *tracepoint < end)
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
+
+static void tracepoint_get_iter(struct tracepoint_iter *iter)
+{
+ int found = 0;
+
+ /* Core kernel tracepoints */
+ if (!iter->module) {
+ found = tracepoint_get_iter_range(&iter->tracepoint,
+ __start___tracepoints, __stop___tracepoints);
+ if (found)
+ goto end;
+ }
+ /* tracepoints in modules. */
+ found = module_get_iter_tracepoints(iter);
+end:
+ if (!found)
+ tracepoint_iter_reset(iter);
+}
+
+void tracepoint_iter_start(struct tracepoint_iter *iter)
+{
+ tracepoint_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(tracepoint_iter_start);
+
+void tracepoint_iter_next(struct tracepoint_iter *iter)
+{
+ iter->tracepoint++;
+ /*
+ * iter->tracepoint may be invalid because we blindly incremented it.
+ * Make sure it is valid by marshalling on the tracepoints, getting the
+ * tracepoints from following modules if necessary.
+ */
+ tracepoint_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(tracepoint_iter_next);
+
+void tracepoint_iter_stop(struct tracepoint_iter *iter)
+{
+}
+EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
+
+void tracepoint_iter_reset(struct tracepoint_iter *iter)
+{
+ iter->module = NULL;
+ iter->tracepoint = NULL;
+}
+EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 712ae47..0797589 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -175,6 +175,21 @@
return nr;
}
+static inline int is_vmalloc_or_module_addr(const void *x)
+{
+ /*
+ * x86-64 and sparc64 put modules in a special place,
+ * and fall back on vmalloc() if that fails. Others
+ * just put it in the vmalloc space.
+ */
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+ unsigned long addr = (unsigned long)x;
+ if (addr >= MODULES_VADDR && addr < MODULES_END)
+ return 1;
+#endif
+ return is_vmalloc_addr(x);
+}
+
/*
* Walk a vmap address to the struct page it maps.
*/
@@ -188,8 +203,7 @@
* XXX we might need to change this if we add VIRTUAL_BUG_ON for
* architectures that do not vmalloc module space
*/
- VIRTUAL_BUG_ON(!is_vmalloc_addr(vmalloc_addr) &&
- !is_module_address(addr));
+ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
if (!pgd_none(*pgd)) {
pud_t *pud = pud_offset(pgd, addr);
diff --git a/samples/Kconfig b/samples/Kconfig
index e1fb471..4b02f5a 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -13,6 +13,12 @@
help
This build markers example modules.
+config SAMPLE_TRACEPOINTS
+ tristate "Build tracepoints examples -- loadable modules only"
+ depends on TRACEPOINTS && m
+ help
+ This build tracepoints example modules.
+
config SAMPLE_KOBJECT
tristate "Build kobject examples"
help
diff --git a/samples/Makefile b/samples/Makefile
index 2e02575..10eaca8 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -1,3 +1,3 @@
# Makefile for Linux samples code
-obj-$(CONFIG_SAMPLES) += markers/ kobject/ kprobes/
+obj-$(CONFIG_SAMPLES) += markers/ kobject/ kprobes/ tracepoints/
diff --git a/samples/markers/probe-example.c b/samples/markers/probe-example.c
index c8e099d..2dfb3b3 100644
--- a/samples/markers/probe-example.c
+++ b/samples/markers/probe-example.c
@@ -81,6 +81,7 @@
probe_array[i].probe_func, &probe_array[i]);
printk(KERN_INFO "Number of event b : %u\n",
atomic_read(&eventb_count));
+ marker_synchronize_unregister();
}
module_init(probe_init);
diff --git a/samples/tracepoints/Makefile b/samples/tracepoints/Makefile
new file mode 100644
index 0000000..36479ad
--- /dev/null
+++ b/samples/tracepoints/Makefile
@@ -0,0 +1,6 @@
+# builds the tracepoint example kernel modules;
+# then to use one (as root): insmod <module_name.ko>
+
+obj-$(CONFIG_SAMPLE_TRACEPOINTS) += tracepoint-sample.o
+obj-$(CONFIG_SAMPLE_TRACEPOINTS) += tracepoint-probe-sample.o
+obj-$(CONFIG_SAMPLE_TRACEPOINTS) += tracepoint-probe-sample2.o
diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h
new file mode 100644
index 0000000..0216b55
--- /dev/null
+++ b/samples/tracepoints/tp-samples-trace.h
@@ -0,0 +1,13 @@
+#ifndef _TP_SAMPLES_TRACE_H
+#define _TP_SAMPLES_TRACE_H
+
+#include <linux/proc_fs.h> /* for struct inode and struct file */
+#include <linux/tracepoint.h>
+
+DEFINE_TRACE(subsys_event,
+ TPPROTO(struct inode *inode, struct file *file),
+ TPARGS(inode, file));
+DEFINE_TRACE(subsys_eventb,
+ TPPROTO(void),
+ TPARGS());
+#endif
diff --git a/samples/tracepoints/tracepoint-probe-sample.c b/samples/tracepoints/tracepoint-probe-sample.c
new file mode 100644
index 0000000..55abfdd
--- /dev/null
+++ b/samples/tracepoints/tracepoint-probe-sample.c
@@ -0,0 +1,55 @@
+/*
+ * tracepoint-probe-sample.c
+ *
+ * sample tracepoint probes.
+ */
+
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/dcache.h>
+#include "tp-samples-trace.h"
+
+/*
+ * Here the caller only guarantees locking for struct file and struct inode.
+ * Locking must therefore be done in the probe to use the dentry.
+ */
+static void probe_subsys_event(struct inode *inode, struct file *file)
+{
+ path_get(&file->f_path);
+ dget(file->f_path.dentry);
+ printk(KERN_INFO "Event is encountered with filename %s\n",
+ file->f_path.dentry->d_name.name);
+ dput(file->f_path.dentry);
+ path_put(&file->f_path);
+}
+
+static void probe_subsys_eventb(void)
+{
+ printk(KERN_INFO "Event B is encountered\n");
+}
+
+int __init tp_sample_trace_init(void)
+{
+ int ret;
+
+ ret = register_trace_subsys_event(probe_subsys_event);
+ WARN_ON(ret);
+ ret = register_trace_subsys_eventb(probe_subsys_eventb);
+ WARN_ON(ret);
+
+ return 0;
+}
+
+module_init(tp_sample_trace_init);
+
+void __exit tp_sample_trace_exit(void)
+{
+ unregister_trace_subsys_eventb(probe_subsys_eventb);
+ unregister_trace_subsys_event(probe_subsys_event);
+}
+
+module_exit(tp_sample_trace_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Tracepoint Probes Samples");
diff --git a/samples/tracepoints/tracepoint-probe-sample2.c b/samples/tracepoints/tracepoint-probe-sample2.c
new file mode 100644
index 0000000..5e9fcf4
--- /dev/null
+++ b/samples/tracepoints/tracepoint-probe-sample2.c
@@ -0,0 +1,42 @@
+/*
+ * tracepoint-probe-sample2.c
+ *
+ * 2nd sample tracepoint probes.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include "tp-samples-trace.h"
+
+/*
+ * Here the caller only guarantees locking for struct file and struct inode.
+ * Locking must therefore be done in the probe to use the dentry.
+ */
+static void probe_subsys_event(struct inode *inode, struct file *file)
+{
+ printk(KERN_INFO "Event is encountered with inode number %lu\n",
+ inode->i_ino);
+}
+
+int __init tp_sample_trace_init(void)
+{
+ int ret;
+
+ ret = register_trace_subsys_event(probe_subsys_event);
+ WARN_ON(ret);
+
+ return 0;
+}
+
+module_init(tp_sample_trace_init);
+
+void __exit tp_sample_trace_exit(void)
+{
+ unregister_trace_subsys_event(probe_subsys_event);
+}
+
+module_exit(tp_sample_trace_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Tracepoint Probes Samples");
diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c
new file mode 100644
index 0000000..4ae4b7f
--- /dev/null
+++ b/samples/tracepoints/tracepoint-sample.c
@@ -0,0 +1,53 @@
+/* tracepoint-sample.c
+ *
+ * Executes a tracepoint when /proc/tracepoint-example is opened.
+ *
+ * (C) Copyright 2007 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include "tp-samples-trace.h"
+
+struct proc_dir_entry *pentry_example;
+
+static int my_open(struct inode *inode, struct file *file)
+{
+ int i;
+
+ trace_subsys_event(inode, file);
+ for (i = 0; i < 10; i++)
+ trace_subsys_eventb();
+ return -EPERM;
+}
+
+static struct file_operations mark_ops = {
+ .open = my_open,
+};
+
+static int example_init(void)
+{
+ printk(KERN_ALERT "example init\n");
+ pentry_example = proc_create("tracepoint-example", 0444, NULL,
+ &mark_ops);
+ if (!pentry_example)
+ return -EPERM;
+ return 0;
+}
+
+static void example_exit(void)
+{
+ printk(KERN_ALERT "example exit\n");
+ remove_proc_entry("tracepoint-example", NULL);
+}
+
+module_init(example_init)
+module_exit(example_exit)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Tracepoint example");
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 277cfe0..5ed4cbf 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -198,10 +198,17 @@
fi;
endif
+ifdef CONFIG_FTRACE_MCOUNT_RECORD
+cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl \
+ "$(ARCH)" "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" \
+ "$(MV)" "$(@)";
+endif
+
define rule_cc_o_c
$(call echo-cmd,checksrc) $(cmd_checksrc) \
$(call echo-cmd,cc_o_c) $(cmd_cc_o_c); \
$(cmd_modversions) \
+ $(cmd_record_mcount) \
scripts/basic/fixdep $(depfile) $@ '$(call make-cmd,cc_o_c)' > \
$(dot-target).tmp; \
rm -f $(depfile); \
diff --git a/scripts/basic/.gitignore b/scripts/basic/.gitignore
index 7304e19..bf8b199 100644
--- a/scripts/basic/.gitignore
+++ b/scripts/basic/.gitignore
@@ -1,3 +1,3 @@
+hash
fixdep
-split-include
docproc
diff --git a/scripts/bootgraph.pl b/scripts/bootgraph.pl
index 2243353..5e7316e 100644
--- a/scripts/bootgraph.pl
+++ b/scripts/bootgraph.pl
@@ -37,13 +37,13 @@
# dmesg | perl scripts/bootgraph.pl > output.svg
#
-my @rows;
-my %start, %end, %row;
+my %start, %end;
my $done = 0;
-my $rowcount = 0;
my $maxtime = 0;
my $firsttime = 100;
my $count = 0;
+my %pids;
+
while (<>) {
my $line = $_;
if ($line =~ /([0-9\.]+)\] calling ([a-zA-Z0-9\_]+)\+/) {
@@ -54,14 +54,8 @@
$firsttime = $1;
}
}
- $row{$func} = 1;
if ($line =~ /\@ ([0-9]+)/) {
- my $pid = $1;
- if (!defined($rows[$pid])) {
- $rowcount = $rowcount + 1;
- $rows[$pid] = $rowcount;
- }
- $row{$func} = $rows[$pid];
+ $pids{$func} = $1;
}
$count = $count + 1;
}
@@ -109,17 +103,25 @@
my $mult = 950.0 / ($maxtime - $firsttime);
my $threshold = ($maxtime - $firsttime) / 60.0;
my $stylecounter = 0;
+my %rows;
+my $rowscount = 1;
while (($key,$value) = each %start) {
my $duration = $end{$key} - $start{$key};
if ($duration >= $threshold) {
my $s, $s2, $e, $y;
+ $pid = $pids{$key};
+
+ if (!defined($rows{$pid})) {
+ $rows{$pid} = $rowscount;
+ $rowscount = $rowscount + 1;
+ }
$s = ($value - $firsttime) * $mult;
$s2 = $s + 6;
$e = ($end{$key} - $firsttime) * $mult;
$w = $e - $s;
- $y = $row{$key} * 150;
+ $y = $rows{$pid} * 150;
$y2 = $y + 4;
$style = $styles[$stylecounter];
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index e30bac1..f88bb3e 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1,5 +1,5 @@
#!/usr/bin/perl -w
-# (c) 2001, Dave Jones. <davej@codemonkey.org.uk> (the file handling bit)
+# (c) 2001, Dave Jones. <davej@redhat.com> (the file handling bit)
# (c) 2005, Joel Schopp <jschopp@austin.ibm.com> (the ugly bit)
# (c) 2007, Andy Whitcroft <apw@uk.ibm.com> (new conditions, test suite, etc)
# Licensed under the terms of the GNU GPL License version 2
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
new file mode 100755
index 0000000..f56d760
--- /dev/null
+++ b/scripts/recordmcount.pl
@@ -0,0 +1,395 @@
+#!/usr/bin/perl -w
+# (c) 2008, Steven Rostedt <srostedt@redhat.com>
+# Licensed under the terms of the GNU GPL License version 2
+#
+# recordmcount.pl - makes a section called __mcount_loc that holds
+# all the offsets to the calls to mcount.
+#
+#
+# What we want to end up with is a section in vmlinux called
+# __mcount_loc that contains a list of pointers to all the
+# call sites in the kernel that call mcount. Later on boot up, the kernel
+# will read this list, save the locations and turn them into nops.
+# When tracing or profiling is later enabled, these locations will then
+# be converted back to pointers to some function.
+#
+# This is no easy feat. This script is called just after the original
+# object is compiled and before it is linked.
+#
+# The references to the call sites are offsets from the section of text
+# that the call site is in. Hence, all functions in a section that
+# has a call site to mcount, will have the offset from the beginning of
+# the section and not the beginning of the function.
+#
+# The trick is to find a way to record the beginning of the section.
+# The way we do this is to look at the first function in the section
+# which will also be the location of that section after final link.
+# e.g.
+#
+# .section ".text.sched"
+# .globl my_func
+# my_func:
+# [...]
+# call mcount (offset: 0x5)
+# [...]
+# ret
+# other_func:
+# [...]
+# call mcount (offset: 0x1b)
+# [...]
+#
+# Both relocation offsets for the mcounts in the above example will be
+# offset from .text.sched. If we make another file called tmp.s with:
+#
+# .section __mcount_loc
+# .quad my_func + 0x5
+# .quad my_func + 0x1b
+#
+# We can then compile this tmp.s into tmp.o, and link it to the original
+# object.
+#
+# But this gets hard if my_func is not globl (a static function).
+# In such a case we have:
+#
+# .section ".text.sched"
+# my_func:
+# [...]
+# call mcount (offset: 0x5)
+# [...]
+# ret
+# .globl my_func
+# other_func:
+# [...]
+# call mcount (offset: 0x1b)
+# [...]
+#
+# If we make the tmp.s the same as above, when we link together with
+# the original object, we will end up with two symbols for my_func:
+# one local, one global. After final compile, we will end up with
+# an undefined reference to my_func.
+#
+# Since local objects can reference local variables, we need to find
+# a way to make tmp.o reference the local objects of the original object
+# file after it is linked together. To do this, we convert the my_func
+# into a global symbol before linking tmp.o. Then after we link tmp.o
+# we will only have a single symbol for my_func that is global.
+# We can convert my_func back into a local symbol and we are done.
+#
+# Here are the steps we take:
+#
+# 1) Record all the local symbols by using 'nm'
+# 2) Use objdump to find all the call site offsets and sections for
+# mcount.
+# 3) Compile the list into its own object.
+# 4) Do we have to deal with local functions? If not, go to step 8.
+# 5) Make an object that converts these local functions to global symbols
+# with objcopy.
+# 6) Link together this new object with the list object.
+# 7) Convert the local functions back to local symbols and rename
+# the result as the original object.
+# End.
+# 8) Link the object with the list object.
+# 9) Move the result back to the original object.
+# End.
+#
+
+use strict;
+
+my $P = $0;
+$P =~ s@.*/@@g;
+
+my $V = '0.1';
+
+if ($#ARGV < 6) {
+ print "usage: $P arch objdump objcopy cc ld nm rm mv inputfile\n";
+ print "version: $V\n";
+ exit(1);
+}
+
+my ($arch, $objdump, $objcopy, $cc, $ld, $nm, $rm, $mv, $inputfile) = @ARGV;
+
+$objdump = "objdump" if ((length $objdump) == 0);
+$objcopy = "objcopy" if ((length $objcopy) == 0);
+$cc = "gcc" if ((length $cc) == 0);
+$ld = "ld" if ((length $ld) == 0);
+$nm = "nm" if ((length $nm) == 0);
+$rm = "rm" if ((length $rm) == 0);
+$mv = "mv" if ((length $mv) == 0);
+
+#print STDERR "running: $P '$arch' '$objdump' '$objcopy' '$cc' '$ld' " .
+# "'$nm' '$rm' '$mv' '$inputfile'\n";
+
+my %locals; # List of local (static) functions
+my %weak; # List of weak functions
+my %convert; # List of local functions used that needs conversion
+
+my $type;
+my $section_regex; # Find the start of a section
+my $function_regex; # Find the name of a function
+ # (return offset and func name)
+my $mcount_regex; # Find the call site to mcount (return offset)
+
+if ($arch eq "x86_64") {
+ $section_regex = "Disassembly of section";
+ $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:";
+ $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$";
+ $type = ".quad";
+
+ # force flags for this arch
+ $ld .= " -m elf_x86_64";
+ $objdump .= " -M x86-64";
+ $objcopy .= " -O elf64-x86-64";
+ $cc .= " -m64";
+
+} elsif ($arch eq "i386") {
+ $section_regex = "Disassembly of section";
+ $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:";
+ $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$";
+ $type = ".long";
+
+ # force flags for this arch
+ $ld .= " -m elf_i386";
+ $objdump .= " -M i386";
+ $objcopy .= " -O elf32-i386";
+ $cc .= " -m32";
+
+} else {
+ die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD";
+}
+
+my $text_found = 0;
+my $read_function = 0;
+my $opened = 0;
+my $mcount_section = "__mcount_loc";
+
+my $dirname;
+my $filename;
+my $prefix;
+my $ext;
+
+if ($inputfile =~ m,^(.*)/([^/]*)$,) {
+ $dirname = $1;
+ $filename = $2;
+} else {
+ $dirname = ".";
+ $filename = $inputfile;
+}
+
+if ($filename =~ m,^(.*)(\.\S),) {
+ $prefix = $1;
+ $ext = $2;
+} else {
+ $prefix = $filename;
+ $ext = "";
+}
+
+my $mcount_s = $dirname . "/.tmp_mc_" . $prefix . ".s";
+my $mcount_o = $dirname . "/.tmp_mc_" . $prefix . ".o";
+
+#
+# --globalize-symbols came out in 2.17, we must test the version
+# of objcopy, and if it is less than 2.17, then we can not
+# record local functions.
+my $use_locals = 01;
+my $local_warn_once = 0;
+my $found_version = 0;
+
+open (IN, "$objcopy --version |") || die "error running $objcopy";
+while (<IN>) {
+ if (/objcopy.*\s(\d+)\.(\d+)/) {
+ my $major = $1;
+ my $minor = $2;
+
+ $found_version = 1;
+ if ($major < 2 ||
+ ($major == 2 && $minor < 17)) {
+ $use_locals = 0;
+ }
+ last;
+ }
+}
+close (IN);
+
+if (!$found_version) {
+ print STDERR "WARNING: could not find objcopy version.\n" .
+ "\tDisabling local function references.\n";
+}
+
+
+#
+# Step 1: find all the local (static functions) and weak symbols.
+# 't' is local, 'w/W' is weak (we never use a weak function)
+#
+open (IN, "$nm $inputfile|") || die "error running $nm";
+while (<IN>) {
+ if (/^[0-9a-fA-F]+\s+t\s+(\S+)/) {
+ $locals{$1} = 1;
+ } elsif (/^[0-9a-fA-F]+\s+([wW])\s+(\S+)/) {
+ $weak{$2} = $1;
+ }
+}
+close(IN);
+
+my @offsets; # Array of offsets of mcount callers
+my $ref_func; # reference function to use for offsets
+my $offset = 0; # offset of ref_func to section beginning
+
+##
+# update_funcs - print out the current mcount callers
+#
+# Go through the list of offsets to callers and write them to
+# the output file in a format that can be read by an assembler.
+#
+sub update_funcs
+{
+ return if ($#offsets < 0);
+
+ defined($ref_func) || die "No function to reference";
+
+ # A section only had a weak function, to represent it.
+ # Unfortunately, a weak function may be overwritten by another
+ # function of the same name, making all these offsets incorrect.
+ # To be safe, we simply print a warning and bail.
+ if (defined $weak{$ref_func}) {
+ print STDERR
+ "$inputfile: WARNING: referencing weak function" .
+ " $ref_func for mcount\n";
+ return;
+ }
+
+ # is this function static? If so, note this fact.
+ if (defined $locals{$ref_func}) {
+
+ # only use locals if objcopy supports globalize-symbols
+ if (!$use_locals) {
+ return;
+ }
+ $convert{$ref_func} = 1;
+ }
+
+ # Loop through all the mcount caller offsets and print a reference
+ # to the caller based from the ref_func.
+ for (my $i=0; $i <= $#offsets; $i++) {
+ if (!$opened) {
+ open(FILE, ">$mcount_s") || die "can't create $mcount_s\n";
+ $opened = 1;
+ print FILE "\t.section $mcount_section,\"a\",\@progbits\n";
+ }
+ printf FILE "\t%s %s + %d\n", $type, $ref_func, $offsets[$i] - $offset;
+ }
+}
+
+#
+# Step 2: find the sections and mcount call sites
+#
+open(IN, "$objdump -dr $inputfile|") || die "error running $objdump";
+
+my $text;
+
+while (<IN>) {
+ # is it a section?
+ if (/$section_regex/) {
+ $read_function = 1;
+ # print out any recorded offsets
+ update_funcs() if ($text_found);
+
+ # reset all markers and arrays
+ $text_found = 0;
+ undef($ref_func);
+ undef(@offsets);
+
+ # section found, now is this a start of a function?
+ } elsif ($read_function && /$function_regex/) {
+ $text_found = 1;
+ $offset = hex $1;
+ $text = $2;
+
+ # if this is either a local function or a weak function
+ # keep looking for functions that are global that
+ # we can use safely.
+ if (!defined($locals{$text}) && !defined($weak{$text})) {
+ $ref_func = $text;
+ $read_function = 0;
+ } else {
+ # if we already have a function, and this is weak, skip it
+ if (!defined($ref_func) || !defined($weak{$text})) {
+ $ref_func = $text;
+ }
+ }
+ }
+
+ # is this a call site to mcount? If so, record it to print later
+ if ($text_found && /$mcount_regex/) {
+ $offsets[$#offsets + 1] = hex $1;
+ }
+}
+
+# dump out anymore offsets that may have been found
+update_funcs() if ($text_found);
+
+# If we did not find any mcount callers, we are done (do nothing).
+if (!$opened) {
+ exit(0);
+}
+
+close(FILE);
+
+#
+# Step 3: Compile the file that holds the list of call sites to mcount.
+#
+`$cc -o $mcount_o -c $mcount_s`;
+
+my @converts = keys %convert;
+
+#
+# Step 4: Do we have sections that started with local functions?
+#
+if ($#converts >= 0) {
+ my $globallist = "";
+ my $locallist = "";
+
+ foreach my $con (@converts) {
+ $globallist .= " --globalize-symbol $con";
+ $locallist .= " --localize-symbol $con";
+ }
+
+ my $globalobj = $dirname . "/.tmp_gl_" . $filename;
+ my $globalmix = $dirname . "/.tmp_mx_" . $filename;
+
+ #
+ # Step 5: set up each local function as a global
+ #
+ `$objcopy $globallist $inputfile $globalobj`;
+
+ #
+ # Step 6: Link the global version to our list.
+ #
+ `$ld -r $globalobj $mcount_o -o $globalmix`;
+
+ #
+ # Step 7: Convert the local functions back into local symbols
+ #
+ `$objcopy $locallist $globalmix $inputfile`;
+
+ # Remove the temp files
+ `$rm $globalobj $globalmix`;
+
+} else {
+
+ my $mix = $dirname . "/.tmp_mx_" . $filename;
+
+ #
+ # Step 8: Link the object with our list of call sites object.
+ #
+ `$ld -r $inputfile $mcount_o -o $mix`;
+
+ #
+ # Step 9: Move the result back to the original object.
+ #
+ `$mv $mix $inputfile`;
+}
+
+# Clean up the temp files
+`$rm $mcount_o $mcount_s`;
+
+exit(0);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 576e511..3e3fde7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -75,6 +75,7 @@
#include <linux/string.h>
#include <linux/selinux.h>
#include <linux/mutex.h>
+#include <linux/posix-timers.h>
#include "avc.h"
#include "objsec.h"
@@ -2322,13 +2323,7 @@
initrlim = init_task.signal->rlim+i;
rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
}
- if (current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
- /*
- * This will cause RLIMIT_CPU calculations
- * to be refigured.
- */
- current->it_prof_expires = jiffies_to_cputime(1);
- }
+ update_rlimit_cpu(rlim->rlim_cur);
}
/* Wake up the parent if it is waiting so that it can